import json
import pandas as pd
from missense_kinase_toolkit.databases import requests_wrapper, utils_requests
[docs]
class Pfam:
"""Class to interact with the Pfam API."""
[docs]
def __init__(
self,
uniprot_id: str,
) -> None:
"""Initialize Pfam Class object.
Attributes
----------
url : str
Pfam API URL
"""
self.url = "https://www.ebi.ac.uk/interpro/api/entry/pfam/protein/UniProt/"
self.uniprot_id = uniprot_id
self._pfam = self.query_pfam_api()
[docs]
def query_pfam_api(self):
"""Queries Pfam API for UniProt ID as DataFrame object.
Returns
-------
pd.DataFrame | str | None
DataFrame with Pfam domain information if request is successful, None if response is empty or request fails
"""
url = f"{self.url}{self.uniprot_id}"
header = {"Accept": "application/json"}
res = requests_wrapper.get_cached_session().get(
url,
headers=header
)
if res.ok:
if len(res.text) == 0:
print(f"No PFAM domains found: {self.uniprot_id}")
return None
else:
list_json = json.loads(res.text)["results"]
# metadata for UniProt ID
list_metadata = [entry["metadata"] for entry in list_json]
list_metadata = [{"pfam_accession" if k == "accession" else k:v for k,v in entry.items()} for entry in list_metadata]
# Pfam domains locations
list_locations = [entry["proteins"][0]["entry_protein_locations"][0]["fragments"][0] for entry in list_json]
# model information
list_model = [entry["proteins"][0]["entry_protein_locations"][0] for entry in list_json]
[entry.pop("fragments", None) for entry in list_model]
# protein information
# do last because pop is an in-place operation
list_protein = [entry["proteins"][0] for entry in list_json]
[entry.pop("entry_protein_locations", None) for entry in list_protein]
list_protein = [{"uniprot" if k == "accession" else k:v for k,v in entry.items()} for entry in list_protein]
df_concat = pd.concat(
[
pd.DataFrame(list_protein),
pd.DataFrame(list_metadata),
pd.DataFrame(list_locations),
pd.DataFrame(list_model)
],
axis=1
)
return df_concat
else:
utils_requests.print_status_code_if_res_not_ok(res)
return None
[docs]
def find_pfam_domain(
input_id: str,
input_position: int,
df_ref: pd.DataFrame,
col_ref_id: str,
col_ref_start: None | str = None,
col_ref_end: None | str = None,
col_ref_domain : None | str = None,
) -> str | None:
"""Find Pfam domain for a given HGNC symbol and position
Parameters
----------
input_id : str
Input ID that matches
input_position : int
Codon position
df_ref : pd.DataFrame
DataFrame with Pfam domain information
col_ref_id : str
Column that contains the IDs to match to in the df_ref dataframe
col_ref_start : None | str
Column containing the domain start position; if None defaults to "start" (Pfam API default)
col_ref_end : None | str
Column containing the domain end position; if None defaults to "end" (Pfam API default)
col_ref_domain : None | str
Column containing the domain name; if None defaults to "name" (Pfam API default)
Returns
-------
str | None
Pfam domain if found, None if not found
"""
if col_ref_start is None:
col_ref_start = "start"
if col_ref_end is None:
col_ref_end = "end"
if col_ref_domain is None:
col_ref_domain = "name"
df_temp = df_ref.loc[df_ref[col_ref_id] == input_id].reset_index()
try:
domain = df_temp.loc[((input_position >= df_temp[col_ref_start]) &
(input_position <= df_temp[col_ref_end])), col_ref_domain].values[0]
return domain
except:
return None