Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,001 changes: 149 additions & 852 deletions docs/usage/embeddings_analysis.ipynb

Large diffs are not rendered by default.

76 changes: 50 additions & 26 deletions docs/usage/mutation_analysis.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,18 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 1,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/nab/anaconda3/envs/pyeed_niklas_env/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"import sys\n",
"\n",
Expand All @@ -38,15 +47,15 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"📡 Connected to database.\n",
"All data has been wiped from the database.\n"
"The provided date does not match the current date. Date is you gave is 2025-03-19 actual date is 2025-04-09\n"
]
}
],
Expand Down Expand Up @@ -76,7 +85,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -102,18 +111,21 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "6ed852d438ab480fa4d1c6129eacfd26",
"version_major": 2,
"version_minor": 0
},
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">/home/nab/anaconda3/envs/pyeed_niklas_env/lib/python3.10/site-packages/rich/live.py:231: UserWarning: install \n",
"\"ipywidgets\" for Jupyter support\n",
" warnings.warn('install \"ipywidgets\" for Jupyter support')\n",
"</pre>\n"
],
"text/plain": [
"Output()"
"/home/nab/anaconda3/envs/pyeed_niklas_env/lib/python3.10/site-packages/rich/live.py:231: UserWarning: install \n",
"\"ipywidgets\" for Jupyter support\n",
" warnings.warn('install \"ipywidgets\" for Jupyter support')\n"
]
},
"metadata": {},
Expand All @@ -123,7 +135,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Region ids: [143, 129, 128, 69, 9]\n",
"Region ids: [5206, 5205, 5203, 5201, 5207]\n",
"len of ids: 5\n"
]
},
Expand Down Expand Up @@ -182,7 +194,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -199,7 +211,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -230,19 +242,31 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'from_positions': [241, 125, 272], 'to_positions': [241, 125, 272], 'from_monomers': ['R', 'V', 'D'], 'to_monomers': ['S', 'I', 'N']}\n"
"{'from_positions': [272, 241, 125], 'to_positions': [272, 241, 125], 'from_monomers': ['D', 'R', 'V'], 'to_monomers': ['N', 'S', 'I']}\n"
]
}
],
"source": [
"print(mutations_protein)"
"print(mutations_protein)\n",
"\n",
"\n",
"# remove double realtionship, there are many doubles between the same DNA and the same Organismen\n",
"# just keep the first one and remove the rest\n",
"query_remove_double_relationship = \"\"\"\n",
"MATCH (d:DNA {accession_id: 'KT405476.1'})-[r:ORIGINATES_FROM]-(e)\n",
"WITH d, r, e\n",
"ORDER BY id(r)\n",
"LIMIT 1\n",
"DELETE r\n",
"\"\"\"\n",
"\n"
]
},
{
Expand All @@ -263,21 +287,21 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Mutation on position 705 -> 705 with a nucleotide change of G -> A\n",
"Mutation on position 395 -> 395 with a nucleotide change of T -> G\n",
"Mutation on position 137 -> 137 with a nucleotide change of A -> G\n",
"Mutation on position 17 -> 17 with a nucleotide change of T -> C\n",
"Mutation on position 473 -> 473 with a nucleotide change of T -> C\n",
"Mutation on position 395 -> 395 with a nucleotide change of T -> G\n",
"Mutation on position 198 -> 198 with a nucleotide change of C -> A\n",
"Mutation on position 716 -> 716 with a nucleotide change of G -> A\n",
"Mutation on position 705 -> 705 with a nucleotide change of G -> A\n",
"Mutation on position 473 -> 473 with a nucleotide change of T -> C\n",
"Mutation on position 720 -> 720 with a nucleotide change of A -> C\n",
"Mutation on position 198 -> 198 with a nucleotide change of C -> A\n"
"Mutation on position 137 -> 137 with a nucleotide change of A -> G\n"
]
}
],
Expand All @@ -296,7 +320,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "pyeed_niklas",
"display_name": "pyeed_niklas_env",
"language": "python",
"name": "python3"
},
Expand All @@ -310,7 +334,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.8"
"version": "3.10.16"
}
},
"nbformat": 4,
Expand Down
98 changes: 93 additions & 5 deletions src/pyeed/adapter/uniprot_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
Organism,
Protein,
Reaction,
Region,
Site,
)

Expand Down Expand Up @@ -64,25 +65,112 @@ def add_to_db(self, response: Response) -> None:
self.add_reaction(record, protein)

self.add_sites(record, protein)
self.add_regions(record, protein)
self.add_catalytic_activity(record, protein)
self.add_go(record, protein)

def add_sites(self, record: dict[str, Any], protein: Protein) -> None:
ligand_dict: dict[str, list[int]] = defaultdict(list)
data_dict: dict[str, list[int]] = defaultdict(list)

for feature in record.get("features", []):
if feature["type"] == "BINDING":
for position in range(int(feature["begin"]), int(feature["end"]) + 1):
ligand_dict[feature["ligand"]["name"]].append(position)
data_dict[feature["ligand"]["name"] + "$binding"].append(position)
elif feature["type"] == "ACT_SITE":
for position in range(int(feature["begin"]), int(feature["end"]) + 1):
data_dict[feature["category"] + "$site"].append(position)

for entry, positions in data_dict.items():
if entry.split("$")[1] == "binding":
annotation = Annotation.BINDING_SITE.value
elif entry.split("$")[1] == "site":
annotation = Annotation.ACTIVE_SITE.value

for ligand, positions in ligand_dict.items():
site = Site(
name=ligand,
annotation=Annotation.BINDING_SITE.value,
name=entry.split("$")[0],
annotation=annotation,
)
site.save()

protein.site.connect(site, {"positions": positions})

def add_regions(self, record: dict[str, Any], protein: Protein) -> None:
data_list: list[tuple[str, tuple[int, int]]] = []

for feature in record.get("features", []):
if feature["type"] == "HELIX":
data_list.append(
(
feature["category"] + "$helix",
(int(feature["begin"]), int(feature["end"])),
)
)
elif feature["type"] == "STRAND":
data_list.append(
(
feature["category"] + "$strand",
(int(feature["begin"]), int(feature["end"])),
)
)
elif feature["type"] == "TURN":
data_list.append(
(
feature["category"] + "$turn",
(int(feature["begin"]), int(feature["end"])),
)
)
elif feature["type"] == "SIGNAL":
data_list.append(
(
feature["category"] + "$signal",
(int(feature["begin"]), int(feature["end"])),
)
)
elif feature["type"] == "PROPEP":
data_list.append(
(
feature["category"] + "$propep",
(int(feature["begin"]), int(feature["end"])),
)
)

for name, positions in data_list:
if name.split("$")[1] == "helix":
annotation = Annotation.ALPHAHELIX.value
elif name.split("$")[1] == "strand":
annotation = Annotation.BETASTRAND.value
elif name.split("$")[1] == "turn":
annotation = Annotation.TURN.value
elif name.split("$")[1] == "signal":
annotation = Annotation.SIGNAL.value
elif name.split("$")[1] == "propep":
annotation = Annotation.PROPEP.value

region = Region(
name=name,
annotation=annotation,
)
region.save()

protein.region.connect(region, {"start": positions[0], "end": positions[1]})

def add_catalytic_activity(self, record: dict[str, Any], protein: Protein) -> None:
try:
for reference in record["comments"]:
if reference["type"] == "CATALYTIC_ACTIVITY":
catalytic_annotation = Reaction.get_or_save(
rhea_id=str(reference["id"]) if reference.get("id") else None,
# Optionally, you can add name=reference["reaction"]["name"] if Reaction supports it
)
# If protein has a reaction relationship, connect it
if hasattr(protein, "reaction"):
protein.reaction.connect(catalytic_annotation)

except Exception as e:
logger.error(
f"Error saving catalytic activity for {protein.accession_id}: {e}"
)

def get_substrates_and_products_from_rhea(
self, rhea_id: str
) -> dict[str, List[str]]:
Expand Down
2 changes: 2 additions & 0 deletions src/pyeed/analysis/embedding_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,7 @@ def find_nearest_neighbors_based_on_vector_index(
db: DatabaseConnector,
index_name: str = "embedding_index",
number_of_neighbors: int = 50,
skip: int = 0,
) -> list[tuple[str, float]]:
"""
This function finds the nearest neighbors of a query protein based on the vector index.
Expand Down Expand Up @@ -412,6 +413,7 @@ def find_nearest_neighbors_based_on_vector_index(
YIELD node AS fprotein, score
WHERE score > 0.95
RETURN fprotein.accession_id, score
SKIP {skip}
"""
results = db.execute_read(query_find_nearest_neighbors)
neighbors: list[tuple[str, float]] = [
Expand Down
2 changes: 2 additions & 0 deletions src/pyeed/analysis/sequence_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,15 @@ def __init__(
gap_open: int = -1,
gap_exted: int = 0,
substitution_matrix: str = "None",
node_type: str = "Protein",
) -> None:
self.mode = mode
self.match = match
self.mismatch = mismatch
self.gap_open = gap_open
self.gap_extend = gap_exted
self.substitution_matrix = substitution_matrix
self.node_type = node_type

def _align(
self,
Expand Down
14 changes: 5 additions & 9 deletions src/pyeed/analysis/standard_numbering.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,11 +401,10 @@ def apply_standard_numbering_pairwise(
if node_type == "DNA" and region_ids_neo4j is not None:
query = """
MATCH (s:StandardNumbering {name: $name})
MATCH (r:Region)
MATCH (d:DNA)-[e:HAS_REGION]-(r:Region)-[:HAS_STANDARD_NUMBERING]-(s)
WHERE id(r) IN $region_ids_neo4j
MATCH (r:Region)<-[:HAS_STANDARD_NUMBERING]-(s)
WHERE r.accession_id IN $list_of_seq_ids
RETURN r.accession_id AS accession_id
AND d.accession_id IN $list_of_seq_ids
RETURN d.accession_id AS accession_id
"""

results = db.execute_read(
Expand Down Expand Up @@ -442,8 +441,7 @@ def apply_standard_numbering_pairwise(
logger.info(f"Pairs: {pairs}")

# Run the pairwise alignment using the PairwiseAligner.
pairwise_aligner = PairwiseAligner()

pairwise_aligner = PairwiseAligner(node_type=node_type)
input = (list_of_seq_ids or []) + [base_sequence_id]
if not input:
raise ValueError("No input sequences provided")
Expand All @@ -458,7 +456,7 @@ def apply_standard_numbering_pairwise(
region_ids_neo4j=region_ids_neo4j,
)

logger.info(f"Pairwise alignment results: {results_pairwise}")
# logger.info(f"Pairwise alignment results: {results_pairwise}")

if results_pairwise is None:
raise ValueError("Pairwise alignment failed - no results returned")
Expand All @@ -484,8 +482,6 @@ def apply_standard_numbering_pairwise(
base_sequence_id, converted_alignment
)

logger.info(f"Positions: {positions}")

# Ensure the standard numbering node exists in the database.
StandardNumbering.get_or_save(
name=self.name,
Expand Down
Loading