PyEED · haeussma · May 8, 2025 · Mar 20, 2025 · Mar 25, 2025 · Mar 26, 2025
diff --git a/docs/usage/embeddings_analysis.ipynb b/docs/usage/embeddings_analysis.ipynb
diff --git a/docs/usage/mutation_analysis.ipynb b/docs/usage/mutation_analysis.ipynb
@@ -11,9 +11,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 1,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/nab/anaconda3/envs/pyeed_niklas_env/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
    "source": [
     "import sys\n",
     "\n",
@@ -38,15 +47,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "📡 Connected to database.\n",
-      "All data has been wiped from the database.\n"
+      "The provided date does not match the current date. Date is you gave is 2025-03-19 actual date is 2025-04-09\n"
      ]
     }
    ],
@@ -76,7 +85,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -102,18 +111,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "6ed852d438ab480fa4d1c6129eacfd26",
-       "version_major": 2,
-       "version_minor": 0
-      },
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">/home/nab/anaconda3/envs/pyeed_niklas_env/lib/python3.10/site-packages/rich/live.py:231: UserWarning: install \n",
+       "\"ipywidgets\" for Jupyter support\n",
+       "  warnings.warn('install \"ipywidgets\" for Jupyter support')\n",
+       "</pre>\n"
+      ],
       "text/plain": [
-       "Output()"
+       "/home/nab/anaconda3/envs/pyeed_niklas_env/lib/python3.10/site-packages/rich/live.py:231: UserWarning: install \n",
+       "\"ipywidgets\" for Jupyter support\n",
+       "  warnings.warn('install \"ipywidgets\" for Jupyter support')\n"
       ]
      },
      "metadata": {},
@@ -123,7 +135,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Region ids: [143, 129, 128, 69, 9]\n",
+      "Region ids: [5206, 5205, 5203, 5201, 5207]\n",
       "len of ids: 5\n"
      ]
     },
@@ -182,7 +194,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -199,7 +211,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -230,19 +242,31 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{'from_positions': [241, 125, 272], 'to_positions': [241, 125, 272], 'from_monomers': ['R', 'V', 'D'], 'to_monomers': ['S', 'I', 'N']}\n"
+      "{'from_positions': [272, 241, 125], 'to_positions': [272, 241, 125], 'from_monomers': ['D', 'R', 'V'], 'to_monomers': ['N', 'S', 'I']}\n"
      ]
     }
    ],
    "source": [
-    "print(mutations_protein)"
+    "print(mutations_protein)\n",
+    "\n",
+    "\n",
+    "# remove double realtionship, there are many doubles between the same DNA and the same Organismen\n",
+    "# just keep the first one and remove the rest\n",
+    "query_remove_double_relationship = \"\"\"\n",
+    "MATCH (d:DNA {accession_id: 'KT405476.1'})-[r:ORIGINATES_FROM]-(e)\n",
+    "WITH d, r, e\n",
+    "ORDER BY id(r)\n",
+    "LIMIT 1\n",
+    "DELETE r\n",
+    "\"\"\"\n",
+    "\n"
    ]
   },
   {
@@ -263,21 +287,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Mutation on position 705 -> 705 with a nucleotide change of G -> A\n",
-      "Mutation on position 395 -> 395 with a nucleotide change of T -> G\n",
-      "Mutation on position 137 -> 137 with a nucleotide change of A -> G\n",
       "Mutation on position 17 -> 17 with a nucleotide change of T -> C\n",
-      "Mutation on position 473 -> 473 with a nucleotide change of T -> C\n",
+      "Mutation on position 395 -> 395 with a nucleotide change of T -> G\n",
+      "Mutation on position 198 -> 198 with a nucleotide change of C -> A\n",
       "Mutation on position 716 -> 716 with a nucleotide change of G -> A\n",
+      "Mutation on position 705 -> 705 with a nucleotide change of G -> A\n",
+      "Mutation on position 473 -> 473 with a nucleotide change of T -> C\n",
       "Mutation on position 720 -> 720 with a nucleotide change of A -> C\n",
-      "Mutation on position 198 -> 198 with a nucleotide change of C -> A\n"
+      "Mutation on position 137 -> 137 with a nucleotide change of A -> G\n"
      ]
     }
    ],
@@ -296,7 +320,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "pyeed_niklas",
+   "display_name": "pyeed_niklas_env",
    "language": "python",
    "name": "python3"
   },
@@ -310,7 +334,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.8"
+   "version": "3.10.16"
   }
  },
  "nbformat": 4,

diff --git a/src/pyeed/adapter/uniprot_mapper.py b/src/pyeed/adapter/uniprot_mapper.py
@@ -16,6 +16,7 @@
     Organism,
     Protein,
     Reaction,
+    Region,
     Site,
 )
 
@@ -64,25 +65,112 @@ def add_to_db(self, response: Response) -> None:
             self.add_reaction(record, protein)
 
         self.add_sites(record, protein)
+        self.add_regions(record, protein)
+        self.add_catalytic_activity(record, protein)
         self.add_go(record, protein)
 
     def add_sites(self, record: dict[str, Any], protein: Protein) -> None:
-        ligand_dict: dict[str, list[int]] = defaultdict(list)
+        data_dict: dict[str, list[int]] = defaultdict(list)
 
         for feature in record.get("features", []):
             if feature["type"] == "BINDING":
                 for position in range(int(feature["begin"]), int(feature["end"]) + 1):
-                    ligand_dict[feature["ligand"]["name"]].append(position)
+                    data_dict[feature["ligand"]["name"] + "$binding"].append(position)
+            elif feature["type"] == "ACT_SITE":
+                for position in range(int(feature["begin"]), int(feature["end"]) + 1):
+                    data_dict[feature["category"] + "$site"].append(position)
+
+        for entry, positions in data_dict.items():
+            if entry.split("$")[1] == "binding":
+                annotation = Annotation.BINDING_SITE.value
+            elif entry.split("$")[1] == "site":
+                annotation = Annotation.ACTIVE_SITE.value
 
-        for ligand, positions in ligand_dict.items():
             site = Site(
-                name=ligand,
-                annotation=Annotation.BINDING_SITE.value,
+                name=entry.split("$")[0],
+                annotation=annotation,
             )
             site.save()
 
             protein.site.connect(site, {"positions": positions})
 
+    def add_regions(self, record: dict[str, Any], protein: Protein) -> None:
+        data_list: list[tuple[str, tuple[int, int]]] = []
+
+        for feature in record.get("features", []):
+            if feature["type"] == "HELIX":
+                data_list.append(
+                    (
+                        feature["category"] + "$helix",
+                        (int(feature["begin"]), int(feature["end"])),
+                    )
+                )
+            elif feature["type"] == "STRAND":
+                data_list.append(
+                    (
+                        feature["category"] + "$strand",
+                        (int(feature["begin"]), int(feature["end"])),
+                    )
+                )
+            elif feature["type"] == "TURN":
+                data_list.append(
+                    (
+                        feature["category"] + "$turn",
+                        (int(feature["begin"]), int(feature["end"])),
+                    )
+                )
+            elif feature["type"] == "SIGNAL":
+                data_list.append(
+                    (
+                        feature["category"] + "$signal",
+                        (int(feature["begin"]), int(feature["end"])),
+                    )
+                )
+            elif feature["type"] == "PROPEP":
+                data_list.append(
+                    (
+                        feature["category"] + "$propep",
+                        (int(feature["begin"]), int(feature["end"])),
+                    )
+                )
+
+        for name, positions in data_list:
+            if name.split("$")[1] == "helix":
+                annotation = Annotation.ALPHAHELIX.value
+            elif name.split("$")[1] == "strand":
+                annotation = Annotation.BETASTRAND.value
+            elif name.split("$")[1] == "turn":
+                annotation = Annotation.TURN.value
+            elif name.split("$")[1] == "signal":
+                annotation = Annotation.SIGNAL.value
+            elif name.split("$")[1] == "propep":
+                annotation = Annotation.PROPEP.value
+
+            region = Region(
+                name=name,
+                annotation=annotation,
+            )
+            region.save()
+
+            protein.region.connect(region, {"start": positions[0], "end": positions[1]})
+
+    def add_catalytic_activity(self, record: dict[str, Any], protein: Protein) -> None:
+        try:
+            for reference in record["comments"]:
+                if reference["type"] == "CATALYTIC_ACTIVITY":
+                    catalytic_annotation = Reaction.get_or_save(
+                        rhea_id=str(reference["id"]) if reference.get("id") else None,
+                        # Optionally, you can add name=reference["reaction"]["name"] if Reaction supports it
+                    )
+                    # If protein has a reaction relationship, connect it
+                    if hasattr(protein, "reaction"):
+                        protein.reaction.connect(catalytic_annotation)
+
+        except Exception as e:
+            logger.error(
+                f"Error saving catalytic activity for {protein.accession_id}: {e}"
+            )
+
     def get_substrates_and_products_from_rhea(
         self, rhea_id: str
     ) -> dict[str, List[str]]:

diff --git a/src/pyeed/analysis/embedding_analysis.py b/src/pyeed/analysis/embedding_analysis.py
@@ -352,6 +352,7 @@ def find_nearest_neighbors_based_on_vector_index(
         db: DatabaseConnector,
         index_name: str = "embedding_index",
         number_of_neighbors: int = 50,
+        skip: int = 0,
     ) -> list[tuple[str, float]]:
         """
         This function finds the nearest neighbors of a query protein based on the vector index.
@@ -412,6 +413,7 @@ def find_nearest_neighbors_based_on_vector_index(
         YIELD node AS fprotein, score
         WHERE score > 0.95
         RETURN fprotein.accession_id, score
+        SKIP {skip}
         """
         results = db.execute_read(query_find_nearest_neighbors)
         neighbors: list[tuple[str, float]] = [

diff --git a/src/pyeed/analysis/sequence_alignment.py b/src/pyeed/analysis/sequence_alignment.py
@@ -24,13 +24,15 @@ def __init__(
         gap_open: int = -1,
         gap_exted: int = 0,
         substitution_matrix: str = "None",
+        node_type: str = "Protein",
     ) -> None:
         self.mode = mode
         self.match = match
         self.mismatch = mismatch
         self.gap_open = gap_open
         self.gap_extend = gap_exted
         self.substitution_matrix = substitution_matrix
+        self.node_type = node_type
 
     def _align(
         self,

diff --git a/src/pyeed/analysis/standard_numbering.py b/src/pyeed/analysis/standard_numbering.py
@@ -401,11 +401,10 @@ def apply_standard_numbering_pairwise(
         if node_type == "DNA" and region_ids_neo4j is not None:
             query = """
             MATCH (s:StandardNumbering {name: $name})
-            MATCH (r:Region)
+            MATCH (d:DNA)-[e:HAS_REGION]-(r:Region)-[:HAS_STANDARD_NUMBERING]-(s)
             WHERE id(r) IN $region_ids_neo4j
-            MATCH (r:Region)<-[:HAS_STANDARD_NUMBERING]-(s)
-            WHERE r.accession_id IN $list_of_seq_ids
-            RETURN r.accession_id AS accession_id
+            AND d.accession_id IN $list_of_seq_ids
+            RETURN d.accession_id AS accession_id
             """
 
             results = db.execute_read(
@@ -442,8 +441,7 @@ def apply_standard_numbering_pairwise(
         logger.info(f"Pairs: {pairs}")
 
         # Run the pairwise alignment using the PairwiseAligner.
-        pairwise_aligner = PairwiseAligner()
-
+        pairwise_aligner = PairwiseAligner(node_type=node_type)
         input = (list_of_seq_ids or []) + [base_sequence_id]
         if not input:
             raise ValueError("No input sequences provided")
@@ -458,7 +456,7 @@ def apply_standard_numbering_pairwise(
             region_ids_neo4j=region_ids_neo4j,
         )
 
-        logger.info(f"Pairwise alignment results: {results_pairwise}")
+        # logger.info(f"Pairwise alignment results: {results_pairwise}")
 
         if results_pairwise is None:
             raise ValueError("Pairwise alignment failed - no results returned")
@@ -484,8 +482,6 @@ def apply_standard_numbering_pairwise(
             base_sequence_id, converted_alignment
         )
 
-        logger.info(f"Positions: {positions}")
-
         # Ensure the standard numbering node exists in the database.
         StandardNumbering.get_or_save(
             name=self.name,