CogStack · adam-sutton-1992 · Mar 16, 2026 · Apr 1, 2026 · Apr 1, 2026 · Apr 2, 2026
diff --git a/.github/workflows/medcat-embedding-linker_ci.yml b/.github/workflows/medcat-embedding-linker_ci.yml
@@ -7,7 +7,7 @@ on:
       - 'medcat-embedding-linker/v*.*.*'  
   pull_request:
     paths:
-      - 'medcat-embedding-linker/**'
+      - 'medcat-plugins/medcat-embedding-linker/**'
       - '.github/workflows/medcat-embedding-linker**'
 
 permissions:

diff --git a/medcat-plugins/embedding-linker/src/medcat_embedding_linker/config.py b/medcat-plugins/embedding-linker/src/medcat_embedding_linker/config.py
@@ -1,17 +1,42 @@
 from typing import Optional, Any
-
 from medcat.config import Linking
 
 
 class EmbeddingLinking(Linking):
     """The config exclusively used for the embedding linker"""
+
     comp_name: str = "embedding_linker"
     """Changing compoenent name"""
     filter_before_disamb: bool = False
+    """Training on names or CUIs. If True all names of all CUIs will be used to train. 
+    If false only CUIs preffered (or longest names will be used to train). Training on 
+    names is more expensive computationally (and RAM/VRAM), but can lead to better 
+    performance."""
+    train_on_names: bool = True
     """Filtering CUIs before disambiguation"""
-    train: bool = False
-    """The embedding linker never needs to be trained in its 
-    current implementation."""
+    training_batch_size: int = 32
+    """The size of the batch to be used for training."""
+    embed_per_n_batches: int = 0
+    """How many batches to train on before re-embedding the all names in the context 
+    model. This is used to control how often the context model is updated during 
+    training."""
+    use_similarity_threshold: bool = True
+    """Do we have a similarity threshold we care about?"""
+    negative_sampling_k: int = 10
+    """How many negative samples to generate for each positive sample during 
+    training."""
+    negative_sampling_candidate_pool_size: int = 4096
+    """When generating negative samples, sample top_n candidates to consider when 
+    sampling. Higher numbers will make training slower but can provide varied negative 
+    samples."""
+    negative_sampling_temperature: float = 0.1
+    """Temperature to use when generating negative samples in training. Lower 
+    temperatures will make the sampling more focused on the highest scoring candidates, 
+    while higher temperatures will make it more random. Must be > 0."""
+    use_mention_attention: bool = True
+    """Improves performance and fun to say. Mention attention can help the model focus 
+    on the most relevant parts of the context when making linking decisions. Will only 
+    pool on the tokens that contain the entity mention, with no context."""
     long_similarity_threshold: float = 0.0
     """Used in the inference step to choose the best CUI given the
     link candidates. Testing shows a threshold of 0.7 increases precision
@@ -26,11 +51,16 @@ class EmbeddingLinking(Linking):
     embedding_model_name: str = "sentence-transformers/all-MiniLM-L6-v2"
     """Name of the embedding model. It must be downloadable from 
     huggingface linked from an appropriate file directory"""
+    use_projection_layer: bool = True
+    """Projection-layer default for trainable embedding linker."""
+    top_n_layers_to_unfreeze: int = 0
+    """LM unfreezing default for trainable embedding linker.
+    -1 unfreezes all LM layers, 0 freezes all LM layers, 
+    n unfreezes the top n layers."""
     max_token_length: int = 64
     """Max number of tokens to be embedded from a name.
     If the max token length is changed then the linker will need to be created
-    with a new config.
-    """
+    with a new config."""
     embedding_batch_size: int = 4096
     """How many pieces names can be embedded at once, useful when 
     embedding name2info names, cui2info names"""
@@ -44,5 +74,3 @@ class EmbeddingLinking(Linking):
     use_ner_link_candidates: bool = True
     """Link candidates are provided by some NER steps. This will flag if 
     you want to trust them or not."""
-    use_similarity_threshold: bool = True
-    """Do we have a similarity threshold we care about?"""