bltlab · claire-yq · Jun 1, 2026 · Jun 1, 2026 · Jun 1, 2026 · Jun 2, 2026
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -17,7 +17,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+        python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
 
     steps:
       - uses: actions/checkout@v4

diff --git a/README.md b/README.md
@@ -21,8 +21,8 @@ To install the latest official release of SeqScore, run: `pip install seqscore`.
 This will install the package and add the command `seqscore` in your Python
 environment.
 
-SeqScore requires Python 3.9 or higher. It is tested on Python 3.9,
-3.10, 3.11, 3.12, and 3.13.
+SeqScore requires Python 3.10 or higher. It is tested on Python 3.10, 3.11, 3.12,
+3.13, and 3.14.
 
 ## License
 
@@ -600,7 +600,7 @@ To install from a clone of this repository, use:
 
 ## Setting up an environment for development
 
-1. Create an environment: `conda create -yn seqscore python=3.9`
+1. Create an environment: `conda create -yn seqscore python=3.10`
 2. Activate the environment: `conda activate seqscore`
 3. Install seqscore: `pip install -e .`
 4. Install development dependencies: `pip install -r requirements.txt`

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,5 +1,5 @@
 [tool.mypy]
-python_version = 3.9
+python_version = "3.10"
 strict_optional = false
 disallow_untyped_defs = true
 disallow_untyped_calls = true
@@ -13,4 +13,4 @@ ignore_missing_imports = true
 
 [tool.ruff]
 line-length = 90
-target-version = "py39"
+target-version = "py310"
diff --git a/requirements.txt b/requirements.txt
@@ -6,12 +6,12 @@
 types-tabulate
 
 # For testing
-pytest==8.3.5
-pytest-cov==5.0.0
+pytest==9.0.3
+pytest-cov>=7.1.0
 
 # For development
-mypy==1.14.1
-ruff==0.9.10
+mypy==2.1.0
+ruff==0.15.15
 
 # Documentation build
 # Disabled for now since we don't need them

diff --git a/seqscore/conll.py b/seqscore/conll.py
@@ -43,16 +43,36 @@ class CoNLLFormatError(Exception):
     pass
 
 
+@attrs(frozen=True)
+class LineSpec:
+    """Defines the fields and delimiters for a CoNLL-format line"""
+
+    token_index: int = attrib()
+    ner_label_index: int = attrib()
+
+    def __attrs_post_init__(self) -> None:
+        # This will only catch cases where the indices are identical, not
+        # when they refer to the same position, such as 1 and -1 in a
+        # sequence of length two
+        if self.token_index == self.ner_label_index:
+            raise ValueError(
+                f"Token index ({self.token_index}) and "
+                f"label index ({self.ner_label_index}) cannot be the same"
+            )
+
+
 @attrs(frozen=True)
 class _CoNLLToken:
     text: str = attrib()
     label: str = attrib()
     is_docstart: bool = attrib()
     line_num: int = attrib()
-    other_fields: tuple[str, ...] = attrib()
+    orig_fields: tuple[str, ...] = attrib()
 
     @classmethod
-    def from_line(cls, line: str, line_num: int, source_name: str) -> "_CoNLLToken":
+    def from_line(
+        cls, line: str, line_num: int, source_name: str, line_spec: LineSpec
+    ) -> "_CoNLLToken":
         # Note: The caller must strip the line of any trailing whitespace
         # TODO: Sense the file rather than the line so we get consistency across lines
         # Try tab first since it's safer, then space
@@ -72,16 +92,17 @@ def from_line(cls, line: str, line_num: int, source_name: str) -> "_CoNLLToken":
                     f"Line {line_num} of {source_name} is not delimited by space or tab: {repr(line)}"
                 )
 
-        text = splits[0]
-        label = splits[-1]
-        other_fields = tuple(splits[1:-1])
+        text = splits[line_spec.token_index]
+        label = splits[line_spec.ner_label_index]
+        orig_fields = tuple(splits)
         is_docstart = text == DOCSTART
-        return cls(text, label, is_docstart, line_num, other_fields)
+        return cls(text, label, is_docstart, line_num, orig_fields)
 
 
 @attrs(frozen=True)
 class CoNLLIngester:
     encoding: Encoding = attrib()
+    line_spec: LineSpec = attrib()
     parse_comment_lines: bool = attrib(default=False, kw_only=True)
     ignore_document_boundaries: bool = attrib(default=True, kw_only=True)
 
@@ -113,7 +134,7 @@ def ingest(
                 continue
 
             # Create mentions from tokens in sequence
-            tokens, labels, line_nums, other_fields = self._decompose_sequence(
+            tokens, labels, line_nums, orig_fields = self._decompose_sequence(
                 source_sequence
             )
 
@@ -186,7 +207,7 @@ def ingest(
                 tokens,
                 labels,
                 mentions,
-                other_fields=other_fields,
+                orig_fields=orig_fields,
                 provenance=SequenceProvenance(line_nums[0], source_name),
                 comment=comment,
             )
@@ -198,13 +219,17 @@ def ingest(
             yield document
 
     def validate(
-        self, source: TextIO, source_name: str
+        self,
+        source: TextIO,
+        source_name: str,
     ) -> list[list[SequenceValidationResult]]:
         all_results: list[list[SequenceValidationResult]] = []
         document_results: list[SequenceValidationResult] = []
 
         for source_sequence, _ in self._parse_file(
-            source, source_name, parse_comments=self.parse_comment_lines
+            source,
+            source_name,
+            parse_comments=self.parse_comment_lines,
         ):
             if source_sequence[0].is_docstart:
                 # We can ony receive DOCSTART in a sequence by itself, see _parse_file.
@@ -242,12 +267,15 @@ def _decompose_sequence(
         tokens = tuple(tok.text for tok in source_sequence)
         labels = tuple(tok.label for tok in source_sequence)
         line_nums = tuple(tok.line_num for tok in source_sequence)
-        other_fields = tuple(tok.other_fields for tok in source_sequence)
-        return tokens, labels, line_nums, other_fields
+        orig_fields = tuple(tok.orig_fields for tok in source_sequence)
+        return tokens, labels, line_nums, orig_fields
 
-    @classmethod
     def _parse_file(
-        cls, input_file: TextIO, source_name: str, *, parse_comments: bool = False
+        self,
+        input_file: TextIO,
+        source_name: str,
+        *,
+        parse_comments: bool = False,
     ) -> Iterable[tuple[tuple[_CoNLLToken, ...], Optional[str]]]:
         sequence: list = []
         comment: Optional[str] = None
@@ -272,14 +300,14 @@ def _parse_file(
             if not line.strip():
                 # Clear out sequence if there's anything in it
                 if sequence:
-                    cls._check_sequence(sequence)
+                    self._check_sequence(sequence)
                     yield tuple(sequence), comment
                     sequence = []
                     comment = None
                 # Always skip empty lines
                 continue
 
-            token = _CoNLLToken.from_line(line, line_num, source_name)
+            token = _CoNLLToken.from_line(line, line_num, source_name, self.line_spec)
             # Skip document starts, but ensure sequence is empty when we reach them
             if token.is_docstart:
                 if sequence:
@@ -289,15 +317,15 @@ def _parse_file(
                 else:
                     # Yield it by itself. Since the sequence variable is empty, leave it unchanged.
                     tmp_sent = (token,)
-                    cls._check_sequence(tmp_sent)
+                    self._check_sequence(tmp_sent)
                     # Don't return the comment yet, it will be returned with the sequence
                     yield tmp_sent, None
             else:
                 sequence.append(token)
 
         # Finish the last sequence if needed
         if sequence:
-            cls._check_sequence(sequence)
+            self._check_sequence(sequence)
             yield tuple(sequence), comment
 
     @staticmethod
@@ -315,6 +343,7 @@ def ingest_conll_file(
     input_path: PathType,
     mention_encoding_name: str,
     file_encoding: str,
+    line_spec: LineSpec,
     *,
     repair: Optional[str] = None,
     ignore_document_boundaries: bool,
@@ -331,6 +360,7 @@ def ingest_conll_file(
 
     ingester = CoNLLIngester(
         mention_encoding,
+        line_spec,
         parse_comment_lines=parse_comment_lines,
         ignore_document_boundaries=ignore_document_boundaries,
     )
@@ -343,13 +373,15 @@ def validate_conll_file(
     input_path: str,
     mention_encoding_name: str,
     file_encoding: str,
+    line_spec: LineSpec,
     *,
     ignore_document_boundaries: bool,
     parse_comment_lines: bool,
 ) -> ValidationResult:
     encoding = get_encoding(mention_encoding_name)
     ingester = CoNLLIngester(
         encoding,
+        line_spec,
         parse_comment_lines=parse_comment_lines,
         ignore_document_boundaries=ignore_document_boundaries,
     )
@@ -374,6 +406,7 @@ def repair_conll_file(
     mention_encoding_name: str,
     repair: Optional[str],
     file_encoding: str,
+    line_spec: LineSpec,
     output_delim: str,
     *,
     ignore_document_boundaries: bool,
@@ -384,6 +417,7 @@ def repair_conll_file(
         input_file,
         mention_encoding_name,
         file_encoding,
+        line_spec,
         repair=repair,
         ignore_document_boundaries=ignore_document_boundaries,
         parse_comment_lines=parse_comment_lines,
@@ -415,6 +449,7 @@ def write_docs_using_encoding(
     mention_encoding_name: str,
     file_encoding: str,
     delim: str,
+    line_spec: LineSpec,
     output_path: PathType,
 ) -> None:
     mention_encoding = get_encoding(mention_encoding_name)
@@ -423,7 +458,12 @@ def write_docs_using_encoding(
     with open(output_path, "w", encoding=file_encoding) as file:
         for doc in docs:
             write_doc_using_encoding(
-                doc, mention_encoding, delim, file, output_docstart=output_docstart
+                doc,
+                mention_encoding,
+                delim,
+                file,
+                line_spec,
+                output_docstart=output_docstart,
             )
 
 
@@ -432,32 +472,42 @@ def write_doc_using_encoding(
     encoding: Encoding,
     delim: str,
     file: TextIO,
+    line_spec: LineSpec,
     *,
     output_docstart: bool,
 ) -> None:
     if output_docstart:
-        # Get a single token to figure out how many other_fields entries it has
-        sequence_other_fields = doc[0].other_fields
-        fields = [DOCSTART]
-        if sequence_other_fields:
-            fields.extend([EMPTY_OTHER_FIELD for _ in sequence_other_fields[0]])
-        fields.append(encoding.dialect.outside)
-
+        # Get the fields of the first token of the first sentence
+        if doc[0].orig_fields:
+            # to figure out how many fields there are
+            sequence_orig_fields = doc[0].orig_fields[0]
+            # Create the write number of fields
+            fields = [EMPTY_OTHER_FIELD] * len(sequence_orig_fields)
+            # Fill in the token and label
+            fields[line_spec.token_index] = DOCSTART
+            fields[line_spec.ner_label_index] = encoding.dialect.outside
+        else:
+            fields = [DOCSTART, encoding.dialect.outside]
+        # Write output
         print(delim.join(fields), file=file)
         print(file=file)
 
     for sequence in doc:
         labels = encoding.encode_sequence(sequence)
-        # Lengths of labels and other_fields have previously been checked to match tokens
-        for (token, other_fields), label in zip(
-            sequence.tokens_with_other_fields(), labels
+        # Lengths of labels and orig_fields have previously been checked to match tokens
+        for (token, orig_fields), label in zip(
+            sequence.tokens_with_orig_fields(), labels
         ):
-            fields = [token]
-            if other_fields:
-                fields.extend(other_fields)
-            fields.append(label)
+            if orig_fields:
+                fields = list(orig_fields)
+                fields[line_spec.token_index] = token
+                fields[line_spec.ner_label_index] = label
+            else:
+                fields = [token, label]
+            # Write output
             print(delim.join(fields), file=file)
 
+        # Print an emtpy line after each sequence
         print(file=file)
 
 
@@ -468,6 +518,7 @@ def score_conll_files(
     mention_encoding_name: str,
     repair: Optional[str],
     file_encoding: str,
+    line_spec: LineSpec,
     *,
     ignore_document_boundaries: bool,
     parse_comment_lines: bool,
@@ -483,6 +534,7 @@ def score_conll_files(
         reference_file,
         mention_encoding_name,
         file_encoding,
+        line_spec,
         repair=repair,
         ignore_document_boundaries=ignore_document_boundaries,
         parse_comment_lines=parse_comment_lines,
@@ -507,6 +559,7 @@ def score_conll_files(
             pred_file,
             mention_encoding_name,
             file_encoding,
+            line_spec,
             repair=repair,
             ignore_document_boundaries=ignore_document_boundaries,
             parse_comment_lines=parse_comment_lines,

diff --git a/seqscore/model.py b/seqscore/model.py
@@ -60,7 +60,7 @@ class LabeledSequence(Sequence[str]):
     tokens: tuple[str, ...] = attrib(converter=tuplify_strs)
     labels: tuple[str, ...] = attrib(converter=tuplify_strs)
     mentions: tuple[Mention, ...] = attrib(default=(), converter=_tuplify_mentions)
-    other_fields: Optional[tuple[tuple[str, ...], ...]] = attrib(
+    orig_fields: Optional[tuple[tuple[str, ...], ...]] = attrib(
         default=None, kw_only=True, converter=tuplify_optional_nested_strs
     )
     provenance: Optional[SequenceProvenance] = attrib(
@@ -79,9 +79,9 @@ def __attrs_post_init__(self) -> None:
         if not self.tokens:
             raise ValueError("Tokens and labels must be non-empty")
 
-        if self.other_fields and len(self.tokens) != len(self.other_fields):
+        if self.orig_fields and len(self.tokens) != len(self.orig_fields):
             raise ValueError(
-                f"Tokens ({len(self.tokens)}) and other_fields ({len(self.other_fields)}) "
+                f"Tokens ({len(self.tokens)}) and orig_fields ({len(self.orig_fields)}) "
                 "must be of the same length"
             )
 
@@ -126,11 +126,11 @@ def __str__(self) -> str:
     def tokens_with_labels(self) -> tuple[tuple[str, str], ...]:
         return tuple(zip(self.tokens, self.labels))
 
-    def tokens_with_other_fields(
+    def tokens_with_orig_fields(
         self,
     ) -> tuple[tuple[str, Optional[tuple[str, ...]]], ...]:
-        if self.other_fields:
-            return tuple(zip(self.tokens, self.other_fields))
+        if self.orig_fields:
+            return tuple(zip(self.tokens, self.orig_fields))
         else:
             return tuple(zip(self.tokens, repeat(None)))