Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]

steps:
- uses: actions/checkout@v4
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ To install the latest official release of SeqScore, run: `pip install seqscore`.
This will install the package and add the command `seqscore` in your Python
environment.

SeqScore requires Python 3.9 or higher. It is tested on Python 3.9,
3.10, 3.11, 3.12, and 3.13.
SeqScore requires Python 3.10 or higher. It is tested on Python 3.10, 3.11, 3.12,
3.13, and 3.14.

## License

Expand Down Expand Up @@ -600,7 +600,7 @@ To install from a clone of this repository, use:

## Setting up an environment for development

1. Create an environment: `conda create -yn seqscore python=3.9`
1. Create an environment: `conda create -yn seqscore python=3.10`
2. Activate the environment: `conda activate seqscore`
3. Install seqscore: `pip install -e .`
4. Install development dependencies: `pip install -r requirements.txt`
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[tool.mypy]
python_version = 3.9
python_version = "3.10"
strict_optional = false
disallow_untyped_defs = true
disallow_untyped_calls = true
Expand All @@ -13,4 +13,4 @@ ignore_missing_imports = true

[tool.ruff]
line-length = 90
target-version = "py39"
target-version = "py310"
8 changes: 4 additions & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@
types-tabulate

# For testing
pytest==8.3.5
pytest-cov==5.0.0
pytest==9.0.3
pytest-cov>=7.1.0

# For development
mypy==1.14.1
ruff==0.9.10
mypy==2.1.0
ruff==0.15.15

# Documentation build
# Disabled for now since we don't need them
Expand Down
119 changes: 86 additions & 33 deletions seqscore/conll.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,16 +43,36 @@ class CoNLLFormatError(Exception):
pass


@attrs(frozen=True)
class LineSpec:
"""Defines the fields and delimiters for a CoNLL-format line"""

token_index: int = attrib()
ner_label_index: int = attrib()

def __attrs_post_init__(self) -> None:
# This will only catch cases where the indices are identical, not
# when they refer to the same position, such as 1 and -1 in a
# sequence of length two
if self.token_index == self.ner_label_index:
raise ValueError(
f"Token index ({self.token_index}) and "
f"label index ({self.ner_label_index}) cannot be the same"
)


@attrs(frozen=True)
class _CoNLLToken:
text: str = attrib()
label: str = attrib()
is_docstart: bool = attrib()
line_num: int = attrib()
other_fields: tuple[str, ...] = attrib()
orig_fields: tuple[str, ...] = attrib()

@classmethod
def from_line(cls, line: str, line_num: int, source_name: str) -> "_CoNLLToken":
def from_line(
cls, line: str, line_num: int, source_name: str, line_spec: LineSpec
) -> "_CoNLLToken":
# Note: The caller must strip the line of any trailing whitespace
# TODO: Sense the file rather than the line so we get consistency across lines
# Try tab first since it's safer, then space
Expand All @@ -72,16 +92,17 @@ def from_line(cls, line: str, line_num: int, source_name: str) -> "_CoNLLToken":
f"Line {line_num} of {source_name} is not delimited by space or tab: {repr(line)}"
)

text = splits[0]
label = splits[-1]
other_fields = tuple(splits[1:-1])
text = splits[line_spec.token_index]
label = splits[line_spec.ner_label_index]
orig_fields = tuple(splits)
is_docstart = text == DOCSTART
return cls(text, label, is_docstart, line_num, other_fields)
return cls(text, label, is_docstart, line_num, orig_fields)


@attrs(frozen=True)
class CoNLLIngester:
encoding: Encoding = attrib()
line_spec: LineSpec = attrib()
parse_comment_lines: bool = attrib(default=False, kw_only=True)
ignore_document_boundaries: bool = attrib(default=True, kw_only=True)

Expand Down Expand Up @@ -113,7 +134,7 @@ def ingest(
continue

# Create mentions from tokens in sequence
tokens, labels, line_nums, other_fields = self._decompose_sequence(
tokens, labels, line_nums, orig_fields = self._decompose_sequence(
source_sequence
)

Expand Down Expand Up @@ -186,7 +207,7 @@ def ingest(
tokens,
labels,
mentions,
other_fields=other_fields,
orig_fields=orig_fields,
provenance=SequenceProvenance(line_nums[0], source_name),
comment=comment,
)
Expand All @@ -198,13 +219,17 @@ def ingest(
yield document

def validate(
self, source: TextIO, source_name: str
self,
source: TextIO,
source_name: str,
) -> list[list[SequenceValidationResult]]:
all_results: list[list[SequenceValidationResult]] = []
document_results: list[SequenceValidationResult] = []

for source_sequence, _ in self._parse_file(
source, source_name, parse_comments=self.parse_comment_lines
source,
source_name,
parse_comments=self.parse_comment_lines,
):
if source_sequence[0].is_docstart:
# We can ony receive DOCSTART in a sequence by itself, see _parse_file.
Expand Down Expand Up @@ -242,12 +267,15 @@ def _decompose_sequence(
tokens = tuple(tok.text for tok in source_sequence)
labels = tuple(tok.label for tok in source_sequence)
line_nums = tuple(tok.line_num for tok in source_sequence)
other_fields = tuple(tok.other_fields for tok in source_sequence)
return tokens, labels, line_nums, other_fields
orig_fields = tuple(tok.orig_fields for tok in source_sequence)
return tokens, labels, line_nums, orig_fields

@classmethod
def _parse_file(
cls, input_file: TextIO, source_name: str, *, parse_comments: bool = False
self,
input_file: TextIO,
source_name: str,
*,
parse_comments: bool = False,
) -> Iterable[tuple[tuple[_CoNLLToken, ...], Optional[str]]]:
sequence: list = []
comment: Optional[str] = None
Expand All @@ -272,14 +300,14 @@ def _parse_file(
if not line.strip():
# Clear out sequence if there's anything in it
if sequence:
cls._check_sequence(sequence)
self._check_sequence(sequence)
yield tuple(sequence), comment
sequence = []
comment = None
# Always skip empty lines
continue

token = _CoNLLToken.from_line(line, line_num, source_name)
token = _CoNLLToken.from_line(line, line_num, source_name, self.line_spec)
# Skip document starts, but ensure sequence is empty when we reach them
if token.is_docstart:
if sequence:
Expand All @@ -289,15 +317,15 @@ def _parse_file(
else:
# Yield it by itself. Since the sequence variable is empty, leave it unchanged.
tmp_sent = (token,)
cls._check_sequence(tmp_sent)
self._check_sequence(tmp_sent)
# Don't return the comment yet, it will be returned with the sequence
yield tmp_sent, None
else:
sequence.append(token)

# Finish the last sequence if needed
if sequence:
cls._check_sequence(sequence)
self._check_sequence(sequence)
yield tuple(sequence), comment

@staticmethod
Expand All @@ -315,6 +343,7 @@ def ingest_conll_file(
input_path: PathType,
mention_encoding_name: str,
file_encoding: str,
line_spec: LineSpec,
*,
repair: Optional[str] = None,
ignore_document_boundaries: bool,
Expand All @@ -331,6 +360,7 @@ def ingest_conll_file(

ingester = CoNLLIngester(
mention_encoding,
line_spec,
parse_comment_lines=parse_comment_lines,
ignore_document_boundaries=ignore_document_boundaries,
)
Expand All @@ -343,13 +373,15 @@ def validate_conll_file(
input_path: str,
mention_encoding_name: str,
file_encoding: str,
line_spec: LineSpec,
*,
ignore_document_boundaries: bool,
parse_comment_lines: bool,
) -> ValidationResult:
encoding = get_encoding(mention_encoding_name)
ingester = CoNLLIngester(
encoding,
line_spec,
parse_comment_lines=parse_comment_lines,
ignore_document_boundaries=ignore_document_boundaries,
)
Expand All @@ -374,6 +406,7 @@ def repair_conll_file(
mention_encoding_name: str,
repair: Optional[str],
file_encoding: str,
line_spec: LineSpec,
output_delim: str,
*,
ignore_document_boundaries: bool,
Expand All @@ -384,6 +417,7 @@ def repair_conll_file(
input_file,
mention_encoding_name,
file_encoding,
line_spec,
repair=repair,
ignore_document_boundaries=ignore_document_boundaries,
parse_comment_lines=parse_comment_lines,
Expand Down Expand Up @@ -415,6 +449,7 @@ def write_docs_using_encoding(
mention_encoding_name: str,
file_encoding: str,
delim: str,
line_spec: LineSpec,
output_path: PathType,
) -> None:
mention_encoding = get_encoding(mention_encoding_name)
Expand All @@ -423,7 +458,12 @@ def write_docs_using_encoding(
with open(output_path, "w", encoding=file_encoding) as file:
for doc in docs:
write_doc_using_encoding(
doc, mention_encoding, delim, file, output_docstart=output_docstart
doc,
mention_encoding,
delim,
file,
line_spec,
output_docstart=output_docstart,
)


Expand All @@ -432,32 +472,42 @@ def write_doc_using_encoding(
encoding: Encoding,
delim: str,
file: TextIO,
line_spec: LineSpec,
*,
output_docstart: bool,
) -> None:
if output_docstart:
# Get a single token to figure out how many other_fields entries it has
sequence_other_fields = doc[0].other_fields
fields = [DOCSTART]
if sequence_other_fields:
fields.extend([EMPTY_OTHER_FIELD for _ in sequence_other_fields[0]])
fields.append(encoding.dialect.outside)

# Get the fields of the first token of the first sentence
if doc[0].orig_fields:
# to figure out how many fields there are
sequence_orig_fields = doc[0].orig_fields[0]
# Create the write number of fields
fields = [EMPTY_OTHER_FIELD] * len(sequence_orig_fields)
# Fill in the token and label
fields[line_spec.token_index] = DOCSTART
fields[line_spec.ner_label_index] = encoding.dialect.outside
else:
fields = [DOCSTART, encoding.dialect.outside]
# Write output
print(delim.join(fields), file=file)
print(file=file)

for sequence in doc:
labels = encoding.encode_sequence(sequence)
# Lengths of labels and other_fields have previously been checked to match tokens
for (token, other_fields), label in zip(
sequence.tokens_with_other_fields(), labels
# Lengths of labels and orig_fields have previously been checked to match tokens
for (token, orig_fields), label in zip(
sequence.tokens_with_orig_fields(), labels
):
fields = [token]
if other_fields:
fields.extend(other_fields)
fields.append(label)
if orig_fields:
fields = list(orig_fields)
fields[line_spec.token_index] = token
fields[line_spec.ner_label_index] = label
else:
fields = [token, label]
# Write output
print(delim.join(fields), file=file)

# Print an emtpy line after each sequence
print(file=file)


Expand All @@ -468,6 +518,7 @@ def score_conll_files(
mention_encoding_name: str,
repair: Optional[str],
file_encoding: str,
line_spec: LineSpec,
*,
ignore_document_boundaries: bool,
parse_comment_lines: bool,
Expand All @@ -483,6 +534,7 @@ def score_conll_files(
reference_file,
mention_encoding_name,
file_encoding,
line_spec,
repair=repair,
ignore_document_boundaries=ignore_document_boundaries,
parse_comment_lines=parse_comment_lines,
Expand All @@ -507,6 +559,7 @@ def score_conll_files(
pred_file,
mention_encoding_name,
file_encoding,
line_spec,
repair=repair,
ignore_document_boundaries=ignore_document_boundaries,
parse_comment_lines=parse_comment_lines,
Expand Down
12 changes: 6 additions & 6 deletions seqscore/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ class LabeledSequence(Sequence[str]):
tokens: tuple[str, ...] = attrib(converter=tuplify_strs)
labels: tuple[str, ...] = attrib(converter=tuplify_strs)
mentions: tuple[Mention, ...] = attrib(default=(), converter=_tuplify_mentions)
other_fields: Optional[tuple[tuple[str, ...], ...]] = attrib(
orig_fields: Optional[tuple[tuple[str, ...], ...]] = attrib(
default=None, kw_only=True, converter=tuplify_optional_nested_strs
)
provenance: Optional[SequenceProvenance] = attrib(
Expand All @@ -79,9 +79,9 @@ def __attrs_post_init__(self) -> None:
if not self.tokens:
raise ValueError("Tokens and labels must be non-empty")

if self.other_fields and len(self.tokens) != len(self.other_fields):
if self.orig_fields and len(self.tokens) != len(self.orig_fields):
raise ValueError(
f"Tokens ({len(self.tokens)}) and other_fields ({len(self.other_fields)}) "
f"Tokens ({len(self.tokens)}) and orig_fields ({len(self.orig_fields)}) "
"must be of the same length"
)

Expand Down Expand Up @@ -126,11 +126,11 @@ def __str__(self) -> str:
def tokens_with_labels(self) -> tuple[tuple[str, str], ...]:
return tuple(zip(self.tokens, self.labels))

def tokens_with_other_fields(
def tokens_with_orig_fields(
self,
) -> tuple[tuple[str, Optional[tuple[str, ...]]], ...]:
if self.other_fields:
return tuple(zip(self.tokens, self.other_fields))
if self.orig_fields:
return tuple(zip(self.tokens, self.orig_fields))
else:
return tuple(zip(self.tokens, repeat(None)))

Expand Down
Loading
Loading