Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
f5b5572
ci: gate docstring quality and coverage in CI (#616)
planetf1 Mar 18, 2026
1d9480a
style: fix ruff formatting in audit_coverage.py
planetf1 Mar 18, 2026
0c8ea1c
ci: remove soft-fail mode labels from docs workflow summary
planetf1 Mar 18, 2026
52dceb3
docs: improve doc check reporting with line numbers, GHA annotations,…
planetf1 Mar 19, 2026
ad7201f
fix: repair orphan check for Mintlify v2 docs.json and improve summar…
planetf1 Mar 19, 2026
5909fc0
ci: per-kind quality breakdown in summary table, drop misleading skip…
planetf1 Mar 19, 2026
6ddcc03
docs: add fix hints and doc refs to coverage and MDX validation output
planetf1 Mar 19, 2026
467c01d
docs: add annotation gap checks and public-API-only doc filter
planetf1 Mar 19, 2026
65b66b7
fix: align coverage scope with doc generator's public-API filter
planetf1 Mar 19, 2026
88d42bc
feat: add param_type_mismatch and return_type_mismatch docstring checks
planetf1 Mar 19, 2026
c874c68
docs: document param_type_mismatch and return_type_mismatch check kinds
planetf1 Mar 19, 2026
2a50045
fix: correct parent __init__.py path for module files in _is_public_s…
planetf1 Mar 19, 2026
8b38c30
style: fix ruff formatting and EN dash in validate.py
planetf1 Mar 19, 2026
272ed5f
revert: restore full D suppression for cli/ (see #705)
planetf1 Mar 19, 2026
8234e9b
docs: add artifact download link to quality gate failure summary
planetf1 Mar 19, 2026
4bfcd0a
feat: add --warn-only to validate.py for pre-commit informational mode
planetf1 Mar 19, 2026
a8b80de
docs: fix 36 docstring quality gate failures across 17 files
planetf1 Mar 20, 2026
c9c4325
fix: suppress pre-existing mypy errors exposed by new type annotations
planetf1 Mar 20, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 84 additions & 30 deletions .github/workflows/docs-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -105,12 +105,31 @@ jobs:
id: audit_coverage
run: |
set -o pipefail
uv run python tooling/docs-autogen/audit_coverage.py --docs-dir docs/docs/api --threshold 80 --quality 2>&1 \
uv run python tooling/docs-autogen/audit_coverage.py --docs-dir docs/docs/api --threshold 80 2>&1 \
| tee /tmp/audit_coverage.log
continue-on-error: ${{ inputs.strict_validation != true }}

- name: Docstring quality gate
id: quality_gate
run: |
set -o pipefail
uv run python tooling/docs-autogen/audit_coverage.py \
--docs-dir docs/docs/api \
--quality --fail-on-quality --threshold 100 \
--orphans \
--output /tmp/quality_report.json 2>&1 \
| tee /tmp/quality_gate.log
# -- Upload artifact for deploy job --------------------------------------

- name: Upload quality report
if: always()
uses: actions/upload-artifact@v7
with:
name: docstring-quality-report
path: /tmp/quality_report.json
retention-days: 30

- name: Upload docs artifact
if: success() || (inputs.strict_validation != true)
uses: actions/upload-artifact@v7
Expand Down Expand Up @@ -141,12 +160,11 @@ jobs:
markdownlint_outcome = "${{ steps.markdownlint.outcome }}"
validate_outcome = "${{ steps.validate_mdx.outcome }}"
coverage_outcome = "${{ steps.audit_coverage.outcome }}"
strict = "${{ inputs.strict_validation }}" == "true"
mode = "" if strict else " *(soft-fail)*"
quality_gate_outcome = "${{ steps.quality_gate.outcome }}"
lint_log = read_log("/tmp/markdownlint.log")
validate_log = read_log("/tmp/validate_mdx.log")
coverage_log = read_log("/tmp/audit_coverage.log")
quality_gate_log = read_log("/tmp/quality_gate.log")
# Count markdownlint issues (lines matching file:line:col format)
lint_issues = len([l for l in lint_log.splitlines() if re.match(r'.+:\d+:\d+ ', l)])
Expand Down Expand Up @@ -186,45 +204,81 @@ jobs:
mdx_detail = parse_validate_detail(validate_log)
# Docstring quality annotation emitted by audit_coverage.py into the log
# Format: ::notice title=Docstring quality::message
# or ::warning title=Docstring quality::message
quality_match = re.search(r"::(notice|warning|error) title=Docstring quality::(.+)", coverage_log)
if quality_match:
quality_level, quality_msg = quality_match.group(1), quality_match.group(2)
quality_icon = "✅" if quality_level == "notice" else "⚠️"
quality_status = "pass" if quality_level == "notice" else "warning"
quality_detail = re.sub(r"\s*—\s*see job summary.*$", "", quality_msg)
quality_row = f"| Docstring Quality | {quality_icon} {quality_status}{mode} | {quality_detail} |"
# Parse per-kind counts from the quality gate log.
# _print_quality_report emits section headers like:
# " Missing docstrings (12)"
# " Missing Args section (5)"
# Capture label -> count from those lines, then build a compact
# per-kind breakdown for the summary table cell.
kind_short = {
"Missing docstrings": "missing",
"Short docstrings": "short",
"Missing Args section": "no_args",
"Missing Returns section": "no_returns",
"Missing Yields section (generator)": "no_yields",
"Missing Raises section": "no_raises",
"Missing class Args section": "no_class_args",
"Duplicate Args: in class + __init__ (Option C violation)": "dup_init_args",
"Param name mismatches (documented but not in signature)": "param_mismatch",
"TypedDict phantom fields (documented but not declared)": "td_phantom",
"TypedDict undocumented fields (declared but missing from Attributes:)": "td_undoc",
"Missing parameter type annotations (type absent from API docs)": "missing_param_type",
"Missing return type annotations (type absent from API docs)": "missing_return_type",
"Param type mismatch (docstring vs annotation)": "param_type_mismatch",
"Return type mismatch (docstring vs annotation)": "return_type_mismatch",
}
section_re = re.compile(r"^\s{2}(.+?)\s+\((\d+)\)\s*$", re.MULTILINE)
kind_counts = {}
for m in section_re.finditer(quality_gate_log):
label, count = m.group(1), int(m.group(2))
short = kind_short.get(label)
if short:
kind_counts[short] = count
if kind_counts:
parts = [f"{v} {k}" for k, v in kind_counts.items()]
quality_gate_detail = ", ".join(parts)
else:
quality_row = None
# Fall back to the summary annotation message
qm = re.search(r"::(notice|warning|error) title=Docstring quality::(.+)", quality_gate_log)
quality_gate_detail = re.sub(r"\s*—\s*see job summary.*$", "", qm.group(2)) if qm else ""
# Split coverage log at quality section to avoid duplicate output in collapsibles
quality_start = coverage_log.find("🔬 Running docstring quality")
if quality_start != -1:
quality_log = coverage_log[quality_start:]
coverage_display_log = coverage_log[:quality_start].strip()
else:
quality_log = ""
coverage_display_log = coverage_log
CONTRIB_URL = (
"https://github.com/generative-computing/mellea/blob/main"
"/docs/docs/guide/CONTRIBUTING.md"
)
REPO = "${{ github.repository }}"
RUN_ID = "${{ github.run_id }}"
ARTIFACT_URL = f"https://github.com/{REPO}/actions/runs/{RUN_ID}#artifacts"
lines = [
"## Docs Build — Validation Summary\n",
"| Check | Result | Details |",
"|-------|--------|---------|",
f"| Markdownlint | {icon(markdownlint_outcome)} {markdownlint_outcome}{mode} | {lint_detail} |",
f"| MDX Validation | {icon(validate_outcome)} {validate_outcome}{mode} | {mdx_detail} |",
f"| API Coverage | {icon(coverage_outcome)} {coverage_outcome}{mode} | {cov_detail} |",
f"| Markdownlint | {icon(markdownlint_outcome)} {markdownlint_outcome} | {lint_detail} |",
f"| MDX Validation | {icon(validate_outcome)} {validate_outcome} | {mdx_detail} |",
f"| API Coverage | {icon(coverage_outcome)} {coverage_outcome} | {cov_detail} |",
f"| Docstring Quality | {icon(quality_gate_outcome)} {quality_gate_outcome} | {quality_gate_detail} |",
]
if quality_row:
lines.append(quality_row)
lines.append("")
# When the quality gate fails, surface a direct link to the fix reference.
# Per-kind Ref: URLs in the log output are inside a ```text``` block and
# don't render as links there.
if quality_gate_outcome == "failure":
lines += [
"> ❌ **Docstring quality gate failed.** "
f"See the [CI docstring checks reference]({CONTRIB_URL}#ci-docstring-checks-reference) "
"for per-kind fix instructions, or expand **Docstring quality details** below for the full list. \n"
f"> The full machine-readable report is available as the [`docstring-quality-report` artifact]({ARTIFACT_URL}).",
"",
]
for title, log, limit in [
("Markdownlint output", lint_log, 5_000),
("MDX validation output", validate_log, 5_000),
("API coverage output", coverage_display_log, 5_000),
("Docstring quality details", quality_log, 1_000_000),
("API coverage output", coverage_log, 5_000),
("Docstring quality details", quality_gate_log, 1_000_000),
]:
if log:
lines += [
Expand Down
13 changes: 6 additions & 7 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,17 +49,16 @@ repos:
hooks:
- id: docs-mdx-validate
name: Validate generated MDX docs
entry: bash -c 'test -d docs/docs/api && uv run --no-sync python tooling/docs-autogen/validate.py docs/docs/api --skip-coverage || true'
entry: bash -c 'test -d docs/docs/api && uv run --no-sync python tooling/docs-autogen/validate.py docs/docs/api --skip-coverage --warn-only || true'
language: system
pass_filenames: false
files: (docs/docs/.*\.mdx$|tooling/docs-autogen/)
# TODO(#616): Move to normal commit flow once docstring quality issues reach 0.
# Griffe loads the full package (~10s), so this is manual-only for now to avoid
# slowing down every Python commit. Re-enable (remove stages: [manual]) and add
# --fail-on-quality once quality issues are resolved.
# Docstring quality gate — manual only (CI is the hard gate via docs-publish.yml).
# Run locally with: pre-commit run docs-docstring-quality --hook-stage manual
# Requires generated API docs (run `uv run python tooling/docs-autogen/build.py` first).
- id: docs-docstring-quality
name: Audit docstring quality (informational)
entry: bash -c 'test -d docs/docs/api && uv run --no-sync python tooling/docs-autogen/audit_coverage.py --quality --docs-dir docs/docs/api || true'
name: Audit docstring quality
entry: uv run --no-sync python tooling/docs-autogen/audit_coverage.py --quality --fail-on-quality --threshold 0 --docs-dir docs/docs/api
language: system
pass_filenames: false
files: (mellea/.*\.py$|cli/.*\.py$)
Expand Down
21 changes: 21 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,25 @@ differs in type or behaviour from the constructor input — for example, when a
argument is wrapped into a `CBlock`, or when a class-level constant is relevant to
callers. Pure-echo entries that repeat `Args:` verbatim should be omitted.

**`TypedDict` classes are a special case.** Their fields *are* the entire public
contract, so when an `Attributes:` section is present it must exactly match the
declared fields. The audit will flag:

- `typeddict_phantom``Attributes:` documents a field that is not declared in the `TypedDict`
- `typeddict_undocumented` — a declared field is absent from the `Attributes:` section

```python
class ConstraintResult(TypedDict):
"""Result of a constraint check.
Attributes:
passed: Whether the constraint was satisfied.
reason: Human-readable explanation.
"""
passed: bool
reason: str
```

#### Validating docstrings

Run the coverage and quality audit to check your changes before committing:
Expand All @@ -194,6 +213,8 @@ Key checks the audit enforces:
| `no_args` | Standalone function has params but no `Args:` section |
| `no_returns` | Function has a non-trivial return annotation but no `Returns:` section |
| `param_mismatch` | `Args:` documents names not present in the actual signature |
| `typeddict_phantom` | `TypedDict` `Attributes:` documents a field not declared in the class |
| `typeddict_undocumented` | `TypedDict` has a declared field absent from its `Attributes:` section |

**IDE hover verification** — open any of these existing classes in VS Code and hover
over the class name or a constructor call to confirm the hover card shows `Args:` once
Expand Down
2 changes: 1 addition & 1 deletion cli/alora/intrinsic_uploader.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def upload_intrinsic(
base_model (str): Base model ID or path (e.g.
``"ibm-granite/granite-3.3-2b-instruct"``). Must contain at most
one ``"/"`` separator.
type (Literal["lora", "alora"]): Adapter type, used as the leaf
type (Literal['lora', 'alora']): Adapter type, used as the leaf
directory name in the repository layout.
io_yaml (str): Path to the ``io.yaml`` configuration file for
intrinsic input/output processing.
Expand Down
23 changes: 19 additions & 4 deletions cli/alora/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,14 @@
import typer
from datasets import Dataset
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainerCallback
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TrainerCallback,
TrainerControl,
TrainerState,
TrainingArguments,
)
from trl import DataCollatorForCompletionOnlyLM, SFTConfig, SFTTrainer

# Handle MPS with old PyTorch versions on macOS only
Expand All @@ -39,7 +46,9 @@
)


def load_dataset_from_json(json_path, tokenizer, invocation_prompt):
def load_dataset_from_json(
json_path: str, tokenizer: AutoTokenizer, invocation_prompt: str
) -> Dataset:
"""Load a JSONL dataset and format it for SFT training.
Reads ``item``/``label`` pairs from a JSONL file and builds a HuggingFace
Expand Down Expand Up @@ -73,7 +82,7 @@ def load_dataset_from_json(json_path, tokenizer, invocation_prompt):
return Dataset.from_dict({"input": inputs, "target": targets})


def formatting_prompts_func(example):
def formatting_prompts_func(example: dict) -> list[str]:
"""Concatenate input and target columns for SFT prompt formatting.
Args:
Expand Down Expand Up @@ -101,7 +110,13 @@ class SaveBestModelCallback(TrainerCallback):
def __init__(self):
self.best_eval_loss = float("inf")

def on_evaluate(self, args, state, control, **kwargs):
def on_evaluate(
self,
args: TrainingArguments,
state: TrainerState,
control: TrainerControl,
**kwargs,
):
"""Save the adapter weights if the current evaluation loss is a new best.
Called automatically by the HuggingFace Trainer after each evaluation
Expand Down
26 changes: 26 additions & 0 deletions cli/decompose/decompose.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,19 @@ class DecompVersion(StrEnum):
def reorder_subtasks(
subtasks: list[DecompSubtasksResult],
) -> list[DecompSubtasksResult]:
"""Topologically sort subtasks by their ``depends_on`` relationships.
Args:
subtasks: List of subtask dicts, each with a ``"tag"`` and optional
``"depends_on"`` field.
Returns:
list[DecompSubtasksResult]: The subtasks reordered so that dependencies
come before dependents, with numbering prefixes updated.
Raises:
ValueError: If a circular dependency is detected.
"""
subtask_map = {subtask["tag"].lower(): subtask for subtask in subtasks}

graph = {}
Expand Down Expand Up @@ -78,6 +91,19 @@ def reorder_subtasks(
def verify_user_variables(
decomp_data: DecompPipelineResult, input_var: list[str] | None
) -> DecompPipelineResult:
"""Validate that all required input variables and dependencies exist.
Args:
decomp_data: The decomposition pipeline result containing subtasks.
input_var: User-provided input variable names, or ``None`` for none.
Returns:
DecompPipelineResult: The (possibly reordered) decomposition data.
Raises:
ValueError: If a subtask requires an input variable that was not
provided, or depends on a subtask tag that does not exist.
"""
if input_var is None:
input_var = []

Expand Down
6 changes: 3 additions & 3 deletions cli/eval/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def __init__(
self.score = score
self.validation_reason = validation_reason

def to_dict(self):
def to_dict(self) -> dict:
"""Serialise the input evaluation result to a plain dictionary.
Returns:
Expand Down Expand Up @@ -84,7 +84,7 @@ def __init__(self, test_eval: TestBasedEval, input_results: list[InputEvalResult
self.test_eval = test_eval
self.input_results = input_results

def to_dict(self):
def to_dict(self) -> dict:
"""Serialise the test evaluation result to a plain dictionary.
Returns:
Expand Down Expand Up @@ -366,7 +366,7 @@ def execute_test_eval(
return test_result


def parse_judge_output(judge_output: str):
def parse_judge_output(judge_output: str) -> tuple[int | None, str]:
"""Parse score and justification from a judge model's output string.
Args:
Expand Down
Loading
Loading