generative-computing · planetf1 · Mar 20, 2026 · Mar 18, 2026 · Mar 18, 2026 · Mar 18, 2026
@@ -105,12 +105,31 @@ jobs:
         id: audit_coverage
         run: |
           set -o pipefail
-          uv run python tooling/docs-autogen/audit_coverage.py --docs-dir docs/docs/api --threshold 80 --quality 2>&1 \
+          uv run python tooling/docs-autogen/audit_coverage.py --docs-dir docs/docs/api --threshold 80 2>&1 \
             | tee /tmp/audit_coverage.log
         continue-on-error: ${{ inputs.strict_validation != true }}
 
+      - name: Docstring quality gate
+        id: quality_gate
+        run: |
+          set -o pipefail
+          uv run python tooling/docs-autogen/audit_coverage.py \
+            --docs-dir docs/docs/api \
+            --quality --fail-on-quality --threshold 100 \
+            --orphans \
+            --output /tmp/quality_report.json 2>&1 \
+            | tee /tmp/quality_gate.log
+
       # -- Upload artifact for deploy job --------------------------------------
 
+      - name: Upload quality report
+        if: always()
+        uses: actions/upload-artifact@v7
+        with:
+          name: docstring-quality-report
+          path: /tmp/quality_report.json
+          retention-days: 30
+
       - name: Upload docs artifact
         if: success() || (inputs.strict_validation != true)
         uses: actions/upload-artifact@v7
@@ -141,12 +160,11 @@ jobs:
           markdownlint_outcome = "${{ steps.markdownlint.outcome }}"
           validate_outcome = "${{ steps.validate_mdx.outcome }}"
           coverage_outcome = "${{ steps.audit_coverage.outcome }}"
-          strict = "${{ inputs.strict_validation }}" == "true"
-          mode = "" if strict else " *(soft-fail)*"
-
+          quality_gate_outcome = "${{ steps.quality_gate.outcome }}"
           lint_log = read_log("/tmp/markdownlint.log")
           validate_log = read_log("/tmp/validate_mdx.log")
           coverage_log = read_log("/tmp/audit_coverage.log")
+          quality_gate_log = read_log("/tmp/quality_gate.log")
 
           # Count markdownlint issues (lines matching file:line:col format)
           lint_issues = len([l for l in lint_log.splitlines() if re.match(r'.+:\d+:\d+ ', l)])
@@ -186,45 +204,81 @@ jobs:
 
           mdx_detail = parse_validate_detail(validate_log)
 
-          # Docstring quality annotation emitted by audit_coverage.py into the log
-          # Format: ::notice title=Docstring quality::message
-          #      or ::warning title=Docstring quality::message
-          quality_match = re.search(r"::(notice|warning|error) title=Docstring quality::(.+)", coverage_log)
-          if quality_match:
-              quality_level, quality_msg = quality_match.group(1), quality_match.group(2)
-              quality_icon = "✅" if quality_level == "notice" else "⚠️"
-              quality_status = "pass" if quality_level == "notice" else "warning"
-              quality_detail = re.sub(r"\s*—\s*see job summary.*$", "", quality_msg)
-              quality_row = f"| Docstring Quality | {quality_icon} {quality_status}{mode} | {quality_detail} |"
+          # Parse per-kind counts from the quality gate log.
+          # _print_quality_report emits section headers like:
+          #   "  Missing docstrings (12)"
+          #   "  Missing Args section (5)"
+          # Capture label -> count from those lines, then build a compact
+          # per-kind breakdown for the summary table cell.
+          kind_short = {
+              "Missing docstrings":       "missing",
+              "Short docstrings":         "short",
+              "Missing Args section":     "no_args",
+              "Missing Returns section":  "no_returns",
+              "Missing Yields section (generator)": "no_yields",
+              "Missing Raises section":   "no_raises",
+              "Missing class Args section": "no_class_args",
+              "Duplicate Args: in class + __init__ (Option C violation)": "dup_init_args",
+              "Param name mismatches (documented but not in signature)": "param_mismatch",
+              "TypedDict phantom fields (documented but not declared)": "td_phantom",
+              "TypedDict undocumented fields (declared but missing from Attributes:)": "td_undoc",
+              "Missing parameter type annotations (type absent from API docs)": "missing_param_type",
+              "Missing return type annotations (type absent from API docs)": "missing_return_type",
+              "Param type mismatch (docstring vs annotation)": "param_type_mismatch",
+              "Return type mismatch (docstring vs annotation)": "return_type_mismatch",
+          }
+          section_re = re.compile(r"^\s{2}(.+?)\s+\((\d+)\)\s*$", re.MULTILINE)
+          kind_counts = {}
+          for m in section_re.finditer(quality_gate_log):
+              label, count = m.group(1), int(m.group(2))
+              short = kind_short.get(label)
+              if short:
+                  kind_counts[short] = count
+
+          if kind_counts:
+              parts = [f"{v} {k}" for k, v in kind_counts.items()]
+              quality_gate_detail = ", ".join(parts)
           else:
-              quality_row = None
+              # Fall back to the summary annotation message
+              qm = re.search(r"::(notice|warning|error) title=Docstring quality::(.+)", quality_gate_log)
+              quality_gate_detail = re.sub(r"\s*—\s*see job summary.*$", "", qm.group(2)) if qm else ""
 
-          # Split coverage log at quality section to avoid duplicate output in collapsibles
-          quality_start = coverage_log.find("🔬 Running docstring quality")
-          if quality_start != -1:
-              quality_log = coverage_log[quality_start:]
-              coverage_display_log = coverage_log[:quality_start].strip()
-          else:
-              quality_log = ""
-              coverage_display_log = coverage_log
+          CONTRIB_URL = (
+              "https://github.com/generative-computing/mellea/blob/main"
+              "/docs/docs/guide/CONTRIBUTING.md"
+          )
+          REPO = "${{ github.repository }}"
+          RUN_ID = "${{ github.run_id }}"
+          ARTIFACT_URL = f"https://github.com/{REPO}/actions/runs/{RUN_ID}#artifacts"
 
           lines = [
               "## Docs Build — Validation Summary\n",
               "| Check | Result | Details |",
               "|-------|--------|---------|",
-              f"| Markdownlint | {icon(markdownlint_outcome)} {markdownlint_outcome}{mode} | {lint_detail} |",
-              f"| MDX Validation | {icon(validate_outcome)} {validate_outcome}{mode} | {mdx_detail} |",
-              f"| API Coverage | {icon(coverage_outcome)} {coverage_outcome}{mode} | {cov_detail} |",
+              f"| Markdownlint | {icon(markdownlint_outcome)} {markdownlint_outcome} | {lint_detail} |",
+              f"| MDX Validation | {icon(validate_outcome)} {validate_outcome} | {mdx_detail} |",
+              f"| API Coverage | {icon(coverage_outcome)} {coverage_outcome} | {cov_detail} |",
+              f"| Docstring Quality | {icon(quality_gate_outcome)} {quality_gate_outcome} | {quality_gate_detail} |",
           ]
-          if quality_row:
-              lines.append(quality_row)
           lines.append("")
 
+          # When the quality gate fails, surface a direct link to the fix reference.
+          # Per-kind Ref: URLs in the log output are inside a ```text``` block and
+          # don't render as links there.
+          if quality_gate_outcome == "failure":
+              lines += [
+                  "> ❌ **Docstring quality gate failed.**  "
+                  f"See the [CI docstring checks reference]({CONTRIB_URL}#ci-docstring-checks-reference) "
+                  "for per-kind fix instructions, or expand **Docstring quality details** below for the full list.  \n"
+                  f"> The full machine-readable report is available as the [`docstring-quality-report` artifact]({ARTIFACT_URL}).",
+                  "",
+              ]
+
           for title, log, limit in [
               ("Markdownlint output", lint_log, 5_000),
               ("MDX validation output", validate_log, 5_000),
-              ("API coverage output", coverage_display_log, 5_000),
-              ("Docstring quality details", quality_log, 1_000_000),
+              ("API coverage output", coverage_log, 5_000),
+              ("Docstring quality details", quality_gate_log, 1_000_000),
           ]:
               if log:
                   lines += [

@@ -49,17 +49,16 @@ repos:
     hooks:
       - id: docs-mdx-validate
         name: Validate generated MDX docs
-        entry: bash -c 'test -d docs/docs/api && uv run --no-sync python tooling/docs-autogen/validate.py docs/docs/api --skip-coverage || true'
+        entry: bash -c 'test -d docs/docs/api && uv run --no-sync python tooling/docs-autogen/validate.py docs/docs/api --skip-coverage --warn-only || true'
         language: system
         pass_filenames: false
         files: (docs/docs/.*\.mdx$|tooling/docs-autogen/)
-      # TODO(#616): Move to normal commit flow once docstring quality issues reach 0.
-      # Griffe loads the full package (~10s), so this is manual-only for now to avoid
-      # slowing down every Python commit. Re-enable (remove stages: [manual]) and add
-      # --fail-on-quality once quality issues are resolved.
+      # Docstring quality gate — manual only (CI is the hard gate via docs-publish.yml).
+      # Run locally with: pre-commit run docs-docstring-quality --hook-stage manual
+      # Requires generated API docs (run `uv run python tooling/docs-autogen/build.py` first).
       - id: docs-docstring-quality
-        name: Audit docstring quality (informational)
-        entry: bash -c 'test -d docs/docs/api && uv run --no-sync python tooling/docs-autogen/audit_coverage.py --quality --docs-dir docs/docs/api || true'
+        name: Audit docstring quality
+        entry: uv run --no-sync python tooling/docs-autogen/audit_coverage.py --quality --fail-on-quality --threshold 0 --docs-dir docs/docs/api
         language: system
         pass_filenames: false
         files: (mellea/.*\.py$|cli/.*\.py$)

@@ -174,6 +174,25 @@ differs in type or behaviour from the constructor input — for example, when a
 argument is wrapped into a `CBlock`, or when a class-level constant is relevant to
 callers. Pure-echo entries that repeat `Args:` verbatim should be omitted.
 
+**`TypedDict` classes are a special case.** Their fields *are* the entire public
+contract, so when an `Attributes:` section is present it must exactly match the
+declared fields. The audit will flag:
+
+- `typeddict_phantom` — `Attributes:` documents a field that is not declared in the `TypedDict`
+- `typeddict_undocumented` — a declared field is absent from the `Attributes:` section
+
+```python
+class ConstraintResult(TypedDict):
+    """Result of a constraint check.
+
+    Attributes:
+        passed: Whether the constraint was satisfied.
+        reason: Human-readable explanation.
+    """
+    passed: bool
+    reason: str
+```
+
 #### Validating docstrings
 
 Run the coverage and quality audit to check your changes before committing:
@@ -194,6 +213,8 @@ Key checks the audit enforces:
 | `no_args` | Standalone function has params but no `Args:` section |
 | `no_returns` | Function has a non-trivial return annotation but no `Returns:` section |
 | `param_mismatch` | `Args:` documents names not present in the actual signature |
+| `typeddict_phantom` | `TypedDict` `Attributes:` documents a field not declared in the class |
+| `typeddict_undocumented` | `TypedDict` has a declared field absent from its `Attributes:` section |
 
 **IDE hover verification** — open any of these existing classes in VS Code and hover
 over the class name or a constructor call to confirm the hover card shows `Args:` once

@@ -40,7 +40,7 @@ def upload_intrinsic(
         base_model (str): Base model ID or path (e.g.
             ``"ibm-granite/granite-3.3-2b-instruct"``). Must contain at most
             one ``"/"`` separator.
-        type (Literal["lora", "alora"]): Adapter type, used as the leaf
+        type (Literal['lora', 'alora']): Adapter type, used as the leaf
             directory name in the repository layout.
         io_yaml (str): Path to the ``io.yaml`` configuration file for
             intrinsic input/output processing.

@@ -16,7 +16,14 @@
 import typer
 from datasets import Dataset
 from peft import LoraConfig, get_peft_model
-from transformers import AutoModelForCausalLM, AutoTokenizer, TrainerCallback
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    TrainerCallback,
+    TrainerControl,
+    TrainerState,
+    TrainingArguments,
+)
 from trl import DataCollatorForCompletionOnlyLM, SFTConfig, SFTTrainer
 
 # Handle MPS with old PyTorch versions on macOS only
@@ -39,7 +46,9 @@
             )
 
 
-def load_dataset_from_json(json_path, tokenizer, invocation_prompt):
+def load_dataset_from_json(
+    json_path: str, tokenizer: AutoTokenizer, invocation_prompt: str
+) -> Dataset:
     """Load a JSONL dataset and format it for SFT training.
 
     Reads ``item``/``label`` pairs from a JSONL file and builds a HuggingFace
@@ -73,7 +82,7 @@ def load_dataset_from_json(json_path, tokenizer, invocation_prompt):
     return Dataset.from_dict({"input": inputs, "target": targets})
 
 
-def formatting_prompts_func(example):
+def formatting_prompts_func(example: dict) -> list[str]:
     """Concatenate input and target columns for SFT prompt formatting.
 
     Args:
@@ -101,7 +110,13 @@ class SaveBestModelCallback(TrainerCallback):
     def __init__(self):
         self.best_eval_loss = float("inf")
 
-    def on_evaluate(self, args, state, control, **kwargs):
+    def on_evaluate(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
         """Save the adapter weights if the current evaluation loss is a new best.
 
         Called automatically by the HuggingFace Trainer after each evaluation

@@ -49,6 +49,19 @@ class DecompVersion(StrEnum):
 def reorder_subtasks(
     subtasks: list[DecompSubtasksResult],
 ) -> list[DecompSubtasksResult]:
+    """Topologically sort subtasks by their ``depends_on`` relationships.
+
+    Args:
+        subtasks: List of subtask dicts, each with a ``"tag"`` and optional
+            ``"depends_on"`` field.
+
+    Returns:
+        list[DecompSubtasksResult]: The subtasks reordered so that dependencies
+        come before dependents, with numbering prefixes updated.
+
+    Raises:
+        ValueError: If a circular dependency is detected.
+    """
     subtask_map = {subtask["tag"].lower(): subtask for subtask in subtasks}
 
     graph = {}
@@ -78,6 +91,19 @@ def reorder_subtasks(
 def verify_user_variables(
     decomp_data: DecompPipelineResult, input_var: list[str] | None
 ) -> DecompPipelineResult:
+    """Validate that all required input variables and dependencies exist.
+
+    Args:
+        decomp_data: The decomposition pipeline result containing subtasks.
+        input_var: User-provided input variable names, or ``None`` for none.
+
+    Returns:
+        DecompPipelineResult: The (possibly reordered) decomposition data.
+
+    Raises:
+        ValueError: If a subtask requires an input variable that was not
+            provided, or depends on a subtask tag that does not exist.
+    """
     if input_var is None:
         input_var = []
 

@@ -49,7 +49,7 @@ def __init__(
         self.score = score
         self.validation_reason = validation_reason
 
-    def to_dict(self):
+    def to_dict(self) -> dict:
         """Serialise the input evaluation result to a plain dictionary.
 
         Returns:
@@ -84,7 +84,7 @@ def __init__(self, test_eval: TestBasedEval, input_results: list[InputEvalResult
         self.test_eval = test_eval
         self.input_results = input_results
 
-    def to_dict(self):
+    def to_dict(self) -> dict:
         """Serialise the test evaluation result to a plain dictionary.
 
         Returns:
@@ -366,7 +366,7 @@ def execute_test_eval(
     return test_result
 
 
-def parse_judge_output(judge_output: str):
+def parse_judge_output(judge_output: str) -> tuple[int | None, str]:
     """Parse score and justification from a judge model's output string.
 
     Args: