ablab · andrewprzh · May 26, 2026 · May 26, 2026 · May 26, 2026 · May 26, 2026
diff --git a/.github/workflows/Mouse.ONT_simulated.polyA.yml b/.github/workflows/Mouse.ONT_simulated.polyA.yml
@@ -0,0 +1,76 @@
+name: Mouse ONT simulated polyA prediction
+
+on:
+  workflow_dispatch:
+  schedule:
+  - cron: '0 6 * * 5'
+
+env:
+  RUN_NAME: Mouse.ONT_simulated.polyA
+  LAUNCHER: ${{github.workspace}}/isoquant_tests/github/run_pipeline.py
+  CFG_DIR: /abga/work/andreyp/ci_isoquant/data
+  BIN_PATH: /abga/work/andreyp/ci_isoquant/bin/
+  OUTPUT_BASE: /abga/work/andreyp/ci_isoquant/output/${{github.ref_name}}/
+
+concurrency:
+  group: ${{github.workflow}}
+  cancel-in-progress: false
+
+jobs:
+  check-changes:
+    runs-on:
+      labels: [isoquant]
+    name: 'Check for recent changes'
+    outputs:
+      has_changes: ${{steps.check.outputs.has_changes}}
+    steps:
+      - name: 'Checkout'
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+
+      - name: 'Check for commits in last 7 days'
+        id: check
+        run: |
+          # Always run on manual trigger
+          if [ "${{github.event_name}}" = "workflow_dispatch" ]; then
+            echo "has_changes=true" >> $GITHUB_OUTPUT
+            exit 0
+          fi
+          # Check for commits in last 7 days
+          COMMITS=$(git log --oneline --since="7 days ago" | wc -l)
+          if [ "$COMMITS" -gt 0 ]; then
+            echo "Found $COMMITS commits in last 7 days"
+            echo "has_changes=true" >> $GITHUB_OUTPUT
+          else
+            echo "No commits in last 7 days, skipping"
+            echo "has_changes=false" >> $GITHUB_OUTPUT
+          fi
+
+  launch-runner:
+    needs: check-changes
+    if: needs.check-changes.outputs.has_changes == 'true'
+    runs-on:
+      labels: [isoquant]
+    name: 'Running IsoQuant and QC'
+
+    steps:
+      - name: 'Cleanup'
+        run: >
+          set -e &&
+          shopt -s dotglob &&
+          rm -rf *
+
+      - name: 'Checkout'
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 1
+
+      - name: 'IsoQuant polyA prediction'
+        if: always()
+        shell: bash
+        env:
+          STEP_NAME: Mouse.ONT_simulated.polyA
+        run: |
+          export PATH=$PATH:${{env.BIN_PATH}}
+          python3 ${{env.LAUNCHER}} ${{env.CFG_DIR}}/${{env.STEP_NAME}}.yaml -o ${{env.OUTPUT_BASE}}
diff --git a/.gitignore b/.gitignore
@@ -107,3 +107,7 @@ venv.bak/
 
 # Claude Code documentation (private)
 .claude/
+
+# polyA/TSS XGBoost training artifacts (regenerated by scripts/train_polya_tss_model.py)
+isoquant_lib/data/model_df.csv
+src/model_df.csv
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -7,6 +7,7 @@ include requirements_tests.txt
 
 recursive-include isoquant_lib *.py
 recursive-include isoquant_lib/test_data *
+recursive-include isoquant_lib/data *.json
 
 prune isoquant_tests/out_*
 prune .claude

diff --git a/README.md b/README.md
@@ -29,6 +29,13 @@ assigns reads to the annotated isoforms based on their intron and exon structure
 IsoQuant further performs annotated gene, isoform, exon, and intron quantification.
 If reads are grouped (e.g. according to a cell type), counts are reported according to the provided grouping.
 
+When a reference annotation is supplied, IsoQuant additionally predicts polyA cleavage
+sites per transcript by clustering read end positions, filtering peaks with a
+pre-trained XGBoost classifier, and flagging each peak as `Known` (within 10 bp of the
+annotated transcript end) or `Novel`. Passing `--fl_data` enables the same prediction
+for transcription start sites. Results are written to `*.polyA_prediction.tsv` and
+`*.TSS_prediction.tsv` alongside the standard counts files (see [docs/output.md](docs/output.md)).
+
 The latest IsoQuant version can be downloaded from [github.com/ablab/IsoQuant/releases/latest](https://github.com/ablab/IsoQuant/releases/latest).
 
 Full IsoQuant documentation is available at [ablab.github.io/IsoQuant](https://ablab.github.io/IsoQuant/).

diff --git a/docs/output.md b/docs/output.md
@@ -29,6 +29,29 @@ If `--count_exons` is set, exon and intron counts will be produced:
 * `SAMPLE_ID.exon_counts.tsv` - reference exon inclusion/exclusion read counts;
 * `SAMPLE_ID.intron_counts.tsv` - reference intron inclusion/exclusion read counts;
 
+#### PolyA / TSS site prediction
+
+Whenever a gene annotation is supplied via `--genedb`, IsoQuant accumulates
+per-transcript histograms of read polyA positions, detects peaks, filters them
+with a pre-trained XGBoost classifier, and classifies each retained peak as
+`Known` (within 10 bp of the annotated transcript end) or `Novel`:
+
+* `SAMPLE_ID.polyA_prediction.tsv` - predicted polyA cleavage sites per reference transcript.
+
+If `--fl_data` is also supplied (reads represent full-length transcripts), the
+same machinery is applied to read start positions:
+
+* `SAMPLE_ID.TSS_prediction.tsv` - predicted transcription start sites per reference transcript.
+
+Columns: `chromosome`, `transcript_id`, `gene_id`, `prediction` (genomic
+coordinate of the peak), `counts` (number of reads supporting the peak window),
+`flag` (`Known` / `Novel`). When `--read_group` is set, an additional file per
+grouping strategy is produced with two extra columns, `counts_byGroup` and
+`group_id`:
+
+* `SAMPLE_ID.polyA_prediction_grouped_<strategy>`
+* `SAMPLE_ID.TSS_prediction_grouped_<strategy>` (only with `--fl_data`)
+
 If `--read_group` is set or multiple files are provided, the per-group expression values for reference features will be also computed:
 
 #### Default grouped counts in linear format

diff --git a/isoquant.py b/isoquant.py
@@ -338,6 +338,25 @@ def add_hidden_option(*args, **kwargs):  # show command only with --full-help
     add_hidden_option("--cage", help="bed file with CAGE peaks", type=str, default=None)
     add_hidden_option("--cage-shift", type=int, default=50, help="interval before read start to look for CAGE peak")
 
+    # PolyA / TSS training-data collection (developer-only).
+    # When either flag is given to a normal IsoQuant run, the matching
+    # terminal-position counter switches from inference to dumping per-peak
+    # features (the eight FEATURE_COLUMNS plus `chromosome` and a `true_peak`
+    # 0/1 label computed against the annotated transcript end) to the CSV
+    # path supplied as the flag value. The XGBoost model is not consulted;
+    # rows are accumulated, not filtered. Feed the CSV to
+    # `misc/train_polya_tss_model.py` to fit a fresh model.
+    #   python isoquant.py … --genedb GENEDB --collect_polya_training peaks.csv
+    #   python misc/train_polya_tss_model.py --features peaks.csv \
+    #       --output isoquant_lib/data/model_polya.json
+    # For TSS, add --fl_data and --collect_tss_training tss_peaks.csv.
+    # These flags are intentionally hidden from --help/--full_help; the run
+    # emits a clearly marked warning when they are used.
+    add_hidden_option("--collect_polya_training", type=str, default=None,
+                      help="Developer: dump per-peak features + true_peak label for polyA training to this CSV path.")
+    add_hidden_option("--collect_tss_training", type=str, default=None,
+                      help="Developer: dump per-peak features + true_peak label for TSS training to this CSV path.")
+
     isoquant_version = "3.12.0"
     try:
         with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), "VERSION")) as version_f:
@@ -1187,6 +1206,15 @@ def main(cmd_args):
         parser.print_usage()
         sys.exit(IsoQuantExitCode.SUCCESS)
     set_logger(args)
+    if getattr(args, "collect_polya_training", None) or getattr(args, "collect_tss_training", None):
+        logger.warning("=" * 78)
+        logger.warning("DEVELOPER MODE: polyA/TSS training-data collection enabled.")
+        logger.warning("Per-peak features will be written to the supplied CSV path INSTEAD of")
+        logger.warning("running the production XGBoost peak filter. Predictions in")
+        logger.warning("*.polyA_prediction.tsv / *.TSS_prediction.tsv will be empty.")
+        logger.warning("This mode is intended only for retraining the shipped model; see")
+        logger.warning("misc/train_polya_tss_model.py and .claude/POLYA_TSS_TRAINING.md.")
+        logger.warning("=" * 78)
     args = check_and_load_args(args, parser)
     create_output_dirs(args)
     set_additional_params(args)

diff --git a/isoquant_lib/assignment_aggregator.py b/isoquant_lib/assignment_aggregator.py
@@ -15,6 +15,7 @@
     create_gene_counter,
     create_transcript_counter,
 )
+from .terminal_counter import PolyACounter, TSSCounter
 from .assignment_io import (
     IOSupport,
     BEDPrinter,
@@ -100,6 +101,18 @@ def __init__(self, args, sample, string_pools, gffutils_db=None, chr_id=None, gz
             self.intron_counter = IntronCounter(intron_counts_path)
             self.global_counter.add_counters([self.exon_counter, self.intron_counter])
 
+        # polyA / TSS terminal-position prediction (ungrouped). PolyA requires
+        # only the gene annotation; TSS also requires --fl_data because read
+        # start coordinates without full-length evidence are unreliable.
+        if self.args.genedb:
+            polya_path = sample.get_polya_prediction_file(chr_id) if chr_id else sample.out_polya_prediction_tsv
+            self.polya_counter = PolyACounter(self.args, polya_path)
+            self.global_counter.add_counter(self.polya_counter)
+            if self.args.fl_data:
+                tss_path = sample.get_tss_prediction_file(chr_id) if chr_id else sample.out_tss_prediction_tsv
+                self.tss_counter = TSSCounter(self.args, tss_path)
+                self.global_counter.add_counter(self.tss_counter)
+
         if self.args.read_group and self.args.genedb:
             for group_idx, strategy_name in enumerate(self.grouping_strategy_names):
                 # Use chr-specific paths if chr_id is provided
@@ -134,6 +147,29 @@ def __init__(self, args, sample, string_pools, gffutils_db=None, chr_id=None, gz
                     intron_counter = IntronCounter(intron_out_file, string_pools=self.string_pools, group_index=group_idx)
                     self.global_counter.add_counters([exon_counter, intron_counter])
 
+                # Grouped polyA / TSS prediction (one file per grouping strategy).
+                # Skipped in training-collection mode -- only the ungrouped counter
+                # writes the per-chr training fragment, and grouped predictions
+                # are not produced in dev mode.
+                if not getattr(self.args, "collect_polya_training", None):
+                    if chr_id:
+                        polya_out_file = sample.get_grouped_counts_file(chr_id, "polyA_prediction", strategy_name)
+                    else:
+                        polya_out_file = f"{sample.out_polya_prediction_grouped_tsv}_{strategy_name}"
+                    grouped_polya_counter = PolyACounter(self.args, polya_out_file,
+                                                         string_pools=self.string_pools,
+                                                         group_index=group_idx)
+                    self.global_counter.add_counter(grouped_polya_counter)
+                if self.args.fl_data and not getattr(self.args, "collect_tss_training", None):
+                    if chr_id:
+                        tss_out_file = sample.get_grouped_counts_file(chr_id, "TSS_prediction", strategy_name)
+                    else:
+                        tss_out_file = f"{sample.out_tss_prediction_grouped_tsv}_{strategy_name}"
+                    grouped_tss_counter = TSSCounter(self.args, tss_out_file,
+                                                     string_pools=self.string_pools,
+                                                     group_index=group_idx)
+                    self.global_counter.add_counter(grouped_tss_counter)
+
         if self.args.read_group and not self.args.no_model_construction:
             for group_idx, strategy_name in enumerate(self.grouping_strategy_names):
                 if chr_id:

diff --git a/isoquant_lib/data/model_polya.json b/isoquant_lib/data/model_polya.json
diff --git a/isoquant_lib/data/model_polya_old.json b/isoquant_lib/data/model_polya_old.json
diff --git a/isoquant_lib/data/model_tss.json b/isoquant_lib/data/model_tss.json
diff --git a/isoquant_lib/data/model_tss_old.json b/isoquant_lib/data/model_tss_old.json
diff --git a/isoquant_lib/dataset_processor.py b/isoquant_lib/dataset_processor.py
@@ -739,6 +739,23 @@ def merge_assignments(self, sample, aggregator, chr_ids):
             merge_counts(counter, sample.prefix, chr_ids, unaligned)
             # counter.convert_counts_to_tpm(self.args.normalization_method)
 
+        # Developer training-data collection: concatenate per-chr training
+        # fragments (written by TerminalCounter._dump_training_features) into
+        # the user-supplied CSV path. Per-chr fragments live next to the
+        # per-chr prediction TSVs with a .training.csv suffix.
+        polya_csv = getattr(self.args, "collect_polya_training", None)
+        if polya_csv:
+            with open(polya_csv, "w") as fh:
+                merge_files(sample.out_polya_prediction_tsv + ".training.csv",
+                            sample.prefix, chr_ids, fh, header_lines=1)
+            logger.info("PolyA training features written to %s", polya_csv)
+        tss_csv = getattr(self.args, "collect_tss_training", None)
+        if tss_csv and self.args.fl_data:
+            with open(tss_csv, "w") as fh:
+                merge_files(sample.out_tss_prediction_tsv + ".training.csv",
+                            sample.prefix, chr_ids, fh, header_lines=1)
+            logger.info("TSS training features written to %s", tss_csv)
+
     def merge_transcript_models(self, label, aggregator, chr_ids, gff_printer):
         merge_files(gff_printer.model_fname, label, chr_ids, gff_printer.out_gff, copy_header=False)
         if gff_printer.output_r2t:

diff --git a/isoquant_lib/input_data_storage.py b/isoquant_lib/input_data_storage.py
@@ -73,6 +73,10 @@ def _init_paths(self):
         self.out_exon_grouped_counts_tsv = self._make_path(self.prefix + ".exon_grouped")
         self.out_intron_grouped_counts_tsv = self._make_path(self.prefix + ".intron_grouped")
         self.out_t2t_tsv = self._make_path(self.prefix + ".novel_vs_known.SQANTI-like.tsv")
+        self.out_polya_prediction_tsv = self._make_path(self.prefix + ".polyA_prediction.tsv")
+        self.out_tss_prediction_tsv = self._make_path(self.prefix + ".TSS_prediction.tsv")
+        self.out_polya_prediction_grouped_tsv = self._make_path(self.prefix + ".polyA_prediction_grouped")
+        self.out_tss_prediction_grouped_tsv = self._make_path(self.prefix + ".TSS_prediction_grouped")
         self.barcodes_tsv = self._make_path(self.prefix + ".barcoded_reads")
         self.barcodes_done = self._make_aux_path(self.prefix + ".barcodes_done")
         self.barcodes_split_reads = self._make_aux_path(self.prefix + ".split_barcodes")
@@ -182,6 +186,14 @@ def get_t2t_tsv_file(self, chr_id: str) -> str:
         """Get path to SQANTI-like TSV for a chromosome."""
         return self._make_path(self.get_chr_prefix(chr_id) + ".novel_vs_known.SQANTI-like.tsv")
 
+    def get_polya_prediction_file(self, chr_id: str) -> str:
+        """Get path to polyA prediction TSV for a chromosome."""
+        return self._make_path(self.get_chr_prefix(chr_id) + ".polyA_prediction.tsv")
+
+    def get_tss_prediction_file(self, chr_id: str) -> str:
+        """Get path to TSS prediction TSV for a chromosome."""
+        return self._make_path(self.get_chr_prefix(chr_id) + ".TSS_prediction.tsv")
+
     def get_grouped_counts_file(self, chr_id: str, feature: str, strategy_name: str) -> str:
         """Get path to grouped counts file for a chromosome.