nf-core · pinin4fjords · May 21, 2026 · May 21, 2026 · May 21, 2026 · May 21, 2026
diff --git a/modules/nf-core/custom/orfmerge/environment.yml b/modules/nf-core/custom/orfmerge/environment.yml
@@ -0,0 +1,9 @@
+---
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - conda-forge::pandas=2.3.0
+  - conda-forge::python=3.12.11
+  - conda-forge::pyyaml=6.0.2
diff --git a/modules/nf-core/custom/orfmerge/main.nf b/modules/nf-core/custom/orfmerge/main.nf
@@ -0,0 +1,55 @@
+process CUSTOM_ORFMERGE {
+    tag "$meta.id"
+    label 'process_medium'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ?
+        'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/0f/0f1019bd22c111267bcb670fdb128829776f0ca6adfa7b0e2d126f91577d08e3/data' :
+        'community.wave.seqera.io/library/python_pandas_pyyaml:75514f9f977be607' }"
+
+    input:
+    tuple val(meta), path(bed12s, arity: '1..*', stageAs: 'beds/*'), path(tsvs, arity: '1..*', stageAs: 'tsvs/*')
+
+    output:
+    tuple val(meta), path("${prefix}.catalogue.bed12")    , emit: bed12
+    tuple val(meta), path("${prefix}.catalogue.tsv")      , emit: catalogue_tsv
+    tuple val(meta), path("${prefix}.orf_to_gene.tsv")    , emit: orf_to_gene_tsv
+    tuple val(meta), path("${prefix}.catalogue.mqc.tsv")  , emit: multiqc
+    path "versions.yml"                                   , emit: versions, topic: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    prefix = task.ext.prefix ?: "${meta.id}"
+    args   = task.ext.args ?: ''
+    template 'orfmerge.py'
+
+    stub:
+    prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    touch ${prefix}.catalogue.bed12
+    touch ${prefix}.catalogue.tsv
+    touch ${prefix}.orf_to_gene.tsv
+    touch ${prefix}.catalogue.mqc.tsv
+
+    python - <<END
+import platform
+import pandas
+import yaml
+
+with open("versions.yml", "w") as fh:
+    yaml.safe_dump(
+        {
+            "${task.process}": {
+                "python": platform.python_version(),
+                "pandas": pandas.__version__,
+            }
+        },
+        fh,
+        default_flow_style=False,
+        sort_keys=False,
+    )
+END
+    """
+}
diff --git a/modules/nf-core/custom/orfmerge/meta.yml b/modules/nf-core/custom/orfmerge/meta.yml
@@ -0,0 +1,147 @@
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
+name: "custom_orfmerge"
+description: |
+  Cluster normalised per-sample, per-caller ORF predictions into a single
+  cohort-level catalogue. Pair with `custom/orfnormalise` upstream and
+  (typically) `bedtools/getfasta` + `seqkit/translate` downstream to obtain
+  the AA FASTA.
+
+  Strategy is class-aware (operating on the harmonised `orf_class` written
+  by `custom/orfnormalise`):
+
+    - canonical_cds:        collapse by (transcript_id, strand). One
+                            canonical CDS per transcript by definition.
+    - uORF, dORF, other:    collapse by (transcript_id, strand, start,
+                            end). A single transcript can host multiple
+                            distinct uORFs / dORFs / internal ORFs, so
+                            keying on the outer span keeps them in
+                            separate clusters while still merging
+                            cross-caller calls that agree on coordinates.
+    - novel_u, smORF:       greedy reciprocal-overlap clustering on the
+                            outer genomic span at `--reciprocal-overlap`
+                            (default 0.8). Catches fuzzy cross-caller
+                            matches and exact-coordinate collapses in
+                            one pass. Order-dependent at the boundary:
+                            a chain A-B-C where A-B and B-C overlap at
+                            ~0.85 but A-C only at ~0.75 may cluster as
+                            {A,B,C} or {A,B}+{C} depending on iteration
+                            order. Rare in practice at 0.8.
+
+  Cross-caller consensus is recorded in two column families on the
+  catalogue TSV:
+
+    - `called_by_<caller>`: 0/1 indicator per supported caller
+                            (ribotish, ribocode, ribotricer, rpbp, price).
+    - `score_<caller>`:     best score from that caller within the cluster.
+                            Score direction is per-caller (p-values are
+                            minimised; Bayes factors / phase scores are
+                            maximised).
+
+  Emits a small MultiQC custom-content TSV (per-class counts) for
+  inclusion in downstream MultiQC reports.
+keywords:
+  - orf
+  - ribo-seq
+  - catalogue
+  - merge
+  - clustering
+tools:
+  - "orfmerge":
+      description: |
+        Python helper that clusters normalised ORF BED12+TSV pairs across
+        callers and samples into one unified catalogue, recording per-caller
+        provenance and best score in the output table.
+      tool_dev_url: "https://github.com/nf-core/modules/blob/master/modules/nf-core/custom/orfmerge/main.nf"
+      licence: ["MIT"]
+      identifier: ""
+input:
+  - - meta:
+        type: map
+        description: |
+          Groovy Map. Typically a cohort-level `[ id: 'allsamples' ]`.
+    - bed12s:
+        type: file
+        description: |
+          One or more normalised BED12 files (output of `custom/orfnormalise`).
+          Files are co-indexed with `tsvs` only via shared `orf_id`s; per-row
+          alignment is not required.
+        pattern: "*.bed12"
+        ontologies:
+          - edam: http://edamontology.org/format_3586 # BED
+    - tsvs:
+        type: file
+        description: |
+          One or more normalised sidecar TSV files (output of
+          `custom/orfnormalise`).
+        pattern: "*.tsv"
+        ontologies:
+          - edam: http://edamontology.org/format_3475 # TSV
+output:
+  bed12:
+    - - meta:
+          type: map
+          description: |
+            Groovy Map matching the input meta.
+      - ${prefix}.catalogue.bed12:
+          type: file
+          description: Merged ORF catalogue as BED12 (genomic blocks).
+          pattern: "*.catalogue.bed12"
+          ontologies:
+            - edam: http://edamontology.org/format_3586 # BED
+  catalogue_tsv:
+    - - meta:
+          type: map
+          description: |
+            Groovy Map matching the input meta.
+      - ${prefix}.catalogue.tsv:
+          type: file
+          description: |
+            Per-ORF table with `called_by_<caller>` and `score_<caller>`
+            columns for each supported caller, plus orf_class, aa_length,
+            and host gene_id / transcript_id.
+          pattern: "*.catalogue.tsv"
+          ontologies:
+            - edam: http://edamontology.org/format_3475 # TSV
+  orf_to_gene_tsv:
+    - - meta:
+          type: map
+          description: |
+            Groovy Map matching the input meta.
+      - ${prefix}.orf_to_gene.tsv:
+          type: file
+          description: |
+            One row per (orf_id, gene_id, transcript_id) mapping. An ORF can
+            map to multiple host transcripts/genes when callers picked
+            different annotated isoforms; downstream gene-level aggregation
+            collapses these.
+          pattern: "*.orf_to_gene.tsv"
+          ontologies:
+            - edam: http://edamontology.org/format_3475 # TSV
+  multiqc:
+    - - meta:
+          type: map
+          description: |
+            Groovy Map matching the input meta.
+      - ${prefix}.catalogue.mqc.tsv:
+          type: file
+          description: |
+            MultiQC custom-content TSV (per-class ORF counts).
+          pattern: "*.catalogue.mqc.tsv"
+          ontologies:
+            - edam: http://edamontology.org/format_3475 # TSV
+  versions:
+    - versions.yml:
+        type: file
+        description: File containing software versions
+        pattern: "versions.yml"
+        ontologies:
+          - edam: http://edamontology.org/format_3750 # YAML
+topics:
+  versions:
+    - versions.yml:
+        type: string
+        description: The name of the process
+authors:
+  - "@pinin4fjords"
+maintainers:
+  - "@pinin4fjords"