Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions modules/nf-core/samtools/coverage/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -39,5 +39,6 @@ process SAMTOOLS_COVERAGE {
def prefix = task.ext.prefix ?: "${meta.id}"
"""
echo "#rname\tstartpos\tendpos\tnumreads\tcovbases\tcoverage\tmeandepth\tmeanbaseq\tmeanmapq" > ${prefix}.txt
echo "chr21\t16570000\t16610000\t8741\t39996\t99.9875\t32.4854\t29.6\t59.8" >> ${prefix}.txt
"""
}
18 changes: 9 additions & 9 deletions modules/nf-core/samtools/coverage/tests/main.nf.test.snap
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
{
"id": "test"
},
"test.txt:md5,ea885d54c0223dff86e2e44f5bc08374"
"test.txt:md5,5b5fbaf1b8a9645ab2e9d34891ae165a"
]
],
"versions_samtools": [
Expand All @@ -19,11 +19,11 @@
]
}
],
"timestamp": "2026-05-21T15:45:07.736128337",
"meta": {
"nf-test": "0.9.3",
"nextflow": "25.10.4"
},
"timestamp": "2026-03-19T08:56:33.66881"
"nf-test": "0.9.5",
"nextflow": "26.04.1"
}
},
"test_samtools_coverage_bam": {
"content": [
Expand All @@ -45,11 +45,11 @@
]
}
],
"timestamp": "2026-03-19T08:56:13.846408",
"meta": {
"nf-test": "0.9.3",
"nextflow": "25.10.4"
},
"timestamp": "2026-03-19T08:56:13.846408"
}
},
"test_samtools_coverage_cram": {
"content": [
Expand Down Expand Up @@ -86,10 +86,10 @@
]
}
],
"timestamp": "2026-03-19T08:56:19.040435",
"meta": {
"nf-test": "0.9.3",
"nextflow": "25.10.4"
},
"timestamp": "2026-03-19T08:56:19.040435"
}
}
}
62 changes: 50 additions & 12 deletions subworkflows/nf-core/bam_subsampledepth_samtools/main.nf
Original file line number Diff line number Diff line change
@@ -1,42 +1,80 @@
include { SAMTOOLS_DEPTH } from '../../../modules/nf-core/samtools/depth'
include { GAWK } from '../../../modules/nf-core/gawk'
include { SAMTOOLS_VIEW } from '../../../modules/nf-core/samtools/view'
include { SAMTOOLS_COVERAGE } from '../../../modules/nf-core/samtools/coverage'
include { SAMTOOLS_VIEW } from '../../../modules/nf-core/samtools/view'

workflow BAM_SUBSAMPLEDEPTH_SAMTOOLS {

take:
ch_bam_bai // channel: [ val(meta), path(bam), path(bai) ]
ch_depth // channel: [ val(meta), val(depth)]
ch_fasta // channel: [ val(meta), path(fasta), path(fai) ]
ch_regions // channel: [ val(meta), path(region) ]

main:

// Compute mean depth
SAMTOOLS_DEPTH(ch_bam_bai, [[], []])
ch_regions_branched = ch_regions
.branch { _meta, region ->
with_bed: region
without_bed: true
}

ch_regions_parsed = ch_regions_branched.with_bed
.splitCsv(header: false, sep:'\t')
.map{ meta, row -> [
meta, "${row[0]}:${row[1]}-${row[2]}"
]}
// For samples WITH BED file: compute coverage per region
ch_coverage_with_regions = ch_bam_bai
.combine(ch_regions_parsed)
.map{ meta, bam, bai, _metaR, region ->
[ meta + ["region": region], bam, bai ]
}

// For samples WITHOUT BED file: compute whole-genome coverage
ch_coverage_without_regions = ch_bam_bai
.combine(ch_regions_branched.without_bed)
.map{ meta, bam, bai, _metaR, _region_file -> [ meta, bam, bai ] }

// Use GAWK to get mean depth
GAWK(SAMTOOLS_DEPTH.out.tsv, [], false)
SAMTOOLS_COVERAGE(
ch_coverage_with_regions
.mix(ch_coverage_without_regions),
ch_fasta
)

// Compute downsampling factor
ch_mean_depth = GAWK.out.output
.splitCsv(header: false, sep:'\t')
ch_mean_depth = SAMTOOLS_COVERAGE.out.coverage
.splitCsv(header: true, sep:'\t')
.filter{ _meta, row -> row.meandepth as Float > 0 }
.map{ meta, row ->
[ meta, row[0] as Float ]
def keys_to_keep = meta.keySet() - ['region']
[
meta.subMap(keys_to_keep),
row.meandepth as Float
]
}
.groupTuple()
.map{ meta, mean ->
[ meta, mean.sum() / mean.size() ]
}

ch_input_subsample = ch_bam_bai
.join(ch_mean_depth)
.combine(ch_depth)
.map{ meta, bam, index, mean, metaD, depth ->
[ meta + metaD + ['subsample_fraction': depth as Float / mean, 'depth': depth ], bam, index ]
[ meta + metaD + ['subsample_fraction': depth as Float / mean, 'mean_depth': mean, 'depth': depth ], bam, index ]
}

ch_input_subsample
.filter{ meta, _bam, _index -> meta.subsample_fraction >= 1 }
.subscribe{ meta, _bam, _index ->
error "Sample ${meta.id} has mean depth of ${meta.mean_depth} and requested depth of ${meta.depth}, resulting in a subsampling fraction of ${meta.subsample_fraction}. As this is >= 1, no subsampling can be performed for this sample."
}

// Downsample
SAMTOOLS_VIEW(
ch_input_subsample,
ch_fasta,
[[], []],
[[], []],
ch_regions,
[]
)

Expand Down
15 changes: 12 additions & 3 deletions subworkflows/nf-core/bam_subsampledepth_samtools/meta.yml
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json
name: "bam_subsampledepth_samtools"
description: Subsample a BAM/CRAM/SAM file using samtools to a given mean depth
description: |
Subsample a BAM/CRAM/SAM file using samtools to a given mean depth.
"region", "subsample_fraction", "mean_depth" and "depth" keys will be added to the meta
map to distinguish the different file generated and therefore shouldn't be used.
keywords:
- subsample
- bam
- sam
- cram
components:
- samtools/depth
- samtools/coverage
- samtools/view
- gawk
input:
- ch_bam:
type: file
Expand All @@ -29,6 +31,13 @@ input:
The reference genome channel containing the fasta files and its index
Structure: [ val(meta), path(fasta), path(fai) ]
pattern: "*.{fa(sta)?}"
- ch_region:
type: file
description: |
The reference genome channel containing the different regions to consider
for the coverage calculation and downsampling.
Structure: [ val(meta), path(region) ]
pattern: "*.bed"
output:
- bam_subsampled:
type: file
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
process {
withName: GAWK {
ext.args2 = "'{ total += \$3 } END { print total/NR }'"
ext.suffix = "txt"
withName: SAMTOOLS_COVERAGE {
ext.args = { meta.region ? "--region ${meta.region}" : "" }
}

withName: SAMTOOLS_VIEW { // Index writing and subsampling command are necessary
Expand Down
115 changes: 100 additions & 15 deletions subworkflows/nf-core/bam_subsampledepth_samtools/tests/main.nf.test
Original file line number Diff line number Diff line change
Expand Up @@ -11,25 +11,22 @@ nextflow_workflow {
tag "subworkflows/bam_subsampledepth_samtools"

tag "samtools"
tag "samtools/depth"
tag "samtools/coverage"
tag "samtools/view"
tag "gawk"

test("Downsample to 4X and 2X") {
when {
workflow {
"""
input[0] = channel.fromList([
[
[id: "NA12878"],
file("https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/hum_data/individuals/NA12878/NA12878.s.bam", checkIfExists:true),
file("https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/hum_data/individuals/NA12878/NA12878.s.bam.bai", checkIfExists:true),
],
[
[id: "NA19401"],
file("https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/hum_data/individuals/NA19401/NA19401.s.bam", checkIfExists:true),
file("https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/hum_data/individuals/NA19401/NA19401.s.bam.bai", checkIfExists:true),
],
regions_file = channel.of("chr21\t16570000\t16610000","chr22\t16570000\t16610000").collectFile(name: "regions.bed", newLine: true)
input[0] = channel.of([
[id: "NA12878"],
file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/bam/NA12878.chr21_22.bam", checkIfExists:true),
file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/bam/NA12878.chr21_22.bam.bai", checkIfExists:true),
], [
[id: "NA19401"],
file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/bam/NA19401.chr21_22.bam", checkIfExists:true),
file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/bam/NA19401.chr21_22.bam.bai", checkIfExists:true),
])
input[1] = channel.of([
[my_depth: "2x"], 2
Expand All @@ -38,9 +35,10 @@ nextflow_workflow {
])
input[2] = channel.of([
[id: "GRCh38"],
file("https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/hum_data/reference_genome/GRCh38.s.fa.gz", checkIfExists:true),
file("https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/hum_data/reference_genome/GRCh38.s.fa.gz.fai", checkIfExists:true)
file(params.modules_testdata_base_path + "genomics/homo_sapiens/genome/genomeGRCh38_chr21_22.fa.gz", checkIfExists:true),
file(params.modules_testdata_base_path + "genomics/homo_sapiens/genome/genomeGRCh38_chr21_22.fa.gz.fai", checkIfExists:true)
]).collect()
input[3] = channel.of([[id: "regions"]]).combine(regions_file).collect()
"""
}
}
Expand All @@ -55,4 +53,91 @@ nextflow_workflow {
).match()
}
}

test("Downsample to 2X - no region -- error") {
tag "test"
when {
workflow {
"""
input[0] = channel.of([
[id: "NA12878"],
file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/bam/NA12878.chr21_22.bam", checkIfExists:true),
file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/bam/NA12878.chr21_22.bam.bai", checkIfExists:true),
])
input[1] = channel.of([[my_depth: "2x"], 2])
input[2] = channel.of([
[id: "GRCh38"],
file(params.modules_testdata_base_path + "genomics/homo_sapiens/genome/genomeGRCh38_chr21_22.fa.gz", checkIfExists:true),
file(params.modules_testdata_base_path + "genomics/homo_sapiens/genome/genomeGRCh38_chr21_22.fa.gz.fai", checkIfExists:true)
]).collect()
input[3] = channel.of([[id: "regions"], []]).collect()
"""
}
}

then {
assert workflow.failed
assert workflow.errorMessage.contains("Sample NA12878 has mean depth of 0.02665365021675825 and requested depth of 2, resulting in a subsampling fraction of 75.03662664345003. As this is >= 1, no subsampling can be performed for this sample.")
}
}

test("Downsample to 2X - no region") {
tag "test"
when {
workflow {
"""
input[0] = channel.of([
[id: "NA12878"],
file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/bam/NA12878.chr21_22.bam", checkIfExists:true),
file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/bam/NA12878.chr21_22.bam.bai", checkIfExists:true),
])
input[1] = channel.of([[my_depth: "0.01x"], 0.01]) // Lower valued as we don't have the full chromosome
input[2] = channel.of([
[id: "GRCh38"],
file(params.modules_testdata_base_path + "genomics/homo_sapiens/genome/genomeGRCh38_chr21_22.fa.gz", checkIfExists:true),
file(params.modules_testdata_base_path + "genomics/homo_sapiens/genome/genomeGRCh38_chr21_22.fa.gz.fai", checkIfExists:true)
]).collect()
input[3] = channel.of([[id: "regions"], []]).collect()
"""
}
}

then {
assert workflow.success
assert snapshot(
workflow.out.bam_subsampled.collect{ meta, bam_file, bai_file -> [
meta, bam_file, bai_file,
bam(bam_file).getReads().size()
]}
).match()
}
}

test("Downsample to 2X - no region -- stub") {
tag "stub"
options "-stub"
when {
workflow {
"""
input[0] = channel.of([
[id: "NA12878"],
file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/bam/NA12878.chr21_22.bam", checkIfExists:true),
file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/bam/NA12878.chr21_22.bam.bai", checkIfExists:true),
])
input[1] = channel.of([[my_depth: "2x"], 2])
input[2] = channel.of([
[id: "GRCh38"],
file(params.modules_testdata_base_path + "genomics/homo_sapiens/genome/genomeGRCh38_chr21_22.fa.gz", checkIfExists:true),
file(params.modules_testdata_base_path + "genomics/homo_sapiens/genome/genomeGRCh38_chr21_22.fa.gz.fai", checkIfExists:true)
]).collect()
input[3] = channel.of([[], []])
"""
}
}

then {
assert workflow.success
assert snapshot(sanitizeOutput(workflow.out)).match()
}
}
}
Loading
Loading