Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
e952e82
feat: add additional probe filtering
adthrasher Dec 9, 2025
65b22a9
chore: add snps output
adthrasher Dec 9, 2025
2dbf660
chore: bump minfi docker image
adthrasher Dec 9, 2025
624f56f
chore: update file format
adthrasher Dec 9, 2025
1418d8e
chore: remove quotes from tabular output
adthrasher Dec 9, 2025
c34478b
chore: ruff format
adthrasher Dec 9, 2025
712b11d
chore: lint
adthrasher Dec 9, 2025
3042a9d
chore: bump pandas
adthrasher Dec 9, 2025
2ed6ed7
chore: combine snp probe lists
adthrasher Dec 10, 2025
ae1c8e9
chore: lint
adthrasher Dec 10, 2025
fe6511b
chore: cleanup linked files
adthrasher Dec 10, 2025
6ad8b26
chore: iterative merge of snp probe lists
adthrasher Dec 10, 2025
da8c773
chore: fix argument
adthrasher Dec 11, 2025
70ef378
chore: add probes on sex chromosomes to output
adthrasher Dec 11, 2025
57b1e3a
chore: add high p-val probe list
adthrasher Dec 15, 2025
d464f49
chore: print non-genomic probe list
adthrasher Dec 15, 2025
9070d43
chore: add non-genomic probe list to task output
adthrasher Dec 15, 2025
6d107fc
chore: rename output
adthrasher Dec 16, 2025
112a7c5
chore: cleanup output docs and add merging of non-genomic probe lists
adthrasher Dec 16, 2025
44e9584
chore: make output optional
adthrasher Dec 16, 2025
cdddb5b
chore: make output optional
adthrasher Dec 16, 2025
1593476
chore: remove branch containers
adthrasher Dec 16, 2025
9d7604f
chore: update CHANGELOG
adthrasher Dec 16, 2025
c0d9388
chore: Apply suggestions from code review
adthrasher Jan 5, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion docker/minfi/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@ RUN R --no-save <<SCRIPT
BiocManager::install("IlluminaHumanMethylationEPICanno.ilm10b4.hg19")
SCRIPT

COPY --from=scripts --chmod=777 methylation/methylation-preprocess.R /scripts/methylation/methylation-preprocess.R
COPY --from=scripts --chmod=777 methylation/methylation-preprocess.R /scripts/methylation/methylation-preprocess.R
COPY --from=scripts --chmod=777 methylation/list-sex-probes.R /scripts/methylation/list-sex-probes.R
2 changes: 1 addition & 1 deletion docker/minfi/package.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"name": "minfi",
"version": "1.48.0",
"revision": "7"
"revision": "8"
}
2 changes: 1 addition & 1 deletion docker/pandas/package.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"name": "pandas",
"version": "2.2.1",
"revision": "6"
"revision": "7"
}
11 changes: 11 additions & 0 deletions scripts/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,17 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/).

Any change to the `scripts/` directory should be accompanied by version increases in the `docker/` directory! If you are editing this file, please ensure these changes propagate!

## 2025 December

### Added

- Added `list-sex-probes.R` to dump the list of probes that align to the sex chromosomes [#283](https://github.com/stjudecloud/workflows/pull/283)

### Changed

- `filter.py` now accepts an optional set of files listing probes to exclude [#283](https://github.com/stjudecloud/workflows/pull/283)
- `methylation-preprocess.R` now outputs non-genomic probes, SNP affected probes, and non-SNP affected probes [#283](https://github.com/stjudecloud/workflows/pull/283)

## 2025 September

### Added
Expand Down
18 changes: 17 additions & 1 deletion scripts/methylation/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ def get_args():
help="Fraction of samples that must exceed the p-value threshold.",
)
parser.add_argument("--pval", type=str, help="P-values CSV file.")
parser.add_argument(
"--exclude", type=str, action="append", help="Files with probes to exclude."
)
parser.add_argument("beta", type=str, help="Beta values CSV file.")
args = parser.parse_args()

Expand All @@ -40,6 +43,15 @@ def get_args():
if __name__ == "__main__":
args = get_args()

# Read probes to exclude
probes_to_exclude = []
if args.exclude is not None:
for exclude_file in args.exclude:
with open(exclude_file, "r") as ef:
for line in ef:
probes_to_exclude.append(line.strip())
print("Number of probes to exclude:", len(probes_to_exclude))

# Read p-values and find probes with high p-value in too many samples
high_pval_probes = []
if args.pval is not None:
Expand All @@ -66,6 +78,10 @@ def get_args():
"high_pval_probes.csv", index=False, header=False
)

# Combine with probes to exclude
exclude_probe_list = set(high_pval_probes).union(set(probes_to_exclude))
print("Total number of probes to exclude:", len(exclude_probe_list))

# Read beta values and compute standard deviation
data = []

Expand All @@ -78,7 +94,7 @@ def get_args():

probe = line[0]
sd = np.std([float(x) for x in line[1:]])
if probe not in high_pval_probes:
if probe not in exclude_probe_list:
data.append([probe, sd])

sd_df = pd.DataFrame(data, columns=["probe", "sd"]).set_index("probe")
Expand Down
15 changes: 15 additions & 0 deletions scripts/methylation/list-sex-probes.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
library(minfi)
library(IlluminaHumanMethylationEPICmanifest)
library(IlluminaHumanMethylationEPICanno.ilm10b4.hg19)


ann <- getAnnotation(IlluminaHumanMethylationEPICanno.ilm10b4.hg19)
sex_probes <- rownames(ann)[ann$chr == "chrX" | ann$chr == "chrY"]

write.table(
sex_probes,
file = "sex_probes.txt",
quote = FALSE,
row.names = FALSE,
col.names = FALSE
)
35 changes: 35 additions & 0 deletions scripts/methylation/methylation-preprocess.R
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,15 @@ saveRDS(r_set, paste0(args$out_base, ".RSet.rds"))
gr_set <- mapToGenome(r_set)
saveRDS(gr_set, paste0(args$out_base, ".GRSet.rds"))

non_genomic_probes <- setdiff(featureNames(r_set), featureNames(gr_set))

# Get the list of sites with SNPs
snps <- getSnpInfo(gr_set)
gr_set_snps <- addSnpInfo(gr_set)
# Remove probes with SNPs at CpG or single-base extension sites
gr_set_snps <- dropLociWithSnps(gr_set_snps, snps = c("SBE", "CpG"), maf = 0)
probes_without_snps <- featureNames(gr_set_snps)

# Take the genomic mapped RatioSet and fill Beta values (non-normalized).
# Get the NON-normalized beta values:
beta <- getBeta(gr_set)
Expand All @@ -81,6 +90,32 @@ write.csv(sample_names, paste0(args$out_base, ".sampleNames.csv"))
probe_names <- featureNames(gr_set)
write.csv(probe_names, paste0(args$out_base, ".probeNames.csv"))

# Write probes with SNPs
probes_with_snps <- setdiff(probe_names, probes_without_snps)
write.table(
probes_with_snps,
paste0(args$out_base, ".probes_with_snps.tab"),
row.names = FALSE,
col.names = FALSE,
quote = FALSE,
)
write.table(
probes_without_snps,
paste0(args$out_base, ".probes_without_snps.tab"),
row.names = FALSE,
col.names = FALSE,
quote = FALSE,
)

# Write non-genomic probe list
write.table(
non_genomic_probes,
paste0(args$out_base, ".non_genomic_probes.tab"),
row.names = FALSE,
col.names = FALSE,
quote = FALSE,
)

gr <- granges(gr_set)
write.csv(gr, paste0(args$out_base, ".gr.csv"))

Expand Down
11 changes: 10 additions & 1 deletion workflows/methylation/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,16 @@
All notable changes to this project will be documented in this file.

The format is based on [Keep a Changelog](http://keepachangelog.com/).


## 2025 December

### Added

- Now outputs list of probes that map to sex chromosomes [#283](https://github.com/stjudecloud/workflows/pull/283)
- Pipeline writes out the list of filtered probes [#283](https://github.com/stjudecloud/workflows/pull/283)
- Pipeline writes out list of probes that are affected by SNPs [#283](https://github.com/stjudecloud/workflows/pull/283)
- Pipeline writes out list of probes that were filtered for not mapping to genomic coordinates [#283](https://github.com/stjudecloud/workflows/pull/283)

## 2025 September

### Added
Expand Down
14 changes: 12 additions & 2 deletions workflows/methylation/methylation-cohort.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,22 @@ workflow methylation_cohort {
umap_embedding: "UMAP embedding for all samples",
umap_plot: "UMAP plot for all samples",
probe_pvalues: "Matrix (in CSV format) containing detection p-values for every (common) probe on the array as rows and all of the input samples as columns.",
high_pval_probes: "List of probes that were filtered out due to high p-values",
}
allowNestedInputs: true
}

parameter_meta {
unfiltered_normalized_beta: "Array of unfiltered normalized beta values for each sample"
sex_probe_list: "List of probes mapping to sex chromosomes to optionally filter"
p_values: "Array of detection p-value files for each sample."
skip_pvalue_check: "Skip filtering based on p-values, even if `p_values` is supplied."
num_probes: "Number of probes to use when filtering to the top `num_probes` probes with the highest standard deviation."
}

input {
Array[File] unfiltered_normalized_beta
File? sex_probe_list
Array[File] p_values = []
Boolean skip_pvalue_check = false
Int num_probes = 10000
Expand Down Expand Up @@ -120,6 +123,7 @@ workflow methylation_cohort {
),
p_values = pval_file,
num_probes,
additional_probes_to_exclude = select_all([sex_probe_list]),
}

call generate_umap { input:
Expand All @@ -142,6 +146,7 @@ workflow methylation_cohort {
File umap_embedding = generate_umap.umap
File umap_plot = plot_umap.umap_plot
File? probe_pvalues = pval_file
File? high_pval_probes = filter_probes.high_pval_probes
}
}

Expand Down Expand Up @@ -191,7 +196,7 @@ task combine_data {
}

runtime {
container: "ghcr.io/stjudecloud/pandas:2.2.1-6"
container: "ghcr.io/stjudecloud/pandas:2.2.1-7"
memory: "~{memory_gb} GB"
cpu: 1
disks: "~{disk_size_gb} GB"
Expand All @@ -206,12 +211,14 @@ task filter_probes {
outputs: {
filtered_beta_values: "Filtered beta values for all samples",
filtered_probes: "Probes that were retained after filtering.",
high_pval_probes: "Probes that were filtered out due to high p-values",
}
}

parameter_meta {
beta_values: "Beta values for all samples"
p_values: "P-values for all samples"
additional_probes_to_exclude: "Additional probes to exclude from the analysis"
prefix: "Prefix for the output files. The extensions `.beta.csv` and `.probes.csv` will be appended."
pval_threshold: "P-value cutoff to determine poor quality probes"
pval_sample_fraction: "Fraction of samples that must exceed p-value threshold to exclude probe"
Expand All @@ -221,6 +228,7 @@ task filter_probes {
input {
File beta_values
File? p_values
Array[File] additional_probes_to_exclude = []
String prefix = "filtered"
Float pval_threshold = 0.01
Float pval_sample_fraction = 0.5
Expand All @@ -237,16 +245,18 @@ task filter_probes {
--pval-threshold ~{pval_threshold} \
--pval-sample-fraction ~{pval_sample_fraction} \
~{"--pval '" + p_values + "'"} \
~{sep(" ", prefix("--exclude ", quote(additional_probes_to_exclude)))} \
"~{beta_values}"
>>>

output {
File filtered_beta_values = "~{prefix}.beta.csv"
File filtered_probes = "~{prefix}.probes.csv"
File? high_pval_probes = "high_pval_probes.csv"
}

runtime {
container: "ghcr.io/stjudecloud/pandas:2.2.1-6"
container: "ghcr.io/stjudecloud/pandas:2.2.1-7"
memory: "8 GB"
cpu: 1
disks: "~{disk_size_gb} GB"
Expand Down
41 changes: 40 additions & 1 deletion workflows/methylation/methylation-preprocess.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ task process_raw_idats {
m_values: "M values",
probe_names: "Probe names found on the array",
probe_pvalues: "Matrix (in CSV format) containing detection p-values for every (common) probe on the array as rows.",
probes_with_snps: "List of probes that contain SNPs",
probes_without_snps: "List of probes that do not contain SNPs",
non_genomic_probes: "List of probes that do not map to the genome",
}
}

Expand Down Expand Up @@ -45,6 +48,8 @@ task process_raw_idats {
--idat_base "~{idat_base}" \
--out_base "~{out_base}" \
--seed ~{seed}

rm ./*.idat
>>>

output {
Expand All @@ -58,13 +63,47 @@ task process_raw_idats {
File m_values = out_base + ".m_values.csv"
File probe_names = out_base + ".probeNames.csv"
File probe_pvalues = out_base + ".detectionP.csv"
File probes_with_snps = out_base + ".probes_with_snps.tab"
File probes_without_snps = out_base + ".probes_without_snps.tab"
File non_genomic_probes = out_base + ".non_genomic_probes.tab"
}

runtime {
container: "ghcr.io/stjudecloud/minfi:1.48.0-7"
container: "ghcr.io/stjudecloud/minfi:1.48.0-8"
memory: "8 GB"
cpu: 1
disks: "~{disk_size_gb} GB"
maxRetries: 1
}
}

task list_sex_probes {
meta {
description: "List probes that map to the sex chromosomes"
outputs: {
probe_list: "List of probe names that map to the sex chromosomes"
}
}

parameter_meta {}

input {}

command <<<
set -euo pipefail

Rscript /scripts/methylation/list-sex-probes.R
>>>

output {
File probe_list = "sex_probes.txt"
}

runtime {
container: "ghcr.io/stjudecloud/minfi:1.48.0-8"
memory: "8 GB"
cpu: 1
disks: "2 GB"
maxRetries: 1
}
}
Loading
Loading