Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,9 @@ jobs:
- name: Run black formatter
run: poetry run black --check --exclude '__pycache__|scripts|sentieon_cli/archive.py' --line-length 79 sentieon_cli
- name: Run mypy
run: poetry run mypy sentieon_cli
run: poetry run mypy sentieon_cli --exclude scripts
- name: Run flake8
run: poetry run flake8 . --extend-ignore E231,E221, --exclude .github/scripts/license_message.py,sentieon_cli/scripts/gvcf_combine.py,sentieon_cli/scripts/vcf_mod.py,sentieon_cli/scripts/hybrid_anno.py,sentieon_cli/scripts/hybrid_select.py,sentieon_cli/archive.py,playground,tests # false+ from python 3.12
run: poetry run flake8 . --extend-ignore E231,E221, --exclude .github/scripts/license_message.py,sentieon_cli/scripts/gvcf_combine.py,sentieon_cli/scripts/vcf_mod.py,sentieon_cli/scripts/hybrid_anno.py,sentieon_cli/scripts/hybrid_select.py,sentieon_cli/archive.py,sentieon_cli/scripts/trimalt.py,playground,tests # false+ from python 3.12
- name: Run the automated tests
run: poetry run pytest -v
- name: Run doct tests
Expand Down
10 changes: 8 additions & 2 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,10 @@ jobs:
- name: Install bcftools
run: |
sudo apt-get update
sudo apt-get install -y bcftools
sudo apt-get install libcurl4-gnutls-dev
curl -L https://github.com/samtools/bcftools/releases/download/1.22/bcftools-1.22.tar.bz2 | tar -jxf -
cd bcftools-1.22
./configure && sudo make install
- name: Install samtools
run: |
curl -L https://github.com/samtools/samtools/releases/download/1.19.2/samtools-1.19.2.tar.bz2 | tar -jxf -
Expand Down Expand Up @@ -236,7 +239,10 @@ jobs:
- name: Install bcftools
run: |
sudo apt-get update
sudo apt-get install -y bcftools
sudo apt-get install libcurl4-gnutls-dev
curl -L https://github.com/samtools/bcftools/releases/download/1.22/bcftools-1.22.tar.bz2 | tar -jxf -
cd bcftools-1.22
./configure && sudo make install
- name: Install samtools
run: |
curl -L https://github.com/samtools/samtools/releases/download/1.19.2/samtools-1.19.2.tar.bz2 | tar -jxf -
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ A command-line interface for the Sentieon software

Download the latest tar.gz file from the GitHub release page, https://github.com/sentieon/sentieon-cli/releases/ and install the package with pip:
```sh
curl -LO https://github.com/Sentieon/sentieon-cli/releases/download/v1.4.0/sentieon_cli-1.4.0.tar.gz
pip install sentieon_cli-1.4.0.tar.gz
curl -LO https://github.com/Sentieon/sentieon-cli/releases/download/v1.5.0/sentieon_cli-1.5.0.tar.gz
pip install sentieon_cli-1.5.0.tar.gz
```

## Installation with Poetry
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@

[project]
name = "sentieon_cli"
version = "1.4.0"
version = "1.5.0"
description = "entry point for sentieon command-line tools"
authors = [
{name = "Don Freed", email = "don.freed@sentieon.com"},
Expand Down
7 changes: 7 additions & 0 deletions sentieon_cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from .dnascope_hybrid import DNAscopeHybridPipeline
from .dnascope_longread import DNAscopeLRPipeline
from .pangenome import PangenomePipeline
from .sentieon_pangenome import SentieonPangenome


def main():
Expand Down Expand Up @@ -50,6 +51,12 @@ def main():
pipeline.add_arguments(pangenome_subparser)
pangenome_subparser.set_defaults(pipeline=pipeline.main)

# Sentieon pangenome
pipeline = SentieonPangenome()
sentieon_pangenome_subparser = subparsers.add_parser("sentieon-pangenome")
pipeline.add_arguments(sentieon_pangenome_subparser)
sentieon_pangenome_subparser.set_defaults(pipeline=pipeline.main)

args = parser.parse_args()
args.pipeline(args)

Expand Down
188 changes: 188 additions & 0 deletions sentieon_cli/base_pangenome.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
"""
A base class for pangenome pipelines
"""

import copy
import pathlib
import sys
from typing import List, Optional

from . import command_strings as cmds
from .job import Job
from .pipeline import BasePipeline
from .util import parse_rg_line, path_arg


class BasePangenome(BasePipeline):
"""A pipeline base class for short reads"""

params = copy.deepcopy(BasePipeline.params)
params.update(
{
# Required arguments
"gbz": {
"help": "The pangenome graph file in GBZ format.",
"required": True,
"type": path_arg(exists=True, is_file=True),
},
"hapl": {
"help": "The haplotype file.",
"required": True,
"type": path_arg(exists=True, is_file=True),
},
"model_bundle": {
"flags": ["-m", "--model_bundle"],
"help": "The model bundle file.",
"required": True,
"type": path_arg(exists=True, is_file=True),
},
"r1_fastq": {
"nargs": "*",
"help": "Sample R1 fastq files.",
"type": path_arg(exists=True, is_file=True),
},
"r2_fastq": {
"nargs": "*",
"help": "Sample R2 fastq files.",
"type": path_arg(exists=True, is_file=True),
},
"readgroups": {
"nargs": "*",
"help": (
"Readgroup information for the fastq files. Only the ID "
"and SM attributes are used."
),
},
# Additional arguments
"bam_format": {
"help": (
"Use the BAM format instead of CRAM for output aligned "
"files."
),
"action": "store_true",
},
"dbsnp": {
"flags": ["-d", "--dbsnp"],
"help": (
"dbSNP vcf file Supplying this file will annotate "
"variants with their dbSNP refSNP ID numbers."
),
"type": path_arg(exists=True, is_file=True),
},
"kmer_memory": {
"help": "Memory limit for KMC in GB.",
"default": 30,
"type": int,
},
}
)

positionals = BasePipeline.positionals

def __init__(self) -> None:
super().__init__()
self.gbz: Optional[pathlib.Path] = None
self.hapl: Optional[pathlib.Path] = None
self.model_bundle: Optional[pathlib.Path] = None
self.r1_fastq: List[pathlib.Path] = []
self.r2_fastq: List[pathlib.Path] = []
self.readgroups: List[str] = []
self.bam_format = False
self.dbsnp: Optional[pathlib.Path] = None
self.kmer_memory = 30
self.t1k_hla_seq: Optional[pathlib.Path] = None
self.t1k_hla_coord: Optional[pathlib.Path] = None
self.t1k_kir_seq: Optional[pathlib.Path] = None
self.t1k_kir_coord: Optional[pathlib.Path] = None

def validate_fastq_rg(self, r1_required=False) -> None:
if not self.r1_fastq and r1_required:
self.logger.error("Please supply --r1_fastq arguments")
sys.exit(2)

if len(self.r1_fastq) != len(self.readgroups):
self.logger.error(
"The number of readgroups does not equal the number of fastq "
"files"
)
sys.exit(2)

# Validate readgroups
rg_sample = None
for rg in self.readgroups:
rg_dict = parse_rg_line(rg.replace(r"\t", "\t"))
rg_sm = rg_dict.get("SM")
if not rg_sm:
self.logger.error(
"Found a readgroup without a SM tag: %s",
str(rg),
)
sys.exit(2)
if rg_sample and rg_sample != rg_sm:
self.logger.error(
"Inconsistent readgroup sample information found in: %s",
str(rg),
)
sys.exit(2)
rg_sample = rg_sm
if "ID" not in rg_dict:
self.logger.error(
"Found a readgroup without an ID tag: %s",
str(rg),
)
sys.exit(2)

def validate_t1k(self) -> None:
if (self.t1k_hla_seq and not self.t1k_hla_coord) or (
self.t1k_hla_coord and not self.t1k_hla_seq
):
self.logger.error(
"For HLA calling, both the seq and coord fasta files need to "
"be supplied. Exiting"
)
sys.exit(2)

if (self.t1k_kir_seq and not self.t1k_kir_coord) or (
self.t1k_kir_coord and not self.t1k_kir_seq
):
self.logger.error(
"For KIR calling, both the seq and coord fasta files need to "
"be supplied. Exiting"
)
sys.exit(2)

def build_kmc_job(
self, kmer_prefix: pathlib.Path, job_threads: int
) -> Job:
"""Build KMC k-mer counting jobs"""
# Create file list for KMC
file_list = pathlib.Path(str(kmer_prefix) + ".paths")
all_fastqs = []

# Add R1 files
all_fastqs.extend(self.r1_fastq)

# Add R2 files if present
if self.r2_fastq:
all_fastqs.extend(self.r2_fastq)

# Write file list
if not self.dry_run:
with open(file_list, "w") as f:
for fq in all_fastqs:
f.write(f"{fq}\n")

# Create KMC job
kmc_job = Job(
cmds.cmd_kmc(
kmer_prefix,
file_list,
self.tmp_dir,
memory=self.kmer_memory,
threads=self.cores,
),
"kmc",
job_threads,
)

return kmc_job
Loading