Sentieon · DonFreed · Dec 16, 2025 · Dec 7, 2025 · Dec 8, 2025 · Dec 8, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -45,9 +45,9 @@ jobs:
       - name: Run black formatter
         run: poetry run black --check --exclude '__pycache__|scripts|sentieon_cli/archive.py' --line-length 79 sentieon_cli
       - name: Run mypy
-        run: poetry run mypy sentieon_cli
+        run: poetry run mypy sentieon_cli --exclude scripts
       - name: Run flake8
-        run: poetry run flake8 . --extend-ignore E231,E221, --exclude .github/scripts/license_message.py,sentieon_cli/scripts/gvcf_combine.py,sentieon_cli/scripts/vcf_mod.py,sentieon_cli/scripts/hybrid_anno.py,sentieon_cli/scripts/hybrid_select.py,sentieon_cli/archive.py,playground,tests # false+ from python 3.12
+        run: poetry run flake8 . --extend-ignore E231,E221, --exclude .github/scripts/license_message.py,sentieon_cli/scripts/gvcf_combine.py,sentieon_cli/scripts/vcf_mod.py,sentieon_cli/scripts/hybrid_anno.py,sentieon_cli/scripts/hybrid_select.py,sentieon_cli/archive.py,sentieon_cli/scripts/trimalt.py,playground,tests # false+ from python 3.12
       - name: Run the automated tests
         run: poetry run pytest -v
       - name: Run doct tests

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -117,7 +117,10 @@ jobs:
       - name: Install bcftools
         run: |
           sudo apt-get update
-          sudo apt-get install -y bcftools
+          sudo apt-get install libcurl4-gnutls-dev
+          curl -L https://github.com/samtools/bcftools/releases/download/1.22/bcftools-1.22.tar.bz2 | tar -jxf -
+          cd bcftools-1.22
+          ./configure && sudo make install
       - name: Install samtools
         run: |
           curl -L https://github.com/samtools/samtools/releases/download/1.19.2/samtools-1.19.2.tar.bz2 | tar -jxf -
@@ -236,7 +239,10 @@ jobs:
       - name: Install bcftools
         run: |
           sudo apt-get update
-          sudo apt-get install -y bcftools
+          sudo apt-get install libcurl4-gnutls-dev
+          curl -L https://github.com/samtools/bcftools/releases/download/1.22/bcftools-1.22.tar.bz2 | tar -jxf -
+          cd bcftools-1.22
+          ./configure && sudo make install
       - name: Install samtools
         run: |
           curl -L https://github.com/samtools/samtools/releases/download/1.19.2/samtools-1.19.2.tar.bz2 | tar -jxf -

diff --git a/README.md b/README.md
@@ -8,8 +8,8 @@ A command-line interface for the Sentieon software
 
 Download the latest tar.gz file from the GitHub release page, https://github.com/sentieon/sentieon-cli/releases/ and install the package with pip:
 ```sh
-curl -LO https://github.com/Sentieon/sentieon-cli/releases/download/v1.4.0/sentieon_cli-1.4.0.tar.gz
-pip install sentieon_cli-1.4.0.tar.gz
+curl -LO https://github.com/Sentieon/sentieon-cli/releases/download/v1.5.0/sentieon_cli-1.5.0.tar.gz
+pip install sentieon_cli-1.5.0.tar.gz
 ```
 
 ## Installation with Poetry

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,7 @@
 
 [project]
 name = "sentieon_cli"
-version = "1.4.0"
+version = "1.5.0"
 description = "entry point for sentieon command-line tools"
 authors = [
     {name = "Don Freed", email = "don.freed@sentieon.com"},

diff --git a/sentieon_cli/__init__.py b/sentieon_cli/__init__.py
@@ -3,6 +3,7 @@
 from .dnascope_hybrid import DNAscopeHybridPipeline
 from .dnascope_longread import DNAscopeLRPipeline
 from .pangenome import PangenomePipeline
+from .sentieon_pangenome import SentieonPangenome
 
 
 def main():
@@ -50,6 +51,12 @@ def main():
     pipeline.add_arguments(pangenome_subparser)
     pangenome_subparser.set_defaults(pipeline=pipeline.main)
 
+    # Sentieon pangenome
+    pipeline = SentieonPangenome()
+    sentieon_pangenome_subparser = subparsers.add_parser("sentieon-pangenome")
+    pipeline.add_arguments(sentieon_pangenome_subparser)
+    sentieon_pangenome_subparser.set_defaults(pipeline=pipeline.main)
+
     args = parser.parse_args()
     args.pipeline(args)
 

diff --git a/sentieon_cli/base_pangenome.py b/sentieon_cli/base_pangenome.py
@@ -0,0 +1,188 @@
+"""
+A base class for pangenome pipelines
+"""
+
+import copy
+import pathlib
+import sys
+from typing import List, Optional
+
+from . import command_strings as cmds
+from .job import Job
+from .pipeline import BasePipeline
+from .util import parse_rg_line, path_arg
+
+
+class BasePangenome(BasePipeline):
+    """A pipeline base class for short reads"""
+
+    params = copy.deepcopy(BasePipeline.params)
+    params.update(
+        {
+            # Required arguments
+            "gbz": {
+                "help": "The pangenome graph file in GBZ format.",
+                "required": True,
+                "type": path_arg(exists=True, is_file=True),
+            },
+            "hapl": {
+                "help": "The haplotype file.",
+                "required": True,
+                "type": path_arg(exists=True, is_file=True),
+            },
+            "model_bundle": {
+                "flags": ["-m", "--model_bundle"],
+                "help": "The model bundle file.",
+                "required": True,
+                "type": path_arg(exists=True, is_file=True),
+            },
+            "r1_fastq": {
+                "nargs": "*",
+                "help": "Sample R1 fastq files.",
+                "type": path_arg(exists=True, is_file=True),
+            },
+            "r2_fastq": {
+                "nargs": "*",
+                "help": "Sample R2 fastq files.",
+                "type": path_arg(exists=True, is_file=True),
+            },
+            "readgroups": {
+                "nargs": "*",
+                "help": (
+                    "Readgroup information for the fastq files. Only the ID "
+                    "and SM attributes are used."
+                ),
+            },
+            # Additional arguments
+            "bam_format": {
+                "help": (
+                    "Use the BAM format instead of CRAM for output aligned "
+                    "files."
+                ),
+                "action": "store_true",
+            },
+            "dbsnp": {
+                "flags": ["-d", "--dbsnp"],
+                "help": (
+                    "dbSNP vcf file Supplying this file will annotate "
+                    "variants with their dbSNP refSNP ID numbers."
+                ),
+                "type": path_arg(exists=True, is_file=True),
+            },
+            "kmer_memory": {
+                "help": "Memory limit for KMC in GB.",
+                "default": 30,
+                "type": int,
+            },
+        }
+    )
+
+    positionals = BasePipeline.positionals
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.gbz: Optional[pathlib.Path] = None
+        self.hapl: Optional[pathlib.Path] = None
+        self.model_bundle: Optional[pathlib.Path] = None
+        self.r1_fastq: List[pathlib.Path] = []
+        self.r2_fastq: List[pathlib.Path] = []
+        self.readgroups: List[str] = []
+        self.bam_format = False
+        self.dbsnp: Optional[pathlib.Path] = None
+        self.kmer_memory = 30
+        self.t1k_hla_seq: Optional[pathlib.Path] = None
+        self.t1k_hla_coord: Optional[pathlib.Path] = None
+        self.t1k_kir_seq: Optional[pathlib.Path] = None
+        self.t1k_kir_coord: Optional[pathlib.Path] = None
+
+    def validate_fastq_rg(self, r1_required=False) -> None:
+        if not self.r1_fastq and r1_required:
+            self.logger.error("Please supply --r1_fastq arguments")
+            sys.exit(2)
+
+        if len(self.r1_fastq) != len(self.readgroups):
+            self.logger.error(
+                "The number of readgroups does not equal the number of fastq "
+                "files"
+            )
+            sys.exit(2)
+
+        # Validate readgroups
+        rg_sample = None
+        for rg in self.readgroups:
+            rg_dict = parse_rg_line(rg.replace(r"\t", "\t"))
+            rg_sm = rg_dict.get("SM")
+            if not rg_sm:
+                self.logger.error(
+                    "Found a readgroup without a SM tag: %s",
+                    str(rg),
+                )
+                sys.exit(2)
+            if rg_sample and rg_sample != rg_sm:
+                self.logger.error(
+                    "Inconsistent readgroup sample information found in: %s",
+                    str(rg),
+                )
+                sys.exit(2)
+            rg_sample = rg_sm
+            if "ID" not in rg_dict:
+                self.logger.error(
+                    "Found a readgroup without an ID tag: %s",
+                    str(rg),
+                )
+                sys.exit(2)
+
+    def validate_t1k(self) -> None:
+        if (self.t1k_hla_seq and not self.t1k_hla_coord) or (
+            self.t1k_hla_coord and not self.t1k_hla_seq
+        ):
+            self.logger.error(
+                "For HLA calling, both the seq and coord fasta files need to "
+                "be supplied. Exiting"
+            )
+            sys.exit(2)
+
+        if (self.t1k_kir_seq and not self.t1k_kir_coord) or (
+            self.t1k_kir_coord and not self.t1k_kir_seq
+        ):
+            self.logger.error(
+                "For KIR calling, both the seq and coord fasta files need to "
+                "be supplied. Exiting"
+            )
+            sys.exit(2)
+
+    def build_kmc_job(
+        self, kmer_prefix: pathlib.Path, job_threads: int
+    ) -> Job:
+        """Build KMC k-mer counting jobs"""
+        # Create file list for KMC
+        file_list = pathlib.Path(str(kmer_prefix) + ".paths")
+        all_fastqs = []
+
+        # Add R1 files
+        all_fastqs.extend(self.r1_fastq)
+
+        # Add R2 files if present
+        if self.r2_fastq:
+            all_fastqs.extend(self.r2_fastq)
+
+        # Write file list
+        if not self.dry_run:
+            with open(file_list, "w") as f:
+                for fq in all_fastqs:
+                    f.write(f"{fq}\n")
+
+        # Create KMC job
+        kmc_job = Job(
+            cmds.cmd_kmc(
+                kmer_prefix,
+                file_list,
+                self.tmp_dir,
+                memory=self.kmer_memory,
+                threads=self.cores,
+            ),
+            "kmc",
+            job_threads,
+        )
+
+        return kmc_job