Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
63 commits
Select commit Hold shift + click to select a range
ae8d914
refactor inference tagging and stabilize test gates
mtcicero26 May 7, 2026
e265ac4
harden output helpers and share streaming filters
mtcicero26 May 7, 2026
299c571
cover daf call input source checks
mtcicero26 May 7, 2026
b02317f
cover daf raw call output parity
mtcicero26 May 7, 2026
c1feace
remove unused bam merge helpers
mtcicero26 May 7, 2026
bbb784c
cover call model resolution
mtcicero26 May 7, 2026
d7f30d9
centralize region bam cat cleanup
mtcicero26 May 8, 2026
9d09d01
centralize region bam merge fallback
mtcicero26 May 8, 2026
564b8aa
harden bam sort index wrappers
mtcicero26 May 8, 2026
383ddef
extract region bam concat fallback
mtcicero26 May 8, 2026
bd1f5b3
type region worker contracts
mtcicero26 May 8, 2026
d05b624
share region result aggregation
mtcicero26 May 8, 2026
3f45481
cover region worker cleanup failures
mtcicero26 May 8, 2026
53ca4bb
extract fused inference stage helpers
mtcicero26 May 8, 2026
68f8f15
trim fused recall copy overhead
mtcicero26 May 8, 2026
eddd407
speed up daf context encoding
mtcicero26 May 8, 2026
70c9225
benchmark daf encoder fast path
mtcicero26 May 8, 2026
696cb0b
Make ruff lint gate pass
mtcicero26 May 8, 2026
cd27575
Report per-read worker failures
mtcicero26 May 8, 2026
a44a63f
Speed up m6a context encoding
mtcicero26 May 8, 2026
1fab1de
Reduce Viterbi memory traffic
mtcicero26 May 8, 2026
3f2398b
Avoid HMM observation copies
mtcicero26 May 8, 2026
a650e3f
Split mode-specific read encoding
mtcicero26 May 8, 2026
14efedc
Freeze HMM logs for inference workers
mtcicero26 May 8, 2026
78aafe6
Reuse footprint extraction helper
mtcicero26 May 8, 2026
2e864fc
Speed up footprint run extraction
mtcicero26 May 8, 2026
f9d4ac5
Reuse apply tag writer in region workers
mtcicero26 May 8, 2026
6ffdc32
Avoid duplicate footprint state scan
mtcicero26 May 8, 2026
a2fb129
Close fused DAF reference handle
mtcicero26 May 8, 2026
0ec03ad
Close recall TF BAM handles on failures
mtcicero26 May 8, 2026
bd427c6
Extract region planning helpers
mtcicero26 May 8, 2026
911b168
Extract streaming drain helpers
mtcicero26 May 8, 2026
060b039
Close posterior writers on apply failures
mtcicero26 May 8, 2026
3b30365
Close fused reference handles on failures
mtcicero26 May 8, 2026
0d223f2
Extract region posterior TSV helpers
mtcicero26 May 8, 2026
10d85e1
Avoid ML tag list copies
mtcicero26 May 8, 2026
500fd4b
Close TSV posterior writer on export failures
mtcicero26 May 8, 2026
df75c91
Close TSV backend handles on failures
mtcicero26 May 8, 2026
a2cc102
Close HDF5 posterior writer after finalize failures
mtcicero26 May 8, 2026
0b6566b
Close extract worker beds on open failures
mtcicero26 May 8, 2026
66d5c81
Close stats BAM handles with context managers
mtcicero26 May 8, 2026
4fecb09
Close DAF encoder handles on validation failures
mtcicero26 May 8, 2026
9abc165
Close tagged BAM on BED open failures
mtcicero26 May 8, 2026
f054693
Batch DAF reference fallback fetches
mtcicero26 May 8, 2026
cdb5921
Close score database connections on failures
mtcicero26 May 8, 2026
d433a13
Use explicit DAF encoder progress guard
mtcicero26 May 14, 2026
b8aaf01
Extract streaming worker helpers
mtcicero26 May 14, 2026
8938d5e
Extract region worker helpers
mtcicero26 May 14, 2026
1e47ab0
Extract inference multiprocessing context
mtcicero26 May 14, 2026
10cf806
Extract streaming pipeline coordinators
mtcicero26 May 14, 2026
014b4c3
Extract region pipeline coordinators
mtcicero26 May 14, 2026
d24fd3f
Extract legacy apply pipeline
mtcicero26 May 14, 2026
a89a392
Reuse read filters in legacy pipeline
mtcicero26 May 14, 2026
8f544c1
Reuse read filters in region workers
mtcicero26 May 14, 2026
0ce12dd
Defer dispatcher model loading
mtcicero26 May 14, 2026
7e5f536
Reuse consensus TF reference maps
mtcicero26 May 14, 2026
c8172bd
Stream AQ parser bytes
mtcicero26 May 14, 2026
0c7868c
Pass through failed TF recall reads
mtcicero26 May 14, 2026
0004530
Keep TF recall payload tags compact
mtcicero26 May 14, 2026
3699616
Avoid extract tag array copies
mtcicero26 May 14, 2026
f06628d
Add workflow regression coverage
mtcicero26 May 14, 2026
2fb6f33
Expand workflow regression modes
mtcicero26 May 14, 2026
15bb17a
Broaden workflow input regressions
mtcicero26 May 14, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
523 changes: 523 additions & 0 deletions docs/REFACTOR_AUDIT.md

Large diffs are not rendered by default.

14 changes: 12 additions & 2 deletions fiberhmm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,16 @@

__version__ = "2.11.0"

from fiberhmm.core.hmm import FiberHMM
from fiberhmm.core.bam_reader import ContextEncoder, FiberRead, read_bam
from fiberhmm.core.model_io import load_model, save_model, load_model_with_metadata
from fiberhmm.core.hmm import FiberHMM
from fiberhmm.core.model_io import load_model, load_model_with_metadata, save_model

__all__ = [
"ContextEncoder",
"FiberHMM",
"FiberRead",
"load_model",
"load_model_with_metadata",
"read_bam",
"save_model",
]
41 changes: 21 additions & 20 deletions fiberhmm/cli/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,23 @@
Applies trained HMM to call chromatin footprints from fiber-seq BAM files.
"""

import argparse
import os
import sys
import glob
import shutil
import tempfile
import argparse
import numpy as np

import pandas as pd

from fiberhmm.core.model_io import load_model_with_metadata
from fiberhmm.inference.parallel import process_bam_for_footprints
from fiberhmm.inference.stats import FootprintStats, collect_stats_from_bam
from fiberhmm.cli.common import (
add_mode_args, add_filter_args, add_edge_trim_args,
add_parallel_args, add_stats_args, add_version_args,
add_edge_trim_args,
add_filter_args,
add_mode_args,
add_parallel_args,
add_stats_args,
add_version_args,
)
from fiberhmm.core.model_io import load_model_with_metadata
from fiberhmm.inference.parallel import process_bam_for_footprints
from fiberhmm.inference.stats import collect_stats_from_bam


def parse_args():
Expand Down Expand Up @@ -178,7 +179,7 @@ def main():
# Load model with metadata
print(f"Loading model from {model_path}")
model, model_context_size, model_mode = load_model_with_metadata(model_path)
print(f"Model loaded successfully")
print("Model loaded successfully")
print(f" Start probs: {model.startprob_}")
print(f" Transition matrix:\n{model.transmat_}")

Expand Down Expand Up @@ -206,9 +207,9 @@ def main():
# Show optimization status
from fiberhmm.core.hmm import HAS_NUMBA
if HAS_NUMBA:
print(f" Numba JIT: enabled (fast)")
print(" Numba JIT: enabled (fast)")
else:
print(f" Numba JIT: disabled (pip install numba for ~10x speedup)")
print(" Numba JIT: disabled (pip install numba for ~10x speedup)")

# Determine context size (command line overrides model)
if args.context_size is not None:
Expand Down Expand Up @@ -274,21 +275,21 @@ def main():
print(f" Min MAPQ: {args.min_mapq}")
print(f" Mod prob threshold: {args.prob_threshold}/255")
if args.circular:
print(f" Circular mode: enabled")
print(" Circular mode: enabled")
if with_scores:
print(f" Confidence scores: enabled")
print(" Confidence scores: enabled")
if args.scores_db:
print(f" Scores database: {db_path}")
if mode == 'daf':
print(f" Strand detection: automatic (C=+, G=-)")
print(" Strand detection: automatic (C=+, G=-)")
elif mode == 'nanopore-fiber':
print(f" Strand detection: none (A-centered only)")
print(" Strand detection: none (A-centered only)")
if args.no_msps:
print(f" MSP output: disabled (--no-msps)")
print(" MSP output: disabled (--no-msps)")
else:
print(f" MSP min size: {msp_min_size} bp")
if args.stats:
print(f" Stats: enabled")
print(" Stats: enabled")
print()

# Parse chromosomes
Expand Down Expand Up @@ -394,7 +395,7 @@ def main():
print("\nDone!", file=sys.stderr)
else:
print("\nDone!")
print(f"\nTo extract BED12/bigBed for browser visualization:")
print("\nTo extract BED12/bigBed for browser visualization:")
print(f" fiberhmm-extract-tags -i {output_bam}")


Expand Down
6 changes: 3 additions & 3 deletions fiberhmm/cli/call.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,16 @@
fiberhmm-call -i in.bam -o - --enzyme hia5 --seq pacbio | ft fire - -
"""
import argparse
import os
import sys

from fiberhmm.core.model_io import load_model_with_metadata
from fiberhmm.models import SUPPORTED_ENZYMES, get_model_path as _get_bundled_model
from fiberhmm.inference.parallel import (
_process_bam_streaming_pipeline_fused,
_process_bam_region_parallel_fused,
_process_bam_streaming_pipeline_fused,
)
from fiberhmm.inference.tf_recaller import ENZYME_PRESETS
from fiberhmm.models import SUPPORTED_ENZYMES
from fiberhmm.models import get_model_path as _get_bundled_model


def parse_args():
Expand Down
6 changes: 3 additions & 3 deletions fiberhmm/cli/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@ def add_filter_args(parser: argparse.ArgumentParser,
"""Add read filtering arguments (--min-mapq, --prob-threshold, --min-read-length)."""
parser.add_argument(
'--min-mapq', '-q', type=int, default=min_mapq,
help=f"Minimum mapping quality; reads below this are written to output "
f"unchanged without footprint/nucleosome tags. Default 0 (call on "
f"all mapped reads). Pass a positive value to filter."
help="Minimum mapping quality; reads below this are written to output "
"unchanged without footprint/nucleosome tags. Default 0 (call on "
"all mapped reads). Pass a positive value to filter."
)
parser.add_argument(
'--prob-threshold', type=int, default=prob_threshold,
Expand Down
34 changes: 22 additions & 12 deletions fiberhmm/cli/consensus_tfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,18 @@
from scipy.ndimage import gaussian_filter1d
from scipy.signal import find_peaks

from fiberhmm.io.ma_tags import parse_ma_tag, parse_aq_array

from fiberhmm.io.ma_tags import parse_aq_array, parse_ma_tag

# ── Query → reference coordinate conversion ──────────────────────────────────

def _ref_interval_from_map(ref_map, q_start: int, length: int) -> Tuple[int, int] | Tuple[None, None]:
"""Return half-open reference coordinates for a query interval."""
mapped = [p for p in ref_map[q_start: q_start + length] if p is not None]
if not mapped:
return None, None
return mapped[0], mapped[-1] + 1 # half-open [start, end)


def _ref_interval(read: pysam.AlignedSegment,
q_start: int, length: int) -> Tuple[int, int] | Tuple[None, None]:
"""Return (ref_start, ref_end) in half-open BED format for a query interval.
Expand All @@ -52,11 +59,7 @@ def _ref_interval(read: pysam.AlignedSegment,
and takes min/max of non-None entries. Returns (None, None) if the entire
footprint falls inside an insertion relative to the reference (rare).
"""
ref_map = read.get_reference_positions(full_length=True)
mapped = [p for p in ref_map[q_start: q_start + length] if p is not None]
if not mapped:
return None, None
return mapped[0], mapped[-1] + 1 # half-open [start, end)
return _ref_interval_from_map(read.get_reference_positions(full_length=True), q_start, length)


# ── TF call extraction ────────────────────────────────────────────────────────
Expand All @@ -73,7 +76,7 @@ def _iter_tf_calls(read: pysam.AlignedSegment,
except KeyError:
return
try:
aq = list(read.get_tag('AQ'))
aq = read.get_tag('AQ')
except KeyError:
aq = []

Expand All @@ -91,6 +94,7 @@ def _iter_tf_calls(read: pysam.AlignedSegment,
# ann_idx must increment for ALL annotations (nuc, msp, tf) to stay
# in sync with the flat per_ann list built by parse_aq_array.
ann_idx = 0
ref_map = None
for name, _strand_field, _qspec, intervals in parsed['raw_types']:
for (q_start, length) in intervals:
quals = per_ann[ann_idx] if ann_idx < len(per_ann) else []
Expand All @@ -100,7 +104,9 @@ def _iter_tf_calls(read: pysam.AlignedSegment,
tq = int(quals[0]) if len(quals) >= 1 else 0
if tq < min_tq:
continue
ref_start, ref_end = _ref_interval(read, q_start, length)
if ref_map is None:
ref_map = read.get_reference_positions(full_length=True)
ref_start, ref_end = _ref_interval_from_map(ref_map, q_start, length)
if ref_start is None:
continue
yield ref_start, ref_end, strand, tq
Expand Down Expand Up @@ -186,8 +192,10 @@ def _process_chrom(chrom: str, chrom_len: int,
continue
idx = int(np.searchsorted(peak_pos, center))
candidates: list[tuple[int, int]] = []
if idx > 0: candidates.append((int(peaks_arr[idx - 1]), idx - 1))
if idx < len(peaks_arr): candidates.append((int(peaks_arr[idx]), idx))
if idx > 0:
candidates.append((int(peaks_arr[idx - 1]), idx - 1))
if idx < len(peaks_arr):
candidates.append((int(peaks_arr[idx]), idx))
if not candidates:
continue
closest_val, closest_idx = min(candidates, key=lambda x: abs(x[0] - center))
Expand Down Expand Up @@ -269,7 +277,9 @@ def main() -> None:
bed_out = open(args.output, 'w')

try:
_banner = lambda msg: print(f"[fiberhmm-consensus-tfs] {msg}", file=sys.stderr)
def _banner(msg):
print(f"[fiberhmm-consensus-tfs] {msg}", file=sys.stderr)

_banner(f"input: {args.input}")
_banner(f"min-mapq: {args.min_mapq}")
_banner(f"min-tq: {args.min_tq}")
Expand Down
1 change: 0 additions & 1 deletion fiberhmm/cli/daf_encode.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
and encodes them as IUPAC Y/R with an st:Z tag for fiberhmm-apply --mode daf.
"""

import sys
import argparse

from fiberhmm.cli.common import add_version_args
Expand Down
36 changes: 21 additions & 15 deletions fiberhmm/cli/export_posteriors.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,26 +21,31 @@

import argparse
import os
import sys
import time
from concurrent.futures import ProcessPoolExecutor
from typing import Dict, List, Optional, Set, Tuple

import numpy as np
from typing import Dict, List, Optional, Tuple, Set
from concurrent.futures import ProcessPoolExecutor, as_completed
import pysam
from tqdm import tqdm

from fiberhmm.cli.common import (
add_edge_trim_args,
add_mode_args,
add_parallel_args,
add_verbose_args,
add_version_args,
)

# Package imports
from fiberhmm.core.bam_reader import (
encode_from_query_sequence, detect_daf_strand,
get_reference_positions, ContextEncoder
detect_daf_strand,
encode_from_query_sequence,
get_reference_positions,
)
from fiberhmm.core.model_io import load_model_with_metadata
from fiberhmm.core.hmm import FiberHMM
from fiberhmm.core.model_io import freeze_model_for_inference, load_model_with_metadata
from fiberhmm.inference.parallel import _get_genome_regions
from fiberhmm.cli.common import (
add_mode_args, add_parallel_args, add_edge_trim_args,
add_verbose_args, add_version_args,
)


def _detect_format(output_path: str, format_arg: str) -> str:
Expand Down Expand Up @@ -154,14 +159,15 @@ def _init_worker(model_path: str, params: dict):
os.environ['NUMBA_CACHE_DIR'] = ''

_worker_model, _, _ = load_model_with_metadata(model_path, normalize=True)
_worker_model = freeze_model_for_inference(_worker_model)
_worker_params = params

# Warmup numba JIT with a dummy sequence
dummy = np.zeros(100, dtype=np.int32)
try:
_worker_model.predict(dummy)
_worker_model.predict_proba(dummy)
except:
except Exception:
pass # OK if warmup fails


Expand Down Expand Up @@ -345,9 +351,10 @@ def on_results(chrom, results):
fp_sizes=fiber['footprint_sizes'],
)

_process_regions(regions, input_bam, model_path, params, n_cores, verbose, on_results)

total = writer.close()
try:
_process_regions(regions, input_bam, model_path, params, n_cores, verbose, on_results)
finally:
total = writer.close()

if verbose:
out_file = writer.output_path
Expand Down Expand Up @@ -603,7 +610,6 @@ def get_fibers_spanning(self, chrom: str, start: int, end: int) -> List['FiberPo
return self._load_fibers(chrom, indices)

def _load_fibers(self, chrom: str, indices: np.ndarray) -> List['FiberPosterior']:
import h5py
grp = self.h5[chrom]
ids = grp['fiber_ids']
starts = grp['fiber_starts']
Expand Down
Loading