Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
14741da
chore: ignore repro/ and uv.lock
d-laub May 28, 2026
cda4906
docs(spec): prefetching dataloader design
d-laub May 29, 2026
f19c47b
docs(plan): prefetching dataloader implementation plan
d-laub May 29, 2026
43c12a3
style: ruff formatting
d-laub May 29, 2026
167beea
feat(haps): add _allele_bytes_sum for exact variant footprint
d-laub May 29, 2026
c6643e0
feat(dataset): _output_bytes_per_instance reference + haplotypes
d-laub May 29, 2026
41c6603
feat(dataset): _output_bytes_per_instance annotated branch
d-laub May 29, 2026
c8c41b6
feat(dataset): _output_bytes_per_instance variants branch with var_fi…
d-laub May 29, 2026
4e9a032
feat(dataset): _output_bytes_per_instance tracks branch
d-laub May 29, 2026
e63a082
feat(chunked): ChunkPlanner and slice_chunk
d-laub May 29, 2026
6f55955
test(chunked): slice_chunk parity with direct indexing
d-laub May 29, 2026
a296ba7
feat: mode='buffered' dataloader
d-laub May 29, 2026
9440756
feat(shm): hand-rolled slot header + dense round-trip
d-laub May 29, 2026
e7ff54f
feat(shm): Ragged and RaggedVariants serialization
d-laub May 29, 2026
637f734
feat(producer): subprocess entrypoint for double_buffered mode
d-laub May 29, 2026
5161d93
feat: mode='double_buffered' dataloader happy path
d-laub May 29, 2026
eecb8a6
test(double_buffered): producer crash + shm cleanup
d-laub May 29, 2026
7597a10
docs(skill): document mode/buffer_bytes/copy/heartbeat_seconds on to_…
d-laub May 29, 2026
00eb4dd
fix(double_buffered): replay all Dataset settings in producer subprocess
d-laub May 29, 2026
9b541fb
refactor: trim dead code and over-commenting per CLAUDE.md
d-laub May 29, 2026
e7e2f5e
style: ruff formatting
d-laub May 29, 2026
0f441c5
style: remove unused variables and imports
d-laub May 29, 2026
09a2f9c
feat(bench): scaffold dataloader bench axis constants
d-laub May 29, 2026
0cadabb
feat(bench): enumerate deduped dataloader bench cells
d-laub May 29, 2026
09196fe
feat(bench): add BED resize + per-region-length dataset prep
d-laub May 29, 2026
13b8192
feat(bench): add exact output-bytes table helper
d-laub May 29, 2026
d179f2b
fix(bench): open datasets with hg38 reference; revert out-of-scope _o…
d-laub May 29, 2026
be3aa68
feat(bench): add per-cell measurement protocol
d-laub May 29, 2026
47a173f
feat(bench): add CSV header/append helpers
d-laub May 29, 2026
4010dc6
feat(bench): add bench.py thread-pinned orchestration
d-laub May 29, 2026
538de1f
feat(bench): add 3x4 small-multiples results plot
d-laub May 29, 2026
577a4bd
fix(double_buffered): size shm slots for serialized ragged footprint
d-laub May 29, 2026
4d4365e
fix(double_buffered): serialize RaggedAnnotatedHaps (annotated output)
d-laub May 29, 2026
b9ecd5e
chore(bench): cap buffer_bytes at 512 MiB for workstation RAM
d-laub May 29, 2026
93c79c0
fix(double_buffered): release producer+shm per loader, not at process…
d-laub May 29, 2026
35158f5
perf(bench): open hg38 reference as memmap (in_memory=False)
d-laub May 29, 2026
d8b1708
chore(bench): add 195-cell results + plot; fix baseline series in plot
d-laub May 29, 2026
473b9b9
feat(bench): add MiB/s bandwidth plot; trim 1KG regression test data
d-laub May 29, 2026
596edea
style: ruff format
d-laub May 29, 2026
8b637ab
feat(bench): log CPU/system/microarch info for all benchmarks
d-laub May 29, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ data/
.claude/worktrees/*
.worktrees/
scratch/
repro/
experiments/dataloader/tmp/
uv.lock

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
4 changes: 2 additions & 2 deletions docs/source/basenji2_eval.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -36,16 +36,16 @@
"import genvarloader as gvl\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import polars.selectors as cs\n",
"import polars as pl\n",
"import polars.selectors as cs\n",
"import scipy.stats as st\n",
"import seaborn as sns\n",
"import seqpro as sp\n",
"import torch\n",
"from basenji2_pytorch import Basenji2, basenji2_params, basenji2_weights\n",
"from einops import rearrange\n",
"from genoray import PGEN\n",
"from genoray.exprs import is_snp, is_biallelic\n",
"from genoray.exprs import is_biallelic, is_snp\n",
"from tqdm.auto import tqdm"
]
},
Expand Down
4 changes: 3 additions & 1 deletion docs/source/faq.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,9 @@ import genvarloader as gvl

pos_strand = gvl.BigWigs.from_table("pos", "pos_strand.tsv")
neg_strand = gvl.BigWigs.from_table("neg", "neg_strand.tsv")
gvl.write("path/to/dataset.gvl", bed="path/to/regions.bed", tracks=[pos_strand, neg_strand])
gvl.write(
"path/to/dataset.gvl", bed="path/to/regions.bed", tracks=[pos_strand, neg_strand]
)
```

## How does GVL handle negative stranded regions provided to [`gvl.write()`](api.md#genvarloader.write)?
Expand Down
4 changes: 2 additions & 2 deletions docs/source/geuvadis.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@
"import numba as nb\n",
"import numpy as np\n",
"import polars as pl\n",
"import seqpro as sp\n",
"import pooch\n",
"from loguru import logger\n",
"import seqpro as sp\n",
"from einops import rearrange\n",
"from loguru import logger\n",
"from tqdm.auto import tqdm"
]
},
Expand Down
6 changes: 5 additions & 1 deletion docs/source/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,9 @@ import genvarloader as gvl
dataset = gvl.Dataset.open(path="cool_dataset.gvl", reference="hg38.fa")
train_samples = ["David", "Aaron"]
train_dataset = dataset.subset_to(regions="train_regions.bed", samples=train_samples)
train_dataloader = train_dataset.to_dataloader(batch_size=32, shuffle=True, num_workers=1)
train_dataloader = train_dataset.to_dataloader(
batch_size=32, shuffle=True, num_workers=1
)

# use it in your training loop
for haplotypes, tracks in train_dataloader:
Expand All @@ -107,11 +109,13 @@ dataset[:10, :5] # first 10 regions and first 5 samples
import seqpro as sp
from einops import rearrange


def transform(haplotypes, tracks):
ohe = sp.DNA.ohe(haplotypes)
ohe = rearrange(ohe, "... length alphabet -> ... alphabet length")
return ohe, tracks


transformed_dataset = dataset.with_settings(transform=transform)
```

Expand Down
7 changes: 4 additions & 3 deletions docs/source/splicing.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"from tempfile import TemporaryDirectory\n",
"\n",
"import genvarloader as gvl\n",
"import polars as pl\n",
"import pooch\n",
"from pathlib import Path\n",
"from tempfile import TemporaryDirectory"
"import pooch"
]
},
{
Expand Down
18 changes: 9 additions & 9 deletions docs/source/write.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,9 @@ Once your data is prepared, you can use [`gvl.write()`](api.md#genvarloader.writ
import genvarloader as gvl

gvl.write(
path='1000_genomes_haplotypes.gvl',
bed='tiling_windows.bed',
variants='all_chroms.bcf',
path="1000_genomes_haplotypes.gvl",
bed="tiling_windows.bed",
variants="all_chroms.bcf",
# OR variants='all_chroms.pgen',
)
```
Expand All @@ -69,13 +69,13 @@ This dataset would have haplotypes available for all samples in `all_chroms.bcf`

```python
gvl.write(
path='1000_genomes_lncRNA.gvl',
bed='lncRNA.bed', # can be varying length regions
variants='all_chroms.bcf',
path="1000_genomes_lncRNA.gvl",
bed="lncRNA.bed", # can be varying length regions
variants="all_chroms.bcf",
tracks=[
gvl.BigWigs.from_table('pos', 'pos_strands.tsv'),
gvl.BigWigs.from_table('neg', 'pos_strands.tsv'),
]
gvl.BigWigs.from_table("pos", "pos_strands.tsv"),
gvl.BigWigs.from_table("neg", "pos_strands.tsv"),
],
)
```

Expand Down
52 changes: 30 additions & 22 deletions docs/superpowers/plans/2026-05-08-get-splice-bed.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,9 @@ Per-row notes (do not put in fixture, just for plan readers):
Helper to write the fixture:

```python
GTF_TEXT = "\t".join # placeholder marker; the real fixture uses the literal string above
GTF_TEXT = (
"\t".join
) # placeholder marker; the real fixture uses the literal string above
```

In the test file, write the string verbatim. Use tabs (not spaces) between fields.
Expand All @@ -76,14 +78,14 @@ import genvarloader as gvl


GTF_TEXT = (
"1\ttest\texon\t100\t200\t.\t+\t.\tgene_id \"G1\"; gene_name \"GENEA\"; transcript_id \"T1\"; exon_number \"1\"; transcript_support_level \"1\";\n"
"1\ttest\tCDS\t300\t308\t.\t+\t0\tgene_id \"G1\"; gene_name \"GENEA\"; transcript_id \"T1\"; exon_number \"2\"; transcript_support_level \"1\";\n"
"1\ttest\tCDS\t100\t108\t.\t+\t0\tgene_id \"G1\"; gene_name \"GENEA\"; transcript_id \"T1\"; exon_number \"1\"; transcript_support_level \"1\";\n"
"2\ttest\tCDS\t500\t506\t.\t-\t0\tgene_id \"G2\"; gene_name \"GENEB\"; transcript_id \"T2\"; exon_number \"1\"; transcript_support_level \"1\";\n"
"2\ttest\tCDS\t600\t606\t.\t-\t0\tgene_id \"G2\"; gene_name \"GENEB\"; transcript_id \"T2\"; exon_number \"2\"; transcript_support_level \"1\";\n"
"3\ttest\tCDS\t700\t705\t.\t+\t0\tgene_id \"G3\"; gene_name \"GENEC\"; transcript_id \"T3\"; exon_number \"1\"; transcript_support_level \"2\";\n"
"4\ttest\tCDS\t800\t804\t.\t+\t0\tgene_id \"G4\"; transcript_id \"T4\"; exon_number \"1\"; transcript_support_level \"1\";\n"
"1\ttest\tfive_prime_utr\t50\t99\t.\t+\t.\tgene_id \"G1\"; gene_name \"GENEA\"; transcript_id \"T1\";\n"
'1\ttest\texon\t100\t200\t.\t+\t.\tgene_id "G1"; gene_name "GENEA"; transcript_id "T1"; exon_number "1"; transcript_support_level "1";\n'
'1\ttest\tCDS\t300\t308\t.\t+\t0\tgene_id "G1"; gene_name "GENEA"; transcript_id "T1"; exon_number "2"; transcript_support_level "1";\n'
'1\ttest\tCDS\t100\t108\t.\t+\t0\tgene_id "G1"; gene_name "GENEA"; transcript_id "T1"; exon_number "1"; transcript_support_level "1";\n'
'2\ttest\tCDS\t500\t506\t.\t-\t0\tgene_id "G2"; gene_name "GENEB"; transcript_id "T2"; exon_number "1"; transcript_support_level "1";\n'
'2\ttest\tCDS\t600\t606\t.\t-\t0\tgene_id "G2"; gene_name "GENEB"; transcript_id "T2"; exon_number "2"; transcript_support_level "1";\n'
'3\ttest\tCDS\t700\t705\t.\t+\t0\tgene_id "G3"; gene_name "GENEC"; transcript_id "T3"; exon_number "1"; transcript_support_level "2";\n'
'4\ttest\tCDS\t800\t804\t.\t+\t0\tgene_id "G4"; transcript_id "T4"; exon_number "1"; transcript_support_level "1";\n'
'1\ttest\tfive_prime_utr\t50\t99\t.\t+\t.\tgene_id "G1"; gene_name "GENEA"; transcript_id "T1";\n'
)


Expand Down Expand Up @@ -127,14 +129,18 @@ def test_chrom_end_unchanged(gtf_path: Path):

def test_dropped_non_cds_rows(gtf_path: Path):
"""exon and five_prime_utr rows are removed."""
bed = gvl.get_splice_bed(gtf_path, transcript_support_level=None, require_multiple_of_3=False)
bed = gvl.get_splice_bed(
gtf_path, transcript_support_level=None, require_multiple_of_3=False
)
# Every surviving row corresponds to a CDS feature; we have 6 CDS rows in fixture.
assert bed.height == 6


def test_sorted_output(gtf_path: Path):
"""Output is sorted by chrom (natural), then chromStart."""
bed = gvl.get_splice_bed(gtf_path, transcript_support_level=None, require_multiple_of_3=False)
bed = gvl.get_splice_bed(
gtf_path, transcript_support_level=None, require_multiple_of_3=False
)
chroms = bed["chrom"].to_list()
starts = bed["chromStart"].to_list()
assert chroms == sorted(chroms, key=lambda c: (len(c), c)) # natural order
Expand Down Expand Up @@ -166,7 +172,10 @@ def test_tsl_explicit_value(gtf_path: Path):
def test_contigs_filter(gtf_path: Path):
"""contigs=['1'] restricts to chr 1 rows."""
bed = gvl.get_splice_bed(
gtf_path, contigs=["1"], transcript_support_level=None, require_multiple_of_3=False
gtf_path,
contigs=["1"],
transcript_support_level=None,
require_multiple_of_3=False,
)
assert bed["chrom"].unique().to_list() == ["1"]

Expand Down Expand Up @@ -250,9 +259,11 @@ def get_splice_bed(
if contigs is not None:
lf = lf.filter(pl.col("seqname").is_in(contigs))

lf = lf.filter(pl.col("feature") == "CDS").rename(
{"seqname": "chrom", "start": "chromStart", "end": "chromEnd"}
)
lf = lf.filter(pl.col("feature") == "CDS").rename({
"seqname": "chrom",
"start": "chromStart",
"end": "chromEnd",
})

lf = lf.with_columns(
pl.col("chromStart") - 1,
Expand All @@ -272,7 +283,9 @@ def get_splice_bed(
drop_cols.append("transcript_len")

if transcript_support_level is not None:
lf = lf.filter(sp.gtf.attr("transcript_support_level") == transcript_support_level)
lf = lf.filter(
sp.gtf.attr("transcript_support_level") == transcript_support_level
)

df = lf.drop(drop_cols).collect()
return sp.bed.sort(df)
Expand Down Expand Up @@ -326,12 +339,7 @@ from ._dataset._write import get_splice_bed, write
And add `"get_splice_bed",` to the `__all__` list (e.g. immediately after `"write"`):

```python
__all__ = [
"write",
"get_splice_bed",
"Dataset",
...
]
__all__ = ["write", "get_splice_bed", "Dataset", ...]
```

- [ ] **Step 2: Run the test suite — expect all green**
Expand Down
Loading
Loading