Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ website/core.db
build/
__pycache__/
*.sqlite3
*.sqlite
env
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ python sample_registry/app.py

How you want to deploy this will depend on your needs, facilities, and ability. We have it deployed by a Kubernetes cluster but you could also 1) just run it in development mode from a lab computer or 2) setup Nginx/Apache on a dedicated server or 3) run it serverlessly in the cloud (e.g. with [Zappa](https://github.com/zappa/Zappa) on AWS) or 4) do something else. There are lots of well documented examples of deploying Flask sites out there, look around and find what works best for you.

When running, it will default to using a SQLite3 database located in the root of this repository (automatically created if it doesn't already exist). You can change to use a different backend by setting the `SAMPLE_REGISTRY_DB_URI` environment variable before running the app. For example, another sqlite database could be specified with a URI like this: `export SAMPLE_REGISTRY_DB_URI=sqlite:////path/to/db.sqlite3`.
When running, it will default to using a SQLite3 database located in the root of this repository (automatically created if it doesn't already exist). You can change to use a different backend by setting the `SAMPLE_REGISTRY_DB_URI` environment variable before running the app. For example, another sqlite database could be specified with a URI like this: `export SAMPLE_REGISTRY_DB_URI=sqlite:////path/to/db.sqlite`.

## Using the library

Expand Down
11 changes: 8 additions & 3 deletions sample_registry/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@
__version__ = "1.3.0"


# Define archive root path
ARCHIVE_ROOT = Path(
os.environ.get("SAMPLE_REGISTRY_ARCHIVE_ROOT", "/mnt/isilon/microbiome/")
)
# Doesn't include "NA" because that's what we fill in for missing values
NULL_VALUES: list[Optional[str]] = [
None,
Expand All @@ -34,16 +38,17 @@ def sample_registry_version():
"Missing database connection information in environment, using test SQLite database\n"
)
SQLALCHEMY_DATABASE_URI = (
f"sqlite:///{Path(__file__).parent.parent.resolve()}/sample_registry.sqlite3"
f"sqlite:///{Path(__file__).parent.parent.resolve()}/sample_registry.sqlite"
)


if "PYTEST_VERSION" in os.environ:
# Set SQLALCHEMY_DATABASE_URI to an in-memory SQLite database for testing
SQLALCHEMY_DATABASE_URI = "sqlite:///:memory:"

# Create database engine
engine = create_engine(SQLALCHEMY_DATABASE_URI, echo=False)

sys.stderr.write(f"Connecting to database at {SQLALCHEMY_DATABASE_URI}\n")
engine = create_engine(SQLALCHEMY_DATABASE_URI)

# Create database session
Session = sessionmaker(bind=engine)
Expand Down
82 changes: 78 additions & 4 deletions sample_registry/app.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import csv
import pickle
import os
from collections import defaultdict
from datetime import datetime
from flask import (
Flask,
make_response,
Expand All @@ -10,11 +12,12 @@
redirect,
send_file,
send_from_directory,
jsonify,
)
from flask_sqlalchemy import SQLAlchemy
from io import StringIO
from pathlib import Path
from sample_registry import SQLALCHEMY_DATABASE_URI
from sample_registry import ARCHIVE_ROOT, SQLALCHEMY_DATABASE_URI
from sample_registry.models import (
Base,
Annotation,
Expand All @@ -33,14 +36,15 @@
# whatever production server you are using instead. It's ok to leave this in when running the dev server.
app.wsgi_app = ProxyFix(app.wsgi_app, x_for=1, x_proto=1, x_host=1, x_prefix=1)

# Sanitize and RO db connection
SQLALCHEMY_DATABASE_URI = f"{SQLALCHEMY_DATABASE_URI.split('?')[0]}?mode=ro"
app.config["SQLALCHEMY_DATABASE_URI"] = SQLALCHEMY_DATABASE_URI
print(SQLALCHEMY_DATABASE_URI)
# Ensure SQLite explicitly opens in read-only mode
app.config["SQLALCHEMY_ENGINE_OPTIONS"] = {"connect_args": {"uri": True}}
db = SQLAlchemy(model_class=Base)
db.init_app(app)

with app.app_context():
db.create_all()


@app.route("/favicon.ico")
def favicon():
Expand Down Expand Up @@ -294,6 +298,76 @@ def show_stats():
)


def _parsed_month(date_str: str):
for fmt in ("%Y-%m-%d", "%Y/%m/%d", "%m/%d/%y", "%m/%d/%Y"):
try:
return datetime.strptime(date_str, fmt).strftime("%Y-%m")
except ValueError:
continue
return None


def _archive_size_for_run(run, warnings):
archive_path = (ARCHIVE_ROOT / run.data_uri).parent
run_label = f"CMR{run.run_accession:06d}"

if not archive_path.exists():
warnings.append(f"{run_label}: Archive path {archive_path} does not exist")
return 0

if not archive_path.is_dir():
warnings.append(f"{run_label}: Archive path {archive_path} is not a directory")
return 0

total_size = 0
for entry in archive_path.rglob("*"):
try:
if entry.is_file():
total_size += entry.stat().st_size
except OSError as exc:
warnings.append(f"{run_label}: Error accessing {entry}: {exc}")

if total_size == 0:
warnings.append(f"{run_label}: Archive at {archive_path} has size 0 bytes")

return total_size


@app.route("/api/archive_sizes")
def archive_sizes():
runs = db.session.query(Run).all()
warnings = []
max_warnings = 50
totals_by_month = defaultdict(int)

for run in runs:
month_label = _parsed_month(run.run_date)
if not month_label:
if len(warnings) < max_warnings:
warnings.append(
f"CMR{run.run_accession:06d}: Unable to parse run_date '{run.run_date}'"
)
continue

archive_size = _archive_size_for_run(run, warnings)
totals_by_month[month_label] += archive_size

if len(warnings) >= max_warnings:
warnings.append("... Additional warnings truncated ...")

by_month = [
{"month": month, "size_bytes": totals_by_month[month]}
for month in sorted(totals_by_month.keys())
]

return jsonify({"by_month": by_month, "warnings": warnings})


@app.route("/archive")
def archive():
return render_template("archive.html")


@app.route("/download/<run_acc>", methods=["GET", "POST"])
def download(run_acc):
ext = run_acc[-4:]
Expand Down
1 change: 0 additions & 1 deletion sample_registry/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
StandardHostSpecies,
)


STANDARD_TAGS = {
"SampleType": "sample_type",
"SubjectID": "subject_id",
Expand Down
1 change: 0 additions & 1 deletion sample_registry/register.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from sample_registry.registrar import SampleRegistry
from seqBackupLib.illumina import IlluminaFastq


SAMPLES_DESC = """\
Add new samples to the registry, with annotations.
"""
Expand Down
98 changes: 98 additions & 0 deletions sample_registry/templates/archive.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
{% extends 'base.html' %}

{% block head %}
<script src="https://cdn.jsdelivr.net/npm/chart.js@4.5.1/dist/chart.umd.min.js" integrity="sha256-SERKgtTty1vsDxll+qzd4Y2cF9swY9BCq62i9wXJ9Uo=" crossorigin="anonymous"></script>
{% endblock %}

{% block body %}
<div class="container mt-4">
<div class="row">
<div class="col-12">
<h2>Archive usage</h2>
<p class="text-muted">Net archive size grouped by month. Data are gathered directly from the NFS archive paths for each run.</p>
</div>
</div>
<div class="row">
<div class="col-12">
<canvas id="archiveChart" aria-label="Archive usage chart" role="img"></canvas>
</div>
</div>
<div class="row mt-3">
<div class="col-12">
<p>
<button class="btn btn-link p-0" type="button" data-bs-toggle="collapse" data-bs-target="#warningsCollapse" aria-expanded="true" aria-controls="warningsCollapse">
Warnings
</button>
</p>
<div class="collapse show" id="warningsCollapse">
<ul id="warningsList" class="list-group list-group-flush border rounded"></ul>
</div>
</div>
</div>
</div>

<script>
const chartElement = document.getElementById('archiveChart');
const warningsList = document.getElementById('warningsList');

function renderWarnings(warnings) {
warningsList.innerHTML = '';
if (!warnings || warnings.length === 0) {
const item = document.createElement('li');
item.className = 'list-group-item text-muted';
item.textContent = 'No warnings to display.';
warningsList.appendChild(item);
return;
}

warnings.forEach((warning) => {
const item = document.createElement('li');
item.className = 'list-group-item';
item.textContent = warning;
warningsList.appendChild(item);
});
}

function renderChart(byMonth) {
const labels = byMonth.map((entry) => entry.month);
const data = byMonth.map((entry) => entry.size_bytes / (1024 * 1024 * 1024));

new Chart(chartElement.getContext('2d'), {
type: 'bar',
data: {
labels,
datasets: [
{
label: 'Archive size (GB)',
data,
backgroundColor: 'rgba(54, 162, 235, 0.6)',
borderColor: 'rgba(54, 162, 235, 1)',
borderWidth: 1,
},
],
},
options: {
scales: {
y: {
beginAtZero: true,
title: {
display: true,
text: 'Size (GB)',
},
},
},
},
});
}

fetch('/api/archive_sizes')
.then((response) => response.json())
.then(({ by_month: byMonth, warnings }) => {
renderChart(byMonth);
renderWarnings(warnings);
})
.catch((error) => {
renderWarnings([`Error loading archive data: ${error}`]);
});
</script>
{% endblock %}
9 changes: 5 additions & 4 deletions sample_registry/templates/base.html
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,11 @@ <h2>
</h2>
</div>
<div class="col-sm-auto">
<a style="padding-left: 20px" href="{{ url_for('show_runs') }}">Runs</a>
<a style="padding-left: 10px" href="{{ url_for('show_tags') }}">Metadata</a>
<a style="padding-left: 10px" href="{{ url_for('show_stats') }}">Stats</a>
</div>
<a style="padding-left: 20px" href="{{ url_for('show_runs') }}">Runs</a>
<a style="padding-left: 10px" href="{{ url_for('show_tags') }}">Metadata</a>
<a style="padding-left: 10px" href="{{ url_for('show_stats') }}">Stats</a>
<a style="padding-left: 10px" href="{{ url_for('archive') }}">Archive</a>
</div>
<div class="col-sm-auto">
<img src="{{ url_for('static', filename='img/ricks_dna.png') }}" alt="" style="width: 100px; height: 50px; margin-left: auto;">
</div>
Expand Down
9 changes: 2 additions & 7 deletions sample_registry/templates/browse_runs.html
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,8 @@ <h2>Sequencing runs</h2>
</tr>
</thead>
<tbody>
{% for run, sample_count in sample_counts.items() %}
{% if run.machine_type.startswith('Illumina') %}
{% set platform = 'Illumina' %}
{% else %}
{% set platform = run.machine_type %}
{% endif %}
{% set platform = platform + ' ' + run.machine_kit %}
{% for run, sample_count in sample_counts.items() %}
{% set platform = run.machine_type + ' ' + run.machine_kit %}
<tr>
<td><a href="{{ url_for('show_runs', run_acc=run.run_accession) }}">{{ "CMR{:06d}".format(run.run_accession) }}</a></td>
<td><span class="date">{{ run.run_date }}</span></td>
Expand Down
2 changes: 1 addition & 1 deletion sample_registry/templates/show_run.html
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ <h2>
<li><strong>Date:</strong> {{ run.run_date }}</li>
<li><strong>Lane:</strong> {{ run.lane }}</li>
<li><strong>Platform:</strong> {{ run.machine_type }} {{ run.machine_kit }}</li>
<li><strong>Data file:</strong> <a href="respublica.research.chop.edu:/mnt/isilon/microbiome/{{ run.data_uri }}">{{ run.data_uri.split('/')|last }}</a></li>
<li><strong>Data file:</strong> /mnt/isilon/microbiome/{{ run.data_uri }}</li>
</ul>
<p>
<strong>Export metadata for all samples:</strong><br />
Expand Down
1 change: 0 additions & 1 deletion tests/test_mapping.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import io
from sample_registry.mapping import SampleTable


NORMAL_TSV = """\
SampleID BarcodeSequence HostSpecies SubjectID
S1 GCCT Human Hu23
Expand Down
1 change: 0 additions & 1 deletion tests/test_register.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
register_host_species,
)


samples = [
{
"SampleID": "abc123",
Expand Down