Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
*.egg-info/
.env
/exported
.github/
/model_checkpoints
__pycache__/
*.py[cod]
Expand Down
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,11 @@ python3 -m elp_rumble.data_creation.create_tfrecords
python3 -m elp_rumble.data_creation.convert_audio_to_spec_tfrecords
```

During preprocessing, normalization statistics are saved to `data/normalization_stats.json`.
`create_tfrecords` writes `audio_mean` and `audio_std`, and
`convert_audio_to_spec_tfrecords` writes `spec_mean` and `spec_std`.
Running both scripts produces a single JSON file with all four keys.
Comment on lines +352 to +355
Copy link

Copilot AI Mar 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This section mentions the output path and which scripts write which keys, but it’s still missing two items called out in issue #15/PR description: (1) an example JSON snippet showing the expected shape, and (2) a brief note on overwrite/upsert behavior (reruns overwrite only the keys they compute). Adding both here would make the README sufficient for downstream consumers without needing to read the PR description.

Copilot uses AI. Check for mistakes.

Once TFRecords are created, no manual path edits are required for CNN. CNN data paths come from `src/elp_rumble/input_pipeline/spectrogram_tfrecords.py` and `src/elp_rumble/config/paths.py`.

For RNN-only workflows, dataset file names are defined in `src/elp_rumble/models/rnn_config.py`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
import os
import numpy as np
import tensorflow as tf
from elp_rumble.config.paths import TFRECORDS_AUDIO_DIR, TFRECORDS_SPECTROGRAM_DIR
from elp_rumble.config.paths import DATA_ROOT, TFRECORDS_AUDIO_DIR, TFRECORDS_SPECTROGRAM_DIR
from .utils import upsert_normalization_stats

INPUT_AUDIO_TFR_FOLDER = TFRECORDS_AUDIO_DIR
OUTPUT_SPEC_FOLDER = TFRECORDS_SPECTROGRAM_DIR
Expand Down Expand Up @@ -119,6 +120,12 @@ def main():
datasets[i] = (apply_stft(dataset, frame_length, frame_step, sample_rate, max_frequency), name)

global_mean, global_std = compute_global_stats(datasets)
normalization_stats_path = DATA_ROOT / "normalization_stats.json"
upsert_normalization_stats(
normalization_stats_path,
{"spec_mean": global_mean, "spec_std": global_std},
)
print(f"Saved spectrogram normalization stats to {normalization_stats_path}")

for dataset, file_name in datasets:
normalized_dataset = dataset.map(lambda spectrogram, label: (normalize_spectrogram(spectrogram, global_mean, global_std), label), num_parallel_calls=tf.data.AUTOTUNE)
Expand Down
8 changes: 8 additions & 0 deletions src/elp_rumble/data_creation/create_tfrecords.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@
load_wavs_into_dataset,
normalize_dataset,
stratified_split,
upsert_normalization_stats,
write_tfrecords,
)
from elp_rumble.config.paths import (
DATA_ROOT,
POS_TRAIN_VAL_CLIPS_DIR,
TRAIN_VAL_NEG_CLIPS_DIR,
POS_HOLDOUT_TEST_CLIPS_DIR,
Expand Down Expand Up @@ -41,6 +43,12 @@ def main():

# Compute statistics
global_mean, global_std = compute_statistics(combined_dataset)
normalization_stats_path = DATA_ROOT / "normalization_stats.json"
upsert_normalization_stats(
normalization_stats_path,
{"audio_mean": global_mean, "audio_std": global_std},
)
print(f"Saved audio normalization stats to {normalization_stats_path}")

del combined_dataset

Expand Down
18 changes: 18 additions & 0 deletions src/elp_rumble/data_creation/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from scipy.signal import resample, butter, lfilter
import numpy as np
import os
import json
import tensorflow as tf
from collections import Counter
from sklearn.model_selection import train_test_split
Expand Down Expand Up @@ -157,6 +158,23 @@ def compute_statistics(dataset):
mean, std = total_sum / total_count, np.sqrt((total_sum_sq / total_count) - (total_sum / total_count) ** 2)
return mean, std


def upsert_normalization_stats(stats_path, updates):
"""Merge normalization stats into JSON at stats_path, creating file if needed."""
os.makedirs(os.path.dirname(os.fspath(stats_path)), exist_ok=True)
Copy link

Copilot AI Mar 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

os.makedirs(os.path.dirname(os.fspath(stats_path)), ...) will raise if stats_path has no parent directory (e.g., just "normalization_stats.json"), because os.path.dirname(...) becomes an empty string. Consider resolving the parent directory via Path(stats_path).parent (or guarding for empty dirname) before calling makedirs so the helper works for both nested and non-nested paths.

Suggested change
os.makedirs(os.path.dirname(os.fspath(stats_path)), exist_ok=True)
dir_name = os.path.dirname(os.fspath(stats_path))
if dir_name:
os.makedirs(dir_name, exist_ok=True)

Copilot uses AI. Check for mistakes.

existing = {}
if os.path.exists(stats_path):
with open(stats_path, "r", encoding="utf-8") as f:
existing = json.load(f)

existing.update({k: float(v) for k, v in updates.items()})

tmp_path = f"{stats_path}.tmp"
with open(tmp_path, "w", encoding="utf-8") as f:
json.dump(existing, f, indent=2)
Copy link

Copilot AI Mar 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Issue #15 requires deterministic JSON output, but json.dump(existing, ..., indent=2) does not enforce a stable key order; the serialized ordering will depend on insertion/update history. Add sort_keys=True (and keep indentation) so repeated runs produce a deterministic file layout.

Suggested change
json.dump(existing, f, indent=2)
json.dump(existing, f, indent=2, sort_keys=True)

Copilot uses AI. Check for mistakes.
os.replace(tmp_path, stats_path)

def normalize_dataset(dataset, mean, std):
return dataset.map(lambda audio: (audio - mean) / std)

Expand Down