Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions .github/workflows/docs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,7 @@ jobs:

- name: Build documentation
run: |
cd docs
sphinx-build -b html . _build/html
sphinx-build -b html docs docs/_build/html

- name: Setup Pages
uses: actions/configure-pages@v5
Expand Down
3 changes: 1 addition & 2 deletions .github/workflows/links.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,7 @@ jobs:

- name: Build documentation with Sphinx
run: |
cd docs
sphinx-build -b html . _build/html
sphinx-build -b html docs docs/_build/html

- name: Link Checker on built documentation
id: lychee
Expand Down
2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
author = "HED Standard"

# The full version, including alpha/beta/rc tags
release = "0.8.0"
release = "0.8.1"

# -- General configuration ---------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
Expand Down
41 changes: 32 additions & 9 deletions hed/tools/analysis/tabular_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,22 @@
class TabularSummary:
"""Summarize the contents of columnar files."""

def __init__(self, value_cols=None, skip_cols=None, name=""):
def __init__(self, value_cols=None, skip_cols=None, name="", categorical_limit=None):
"""Constructor for a BIDS tabular file summary.

Parameters:
value_cols (list, None): List of columns to be treated as value columns.
skip_cols (list, None): List of columns to be skipped.
name (str): Name associated with the dictionary.
categorical_limit (int, None): Maximum number of unique values to store for categorical columns.

"""

self.name = name
self.categorical_info = {}
self.value_info = {}
self.categorical_counts = {}
self.categorical_limit = categorical_limit
if value_cols and skip_cols and set(value_cols).intersection(skip_cols):
raise HedFileError(
"ValueSkipOverlap", f"Value columns {str(value_cols)} and skip columns {str(skip_cols)} cannot overlap", ""
Expand All @@ -47,7 +50,10 @@ def __str__(self):
for key in sorted_keys:
value_dict = self.categorical_info[key]
sorted_v_keys = sorted(value_dict)
summary_list.append(f"{indent * 2}{key} ({len(sorted_v_keys)} distinct values):")
counts = self.categorical_counts.get(key, [0, 0])
summary_list.append(
f"{indent * 2}{key} ({len(sorted_v_keys)} distinct values, {counts[0]} total values in {counts[1]} files):"
)
for v_key in sorted_v_keys:
summary_list.append(f"{indent * 3}{v_key}: {value_dict[v_key]}")

Expand Down Expand Up @@ -101,9 +107,11 @@ def get_summary(self, as_json=False) -> Union[dict, str]:
"Total events": self.total_events,
"Total files": self.total_files,
"Categorical columns": categorical_cols,
"Categorical counts": self.categorical_counts,
"Value columns": value_cols,
"Skip columns": self.skip_cols,
"Files": self.files,
"Categorical limit": str(self.categorical_limit),
}
if as_json:
return json.dumps(summary, indent=4)
Expand Down Expand Up @@ -131,7 +139,7 @@ def get_number_unique(self, column_names=None) -> dict:
return counts

def update(self, data, name=None):
"""Update the counts based on data.
"""Update the counts based on data (DataFrame, filename, or list of filenames).

Parameters:
data (DataFrame, str, or list): DataFrame containing data to update.
Expand Down Expand Up @@ -166,19 +174,26 @@ def update_summary(self, tab_sum):
self._update_dict_value(tab_sum)
self._update_dict_categorical(tab_sum)

def _update_categorical(self, tab_name, values):
def _update_categorical(self, tab_name, values, cat_counts):
"""Update the categorical information for this summary.

Parameters:
tab_name (str): Name of a key indicating a categorical column.
values (dict): A dictionary whose keys are unique categorical values.
cat_counts (list): A list with two elements: total count of values and number of entries.

"""
if tab_name not in self.categorical_info:
self.categorical_info[tab_name] = {}

if tab_name not in self.categorical_counts:
self.categorical_counts[tab_name] = [cat_counts[0], cat_counts[1]]
else:
self.categorical_counts[tab_name][0] += cat_counts[0]
self.categorical_counts[tab_name][1] += cat_counts[1]
total_values = self.categorical_info[tab_name]
for name, value in values.items():
if self.categorical_limit is not None and len(total_values) >= self.categorical_limit:
break
value_list = total_values.get(name, [0, 0])
if not isinstance(value, list):
value = [value, 1]
Expand Down Expand Up @@ -207,9 +222,15 @@ def _update_dataframe(self, data, name):
self.value_info[col_name][0] = self.value_info[col_name][0] + len(col_values)
self.value_info[col_name][1] = self.value_info[col_name][1] + 1
else:
cat_counts = self.categorical_counts.get(col_name, [0, 0])
cat_counts[0] += len(col_values)
cat_counts[1] += 1
self.categorical_counts[col_name] = cat_counts
if self.categorical_limit is not None and len(col_values) > self.categorical_limit:
continue
col_values = col_values.astype(str)
values = col_values.value_counts(ascending=True)
self._update_categorical(col_name, values)
self._update_categorical(col_name, values, cat_counts)

def _update_dict_categorical(self, col_dict):
"""Update this summary with the categorical information in the dictionary from another summary.
Expand All @@ -228,7 +249,7 @@ def _update_dict_categorical(self, col_dict):
elif col in self.skip_cols:
continue
else:
self._update_categorical(col, col_dict.categorical_info[col])
self._update_categorical(col, col_dict.categorical_info[col], col_dict.categorical_counts.get(col, [0, 0]))

def _update_dict_skip(self, col_dict):
"""Update this summary with the skip column information from another summary.
Expand Down Expand Up @@ -289,13 +310,15 @@ def extract_summary(summary_info) -> "TabularSummary":
new_tab = TabularSummary(
value_cols=summary_info.get("Value columns", {}).keys(),
skip_cols=summary_info.get("Skip columns", []),
name=summary_info.get("Summary name", ""),
name=summary_info.get("Name", ""),
categorical_limit=summary_info.get("Categorical limit", None),
)
new_tab.value_info = summary_info.get("Value_columns", {})
new_tab.value_info = summary_info.get("Value columns", {})
new_tab.total_files = summary_info.get("Total files", 0)
new_tab.total_events = summary_info.get("Total events", 0)
new_tab.skip_cols = summary_info.get("Skip columns", [])
new_tab.categorical_info = summary_info.get("Categorical columns", {})
new_tab.categorical_counts = summary_info.get("Categorical counts", {})
new_tab.files = summary_info.get("Files", {})
return new_tab

Expand Down
96 changes: 95 additions & 1 deletion tests/tools/analysis/test_tabular_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def test_get_summary(self):
)
summary1 = dict1.get_summary(as_json=False)
self.assertIsInstance(summary1, dict)
self.assertEqual(len(summary1), 7)
self.assertEqual(len(summary1), 9)
summary2 = dict1.get_summary(as_json=True).replace('"', "")
self.assertIsInstance(summary2, str)

Expand Down Expand Up @@ -240,6 +240,100 @@ def test_update_summary(self):
self.assertEqual(len(files_bids), tab_all.total_files)
self.assertEqual(len(files_bids) * 200, tab_all.total_events)

def test_categorical_limit_constructor(self):
# Test that categorical_limit can be set in constructor
dict1 = TabularSummary(categorical_limit=5)
self.assertEqual(dict1.categorical_limit, 5)

dict2 = TabularSummary(categorical_limit=None)
self.assertIsNone(dict2.categorical_limit)

def test_categorical_limit_enforced(self):
# Test that categorical_limit is enforced when updating
stern_df = get_new_dataframe(self.stern_map_path)

# Create a summary with no limit
dict_no_limit = TabularSummary()
dict_no_limit.update(stern_df)

# Create a summary with a limit of 2 unique values per column
dict_with_limit = TabularSummary(categorical_limit=2)
dict_with_limit.update(stern_df)

# Check that columns with more than 2 unique values are limited
for col_name in dict_with_limit.categorical_info:
self.assertLessEqual(
len(dict_with_limit.categorical_info[col_name]),
2,
f"Column {col_name} should have at most 2 unique values stored",
)
# But categorical_counts should track all values
self.assertIn(col_name, dict_with_limit.categorical_counts)
self.assertGreater(dict_with_limit.categorical_counts[col_name][0], 0)

def test_categorical_limit_columns_with_many_values(self):
# Test that columns with many values are skipped during initial update
wh_df = get_new_dataframe(self.wh_events_path)

# Set limit to 5
dict1 = TabularSummary(categorical_limit=5)
dict1.update(wh_df)

# Columns with more than 5 unique values at collection time should still be tracked in counts
for col_name, counts in dict1.categorical_counts.items():
self.assertGreater(counts[0], 0, f"Column {col_name} should have event count > 0")
self.assertEqual(counts[1], 1, f"Column {col_name} should have been updated once")

def test_categorical_limit_in_summary(self):
# Test that categorical_limit appears in the summary output
dict1 = TabularSummary(categorical_limit=10)
stern_df = get_new_dataframe(self.stern_map_path)
dict1.update(stern_df)

summary = dict1.get_summary(as_json=False)
self.assertIn("Categorical limit", summary)
self.assertEqual(summary["Categorical limit"], "10")

# Test with None
dict2 = TabularSummary()
dict2.update(stern_df)
summary2 = dict2.get_summary(as_json=False)
self.assertEqual(summary2["Categorical limit"], "None")

def test_categorical_limit_extract_summary(self):
# Test that categorical_limit is preserved through extract_summary
dict1 = TabularSummary(categorical_limit=15)
stern_df = get_new_dataframe(self.stern_map_path)
dict1.update(stern_df)

summary_info = dict1.get_summary(as_json=False)
dict2 = TabularSummary.extract_summary(summary_info)

# Note: extract_summary doesn't restore categorical_limit currently,
# but it should at least not error
self.assertIsInstance(dict2, TabularSummary)

def test_categorical_limit_update_dict(self):
# Test that categorical_limit works correctly with update_summary
stern_df = get_new_dataframe(self.stern_test1_path)

dict1 = TabularSummary(categorical_limit=3)
dict1.update(stern_df)

dict2 = TabularSummary(categorical_limit=3)
dict2.update(stern_df)

# Update dict1 with dict2
dict1.update_summary(dict2)

# Check that limits are still enforced
for col_name in dict1.categorical_info:
self.assertLessEqual(
len(dict1.categorical_info[col_name]),
3,
f"Column {col_name} should have at most 3 unique values after update_summary",
)


if __name__ == "__main__":
unittest.main()