Add CSV output option to imas process-db-analysis (#85)

maarten-ic · web-flow · commit bade438ecf3a · 2025-12-10T17:31:33.000+01:00
diff --git a/imas/command/db_analysis.py b/imas/command/db_analysis.py
@@ -7,6 +7,8 @@
 import re
 import readline
 import sys
+from csv import writer as csvwriter
+from collections import Counter, defaultdict
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Dict, Iterable, List, Optional
@@ -139,12 +141,70 @@ def ids_info(idsfile: Path):
     }
 
 
+@dataclass
+class _PathUsage:
+    num_occurrences: int = 0
+    path_counter: Counter = field(default_factory=Counter)
+
+
+def _write_usage_stats_to_csv(
+    writer, usage_per_entry, usage_per_occurrence, num_entries
+):
+    """Write usage statistics to csv file.
+
+    Args:
+        writer: an instance of csv.writer
+        usage_per_entry: path usage statistics per data entry
+        usage_per_occurrence: path usage statistics per occurrence
+        num_entries: number of data entries
+    """
+    # Write header
+    writer.writerow(
+        [
+            "IDS",
+            "Path in IDS",
+            "Uses errorbar",
+            "Frequency (without occurrences)",
+            "Frequency (with occurences)",
+        ]
+    )
+
+    for ids_name in sorted(usage_per_entry):
+        entry_usage = usage_per_entry[ids_name]
+        occurrence_usage = usage_per_occurrence[ids_name]
+
+        # Usage statistics of the IDS (# entries with this IDS / # entries)
+        freq = entry_usage.num_occurrences / num_entries
+        writer.writerow([ids_name, "", "", freq, ""])
+
+        for path, entry_count in sorted(entry_usage.path_counter.items()):
+            if "_error_" in path:
+                continue  # Skip error nodes
+            occurrence_count = occurrence_usage.path_counter[path]
+
+            uses_error = f"{path}_error_upper" in entry_usage.path_counter
+            # Frequency without occurrences, see GH#84 for details
+            freq1 = entry_count / entry_usage.num_occurrences
+            # Frequency with occurences
+            freq2 = occurrence_count / occurrence_usage.num_occurrences
+
+            # Write data row
+            writer.writerow([ids_name, path, "X" if uses_error else "", freq1, freq2])
+
+
+_csv_help_text = (
+    "Write analysis output to the provided CSV file. For details, "
+    "see https://github.com/iterorganization/IMAS-Python/issues/84."
+)
+
+
 @click.command("process-db-analysis")
 @click.argument(
     "infiles", metavar="INPUT_FILES...", nargs=-1, type=infile_path, required=True
 )
 @click.option("--show-empty-ids", is_flag=True, help="Show empty IDSs in the overview.")
-def process_db_analysis(infiles, show_empty_ids):
+@click.option("--csv", type=outfile_path, help=_csv_help_text)
+def process_db_analysis(infiles, show_empty_ids, csv):
     """Process supplied Data Entry analyses, and display statistics.
 
     \b
@@ -153,9 +213,10 @@ def process_db_analysis(infiles, show_empty_ids):
     """
     setup_rich_log_handler(False)
 
-    factory = imas.IDSFactory()
-    filled_per_ids = {ids_name: set() for ids_name in factory.ids_names()}
-    logger.info("Using Data Dictionary version %s.", factory.dd_version)
+    usage_per_entry = defaultdict(_PathUsage)
+    usage_per_occurrence = defaultdict(_PathUsage)
+    num_entries = 0
+
     logger.info("Reading %d input files...", len(infiles))
 
     # Read input data and collate usage info per IDS
@@ -164,17 +225,51 @@ def process_db_analysis(infiles, show_empty_ids):
             data = json.load(file)
 
         for entry in data:
+            usage_for_this_entry = defaultdict(_PathUsage)
             for ids_info in entry["ids_info"]:
-                fill_info = filled_per_ids[ids_info["name"]]
-                fill_info.update(ids_info["filled_data"])
+                ids_name = ids_info["name"]
+                filled_paths = ids_info["filled_data"]
+                # Update counters for this entry
+                usage_for_this_entry[ids_name].path_counter.update(filled_paths)
+                # Update counters for all occurrecnes
+                usage_per_occurrence[ids_name].num_occurrences += 1
+                usage_per_occurrence[ids_name].path_counter.update(filled_paths)
+            # Update data entry usage
+            for ids_name, usage in usage_for_this_entry.items():
+                usage_per_entry[ids_name].num_occurrences += 1
+                usage_per_entry[ids_name].path_counter.update(usage.path_counter.keys())
+            num_entries += 1
 
     logger.info("Done reading input files.")
+
+    if csv is not None:
+        # Output to CSV file
+        logger.info("Writing output to CSV file: %s", csv)
+        with open(csv, "w") as csvfile:
+            writer = csvwriter(csvfile)
+            _write_usage_stats_to_csv(
+                writer, usage_per_entry, usage_per_occurrence, num_entries
+            )
+        logger.info("Done.")
+        return
+
     logger.info("Analyzing filled data...")
+    factory = imas.IDSFactory()
+    logger.info("Using Data Dictionary version %s.", factory.dd_version)
 
     # Construct AnalysisNodes per IDS
     analysis_nodes: Dict[str, _AnalysisNode] = {}
-    for ids_name, filled in filled_per_ids.items():
+    for ids_name, usage in usage_per_occurrence.items():
+        if ids_name not in factory.ids_names():
+            logger.warning(
+                "Founds IDS %s in data files, but this IDS is not available "
+                "in DD version %s. Statistics will not be tracked.",
+                ids_name,
+                factory.dd_version,
+            )
+            continue
         metadata = factory.new(ids_name).metadata
+        filled = set(usage.path_counter.keys())
         ids_analysis_node = _AnalysisNode("")
 
         def walk_metadata_tree(metadata: IDSMetadata, node: _AnalysisNode):
diff --git a/imas/test/test_cli.py b/imas/test/test_cli.py
@@ -39,3 +39,64 @@ def test_db_analysis(tmp_path, requires_imas):
     )
     assert process_result.exit_code == 0, process_result.output
     assert "core_profiles" in process_result.output
+
+
+@pytest.mark.cli
+def test_db_analysis_csv(tmp_path, requires_imas):
+    with DBEntry(f"imas:hdf5?path={tmp_path}/entry1", "w") as entry:
+        eq = entry.factory.equilibrium()
+        eq.ids_properties.homogeneous_time = 2
+        entry.put(eq)
+        eq.ids_properties.comment = "filled"
+        entry.put(eq, 1)
+        eq.ids_properties.homogeneous_time = 1
+        eq.time = [1.0]
+        eq.time_slice.resize(1)
+        eq.time_slice[0].boundary.psi = 1.0
+        eq.time_slice[0].boundary.psi_error_upper = 0.1
+        entry.put(eq, 2)
+        wall = entry.factory.wall()
+        wall.ids_properties.homogeneous_time = 2
+        entry.put(wall)
+        wall.first_wall_surface_area = 1.0
+        entry.put(wall, 1)
+    with DBEntry(f"imas:hdf5?path={tmp_path}/entry2", "w") as entry:
+        eq = entry.factory.equilibrium()
+        eq.ids_properties.homogeneous_time = 2
+        eq.ids_properties.comment = "also filled"
+        entry.put(eq)
+
+    runner = CliRunner()
+    with runner.isolated_filesystem(temp_dir=tmp_path) as td:
+        analyze_result = runner.invoke(
+            analyze_db, [f"{tmp_path}/entry1", f"{tmp_path}/entry2"]
+        )
+        assert analyze_result.exit_code == 0
+
+        outfile = Path(td) / "imas-db-analysis.json.gz"
+        assert outfile.exists()
+        process_result = runner.invoke(
+            process_db_analysis, [str(outfile), "--csv", "output.csv"]
+        )
+        assert process_result.exit_code == 0
+
+        assert (
+            Path("output.csv").read_text()
+            == """\
+IDS,Path in IDS,Uses errorbar,Frequency (without occurrences),Frequency (with occurences)
+equilibrium,,,1.0,
+equilibrium,ids_properties/comment,,1.0,0.75
+equilibrium,ids_properties/homogeneous_time,,1.0,1.0
+equilibrium,ids_properties/version_put/access_layer,,1.0,1.0
+equilibrium,ids_properties/version_put/access_layer_language,,1.0,1.0
+equilibrium,ids_properties/version_put/data_dictionary,,1.0,1.0
+equilibrium,time,,0.5,0.25
+equilibrium,time_slice/boundary/psi,X,0.5,0.25
+wall,,,0.5,
+wall,first_wall_surface_area,,1.0,0.5
+wall,ids_properties/homogeneous_time,,1.0,1.0
+wall,ids_properties/version_put/access_layer,,1.0,1.0
+wall,ids_properties/version_put/access_layer_language,,1.0,1.0
+wall,ids_properties/version_put/data_dictionary,,1.0,1.0
+"""  # noqa: E501 (line too long)
+        )