Skip to content

Commit bade438

Browse files
authored
Add CSV output option to imas process-db-analysis (#85)
1 parent 836798d commit bade438

File tree

2 files changed

+163
-7
lines changed

2 files changed

+163
-7
lines changed

imas/command/db_analysis.py

Lines changed: 102 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
import re
88
import readline
99
import sys
10+
from csv import writer as csvwriter
11+
from collections import Counter, defaultdict
1012
from dataclasses import dataclass, field
1113
from pathlib import Path
1214
from typing import Dict, Iterable, List, Optional
@@ -139,12 +141,70 @@ def ids_info(idsfile: Path):
139141
}
140142

141143

144+
@dataclass
145+
class _PathUsage:
146+
num_occurrences: int = 0
147+
path_counter: Counter = field(default_factory=Counter)
148+
149+
150+
def _write_usage_stats_to_csv(
151+
writer, usage_per_entry, usage_per_occurrence, num_entries
152+
):
153+
"""Write usage statistics to csv file.
154+
155+
Args:
156+
writer: an instance of csv.writer
157+
usage_per_entry: path usage statistics per data entry
158+
usage_per_occurrence: path usage statistics per occurrence
159+
num_entries: number of data entries
160+
"""
161+
# Write header
162+
writer.writerow(
163+
[
164+
"IDS",
165+
"Path in IDS",
166+
"Uses errorbar",
167+
"Frequency (without occurrences)",
168+
"Frequency (with occurences)",
169+
]
170+
)
171+
172+
for ids_name in sorted(usage_per_entry):
173+
entry_usage = usage_per_entry[ids_name]
174+
occurrence_usage = usage_per_occurrence[ids_name]
175+
176+
# Usage statistics of the IDS (# entries with this IDS / # entries)
177+
freq = entry_usage.num_occurrences / num_entries
178+
writer.writerow([ids_name, "", "", freq, ""])
179+
180+
for path, entry_count in sorted(entry_usage.path_counter.items()):
181+
if "_error_" in path:
182+
continue # Skip error nodes
183+
occurrence_count = occurrence_usage.path_counter[path]
184+
185+
uses_error = f"{path}_error_upper" in entry_usage.path_counter
186+
# Frequency without occurrences, see GH#84 for details
187+
freq1 = entry_count / entry_usage.num_occurrences
188+
# Frequency with occurences
189+
freq2 = occurrence_count / occurrence_usage.num_occurrences
190+
191+
# Write data row
192+
writer.writerow([ids_name, path, "X" if uses_error else "", freq1, freq2])
193+
194+
195+
_csv_help_text = (
196+
"Write analysis output to the provided CSV file. For details, "
197+
"see https://github.com/iterorganization/IMAS-Python/issues/84."
198+
)
199+
200+
142201
@click.command("process-db-analysis")
143202
@click.argument(
144203
"infiles", metavar="INPUT_FILES...", nargs=-1, type=infile_path, required=True
145204
)
146205
@click.option("--show-empty-ids", is_flag=True, help="Show empty IDSs in the overview.")
147-
def process_db_analysis(infiles, show_empty_ids):
206+
@click.option("--csv", type=outfile_path, help=_csv_help_text)
207+
def process_db_analysis(infiles, show_empty_ids, csv):
148208
"""Process supplied Data Entry analyses, and display statistics.
149209
150210
\b
@@ -153,9 +213,10 @@ def process_db_analysis(infiles, show_empty_ids):
153213
"""
154214
setup_rich_log_handler(False)
155215

156-
factory = imas.IDSFactory()
157-
filled_per_ids = {ids_name: set() for ids_name in factory.ids_names()}
158-
logger.info("Using Data Dictionary version %s.", factory.dd_version)
216+
usage_per_entry = defaultdict(_PathUsage)
217+
usage_per_occurrence = defaultdict(_PathUsage)
218+
num_entries = 0
219+
159220
logger.info("Reading %d input files...", len(infiles))
160221

161222
# Read input data and collate usage info per IDS
@@ -164,17 +225,51 @@ def process_db_analysis(infiles, show_empty_ids):
164225
data = json.load(file)
165226

166227
for entry in data:
228+
usage_for_this_entry = defaultdict(_PathUsage)
167229
for ids_info in entry["ids_info"]:
168-
fill_info = filled_per_ids[ids_info["name"]]
169-
fill_info.update(ids_info["filled_data"])
230+
ids_name = ids_info["name"]
231+
filled_paths = ids_info["filled_data"]
232+
# Update counters for this entry
233+
usage_for_this_entry[ids_name].path_counter.update(filled_paths)
234+
# Update counters for all occurrecnes
235+
usage_per_occurrence[ids_name].num_occurrences += 1
236+
usage_per_occurrence[ids_name].path_counter.update(filled_paths)
237+
# Update data entry usage
238+
for ids_name, usage in usage_for_this_entry.items():
239+
usage_per_entry[ids_name].num_occurrences += 1
240+
usage_per_entry[ids_name].path_counter.update(usage.path_counter.keys())
241+
num_entries += 1
170242

171243
logger.info("Done reading input files.")
244+
245+
if csv is not None:
246+
# Output to CSV file
247+
logger.info("Writing output to CSV file: %s", csv)
248+
with open(csv, "w") as csvfile:
249+
writer = csvwriter(csvfile)
250+
_write_usage_stats_to_csv(
251+
writer, usage_per_entry, usage_per_occurrence, num_entries
252+
)
253+
logger.info("Done.")
254+
return
255+
172256
logger.info("Analyzing filled data...")
257+
factory = imas.IDSFactory()
258+
logger.info("Using Data Dictionary version %s.", factory.dd_version)
173259

174260
# Construct AnalysisNodes per IDS
175261
analysis_nodes: Dict[str, _AnalysisNode] = {}
176-
for ids_name, filled in filled_per_ids.items():
262+
for ids_name, usage in usage_per_occurrence.items():
263+
if ids_name not in factory.ids_names():
264+
logger.warning(
265+
"Founds IDS %s in data files, but this IDS is not available "
266+
"in DD version %s. Statistics will not be tracked.",
267+
ids_name,
268+
factory.dd_version,
269+
)
270+
continue
177271
metadata = factory.new(ids_name).metadata
272+
filled = set(usage.path_counter.keys())
178273
ids_analysis_node = _AnalysisNode("")
179274

180275
def walk_metadata_tree(metadata: IDSMetadata, node: _AnalysisNode):

imas/test/test_cli.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,3 +39,64 @@ def test_db_analysis(tmp_path, requires_imas):
3939
)
4040
assert process_result.exit_code == 0, process_result.output
4141
assert "core_profiles" in process_result.output
42+
43+
44+
@pytest.mark.cli
45+
def test_db_analysis_csv(tmp_path, requires_imas):
46+
with DBEntry(f"imas:hdf5?path={tmp_path}/entry1", "w") as entry:
47+
eq = entry.factory.equilibrium()
48+
eq.ids_properties.homogeneous_time = 2
49+
entry.put(eq)
50+
eq.ids_properties.comment = "filled"
51+
entry.put(eq, 1)
52+
eq.ids_properties.homogeneous_time = 1
53+
eq.time = [1.0]
54+
eq.time_slice.resize(1)
55+
eq.time_slice[0].boundary.psi = 1.0
56+
eq.time_slice[0].boundary.psi_error_upper = 0.1
57+
entry.put(eq, 2)
58+
wall = entry.factory.wall()
59+
wall.ids_properties.homogeneous_time = 2
60+
entry.put(wall)
61+
wall.first_wall_surface_area = 1.0
62+
entry.put(wall, 1)
63+
with DBEntry(f"imas:hdf5?path={tmp_path}/entry2", "w") as entry:
64+
eq = entry.factory.equilibrium()
65+
eq.ids_properties.homogeneous_time = 2
66+
eq.ids_properties.comment = "also filled"
67+
entry.put(eq)
68+
69+
runner = CliRunner()
70+
with runner.isolated_filesystem(temp_dir=tmp_path) as td:
71+
analyze_result = runner.invoke(
72+
analyze_db, [f"{tmp_path}/entry1", f"{tmp_path}/entry2"]
73+
)
74+
assert analyze_result.exit_code == 0
75+
76+
outfile = Path(td) / "imas-db-analysis.json.gz"
77+
assert outfile.exists()
78+
process_result = runner.invoke(
79+
process_db_analysis, [str(outfile), "--csv", "output.csv"]
80+
)
81+
assert process_result.exit_code == 0
82+
83+
assert (
84+
Path("output.csv").read_text()
85+
== """\
86+
IDS,Path in IDS,Uses errorbar,Frequency (without occurrences),Frequency (with occurences)
87+
equilibrium,,,1.0,
88+
equilibrium,ids_properties/comment,,1.0,0.75
89+
equilibrium,ids_properties/homogeneous_time,,1.0,1.0
90+
equilibrium,ids_properties/version_put/access_layer,,1.0,1.0
91+
equilibrium,ids_properties/version_put/access_layer_language,,1.0,1.0
92+
equilibrium,ids_properties/version_put/data_dictionary,,1.0,1.0
93+
equilibrium,time,,0.5,0.25
94+
equilibrium,time_slice/boundary/psi,X,0.5,0.25
95+
wall,,,0.5,
96+
wall,first_wall_surface_area,,1.0,0.5
97+
wall,ids_properties/homogeneous_time,,1.0,1.0
98+
wall,ids_properties/version_put/access_layer,,1.0,1.0
99+
wall,ids_properties/version_put/access_layer_language,,1.0,1.0
100+
wall,ids_properties/version_put/data_dictionary,,1.0,1.0
101+
""" # noqa: E501 (line too long)
102+
)

0 commit comments

Comments
 (0)