77import re
88import readline
99import sys
10+ from csv import writer as csvwriter
11+ from collections import Counter , defaultdict
1012from dataclasses import dataclass , field
1113from pathlib import Path
1214from typing import Dict , Iterable , List , Optional
@@ -139,12 +141,70 @@ def ids_info(idsfile: Path):
139141 }
140142
141143
144+ @dataclass
145+ class _PathUsage :
146+ num_occurrences : int = 0
147+ path_counter : Counter = field (default_factory = Counter )
148+
149+
150+ def _write_usage_stats_to_csv (
151+ writer , usage_per_entry , usage_per_occurrence , num_entries
152+ ):
153+ """Write usage statistics to csv file.
154+
155+ Args:
156+ writer: an instance of csv.writer
157+ usage_per_entry: path usage statistics per data entry
158+ usage_per_occurrence: path usage statistics per occurrence
159+ num_entries: number of data entries
160+ """
161+ # Write header
162+ writer .writerow (
163+ [
164+ "IDS" ,
165+ "Path in IDS" ,
166+ "Uses errorbar" ,
167+ "Frequency (without occurrences)" ,
168+ "Frequency (with occurences)" ,
169+ ]
170+ )
171+
172+ for ids_name in sorted (usage_per_entry ):
173+ entry_usage = usage_per_entry [ids_name ]
174+ occurrence_usage = usage_per_occurrence [ids_name ]
175+
176+ # Usage statistics of the IDS (# entries with this IDS / # entries)
177+ freq = entry_usage .num_occurrences / num_entries
178+ writer .writerow ([ids_name , "" , "" , freq , "" ])
179+
180+ for path , entry_count in sorted (entry_usage .path_counter .items ()):
181+ if "_error_" in path :
182+ continue # Skip error nodes
183+ occurrence_count = occurrence_usage .path_counter [path ]
184+
185+ uses_error = f"{ path } _error_upper" in entry_usage .path_counter
186+ # Frequency without occurrences, see GH#84 for details
187+ freq1 = entry_count / entry_usage .num_occurrences
188+ # Frequency with occurences
189+ freq2 = occurrence_count / occurrence_usage .num_occurrences
190+
191+ # Write data row
192+ writer .writerow ([ids_name , path , "X" if uses_error else "" , freq1 , freq2 ])
193+
194+
195+ _csv_help_text = (
196+ "Write analysis output to the provided CSV file. For details, "
197+ "see https://github.com/iterorganization/IMAS-Python/issues/84."
198+ )
199+
200+
142201@click .command ("process-db-analysis" )
143202@click .argument (
144203 "infiles" , metavar = "INPUT_FILES..." , nargs = - 1 , type = infile_path , required = True
145204)
146205@click .option ("--show-empty-ids" , is_flag = True , help = "Show empty IDSs in the overview." )
147- def process_db_analysis (infiles , show_empty_ids ):
206+ @click .option ("--csv" , type = outfile_path , help = _csv_help_text )
207+ def process_db_analysis (infiles , show_empty_ids , csv ):
148208 """Process supplied Data Entry analyses, and display statistics.
149209
150210 \b
@@ -153,9 +213,10 @@ def process_db_analysis(infiles, show_empty_ids):
153213 """
154214 setup_rich_log_handler (False )
155215
156- factory = imas .IDSFactory ()
157- filled_per_ids = {ids_name : set () for ids_name in factory .ids_names ()}
158- logger .info ("Using Data Dictionary version %s." , factory .dd_version )
216+ usage_per_entry = defaultdict (_PathUsage )
217+ usage_per_occurrence = defaultdict (_PathUsage )
218+ num_entries = 0
219+
159220 logger .info ("Reading %d input files..." , len (infiles ))
160221
161222 # Read input data and collate usage info per IDS
@@ -164,17 +225,51 @@ def process_db_analysis(infiles, show_empty_ids):
164225 data = json .load (file )
165226
166227 for entry in data :
228+ usage_for_this_entry = defaultdict (_PathUsage )
167229 for ids_info in entry ["ids_info" ]:
168- fill_info = filled_per_ids [ids_info ["name" ]]
169- fill_info .update (ids_info ["filled_data" ])
230+ ids_name = ids_info ["name" ]
231+ filled_paths = ids_info ["filled_data" ]
232+ # Update counters for this entry
233+ usage_for_this_entry [ids_name ].path_counter .update (filled_paths )
234+ # Update counters for all occurrecnes
235+ usage_per_occurrence [ids_name ].num_occurrences += 1
236+ usage_per_occurrence [ids_name ].path_counter .update (filled_paths )
237+ # Update data entry usage
238+ for ids_name , usage in usage_for_this_entry .items ():
239+ usage_per_entry [ids_name ].num_occurrences += 1
240+ usage_per_entry [ids_name ].path_counter .update (usage .path_counter .keys ())
241+ num_entries += 1
170242
171243 logger .info ("Done reading input files." )
244+
245+ if csv is not None :
246+ # Output to CSV file
247+ logger .info ("Writing output to CSV file: %s" , csv )
248+ with open (csv , "w" ) as csvfile :
249+ writer = csvwriter (csvfile )
250+ _write_usage_stats_to_csv (
251+ writer , usage_per_entry , usage_per_occurrence , num_entries
252+ )
253+ logger .info ("Done." )
254+ return
255+
172256 logger .info ("Analyzing filled data..." )
257+ factory = imas .IDSFactory ()
258+ logger .info ("Using Data Dictionary version %s." , factory .dd_version )
173259
174260 # Construct AnalysisNodes per IDS
175261 analysis_nodes : Dict [str , _AnalysisNode ] = {}
176- for ids_name , filled in filled_per_ids .items ():
262+ for ids_name , usage in usage_per_occurrence .items ():
263+ if ids_name not in factory .ids_names ():
264+ logger .warning (
265+ "Founds IDS %s in data files, but this IDS is not available "
266+ "in DD version %s. Statistics will not be tracked." ,
267+ ids_name ,
268+ factory .dd_version ,
269+ )
270+ continue
177271 metadata = factory .new (ids_name ).metadata
272+ filled = set (usage .path_counter .keys ())
178273 ids_analysis_node = _AnalysisNode ("" )
179274
180275 def walk_metadata_tree (metadata : IDSMetadata , node : _AnalysisNode ):
0 commit comments