33from AliasDataFrame import *
44Utility helpers extension of the pandas DataFrame to support on-demand computed columns (aliases)
55"""
6-
76import pandas as pd
87import numpy as np
98import json
109import uproot
1110import ROOT # type: ignore
11+ import matplotlib .pyplot as plt
12+ import networkx as nx
13+
1214class AliasDataFrame :
1315 """
1416 A wrapper for pandas DataFrame that supports on-demand computed columns (aliases)
@@ -29,9 +31,25 @@ class AliasDataFrame:
2931 def __init__ (self , df ):
3032 self .df = df
3133 self .aliases = {}
34+ self .alias_dtypes = {} # Optional output types for each alias
3235
33- def add_alias (self , name , expression ):
36+ def add_alias (self , name , expression , dtype = None ):
37+ try :
38+ dummy_env = {k : 1 for k in list (self .df .columns ) + list (self .aliases .keys ())}
39+ dummy_env .update (self ._default_functions ())
40+ eval (expression , self ._default_functions (), dummy_env )
41+ except Exception as e :
42+ print (f"[Alias add warning] '{ name } ' may be invalid: { e } " )
3443 self .aliases [name ] = expression
44+ if dtype is not None :
45+ self .alias_dtypes [name ] = dtype
46+
47+ def _default_functions (self ):
48+ import math
49+ env = {k : getattr (math , k ) for k in dir (math ) if not k .startswith ("_" )}
50+ env .update ({k : getattr (np , k ) for k in dir (np ) if not k .startswith ("_" )})
51+ env ["np" ] = np
52+ return env
3553
3654 def _resolve_dependencies (self ):
3755 from collections import defaultdict
@@ -44,6 +62,18 @@ def _resolve_dependencies(self):
4462 dependencies [name ].add (token )
4563 return dependencies
4664
65+ def plot_alias_dependencies (self ):
66+ deps = self ._resolve_dependencies ()
67+ G = nx .DiGraph ()
68+ for alias , subdeps in deps .items ():
69+ for dep in subdeps :
70+ G .add_edge (dep , alias )
71+ pos = nx .spring_layout (G )
72+ plt .figure (figsize = (10 , 6 ))
73+ nx .draw (G , pos , with_labels = True , node_color = 'lightblue' , edge_color = 'gray' , node_size = 2000 , font_size = 10 , arrows = True )
74+ plt .title ("Alias Dependency Graph" )
75+ plt .show ()
76+
4777 def _topological_sort (self ):
4878 from collections import defaultdict , deque
4979
@@ -76,7 +106,9 @@ def validate_aliases(self):
76106 broken = []
77107 for name , expr in self .aliases .items ():
78108 try :
79- eval (expr , {}, self .df )
109+ local_env = {col : self .df [col ] for col in self .df .columns }
110+ local_env .update ({k : self .df [k ] for k in self .aliases if k in self .df })
111+ eval (expr , self ._default_functions (), local_env )
80112 except Exception :
81113 broken .append (name )
82114 return broken
@@ -97,12 +129,17 @@ def describe_aliases(self):
97129 for k , v in deps .items ():
98130 print (f" { k } : { sorted (v )} " )
99131
100- def materialize_alias0 (self , name ):
132+ def materialize_alias0 (self , name , dtype = None ):
101133 if name in self .aliases :
102134 local_env = {col : self .df [col ] for col in self .df .columns }
103135 local_env .update ({k : self .df [k ] for k in self .aliases if k in self .df })
104- self .df [name ] = eval (self .aliases [name ], {}, local_env )
105- def materialize_alias (self , name , cleanTemporary = False ):
136+ result = eval (self .aliases [name ], self ._default_functions (), local_env )
137+ result_dtype = dtype or self .alias_dtypes .get (name )
138+ if result_dtype is not None :
139+ result = result .astype (result_dtype )
140+ self .df [name ] = result
141+
142+ def materialize_alias (self , name , cleanTemporary = False , dtype = None ):
106143 if name not in self .aliases :
107144 return
108145 to_materialize = []
@@ -120,14 +157,17 @@ def visit(n):
120157
121158 visit (name )
122159
123- # Track which ones were newly created
124160 original_columns = set (self .df .columns )
125161
126162 for alias in to_materialize :
127163 local_env = {col : self .df [col ] for col in self .df .columns }
128164 local_env .update ({k : self .df [k ] for k in self .aliases if k in self .df })
129165 try :
130- self .df [alias ] = eval (self .aliases [alias ], {}, local_env )
166+ result = eval (self .aliases [alias ], self ._default_functions (), local_env )
167+ result_dtype = dtype or self .alias_dtypes .get (alias )
168+ if result_dtype is not None :
169+ result = result .astype (result_dtype )
170+ self .df [alias ] = result
131171 except Exception as e :
132172 print (f"Failed to materialize { alias } : { e } " )
133173
@@ -136,14 +176,17 @@ def visit(n):
136176 if alias != name and alias not in original_columns :
137177 self .df .drop (columns = [alias ], inplace = True )
138178
139-
140- def materialize_all (self ):
179+ def materialize_all (self , dtype = None ):
141180 order = self ._topological_sort ()
142181 for name in order :
143182 try :
144183 local_env = {col : self .df [col ] for col in self .df .columns }
145184 local_env .update ({k : self .df [k ] for k in self .df .columns if k in self .aliases })
146- self .df [name ] = eval (self .aliases [name ], {}, local_env )
185+ result = eval (self .aliases [name ], self ._default_functions (), local_env )
186+ result_dtype = dtype or self .alias_dtypes .get (name )
187+ if result_dtype is not None :
188+ result = result .astype (result_dtype )
189+ self .df [name ] = result
147190 except Exception as e :
148191 print (f"Failed to materialize { name } : { e } " )
149192
@@ -166,13 +209,11 @@ def export_tree(self, filename, treename="tree", dropAliasColumns=True):
166209 export_cols = [col for col in self .df .columns if col not in self .aliases ]
167210 else :
168211 export_cols = list (self .df .columns )
169- # Convert float16 columns to float32 for ROOT compatibility
170212 dtype_casts = {col : np .float32 for col in export_cols if self .df [col ].dtype == np .float16 }
171213 export_df = self .df [export_cols ].astype (dtype_casts )
172214
173215 with uproot .recreate (filename ) as f :
174216 f [treename ] = export_df
175- # Update the ROOT file with aliases
176217 f = ROOT .TFile .Open (filename , "UPDATE" )
177218 tree = f .Get (treename )
178219 for alias , expr in self .aliases .items ():
0 commit comments