Skip to content

Commit 4a7d520

Browse files
author
miranov25
committed
Add dtype support and alias dependency graph to AliasDataFrame
- Allow optional dtype per alias via `add_alias(..., dtype=...)` - Enable global override dtype in `materialize_alias` and `materialize_all` - Add `plot_alias_dependencies()` for visualizing alias dependencies - Improve alias validation with support for numpy/math functions
1 parent 1ba0686 commit 4a7d520

File tree

1 file changed

+54
-13
lines changed

1 file changed

+54
-13
lines changed

UTILS/dfextensions/AliasDataFrame.py

Lines changed: 54 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,14 @@
33
from AliasDataFrame import *
44
Utility helpers extension of the pandas DataFrame to support on-demand computed columns (aliases)
55
"""
6-
76
import pandas as pd
87
import numpy as np
98
import json
109
import uproot
1110
import ROOT # type: ignore
11+
import matplotlib.pyplot as plt
12+
import networkx as nx
13+
1214
class AliasDataFrame:
1315
"""
1416
A wrapper for pandas DataFrame that supports on-demand computed columns (aliases)
@@ -29,9 +31,25 @@ class AliasDataFrame:
2931
def __init__(self, df):
3032
self.df = df
3133
self.aliases = {}
34+
self.alias_dtypes = {} # Optional output types for each alias
3235

33-
def add_alias(self, name, expression):
36+
def add_alias(self, name, expression, dtype=None):
37+
try:
38+
dummy_env = {k: 1 for k in list(self.df.columns) + list(self.aliases.keys())}
39+
dummy_env.update(self._default_functions())
40+
eval(expression, self._default_functions(), dummy_env)
41+
except Exception as e:
42+
print(f"[Alias add warning] '{name}' may be invalid: {e}")
3443
self.aliases[name] = expression
44+
if dtype is not None:
45+
self.alias_dtypes[name] = dtype
46+
47+
def _default_functions(self):
48+
import math
49+
env = {k: getattr(math, k) for k in dir(math) if not k.startswith("_")}
50+
env.update({k: getattr(np, k) for k in dir(np) if not k.startswith("_")})
51+
env["np"] = np
52+
return env
3553

3654
def _resolve_dependencies(self):
3755
from collections import defaultdict
@@ -44,6 +62,18 @@ def _resolve_dependencies(self):
4462
dependencies[name].add(token)
4563
return dependencies
4664

65+
def plot_alias_dependencies(self):
66+
deps = self._resolve_dependencies()
67+
G = nx.DiGraph()
68+
for alias, subdeps in deps.items():
69+
for dep in subdeps:
70+
G.add_edge(dep, alias)
71+
pos = nx.spring_layout(G)
72+
plt.figure(figsize=(10, 6))
73+
nx.draw(G, pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=2000, font_size=10, arrows=True)
74+
plt.title("Alias Dependency Graph")
75+
plt.show()
76+
4777
def _topological_sort(self):
4878
from collections import defaultdict, deque
4979

@@ -76,7 +106,9 @@ def validate_aliases(self):
76106
broken = []
77107
for name, expr in self.aliases.items():
78108
try:
79-
eval(expr, {}, self.df)
109+
local_env = {col: self.df[col] for col in self.df.columns}
110+
local_env.update({k: self.df[k] for k in self.aliases if k in self.df})
111+
eval(expr, self._default_functions(), local_env)
80112
except Exception:
81113
broken.append(name)
82114
return broken
@@ -97,12 +129,17 @@ def describe_aliases(self):
97129
for k, v in deps.items():
98130
print(f" {k}: {sorted(v)}")
99131

100-
def materialize_alias0(self, name):
132+
def materialize_alias0(self, name, dtype=None):
101133
if name in self.aliases:
102134
local_env = {col: self.df[col] for col in self.df.columns}
103135
local_env.update({k: self.df[k] for k in self.aliases if k in self.df})
104-
self.df[name] = eval(self.aliases[name], {}, local_env)
105-
def materialize_alias(self, name, cleanTemporary=False):
136+
result = eval(self.aliases[name], self._default_functions(), local_env)
137+
result_dtype = dtype or self.alias_dtypes.get(name)
138+
if result_dtype is not None:
139+
result = result.astype(result_dtype)
140+
self.df[name] = result
141+
142+
def materialize_alias(self, name, cleanTemporary=False, dtype=None):
106143
if name not in self.aliases:
107144
return
108145
to_materialize = []
@@ -120,14 +157,17 @@ def visit(n):
120157

121158
visit(name)
122159

123-
# Track which ones were newly created
124160
original_columns = set(self.df.columns)
125161

126162
for alias in to_materialize:
127163
local_env = {col: self.df[col] for col in self.df.columns}
128164
local_env.update({k: self.df[k] for k in self.aliases if k in self.df})
129165
try:
130-
self.df[alias] = eval(self.aliases[alias], {}, local_env)
166+
result = eval(self.aliases[alias], self._default_functions(), local_env)
167+
result_dtype = dtype or self.alias_dtypes.get(alias)
168+
if result_dtype is not None:
169+
result = result.astype(result_dtype)
170+
self.df[alias] = result
131171
except Exception as e:
132172
print(f"Failed to materialize {alias}: {e}")
133173

@@ -136,14 +176,17 @@ def visit(n):
136176
if alias != name and alias not in original_columns:
137177
self.df.drop(columns=[alias], inplace=True)
138178

139-
140-
def materialize_all(self):
179+
def materialize_all(self, dtype=None):
141180
order = self._topological_sort()
142181
for name in order:
143182
try:
144183
local_env = {col: self.df[col] for col in self.df.columns}
145184
local_env.update({k: self.df[k] for k in self.df.columns if k in self.aliases})
146-
self.df[name] = eval(self.aliases[name], {}, local_env)
185+
result = eval(self.aliases[name], self._default_functions(), local_env)
186+
result_dtype = dtype or self.alias_dtypes.get(name)
187+
if result_dtype is not None:
188+
result = result.astype(result_dtype)
189+
self.df[name] = result
147190
except Exception as e:
148191
print(f"Failed to materialize {name}: {e}")
149192

@@ -166,13 +209,11 @@ def export_tree(self, filename, treename="tree", dropAliasColumns=True):
166209
export_cols = [col for col in self.df.columns if col not in self.aliases]
167210
else:
168211
export_cols = list(self.df.columns)
169-
# Convert float16 columns to float32 for ROOT compatibility
170212
dtype_casts = {col: np.float32 for col in export_cols if self.df[col].dtype == np.float16}
171213
export_df = self.df[export_cols].astype(dtype_casts)
172214

173215
with uproot.recreate(filename) as f:
174216
f[treename] = export_df
175-
# Update the ROOT file with aliases
176217
f = ROOT.TFile.Open(filename, "UPDATE")
177218
tree = f.Get(treename)
178219
for alias, expr in self.aliases.items():

0 commit comments

Comments
 (0)