Skip to content

Commit 664db50

Browse files
author
miranov25
committed
Add dependency-aware alias materialization with optional cleanup and verbosity
- Introduced `materialize_aliases(targets, cleanTemporary=True, verbose=False)` method: - Builds a dependency graph among defined aliases using NetworkX. - Topologically sorts dependencies to ensure correct materialization order. - Materializes only the requested aliases and their dependencies. - Optionally cleans up intermediate (temporary) columns not in the target list. - Includes verbose logging to trace evaluation and cleanup steps. - Improves memory efficiency and control when working with layered alias chains. - Ensures robust handling of mixed alias and non-alias columns.
1 parent f77f57c commit 664db50

File tree

1 file changed

+52
-1
lines changed

1 file changed

+52
-1
lines changed

UTILS/dfextensions/AliasDataFrame.py

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
import ROOT # type: ignore
1111
import matplotlib.pyplot as plt
1212
import networkx as nx
13-
13+
import re
1414

1515

1616
def convert_expr_to_root(expr):
@@ -257,6 +257,57 @@ def materialize_all(self, dtype=None):
257257
self.df[name] = result
258258
except Exception as e:
259259
print(f"Failed to materialize {name}: {e}")
260+
def materialize_aliases(self, targets, cleanTemporary=True, verbose=False):
261+
import networkx as nx
262+
# Step 1: Build dependency graph
263+
def build_graph():
264+
g = nx.DiGraph()
265+
for alias, expr in self.aliases.items():
266+
for token in re.findall(r'\b\w+\b', expr):
267+
if token in self.aliases:
268+
g.add_edge(token, alias)
269+
return g
270+
271+
g = build_graph()
272+
273+
# Step 2: Extract subgraph and topologically sort
274+
required = set()
275+
for t in targets:
276+
if t not in self.aliases:
277+
if verbose:
278+
print(f"[materialize_aliases] Skipping non-alias target: {t}")
279+
continue # not an alias, skip
280+
if t not in g:
281+
if verbose:
282+
print(f"[materialize_aliases] Alias '{t}' is not in dependency graph (no dependencies)")
283+
continue # alias exists but not in graph
284+
try:
285+
required |= nx.ancestors(g, t)
286+
except nx.NetworkXError:
287+
if verbose:
288+
print(f"[materialize_aliases] NetworkXError on alias: {t}")
289+
continue
290+
required.add(t)
291+
292+
ordered = list(nx.topological_sort(g.subgraph(required)))
293+
294+
# Step 3: Materialize and optionally clean temporary ones
295+
added = []
296+
for name in ordered:
297+
if name not in self.df.columns:
298+
if verbose:
299+
print(f"[materialize_aliases] Materializing: {name}")
300+
self.materialize_alias(name)
301+
added.append(name)
302+
303+
if cleanTemporary:
304+
for col in added:
305+
if col not in targets and col in self.df.columns:
306+
if verbose:
307+
print(f"[materialize_aliases] Cleaning up temporary column: {col}")
308+
self.df.drop(columns=[col], inplace=True)
309+
310+
return added
260311

261312
def save(self, path_prefix, dropAliasColumns=True):
262313
import pyarrow as pa

0 commit comments

Comments
 (0)