Skip to content

Commit 8337e9a

Browse files
author
miranov25
committed
AliasDataFrame is a small utility that extends pandas.DataFrame functionality by enabling:
* **Lazy evaluation of derived columns via named aliases** * **Automatic dependency resolution across aliases** * **Persistence via Parquet + JSON or ROOT TTree (via `uproot` + `PyROOT`)** * **ROOT-compatible TTree export/import including alias metadata**
1 parent e6940e9 commit 8337e9a

File tree

2 files changed

+286
-0
lines changed

2 files changed

+286
-0
lines changed
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
### `AliasDataFrame` – A Lightweight Wrapper for Pandas with Alias Support
2+
3+
`AliasDataFrame` is a small utility that extends `pandas.DataFrame` functionality by enabling:
4+
5+
* **Lazy evaluation of derived columns via named aliases**
6+
* **Automatic dependency resolution across aliases**
7+
* **Persistence via Parquet + JSON or ROOT TTree (via `uproot` + `PyROOT`)**
8+
* **ROOT-compatible TTree export/import including alias metadata**
9+
10+
---
11+
12+
#### 🔧 Example Usage
13+
14+
```python
15+
import pandas as pd
16+
from AliasDataFrame import AliasDataFrame
17+
18+
# Base DataFrame
19+
df = pd.DataFrame({"x": [1, 2], "y": [10, 20]})
20+
adf = AliasDataFrame(df)
21+
22+
# Add aliases (on-demand expressions)
23+
adf.add_alias("z", "x + y")
24+
adf.add_alias("w", "z * 2")
25+
26+
# Materialize evaluated columns
27+
adf.materialize_all()
28+
print(adf.df)
29+
```
30+
31+
---
32+
33+
#### 📦 Persistence
34+
35+
##### Save to Parquet + Aliases JSON:
36+
37+
```python
38+
adf.save("mydata")
39+
```
40+
41+
##### Load from disk:
42+
43+
```python
44+
adf2 = AliasDataFrame.load("mydata")
45+
adf2.describe_aliases()
46+
```
47+
48+
---
49+
50+
#### 🌲 ROOT TTree Support
51+
52+
##### Export to `.root` with aliases:
53+
54+
```python
55+
adf.export_tree("mytree.root", treename="myTree", dropAliasColumns=True)
56+
```
57+
58+
This uses `uproot` for writing columns and `PyROOT` to set alias metadata via `TTree::SetAlias`.
59+
60+
##### Read `.root` file back:
61+
62+
```python
63+
adf2 = adf.read_tree("mytree.root", treename="myTree")
64+
```
65+
66+
---
67+
68+
#### 🔍 Introspection
69+
70+
```python
71+
adf.describe_aliases()
72+
```
73+
74+
Outputs:
75+
76+
* Defined aliases
77+
* Broken/inconsistent aliases
78+
* Dependency graph
79+
80+
---
81+
82+
#### 🧠 Notes
83+
84+
* Dependencies across aliases are auto-resolved via topological sort.
85+
* Cycles in alias definitions are detected and reported.
86+
* Aliases are **not materialized** by default and **not stored** in `.parquet` unless requested.
87+
* `float16` columns are auto-upcast to `float32` for ROOT compatibility.
Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
"""timeseries_diff.py
2+
import sys,os; sys.path.insert(1, os.environ[f"O2DPG"]+"/UTILS/dfextensions");
3+
from AliasDataFrame import *
4+
Utility helpers extension of the pandas DataFrame to support on-demand computed columns (aliases)
5+
"""
6+
7+
import pandas as pd
8+
import numpy as np
9+
import json
10+
import os
11+
import uproot
12+
13+
class AliasDataFrame:
14+
"""
15+
A wrapper for pandas DataFrame that supports on-demand computed columns (aliases)
16+
with dependency tracking and persistence.
17+
Example usage:
18+
>>> import pandas as pd
19+
>>> df = pd.DataFrame({"x": [1, 2, 3], "y": [10, 20, 30]})
20+
>>> adf = AliasDataFrame(df)
21+
>>> adf.add_alias("z", "x + y")
22+
>>> adf.add_alias("w", "z * 2")
23+
>>> adf.materialize_all()
24+
>>> print(adf.df)
25+
You can also save and load the dataframe along with aliases:
26+
>>> adf.save("mydata")
27+
>>> adf2 = AliasDataFrame.load("mydata")
28+
>>> adf2.describe_aliases()
29+
"""
30+
31+
def __init__(self, df):
32+
self.df = df
33+
self.aliases = {}
34+
35+
def add_alias(self, name, expression):
36+
self.aliases[name] = expression
37+
38+
def _resolve_dependencies(self):
39+
from collections import defaultdict
40+
41+
dependencies = defaultdict(set)
42+
for name, expr in self.aliases.items():
43+
tokens = expr.replace('(', ' ').replace(')', ' ').replace('*', ' ').replace('+', ' ').replace('-', ' ').replace('/', ' ').split()
44+
for token in tokens:
45+
if token in self.aliases:
46+
dependencies[name].add(token)
47+
return dependencies
48+
49+
def _topological_sort(self):
50+
from collections import defaultdict, deque
51+
52+
dependencies = self._resolve_dependencies()
53+
reverse_deps = defaultdict(set)
54+
indegree = defaultdict(int)
55+
56+
for alias, deps in dependencies.items():
57+
indegree[alias] = len(deps)
58+
for dep in deps:
59+
reverse_deps[dep].add(alias)
60+
61+
queue = deque([alias for alias in self.aliases if indegree[alias] == 0])
62+
result = []
63+
64+
while queue:
65+
node = queue.popleft()
66+
result.append(node)
67+
for dependent in reverse_deps[node]:
68+
indegree[dependent] -= 1
69+
if indegree[dependent] == 0:
70+
queue.append(dependent)
71+
72+
if len(result) != len(self.aliases):
73+
raise ValueError("Cycle detected in alias dependencies")
74+
75+
return result
76+
77+
def validate_aliases(self):
78+
broken = []
79+
for name, expr in self.aliases.items():
80+
try:
81+
eval(expr, {}, self.df)
82+
except Exception:
83+
broken.append(name)
84+
return broken
85+
86+
def describe_aliases(self):
87+
print("Aliases:")
88+
for name, expr in self.aliases.items():
89+
print(f" {name}: {expr}")
90+
91+
broken = self.validate_aliases()
92+
if broken:
93+
print("\nBroken Aliases:")
94+
for name in broken:
95+
print(f" {name}")
96+
97+
print("\nDependencies:")
98+
deps = self._resolve_dependencies()
99+
for k, v in deps.items():
100+
print(f" {k}: {sorted(v)}")
101+
102+
def materialize_alias0(self, name):
103+
if name in self.aliases:
104+
local_env = {col: self.df[col] for col in self.df.columns}
105+
local_env.update({k: self.df[k] for k in self.aliases if k in self.df})
106+
self.df[name] = eval(self.aliases[name], {}, local_env)
107+
def materialize_alias(self, name, cleanTemporary=False):
108+
if name not in self.aliases:
109+
return
110+
to_materialize = []
111+
visited = set()
112+
def visit(n):
113+
if n in visited:
114+
return
115+
visited.add(n)
116+
if n in self.aliases:
117+
expr = self.aliases[n]
118+
tokens = expr.replace('(', ' ').replace(')', ' ').replace('*', ' ').replace('+', ' ').replace('-', ' ').replace('/', ' ').split()
119+
for token in tokens:
120+
visit(token)
121+
to_materialize.append(n)
122+
123+
visit(name)
124+
125+
# Track which ones were newly created
126+
original_columns = set(self.df.columns)
127+
128+
for alias in to_materialize:
129+
local_env = {col: self.df[col] for col in self.df.columns}
130+
local_env.update({k: self.df[k] for k in self.aliases if k in self.df})
131+
try:
132+
self.df[alias] = eval(self.aliases[alias], {}, local_env)
133+
except Exception as e:
134+
print(f"Failed to materialize {alias}: {e}")
135+
136+
if cleanTemporary:
137+
for alias in to_materialize:
138+
if alias != name and alias not in original_columns:
139+
self.df.drop(columns=[alias], inplace=True)
140+
141+
142+
def materialize_all(self):
143+
order = self._topological_sort()
144+
for name in order:
145+
try:
146+
local_env = {col: self.df[col] for col in self.df.columns}
147+
local_env.update({k: self.df[k] for k in self.df.columns if k in self.aliases})
148+
self.df[name] = eval(self.aliases[name], {}, local_env)
149+
except Exception as e:
150+
print(f"Failed to materialize {name}: {e}")
151+
152+
def save(self, path_prefix):
153+
self.df.to_parquet(f"{path_prefix}.parquet", compression="zstd")
154+
with open(f"{path_prefix}.aliases.json", "w") as f:
155+
json.dump(self.aliases, f, indent=2)
156+
157+
@staticmethod
158+
def load(path_prefix):
159+
df = pd.read_parquet(f"{path_prefix}.parquet")
160+
with open(f"{path_prefix}.aliases.json") as f:
161+
aliases = json.load(f)
162+
adf = AliasDataFrame(df)
163+
adf.aliases = aliases
164+
return adf
165+
166+
def export_tree(self, filename, treename="tree", dropAliasColumns=True):
167+
if dropAliasColumns:
168+
export_cols = [col for col in self.df.columns if col not in self.aliases]
169+
else:
170+
export_cols = list(self.df.columns)
171+
# Convert float16 columns to float32 for ROOT compatibility
172+
dtype_casts = {col: np.float32 for col in export_cols if self.df[col].dtype == np.float16}
173+
export_df = self.df[export_cols].astype(dtype_casts)
174+
175+
with uproot.recreate(filename) as f:
176+
f[treename] = export_df
177+
178+
import ROOT
179+
f = ROOT.TFile.Open(filename, "UPDATE")
180+
tree = f.Get(treename)
181+
for alias, expr in self.aliases.items():
182+
tree.SetAlias(alias, expr)
183+
tree.Write("", ROOT.TObject.kOverwrite)
184+
f.Close()
185+
186+
def read_tree(self, filename, treename="tree"):
187+
with uproot.open(filename) as f:
188+
df = f[treename].arrays(library="pd")
189+
adf = AliasDataFrame(df)
190+
f = ROOT.TFile.Open(filename, "UPDATE")
191+
try:
192+
tree = f.Get(treename)
193+
if not tree:
194+
raise ValueError(f"Tree '{treename}' not found in file '{filename}'")
195+
for alias in tree.GetListOfAliases():
196+
adf.aliases[alias.GetName()] = alias.GetTitle()
197+
finally:
198+
f.Close()
199+
return adf

0 commit comments

Comments
 (0)