Skip to content

Commit b188456

Browse files
author
miranov25
committed
special treatment for constants
1 parent 60e26cb commit b188456

File tree

2 files changed

+77
-21
lines changed

2 files changed

+77
-21
lines changed

UTILS/dfextensions/AliasDataFrame.py

Lines changed: 28 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -15,17 +15,6 @@ class AliasDataFrame:
1515
"""
1616
A wrapper for pandas DataFrame that supports on-demand computed columns (aliases)
1717
with dependency tracking and persistence.
18-
Example usage:
19-
>>> df = pd.DataFrame({"x": [1, 2, 3], "y": [10, 20, 30]})
20-
>>> adf = AliasDataFrame(df)
21-
>>> adf.add_alias("z", "x + y")
22-
>>> adf.add_alias("w", "z * 2")
23-
>>> adf.materialize_all()
24-
>>> print(adf.df)
25-
You can also save and load the dataframe along with aliases:
26-
>>> adf.save("mydata")
27-
>>> adf2 = AliasDataFrame.load("mydata")
28-
>>> adf2.describe_aliases()
2918
"""
3019

3120
def __init__(self, df):
@@ -35,10 +24,6 @@ def __init__(self, df):
3524
self.constant_aliases = set() # Optional set of constants that should not be materialized
3625

3726
def add_alias(self, name, expression, dtype=None, is_constant=False):
38-
"""
39-
Add an alias expression to the DataFrame.
40-
Optionally specify output dtype and whether it's a constant (scalar-only).
41-
"""
4227
try:
4328
dummy_env = {k: 1 for k in list(self.df.columns) + list(self.aliases.keys())}
4429
dummy_env.update(self._default_functions())
@@ -142,6 +127,12 @@ def materialize_alias0(self, name, dtype=None):
142127
if name in self.aliases:
143128
local_env = {col: self.df[col] for col in self.df.columns}
144129
local_env.update({k: self.df[k] for k in self.aliases if k in self.df})
130+
for cname in self.constant_aliases:
131+
try:
132+
val = eval(self.aliases[cname], self._default_functions())
133+
local_env[cname] = val
134+
except Exception as e:
135+
print(f"[Alias constant] Failed to evaluate constant '{cname}': {e}")
145136
result = eval(self.aliases[name], self._default_functions(), local_env)
146137
result_dtype = dtype or self.alias_dtypes.get(name)
147138
if result_dtype is not None:
@@ -177,6 +168,12 @@ def visit(n):
177168
continue
178169
local_env = {col: self.df[col] for col in self.df.columns}
179170
local_env.update({k: self.df[k] for k in self.aliases if k in self.df})
171+
for cname in self.constant_aliases:
172+
try:
173+
val = eval(self.aliases[cname], self._default_functions())
174+
local_env[cname] = val
175+
except Exception as e:
176+
print(f"[Alias constant] Failed to evaluate constant '{cname}': {e}")
180177
try:
181178
result = eval(self.aliases[alias], self._default_functions(), local_env)
182179
result_dtype = dtype or self.alias_dtypes.get(alias)
@@ -202,6 +199,9 @@ def materialize_all(self, dtype=None):
202199
try:
203200
local_env = {col: self.df[col] for col in self.df.columns}
204201
local_env.update({k: self.df[k] for k in self.df.columns if k in self.aliases})
202+
for cname in self.constant_aliases:
203+
val = eval(self.aliases[cname], self._default_functions())
204+
local_env[cname] = val
205205
result = eval(self.aliases[name], self._default_functions(), local_env)
206206
result_dtype = dtype or self.alias_dtypes.get(name)
207207
if result_dtype is not None:
@@ -222,19 +222,18 @@ def save(self, path_prefix, dropAliasColumns=True):
222222
else:
223223
cols = list(self.df.columns)
224224

225-
# Save Parquet with metadata
226225
table = pa.Table.from_pandas(self.df[cols])
227226
metadata = {
228227
"aliases": json.dumps(self.aliases),
229-
"dtypes": json.dumps({k: v.__name__ for k, v in self.alias_dtypes.items()})
228+
"dtypes": json.dumps({k: v.__name__ for k, v in self.alias_dtypes.items()}),
229+
"constants": json.dumps(list(self.constant_aliases))
230230
}
231231
existing_meta = table.schema.metadata or {}
232232
combined_meta = existing_meta.copy()
233233
combined_meta.update({k.encode(): v.encode() for k, v in metadata.items()})
234234
table = table.replace_schema_metadata(combined_meta)
235235
pq.write_table(table, f"{path_prefix}.parquet", compression="zstd")
236236

237-
# Also write JSON file for explicit tracking
238237
with open(f"{path_prefix}.aliases.json", "w") as f:
239238
json.dump(metadata, f, indent=2)
240239

@@ -245,17 +244,19 @@ def load(path_prefix):
245244
df = table.to_pandas()
246245
adf = AliasDataFrame(df)
247246

248-
# Try metadata first
249247
meta = table.schema.metadata or {}
250248
if b"aliases" in meta and b"dtypes" in meta:
251249
adf.aliases = json.loads(meta[b"aliases"].decode())
252250
adf.alias_dtypes = {k: getattr(np, v) for k, v in json.loads(meta[b"dtypes"].decode()).items()}
251+
if b"constants" in meta:
252+
adf.constant_aliases = set(json.loads(meta[b"constants"].decode()))
253253
else:
254-
# Fallback to JSON
255254
with open(f"{path_prefix}.aliases.json") as f:
256255
data = json.load(f)
257256
adf.aliases = json.loads(data["aliases"])
258257
adf.alias_dtypes = {k: getattr(np, v) for k, v in json.loads(data["dtypes"]).items()}
258+
if "constants" in data:
259+
adf.constant_aliases = set(json.loads(data["constants"]))
259260

260261
return adf
261262

@@ -272,7 +273,13 @@ def export_tree(self, filename, treename="tree", dropAliasColumns=True):
272273
f = ROOT.TFile.Open(filename, "UPDATE")
273274
tree = f.Get(treename)
274275
for alias, expr in self.aliases.items():
275-
tree.SetAlias(alias, expr)
276+
expr_str = expr
277+
try:
278+
val = float(expr)
279+
expr_str = f"({val}+0)"
280+
except Exception:
281+
pass
282+
tree.SetAlias(alias, expr_str)
276283
tree.Write("", ROOT.TObject.kOverwrite)
277284
f.Close()
278285

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
import unittest
2+
import pandas as pd
3+
import numpy as np
4+
from AliasDataFrame import AliasDataFrame # Adjust this if you're using a different import method
5+
6+
class TestAliasDataFrame(unittest.TestCase):
7+
def setUp(self):
8+
df = pd.DataFrame({
9+
"x": np.arange(5),
10+
"y": np.arange(5, 10),
11+
"CTPLumi_countsFV0": np.array([2000, 2100, 2200, 2300, 2400])
12+
})
13+
self.adf = AliasDataFrame(df)
14+
15+
def test_basic_alias(self):
16+
self.adf.add_alias("z", "x + y")
17+
self.adf.materialize_alias("z")
18+
expected = self.adf.df["x"] + self.adf.df["y"]
19+
pd.testing.assert_series_equal(self.adf.df["z"], expected, check_names=False)
20+
21+
def test_dtype(self):
22+
self.adf.add_alias("z", "x + y", dtype=np.float16)
23+
self.adf.materialize_alias("z")
24+
self.assertEqual(self.adf.df["z"].dtype, np.float16)
25+
26+
def test_constant(self):
27+
self.adf.add_alias("c", "42.0", dtype=np.float32, is_constant=True)
28+
self.adf.add_alias("z", "x + c")
29+
self.adf.materialize_alias("z")
30+
expected = self.adf.df["x"] + 42.0
31+
pd.testing.assert_series_equal(self.adf.df["z"], expected, check_names=False)
32+
33+
def test_dependency_order(self):
34+
self.adf.add_alias("a", "x + y")
35+
self.adf.add_alias("b", "a * 2")
36+
self.adf.materialize_alias("b")
37+
expected = (self.adf.df["x"] + self.adf.df["y"]) * 2
38+
pd.testing.assert_series_equal(self.adf.df["b"], expected, check_names=False)
39+
40+
def test_log_rate_with_constant(self):
41+
median = self.adf.df["CTPLumi_countsFV0"].median()
42+
self.adf.add_alias("countsFV0_median", f"{median}", dtype=np.float16, is_constant=True)
43+
self.adf.add_alias("logRate", "log(CTPLumi_countsFV0/countsFV0_median)", dtype=np.float16)
44+
self.adf.materialize_alias("logRate")
45+
expected = np.log(self.adf.df["CTPLumi_countsFV0"] / median).astype(np.float16)
46+
pd.testing.assert_series_equal(self.adf.df["logRate"], expected, check_names=False)
47+
48+
if __name__ == "__main__":
49+
unittest.main()

0 commit comments

Comments
 (0)