Skip to content

Commit b8e241e

Browse files
author
miranov25
committed
Save aliases directly to pyarrow metadata
1 parent 54de3fd commit b8e241e

File tree

1 file changed

+34
-8
lines changed

1 file changed

+34
-8
lines changed

UTILS/dfextensions/AliasDataFrame.py

Lines changed: 34 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -191,23 +191,49 @@ def materialize_all(self, dtype=None):
191191
print(f"Failed to materialize {name}: {e}")
192192

193193
def save(self, path_prefix, dropAliasColumns=True):
194+
import pyarrow as pa
195+
import pyarrow.parquet as pq
196+
194197
if dropAliasColumns:
195198
cols = [c for c in self.df.columns if c not in self.aliases]
196199
else:
197200
cols = list(self.df.columns)
198-
self.df[cols].to_parquet(f"{path_prefix}.parquet", compression="zstd")
201+
202+
# Save Parquet with metadata
203+
table = pa.Table.from_pandas(self.df[cols])
204+
metadata = {
205+
"aliases": json.dumps(self.aliases),
206+
"dtypes": json.dumps({k: v.__name__ for k, v in self.alias_dtypes.items()})
207+
}
208+
existing_meta = table.schema.metadata or {}
209+
combined_meta = existing_meta.copy()
210+
combined_meta.update({k.encode(): v.encode() for k, v in metadata.items()})
211+
table = table.replace_schema_metadata(combined_meta)
212+
pq.write_table(table, f"{path_prefix}.parquet", compression="zstd")
213+
214+
# Also write JSON file for explicit tracking
199215
with open(f"{path_prefix}.aliases.json", "w") as f:
200-
json.dump({"aliases": self.aliases, "dtypes": {k: str(v) for k, v in self.alias_dtypes.items()}}, f, indent=2)
216+
json.dump(metadata, f, indent=2)
201217

202218
@staticmethod
203219
def load(path_prefix):
204-
df = pd.read_parquet(f"{path_prefix}.parquet")
205-
with open(f"{path_prefix}.aliases.json") as f:
206-
data = json.load(f)
220+
import pyarrow.parquet as pq
221+
table = pq.read_table(f"{path_prefix}.parquet")
222+
df = table.to_pandas()
207223
adf = AliasDataFrame(df)
208-
adf.aliases = data["aliases"]
209-
if "dtypes" in data:
210-
adf.alias_dtypes = {k: getattr(np, v) for k, v in data["dtypes"].items()}
224+
225+
# Try metadata first
226+
meta = table.schema.metadata or {}
227+
if b"aliases" in meta and b"dtypes" in meta:
228+
adf.aliases = json.loads(meta[b"aliases"].decode())
229+
adf.alias_dtypes = {k: getattr(np, v) for k, v in json.loads(meta[b"dtypes"].decode()).items()}
230+
else:
231+
# Fallback to JSON
232+
with open(f"{path_prefix}.aliases.json") as f:
233+
data = json.load(f)
234+
adf.aliases = json.loads(data["aliases"])
235+
adf.alias_dtypes = {k: getattr(np, v) for k, v in json.loads(data["dtypes"]).items()}
236+
211237
return adf
212238

213239
def export_tree(self, filename, treename="tree", dropAliasColumns=True):

0 commit comments

Comments
 (0)