Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
.DS_Store
build/
dist/
*.egg-info/

## Ignore Visual Studio temporary files, build results, and
## files generated by popular Visual Studio add-ons.
Expand Down
153 changes: 152 additions & 1 deletion py-src/data_formulator/tables_routes.py
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is incorrect we should do this function as recalculate_derived_data, we need to sub functions:

  1. recalc_derived_data_py(): update a table based on python, if this is the case, we will provide a list of input tables, and the derived table code, then return the new derived table by applying the python code on updated inputs
  2. recalc_derived_data_sql(): this will simply rerun the query against duck db

decide which one to call based on whether the derived data is virtual or not in the backend. Call each table update independently (i.e., when a table update, we first find the list of derived table that are affected, and then for each affected table, gather their actual input data, and rerun code to get the new content and update) we don't need to do this recursively)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But for now, let's only do recalc_derived_data_py, as we first only support python data refresh. This simplifies the refresh logic for remote/duckdb data.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Refactored to recalculate_derived_data endpoint with recalc_derived_data_py() function. Virtual/SQL table support skipped - returns 'skipped' status. Each affected table is processed independently with actual input data. Fixed in commit f611d1f.

Original file line number Diff line number Diff line change
Expand Up @@ -841,4 +841,155 @@ def data_loader_ingest_data_from_query():
return jsonify({
"status": "error",
"message": safe_msg
}), status_code
}), status_code


@tables_bp.route('/recalculate-derived-data', methods=['POST'])
def recalculate_derived_data():
"""Recalculate derived data by re-executing Python code on updated base table"""
try:
data = request.get_json()

# Get updated base table data
updated_table_id = data.get('updated_table_id')
updated_table_rows = data.get('updated_table_rows')

if not updated_table_id:
return jsonify({"status": "error", "message": "No table ID provided"}), 400

if not updated_table_rows:
return jsonify({"status": "error", "message": "No updated data provided"}), 400

# Get list of affected derived tables
affected_derived_tables = data.get('affected_derived_tables', [])

if not affected_derived_tables:
# No derived tables to refresh, just success
return jsonify({
"status": "success",
"results": []
})

results = []

# Process each affected derived table independently
for derived_info in affected_derived_tables:
try:
derived_table_id = derived_info['id']
code = derived_info['code']
source_table_ids = derived_info['source_tables']
is_virtual = derived_info.get('is_virtual', False)

# For now, only support Python (non-virtual) tables
if is_virtual:
results.append({
'id': derived_table_id,
'status': 'skipped',
'message': 'Virtual (DuckDB) table refresh not yet supported'
})
continue

# Recalculate using Python
result = recalc_derived_data_py(
updated_table_id=updated_table_id,
updated_table_rows=updated_table_rows,
derived_table_id=derived_table_id,
code=code,
source_table_ids=source_table_ids
)

results.append(result)

except Exception as e:
logger.error(f"Error recalculating derived table {derived_info.get('id')}: {str(e)}")
results.append({
'id': derived_info.get('id'),
'status': 'error',
'message': str(e)
})

return jsonify({
"status": "success",
"results": results
})

except Exception as e:
logger.error(f"Error recalculating derived data: {str(e)}")
safe_msg, status_code = sanitize_db_error_message(e)
return jsonify({
"status": "error",
"message": safe_msg
}), status_code


def recalc_derived_data_py(updated_table_id, updated_table_rows, derived_table_id, code, source_table_ids):
"""
Recalculate a Python-based derived table using updated input data.

Args:
updated_table_id: ID of the table that was updated
updated_table_rows: New rows for the updated table
derived_table_id: ID of the derived table to recalculate
code: Python transformation code
source_table_ids: List of source table IDs this derived table depends on

Returns:
dict with status, rows, and columns
"""
from data_formulator.py_sandbox import run_transform_in_sandbox2020

try:
# Prepare input dataframes
df_list = []

for source_id in source_table_ids:
if source_id == updated_table_id:
# Use the updated data
df = pd.DataFrame(updated_table_rows)
else:
# Fetch from database or state
with db_manager.connection(session['session_id']) as db:
try:
result = db.execute(f"SELECT * FROM {source_id}").fetchdf()
df = result
except Exception as e:
logger.warning(f"Could not fetch table {source_id} from database: {str(e)}")
# Table might not be in database yet (in-memory only)
return {
'id': derived_table_id,
'status': 'error',
'message': f'Could not fetch source table: {source_id}'
}

df_list.append(df)

# Execute the transformation code in subprocess for safety
exec_result = run_transform_in_sandbox2020(code, df_list, exec_python_in_subprocess=True)

if exec_result['status'] == 'ok':
output_df = exec_result['content']

# Convert to records format efficiently
rows = output_df.to_dict(orient='records')
columns = list(output_df.columns)

return {
'id': derived_table_id,
'status': 'success',
'rows': rows,
'columns': columns
}
else:
return {
'id': derived_table_id,
'status': 'error',
'message': exec_result['content']
}

except Exception as e:
logger.error(f"Error in recalc_derived_data_py for {derived_table_id}: {str(e)}")
return {
'id': derived_table_id,
'status': 'error',
'message': str(e)
}
11 changes: 11 additions & 0 deletions src/app/dfSlice.tsx
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we don't need to update concept shelf, since it requires all columns are the same

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed concept shelf update from updateTableRows action since columns must be identical. Fixed in commit f611d1f.

Original file line number Diff line number Diff line change
Expand Up @@ -475,6 +475,17 @@ export const dataFormulatorSlice = createSlice({
let attachedMetadata = action.payload.attachedMetadata;
state.tables = state.tables.map(t => t.id == tableId ? {...t, attachedMetadata} : t);
},
updateTableRows: (state, action: PayloadAction<{tableId: string, rows: any[]}>) => {
let tableId = action.payload.tableId;
let rows = action.payload.rows;
state.tables = state.tables.map(t => {
if (t.id == tableId) {
// Update rows while preserving other table properties
return {...t, rows};
}
return t;
});
},
extendTableWithNewFields: (state, action: PayloadAction<{tableId: string, columnName: string, values: any[], previousName: string | undefined, parentIDs: string[]}>) => {
// extend the existing extTable with new columns from the new table
let newValues = action.payload.values;
Expand Down
Loading