Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion scripts/experiment_effects.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ def generate_free_shipping_threshold_overlay(
# Generate overlay data
overlay_data = generator.generate_experiment_overlay(
experiment_name='free_shipping_threshold_test_v1_1_1',
data_category='orders',WYC
data_category='orders',
granularity='order_id',
source_table_path='bigquery-public-data.thelook_ecommerce.orders',
assignments_df=assignments_df
Expand Down
25 changes: 17 additions & 8 deletions scripts/generate_synthetic_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,10 +93,9 @@ def generate_and_upload_all(self, sample_pct: float = 100.0):
orders_df = orders_df[orders_df['user_id'].isin(user_ids)]
logging.info(f"Sampled {sample_pct}% of data: {len(users_df)} users")

# Generate synthetic datasets (LEGACY - imports from archives)
# Generate experiment assignments using modern method (independent of sampling)
logging.info("Generating experiment assignments...")
from archives.experiment_assignments import generate_experiment_assignments
experiments_df = generate_experiment_assignments(users_df)
experiments_df = self._generate_free_shipping_threshold_assignments()
self._upload_dataframe(experiments_df, "experiment_assignments")

logging.info("Generating logistics data...")
Expand Down Expand Up @@ -232,15 +231,25 @@ def _upload_dataframe(self, df: pd.DataFrame, table_name: str):

table_ref = f"{self.project_id}.{self.dataset_id}.{table_name}"

# Configure load job
# Convert DataFrame to records for JSON upload (same method as experiment_effects.py)
# Convert timestamps to strings for JSON serialization
df_copy = df.copy()
for col in df_copy.columns:
if df_copy[col].dtype == 'datetime64[ns]':
df_copy[col] = df_copy[col].dt.strftime('%Y-%m-%d %H:%M:%S')

records = df_copy.to_dict('records')

# Configure load job for JSON format
job_config = bigquery.LoadJobConfig(
write_disposition="WRITE_TRUNCATE", # Replace existing data
autodetect=True # Auto-detect schema
autodetect=True, # Auto-detect schema
source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
)

# Upload data
job = self.client.load_table_from_dataframe(
df, table_ref, job_config=job_config
# Upload data using JSON format
job = self.client.load_table_from_json(
records, table_ref, job_config=job_config
)
job.result() # Wait for completion

Expand Down