switchbox-data · alxsmith · Jan 13, 2026 · Jan 14, 2026
diff --git a/reports/ny_aeba_grid/notebooks/analysis.qmd b/reports/ny_aeba_grid/notebooks/analysis.qmd
@@ -29,48 +29,31 @@ source("/workspaces/reports2/lib/ggplot/switchbox_theme.R")
 
 ```{r}
 
-path_nyiso_hourly_load_csv <- "/workspaces/reports2/data/nyiso/hourly/20151025-20251026 NYISO Hourly Actual Load.csv"
-path_nyiso_hourly_load_parquet <- "/workspaces/reports2/data/nyiso/hourly/nyiso_hourly_load.parquet"
-
-# Based on the image, the CSV's columns are: Date, Load, Zone (and probably more for time parsing)
-# Let's cleanly parse the timestamp, extract date/hour, and aggregate total load (sum) across all zones per hour
-
-# first look for the parquet, fallback to loading and parsing CSV if not found
-if (file.exists(path_nyiso_hourly_load_parquet)) {
-    nyiso_hourly_load <- read_parquet(path_nyiso_hourly_load_parquet)
-} else {
-    nyiso_hourly_load <- read_csv(path_nyiso_hourly_load_csv)
-
-    nyiso_hourly_load <- nyiso_hourly_load |>
-        # Rename columns to lowercase for easier handling if needed
-        rename(
-            datetime = Date,
-            load = Load,
-            zone = Zone
-        ) |>
-        # ensure load is numeric
-        mutate(load = as.numeric(load)) |>
-        # Parse the datetime (given as e.g. "10/25/2015  7:00:00 PM" -- note double space between date and hour)
-        mutate(
-            datetime = lubridate::mdy_hms(
-                datetime,
-                tz = "America/New_York",
-                quiet = TRUE
-            ),
-            year = lubridate::year(datetime),
-            month = lubridate::month(datetime),
-            day = lubridate::day(datetime),
-            hour = lubridate::hour(datetime)
-        ) |>
-        select(-datetime)
-
-    # save to parquet
-    write_parquet(
-        nyiso_hourly_load,
-        "/workspaces/reports2/data/nyiso/hourly/nyiso_hourly_load.parquet"
-    )
-}
+# Read processed NYISO hourly load data from S3
+# Set up S3 filesystem with arrow
+s3_bucket <- arrow::s3_bucket(
+    bucket = "data.sb",
+    region = "us-west-2"
+)
 
+s3_file_path <- "ny_aeba_grid/nyiso/hourly/nyiso_hourly_load.parquet"
+
+nyiso_hourly_load <- tryCatch(
+    {
+        arrow::read_parquet(s3_bucket$path(s3_file_path))
+    },
+    error = function(e) {
+        cat("\n")
+        cat("ERROR: Could not read processed NYISO hourly load data from S3.\n")
+        cat("Bucket: data.sb\n")
+        cat("Path:", s3_file_path, "\n")
+        cat("Error details:", conditionMessage(e), "\n\n")
+        cat("The processed parquet file may not exist yet, or there may be an AWS credentials issue.\n")
+        cat("Please run the data processing script first:\n")
+        cat("  Rscript reports/ny_aeba_grid/utils/make_hourly_nyiso_load.R\n\n")
+        stop("Missing required data file on S3", call. = FALSE)
+    }
+)
 
 # add a "NY_STATE" zone, which is the sum of all zones
 nyiso_monthly_peak_load <- nyiso_hourly_load |>

diff --git a/reports/ny_aeba_grid/utils/make_hourly_nyiso_load.R b/reports/ny_aeba_grid/utils/make_hourly_nyiso_load.R
@@ -0,0 +1,65 @@
+#!/usr/bin/env Rscript
+
+# Utility script to process NYISO hourly load data from S3
+# Reads raw CSV, processes it, and uploads parquet to S3
+
+library(tidyverse)
+library(arrow)
+library(lubridate)
+
+# Set up S3 filesystem
+s3_bucket <- arrow::s3_bucket(
+    bucket = "data.sb",
+    region = "us-west-2"
+)
+
+# S3 file paths (without bucket prefix)
+s3_csv_file <- "ny_aeba_grid/nyiso/hourly/20151025-20251026 NYISO Hourly Actual Load.csv"
+s3_parquet_file <- "ny_aeba_grid/nyiso/hourly/nyiso_hourly_load.parquet"
+
+cat("Starting NYISO hourly load data processing...\n")
+cat("Reading CSV from S3: data.sb/", s3_csv_file, "\n", sep = "")
+
+# Read CSV from S3
+nyiso_hourly_load <- arrow::read_csv_arrow(s3_bucket$path(s3_csv_file))
+
+cat("CSV loaded. Processing data...\n")
+
+# Apply transformations
+nyiso_hourly_load <- nyiso_hourly_load |>
+    # Rename columns to lowercase for easier handling if needed
+    rename(
+        datetime = Date,
+        load = Load,
+        zone = Zone
+    ) |>
+    # ensure load is numeric
+    mutate(load = as.numeric(load)) |>
+    # Parse the datetime (given as e.g. "10/25/2015  7:00:00 PM" -- note double space between date and hour)
+    mutate(
+        datetime = lubridate::mdy_hms(
+            datetime,
+            tz = "America/New_York",
+            quiet = TRUE
+        ),
+        year = lubridate::year(datetime),
+        month = lubridate::month(datetime),
+        day = lubridate::day(datetime),
+        hour = lubridate::hour(datetime)
+    ) |>
+    select(-datetime)
+
+cat("Data processed successfully.\n")
+cat("Writing parquet to S3: data.sb/", s3_parquet_file, "\n", sep = "")
+
+# Write parquet to S3
+arrow::write_parquet(
+    nyiso_hourly_load,
+    s3_bucket$path(s3_parquet_file)
+)
+
+cat("✓ Parquet file uploaded to S3 successfully!\n")
+cat("Summary:\n")
+cat("  - Total rows:", nrow(nyiso_hourly_load), "\n")
+cat("  - Columns:", paste(names(nyiso_hourly_load), collapse = ", "), "\n")
+cat("  - Year range:", min(nyiso_hourly_load$year), "to", max(nyiso_hourly_load$year), "\n")
diff --git a/reports/ri_hp_rates/notebooks/analysis.qmd b/reports/ri_hp_rates/notebooks/analysis.qmd
@@ -39,7 +39,7 @@ path_heat_pump_plots <- file.path(path_to_lib, "rates_analysis", "heat_pump_rate
 path_create_housing_units <- file.path(path_to_lib, "rates_analysis", "create_sb_housing_units.R")
 
 # Data paths
-path_monthly_data <- file.path(path_to_data, "resstock", "2024_release2_tmy3", "load_curve_monthly")
+path_monthly_data <- file.path(path_to_data, "resstock", "2024_release2_tmy3_2", "load_curve_monthly")
 path_supply_year_metadata_dir <- file.path(path_to_data, "resstock", "2024_release2_tmy3", "metadata")
 path_fuel_oil_supply_rates <- file.path(path_to_data, "eia", "heating_oil", "ri_eia_heating_oil_prices_monthly.parquet")
 path_propane_supply_rates <- file.path(path_to_data, "eia", "propane", "ri_eia_propane_prices_monthly.parquet")