ercbk
diff --git a/‎NAMESPACE‎
Lines changed: 2 additions & 0 deletions b/‎NAMESPACE‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎R/add-spatial-lags.R‎
Lines changed: 39 additions & 26 deletions b/‎R/add-spatial-lags.R‎
Lines changed: 39 additions & 26 deletions
diff --git a/‎R/skim-arrow.R‎
Lines changed: 201 additions & 0 deletions b/‎R/skim-arrow.R‎
Lines changed: 201 additions & 0 deletions
diff --git a/‎R/test-fable-resids.R‎
Lines changed: 19 additions & 19 deletions b/‎R/test-fable-resids.R‎
Lines changed: 19 additions & 19 deletions
@@ -1,5 +1,6 @@
 # Generated by roxygen2: do not edit by hand
 
+S3method(print,skim_arrow)
 export("%>%")
 export(":=")
 export(.data)
@@ -18,6 +19,7 @@ export(expr)
 export(get_boot_ci)
 export(prewhitened_ccf)
 export(scale_by_mase)
+export(skim_arrow)
 export(sym)
 export(syms)
 export(test_fable_resids)
 
@@ -87,15 +87,6 @@
 #'
 #' cat(attributes(tib_spat_lags)$summ_wgts_spatlag_1, sep = "\n")
 #'
-#' rlang::check_installed(
-#'   "mirai (>= 2.1.0.9000)",
-#'   action = function(...) {
-#'     remotes::install_version('mirai',
-#'                              version = ">= 2.1.0.9000",
-#'                              repos = c('https://shikokuchuo.r-universe.dev',
-#'                                        'https://cloud.r-project.org'))
-#'   }
-#' )
 #'
 #' library(mirai)
 #'
@@ -116,12 +107,12 @@
 
 
 add_spatial_lags <- function(nblist,
-                             y,
-                             .data,
-                             lags,
-                             type = NULL,
-                             parallel = FALSE,
-                             ...) {
+                              y,
+                              .data,
+                              lags,
+                              type = NULL,
+                              parallel = FALSE,
+                              ...) {
 
   # ---------------- tests ------------------
   # Check if nblist is of class "nb"
@@ -151,15 +142,32 @@ add_spatial_lags <- function(nblist,
   }
   # -----------------------------------------
 
-  get_vec_lags <- function(lag_nb, vec_num, .data, lag, type, ...) {
+  get_vec_lags <- function(lag_nb, vec_num, .data, lag, type, dots) {
 
     # add weights to nb list
     if (is.null(type)) {
+      listw_args <- list(neighbours = lag_nb)
+      if (length(dots) != 0) {
+        listw_args <- append(listw_args, dots)
+      }
       ls_wts <-
-        spdep::nb2listw(lag_nb, ...)
+        do.call(
+          spdep::nb2listw,
+          listw_args
+        )
     } else {
+      listwdist_args <-
+        list(neighbours = lag_nb,
+             x = .data,
+             type = type)
+      if (length(dots) != 0) {
+        listwdist_args <- append(listwdist_args, dots)
+      }
       ls_wts <-
-        spdep::nb2listwdist(lag_nb, .data, type, ...)
+        do.call(
+          spdep::nb2listwdist,
+          listwdist_args
+        )
     }
 
     # get weights summary
@@ -200,20 +208,25 @@ add_spatial_lags <- function(nblist,
       purrr::map2(
         lags_nb,
         1:lags,
-        carrier::crate(
+        purrr::in_parallel(
           \(x1, x2) {
             get_vec_lags(
               x1,
-              !!vec_num,
-              !!.data,
+              vec_num,
+              .data,
               x2,
-              !!type,
-              !!!dots
+              type,
+              dots
             )
           },
-          get_vec_lags = get_vec_lags
+          get_vec_lags = get_vec_lags,
+          vec_num = vec_num,
+          .data = .data,
+          type = type,
+          dots = dots,
+          y = y # not a fun arg, but needed to add to env for glue variable naming
         ),
-        .parallel = TRUE
+        .progress = TRUE
       )
 
   } else {
@@ -229,7 +242,7 @@ add_spatial_lags <- function(nblist,
             .data,
             x2,
             type,
-            ...
+            dots
           )
         }
       )
 
@@ -0,0 +1,201 @@
+#' Skim an Arrow Dataset
+#'
+#' @description
+#' Provides a \{skimr\}-style summary of an Arrow Dataset with statistics
+#' organized by variable type. Computes summary statistics efficiently using
+#' Arrow's query engine without loading the full dataset into memory.
+#'
+#' @param ds An Arrow Dataset object created with `arrow::open_dataset()`. This would probably work on any \{arrow\} data object with a schema.
+#'
+#' @return A list of class "skim_arrow" containing:
+#'   \item{overview}{A tibble with dataset dimensions and column type counts}
+#'   \item{numeric}{A tibble with statistics for numeric columns (missing_pct, mean, sd, min, max)}
+#'   \item{character}{A tibble with statistics for character columns (missing_pct, n_unique)}
+#'   \item{timestamp}{A tibble with statistics for timestamp columns (missing_pct, min, max)}
+#'
+#' @details
+#' The function classifies columns by type and computes appropriate summary
+#' statistics for each:
+#' \itemize{
+#'   \item Numeric columns: missing percentage, mean, standard deviation, min, max
+#'   \item Character columns: missing percentage, number of unique values
+#'   \item Timestamp columns: missing percentage, min, max (as POSIXct objects)
+#' }
+#'
+#' All computations are performed using Arrow's query engine, making this
+#' function efficient even for very large datasets stored in Parquet files.
+#'
+#' @examples
+#' \dontrun{
+#' # Open a directory of Parquet files
+#' ds <- arrow::open_dataset("path/to/parquet/files")
+#'
+#' # Get summary statistics
+#' summary <- skim_arrow(ds)
+#'
+#' # View all sections
+#' summary
+#'
+#' # Access specific sections
+#' summary$numeric
+#' summary$character
+#' summary$timestamp
+#' }
+#'
+#' @seealso \code{\link[arrow]{open_dataset}}, \code{\link[skimr]{skim}}
+#'
+#' @export
+skim_arrow <- function(ds) {
+
+  # Get schema to identify column types
+  schema <- ds$schema
+  col_names <- names(schema)
+
+  # Classify columns by type
+  numeric_cols <- col_names[sapply(schema, function(field) {
+    type_name <- field$type$ToString()
+    grepl("int|float|double|decimal", type_name, ignore.case = TRUE)
+  })]
+
+  character_cols <- col_names[sapply(schema, function(field) {
+    type_name <- field$type$ToString()
+    grepl("string|utf8", type_name, ignore.case = TRUE)
+  })]
+
+  timestamp_cols <- col_names[sapply(schema, function(field) {
+    type_name <- field$type$ToString()
+    grepl("timestamp", type_name, ignore.case = TRUE)
+  })]
+
+  # Build the summary query
+  result <- ds |>
+    dplyr::summarize(
+      # Missingness for ALL columns
+      dplyr::across(
+        dplyr::everything(),
+        ~mean(is.na(.)) * 100,
+        .names = "{.col}_missing_pct"
+      ),
+
+      # Numeric column stats
+      dplyr::across(
+        dplyr::all_of(numeric_cols),
+        list(
+          min = ~min(., na.rm = TRUE),
+          max = ~max(., na.rm = TRUE),
+          mean = ~mean(., na.rm = TRUE),
+          sd = ~sd(., na.rm = TRUE)
+        ),
+        .names = "{.col}_{.fn}"
+      ),
+
+      # Character column stats
+      dplyr::across(
+        dplyr::all_of(character_cols),
+        ~dplyr::n_distinct(., na.rm = TRUE),
+        .names = "{.col}_n_unique"
+      ),
+
+      # Timestamp column stats (min/max only)
+      dplyr::across(
+        dplyr::all_of(timestamp_cols),
+        list(
+          min = ~min(., na.rm = TRUE),
+          max = ~max(., na.rm = TRUE)
+        ),
+        .names = "{.col}_{.fn}"
+      )
+    ) |>
+    dplyr::collect()
+
+  # Create separate tables for each variable type
+  output <- list()
+
+  # Overview table
+  output$overview <- dplyr::tibble(
+    n_rows = nrow(ds),
+    n_cols = length(col_names),
+    n_numeric = length(numeric_cols),
+    n_character = length(character_cols),
+    n_timestamp = length(timestamp_cols)
+  )
+
+  # Numeric variables table
+  if (length(numeric_cols) > 0) {
+    numeric_data <- result |>
+      dplyr::select(dplyr::ends_with("_missing_pct"), dplyr::ends_with(c("_min", "_max", "_mean", "_sd"))) |>
+      dplyr::select(dplyr::matches(paste0("^(", paste(numeric_cols, collapse = "|"), ")_")))
+
+    output$numeric <- dplyr::tibble(
+      variable = numeric_cols,
+      missing_pct = as.numeric(numeric_data[1, paste0(numeric_cols, "_missing_pct")]),
+      mean = as.numeric(numeric_data[1, paste0(numeric_cols, "_mean")]),
+      sd = as.numeric(numeric_data[1, paste0(numeric_cols, "_sd")]),
+      min = as.numeric(numeric_data[1, paste0(numeric_cols, "_min")]),
+      max = as.numeric(numeric_data[1, paste0(numeric_cols, "_max")])
+    )
+  }
+
+  # Character variables table
+  if (length(character_cols) > 0) {
+    char_data <- result |>
+      dplyr::select(dplyr::matches(paste0("^(", paste(character_cols, collapse = "|"), ")_(missing_pct|n_unique)")))
+
+    output$character <- dplyr::tibble(
+      variable = character_cols,
+      missing_pct = as.numeric(char_data[1, paste0(character_cols, "_missing_pct")]),
+      n_unique = as.numeric(char_data[1, paste0(character_cols, "_n_unique")])
+    )
+  }
+
+  # Timestamp variables table
+  if (length(timestamp_cols) > 0) {
+    ts_data <- result |>
+      dplyr::select(dplyr::matches(paste0("^(", paste(timestamp_cols, collapse = "|"), ")_(missing_pct|min|max)")))
+
+    output$timestamp <- dplyr::tibble(
+      variable = timestamp_cols,
+      missing_pct = as.numeric(ts_data[1, paste0(timestamp_cols, "_missing_pct")]),
+      min = as.POSIXct(unlist(ts_data[1, paste0(timestamp_cols, "_min")]), origin = "1970-01-01", tz = "UTC"),
+      max = as.POSIXct(unlist(ts_data[1, paste0(timestamp_cols, "_max")]), origin = "1970-01-01", tz = "UTC")
+    )
+  }
+
+  # Set class for custom print method
+  class(output) <- c("skim_arrow", "list")
+
+  return(output)
+}
+
+#' Print Method for skim_arrow Objects
+#'
+#' Provides formatted output for skim_arrow results, displaying summary
+#' statistics organized by variable type in a `skimr`-style format.
+#'
+#' @param x A skim_arrow object (output from `skim_arrow()`)
+#' @param ... Additional arguments passed to print methods (currently unused)
+#'
+#' @return Invisibly returns the input object `x`
+#' @keywords internal
+#' @export
+print.skim_arrow <- function(x, ...) {
+  cat("\u2500\u2500 Data Summary \u2500\u2500\n\n")
+  print(x$overview)
+
+  if (!is.null(x$numeric)) {
+    cat("\n\u2500\u2500 Numeric Variables \u2500\u2500\n\n")
+    print(x$numeric, n = Inf)
+  }
+
+  if (!is.null(x$character)) {
+    cat("\n\u2500\u2500 Character Variables \u2500\u2500\n\n")
+    print(x$character, n = Inf)
+  }
+
+  if (!is.null(x$timestamp)) {
+    cat("\n\u2500\u2500 Timestamp Variables \u2500\u2500\n\n")
+    print(x$timestamp, n = Inf)
+  }
+
+  invisible(x)
+}
@@ -18,14 +18,15 @@
 #'
 #' @examples
 #'
-#' library(dplyr, warn.conflicts = FALSE)
-#' library(fable, quietly = TRUE)
-#' library(furrr, quietly = TRUE)
-#' plan(multisession)
+#'  library(dplyr, warn.conflicts = FALSE)
+#'  library(fable, quietly = TRUE)
+#'  library(mirai)
 #'
-#' head(ohio_covid)[,1:6]
+#'  head(ohio_covid)[,1:6]
 #'
-#' models_dyn <- ohio_covid[ ,1:7] %>%
+#'  daemons(3)
+#'
+#'  models_dyn <- ohio_covid[ ,1:7] %>%
 #'   tidyr::pivot_longer(
 #'     cols = contains("lead"),
 #'     names_to = "lead",
@@ -37,21 +38,20 @@
 #'   tidyr::drop_na() %>%
 #'   tidyr::nest(data = c(date, cases, lead_deaths)) %>%
 #'   # Run a regression on lagged cases and date vs deaths
-#'   mutate(model = furrr::future_map(data, function(df) {
-#'     model(.data = df,
-#'           dyn_reg = ARIMA(lead_deaths ~ 1 + cases),
-#'           dyn_reg_trend = ARIMA(lead_deaths ~ 1 + cases + trend()),
-#'           dyn_reg_quad = ARIMA(lead_deaths ~ 1 + cases + poly(date, 2))
-#'     )
-#'   }
-#'   ))
-#' # shut down workers
-#' plan(sequential)
+#'   mutate(model = purrr::map(data, purrr::in_parallel(\(df) {
+#'     fabletools::model(
+#'       .data = df,
+#'       dyn_reg = fable::ARIMA(lead_deaths ~ 1 + cases),
+#'       dyn_reg_trend = fable::ARIMA(lead_deaths ~ 1 + cases + trend()),
+#'       dyn_reg_quad = fable::ARIMA(lead_deaths ~ 1 + cases + poly(date, 2))
+#'     )})))
 #'
-#' dyn_mod_tbl <- select(models_dyn, -data)
+#'  # shut down workers
+#'  daemons(0)
 #'
-#' fable_resid_res <- test_fable_resids(dyn_mod_tbl, grp_col = "lead", mod_col = "model")
-#' head(fable_resid_res)
+#'  dyn_mod_tbl <- select(models_dyn, -data)
+#'  fable_resid_res <- test_fable_resids(dyn_mod_tbl, grp_col = "lead", mod_col = "model")
+#'  head(fable_resid_res)