gowerc · gowerc · Jul 26, 2024 · Jul 29, 2024 · Jul 29, 2024 · Jul 29, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -17,7 +17,8 @@ Depends:
     R (>= 3.1.2)
 Imports:
     tibble,
-    assertthat
+    assertthat,
+    data.table
 Suggests: 
     testthat,
     lubridate,

diff --git a/NAMESPACE b/NAMESPACE
@@ -10,6 +10,7 @@ S3method(print,diffdf)
 export(diffdf)
 export(diffdf_has_issues)
 export(diffdf_issuerows)
+import(data.table)
 importFrom(tibble,as_tibble)
 importFrom(tibble,rownames_to_column)
 importFrom(tibble,tibble)
diff --git a/R/diffdf.R b/R/diffdf.R
@@ -24,6 +24,7 @@
 #' @param check_column_order Should the column ordering be checked? (logical)
 #' @param check_df_class Do you want to check for differences in the class
 #' between `base` and `compare`? (logical)
+#' @import data.table
 #' @examples
 #' x <- subset(iris, -Species)
 #' x[1, 2] <- 5
@@ -97,14 +98,14 @@ diffdf <- function(
     check_df_class = FALSE
 ) {
 
+    BASE <- base
+    COMP <- compare
     assertthat::assert_that(
         assertthat::is.flag(check_df_class),
         !is.na(check_df_class),
         msg = "`check_df_class` must be a length 1 logical"
     )
 
-    BASE <- base
-    COMP <- compare
     KEYS <- keys
     SUPWARN <- suppress_warnings
 
@@ -119,7 +120,8 @@ diffdf <- function(
         value = describe_dataframe(BASE, COMP, BASE_NAME, COMP_NAME),
         message = "Summary of BASE and COMPARE"
     )
-
+    BASE <- copy(as.data.table(BASE))
+    COMP <- copy(as.data.table(COMP))
 
     is_derived <- FALSE
 
@@ -156,17 +158,6 @@ diffdf <- function(
         )
     )
 
-    assertthat::assert_that(
-        has_unique_rows(BASE, KEYS),
-        msg = "BY variables in BASE do not result in unique observations"
-    )
-
-    assertthat::assert_that(
-        has_unique_rows(COMP, KEYS),
-        msg = "BY variables in COMPARE do not result in unique observations"
-    )
-
-
 
     #### Check essential variable properties (class & mode)
 
@@ -209,24 +200,6 @@ diffdf <- function(
     )
 
 
-
-
-    ##### Check Validity of Keys
-
-    BASE_keys <- names(BASE)[names(BASE) %in% KEYS]
-    COMP_keys <- names(COMP)[names(COMP) %in% KEYS]
-
-    assertthat::assert_that(
-        length(BASE_keys) == length(KEYS),
-        msg = "BASE is missing variables specified in KEYS"
-    )
-
-    assertthat::assert_that(
-        length(COMP_keys) == length(KEYS),
-        msg = "COMP is missing variables specified in KEYS"
-    )
-
-
     assert_valid_keys(
         COMPARE, KEYS, "UnsupportedColsBase",
         "The following KEYS in BASE have an unsupported mode (see `?mode()`)"
@@ -244,6 +217,16 @@ diffdf <- function(
         "The following KEYS have different classes between BASE and COMPARE"
     )
 
+    assertthat::assert_that(
+        has_unique_rows(BASE, KEYS),
+        msg = "BY variables in BASE do not result in unique observations"
+    )
+
+    assertthat::assert_that(
+        has_unique_rows(COMP, KEYS),
+        msg = "BY variables in COMPARE do not result in unique observations"
+    )
+
 
     exclude_cols <- c(
         COMPARE[["UnsupportedColsBase"]]$VARIABLE,
@@ -257,13 +240,13 @@ diffdf <- function(
             keep_vars_base <- !(names(BASE) %in% attr(COMPARE, "keys")$value)
             keep_vars_comp <- !(names(COMP) %in% attr(COMPARE, "keys")$value)
         } else {
-            keep_vars_base <- TRUE
-            keep_vars_comp <- TRUE
+            keep_vars_base <- names(BASE)
+            keep_vars_comp <- names(COMP)
         }
         COMPARE[["ColumnOrder"]] <- construct_issue(
             value = identify_column_order_differences(
-                BASE[, keep_vars_base, drop = FALSE],
-                COMP[, keep_vars_comp, drop = FALSE]
+                subset(BASE, select = keep_vars_base),
+                subset(COMP, select = keep_vars_comp)
             ),
             message = "There are differences in the column ordering between BASE and COMPARE !!"
         )

diff --git a/R/identify.R b/R/identify.R
@@ -6,8 +6,11 @@
 #' @param KEYS List of variables that define a unique row within the datasets (strings)
 #' @keywords internal
 identify_extra_rows <- function(DS1, DS2, KEYS) {
+
+    DS1 <- copy(DS1)
+    DS2 <- copy(DS2)
     if (nrow(DS2) == 0 || nrow(DS1) == 0) {
-        return(DS1[, KEYS, drop = FALSE])
+        return(subset(DS1, select = KEYS))
     }
     DS2[["..FLAG.."]] <- "Y"
     dat <- merge(
@@ -16,9 +19,8 @@ identify_extra_rows <- function(DS1, DS2, KEYS) {
         by = KEYS, all.x = TRUE,
         sort = TRUE
     )
-    dat <- dat[do.call("order", dat[KEYS]), ]
-
-    dat[is.na(dat[["..FLAG.."]]), KEYS, drop = FALSE]
+    do.call(setorder, list(dat, KEYS))
+    subset(dat[is.na(dat[["..FLAG.."]])], select = KEYS)
 }
 
 
@@ -228,6 +230,8 @@ identify_differences <- function(
     tolerance = sqrt(.Machine$double.eps),
     scale = NULL
 ) {
+    BASE <- copy(BASE)
+    COMP <- copy(COMP)
 
     matching_cols <- identify_matching_cols(BASE, COMP, c(KEYS, exclude_cols))
 
@@ -245,7 +249,7 @@ identify_differences <- function(
     if (nrow(DAT) == 0) {
         return(tibble())
     }
-    DAT <- DAT[do.call("order", DAT[KEYS]), ]
+    do.call(setorder, list(DAT, KEYS))
 
     matching_list <- mapply(
         is_variable_different,

diff --git a/tests/testthat/test-miscellaneous.R b/tests/testthat/test-miscellaneous.R
@@ -199,3 +199,31 @@ test_that("Format Char works on standard data types", {
 test_that("ascii_table can handle all standard datatypes", {
     expect_snapshot(as_ascii_table(TDAT) |> cat())
 })
+
+
+test_that("diffdf doesn't alter existing data.table objects", {
+    # Many operations in data.table are pass by reference
+    # So here we just check to make sure diffdf doesn't modify the
+    # users data which would be unexpected / unwanted behaviour
+    df1 <- data.table(
+        id = c(1, 2, 3, 4, 5),
+        val = rnorm(5),
+        age = factor(runif(5))
+    )
+    attr(df1, "some attr") <- c(1, 2, 3, 4)
+    attr(df1$val, "att2") <- c("A", "B")
+    df1_copy <- copy(df1)
+
+    df2 <- data.table(
+        id = c(1, 2, 99, 4, 5),
+        val = rnorm(5),
+        age = runif(5)
+    )
+    df2_copy <- copy(df2)
+
+    devnull <- diffdf(df1, df2, suppress_warnings = TRUE)
+    devnull <- diffdf(df1, df2, key = "id", suppress_warnings = TRUE)
+
+    expect_equal(df1, df1_copy)
+    expect_equal(df2, df2_copy)
+})
diff --git a/utils/benchmarks/benchmark2.R b/utils/benchmarks/benchmark2.R
@@ -1,120 +1,73 @@
 devtools::load_all()
 
-n <- 1000000
-test_dat <- data_frame( id = 1:n)
 
-
-vars <-letters[1:15]
-
-for ( i in vars){
-    test_dat[[paste0(i,"_num.x")]] <- rnorm(n)
-    test_dat[[paste0(i,"_num.y")]] <- rnorm(n)
-    test_dat[[paste0(i,"_chr.x")]] <- letters[ round(runif(n , 1,25))]
-    test_dat[[paste0(i,"_chr.y")]] <- letters[ round(runif(n , 1,25))]
-    test_dat[[paste0(i,"_fct.x")]] <- factor(test_dat[[paste0(i,"_chr.x")]])
-    test_dat[[paste0(i,"_fct.y")]] <- factor(test_dat[[paste0(i,"_chr.y")]])
-}
-
-vars2 <- c(
-    paste0(vars, "_num"),
-    paste0(vars, "_chr"),
-    paste0(vars, "_fct")
-)
-
-matching_cols = vars2
-KEYS = "id"
-DAT = test_dat
-tolerance = sqrt(.Machine$double.eps)
-scale = NULL
-
-RES5 <- purrr::rerun( 10 , {
-    x <- system.time({
-        matching_list <- mapply(
-            is_variable_different , 
-            matching_cols,
-            MoreArgs = list(
-                keynames = KEYS, 
-                datain = DAT, 
-                tolerance = tolerance ,
-                scale = scale
-            ),
-            SIMPLIFY = FALSE
-        )
-    })
-
-
-    y <- system.time({
-        HOLD <- list() 
-        for ( v in vars2){
-
-            xvar <- paste0(v,'.x')
-            yvar <- paste0(v,'.y')
-
-            keep <- find_difference( test_dat[[xvar]] , test_dat[[yvar]] , tolerance = tolerance , scale = scale)
-
-            HOLD[[v]] <- data_frame(
-                VARIABLE = v,
-                BASE = test_dat[[xvar]][keep],
-                COMPARE = test_dat[[yvar]][keep]
-            )
-
-            for ( i in KEYS){
-                HOLD[[v]][[i]] <- test_dat[[i]][keep]
-            }
-
-            #HOLD[[v]] <- HOLD[[v]][ , c("VARIABLE" , KEYS, "BASE" , "COMPARE")]
-        }
-    })
-
-    data_frame(
-        x = x[[3]] ,
-        y = y[[3]] ,
-        diff = x - y,
-        pcent = y / x
-    )
-
+suppressPackageStartupMessages({
+    library(dplyr)
+    library(stringi)
+    library(lubridate)
+    library(haven)
 })
 
-bind_rows(RES1)$pcent %>% mean  # nrow = 300000 , ncol = 60
-bind_rows(RES2)$pcent %>% mean  # nrow = 300000 , ncol = 15
-bind_rows(RES3)$pcent %>% mean  # nrow = 600000 , ncol = 15
-bind_rows(RES4)$pcent %>% mean
-bind_rows(RES5)$pcent %>% mean
-
-
-HOLD[["e_fct"]]
-matching_list[["e_fct"]]
-
-
 
-
-is_variable_different2 <- function (variablename, keynames, datain, ...) {
-
-
-
-    if ( ! xvar %in% names(datain) | ! yvar %in% names(datain)){
-        stop("Variable does not exist within input dataset")
+generate_test_data <- function(
+    n = 10000,
+    n_col = 4,
+    n_num = n_col,
+    n_int = n_col,
+    n_chr = n_col,
+    n_fct = n_col,
+    n_date = n_col,
+    n_dt = n_col
+) {
+    dat <- tibble(id = 1:n)
+
+    for (i in seq_len(n_num)) {
+        dat[sprintf("num_%s", i)] <- runif(n, -1000000, 1000000)
+    }
+    for (i in seq_len(n_int)) {
+        dat[sprintf("int_%s", i)] <- as.integer(sample(seq(-9999, 9999), n, TRUE))
+    }
+    for (i in seq_len(n_chr)) {
+        possible_chrs <- stringi::stri_rand_strings(1000, 15)
+        dat[sprintf("chr_%s", i)] <- sample(possible_chrs, n, TRUE)
+    }
+    for (i in seq_len(n_fct)) {
+        fct_levels <- c("A", "B", "C", "D", "E", "F", "G", "H")
+        dat[sprintf("fct_%s", i)] <- factor(sample(fct_levels, size = n, replace = TRUE), levels = fct_levels)
+    }
+    for (i in seq_len(n_date)) {
+        dat[sprintf("date_%s", i)] <- ymd("20200101") + days(round(runif(n, -1000, 1000)))
     }
-
-    target  <- datain[[xvar]]
-    current <- datain[[yvar]]
-    outvect <- find_difference(target, current, ...)
-
-    datain[["VARIABLE"]] <- variablename
-
-    names(datain)[names(datain) %in% c(xvar, yvar)] <- c("BASE", "COMPARE")
-
-    as.tibble(subset(datain, outvect, select = c("VARIABLE", keynames, "BASE", "COMPARE")))
-
+    for (i in seq_len(n_dt)) {
+        dat[sprintf("dt_%s", i)] <- ymd_hms("2020-01-01T12:00:01") + seconds(round(runif(n, -70000000, 70000000)))
+    }
+    return(dat)
 }
 
+dat1 <- generate_test_data(1000000, n_col = 10) |>
+    sample_frac(1)
 
+dat2 <- generate_test_data(1000000, n_col = 10) |>
+    sample_frac(1)
 
 
+results_new <- replicate(
+    {
+        x <- system.time({
+            diffdf(dat1, dat2, "id", suppress_warnings = TRUE)
+        })
+        Sys.sleep(4)
+        x
+    },
+    simplify = FALSE,
+    n = 6
+)
 
 
+dat1 <- generate_test_data(1000, n_col = 1) |>
+    sample_frac(1)
 
+dat2 <- generate_test_data(1000, n_col = 1) |>
+    sample_frac(1)
 
-
-
-
+diffdf(dat1, dat2, "id")