Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ Depends:
R (>= 3.1.2)
Imports:
tibble,
assertthat
assertthat,
data.table
Suggests:
testthat,
lubridate,
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ S3method(print,diffdf)
export(diffdf)
export(diffdf_has_issues)
export(diffdf_issuerows)
import(data.table)
importFrom(tibble,as_tibble)
importFrom(tibble,rownames_to_column)
importFrom(tibble,tibble)
55 changes: 19 additions & 36 deletions R/diffdf.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#' @param check_column_order Should the column ordering be checked? (logical)
#' @param check_df_class Do you want to check for differences in the class
#' between `base` and `compare`? (logical)
#' @import data.table
#' @examples
#' x <- subset(iris, -Species)
#' x[1, 2] <- 5
Expand Down Expand Up @@ -97,14 +98,14 @@ diffdf <- function(
check_df_class = FALSE
) {

BASE <- base
COMP <- compare
assertthat::assert_that(
assertthat::is.flag(check_df_class),
!is.na(check_df_class),
msg = "`check_df_class` must be a length 1 logical"
)

BASE <- base
COMP <- compare
KEYS <- keys
SUPWARN <- suppress_warnings

Expand All @@ -119,7 +120,8 @@ diffdf <- function(
value = describe_dataframe(BASE, COMP, BASE_NAME, COMP_NAME),
message = "Summary of BASE and COMPARE"
)

BASE <- copy(as.data.table(BASE))
COMP <- copy(as.data.table(COMP))

is_derived <- FALSE

Expand Down Expand Up @@ -156,17 +158,6 @@ diffdf <- function(
)
)

assertthat::assert_that(
has_unique_rows(BASE, KEYS),
msg = "BY variables in BASE do not result in unique observations"
)

assertthat::assert_that(
has_unique_rows(COMP, KEYS),
msg = "BY variables in COMPARE do not result in unique observations"
)



#### Check essential variable properties (class & mode)

Expand Down Expand Up @@ -209,24 +200,6 @@ diffdf <- function(
)




##### Check Validity of Keys

BASE_keys <- names(BASE)[names(BASE) %in% KEYS]
COMP_keys <- names(COMP)[names(COMP) %in% KEYS]

assertthat::assert_that(
length(BASE_keys) == length(KEYS),
msg = "BASE is missing variables specified in KEYS"
)

assertthat::assert_that(
length(COMP_keys) == length(KEYS),
msg = "COMP is missing variables specified in KEYS"
)


assert_valid_keys(
COMPARE, KEYS, "UnsupportedColsBase",
"The following KEYS in BASE have an unsupported mode (see `?mode()`)"
Expand All @@ -244,6 +217,16 @@ diffdf <- function(
"The following KEYS have different classes between BASE and COMPARE"
)

assertthat::assert_that(
has_unique_rows(BASE, KEYS),
msg = "BY variables in BASE do not result in unique observations"
)

assertthat::assert_that(
has_unique_rows(COMP, KEYS),
msg = "BY variables in COMPARE do not result in unique observations"
)


exclude_cols <- c(
COMPARE[["UnsupportedColsBase"]]$VARIABLE,
Expand All @@ -257,13 +240,13 @@ diffdf <- function(
keep_vars_base <- !(names(BASE) %in% attr(COMPARE, "keys")$value)
keep_vars_comp <- !(names(COMP) %in% attr(COMPARE, "keys")$value)
} else {
keep_vars_base <- TRUE
keep_vars_comp <- TRUE
keep_vars_base <- names(BASE)
keep_vars_comp <- names(COMP)
}
COMPARE[["ColumnOrder"]] <- construct_issue(
value = identify_column_order_differences(
BASE[, keep_vars_base, drop = FALSE],
COMP[, keep_vars_comp, drop = FALSE]
subset(BASE, select = keep_vars_base),
subset(COMP, select = keep_vars_comp)
),
message = "There are differences in the column ordering between BASE and COMPARE !!"
)
Expand Down
14 changes: 9 additions & 5 deletions R/identify.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,11 @@
#' @param KEYS List of variables that define a unique row within the datasets (strings)
#' @keywords internal
identify_extra_rows <- function(DS1, DS2, KEYS) {

DS1 <- copy(DS1)
DS2 <- copy(DS2)
if (nrow(DS2) == 0 || nrow(DS1) == 0) {
return(DS1[, KEYS, drop = FALSE])
return(subset(DS1, select = KEYS))
}
DS2[["..FLAG.."]] <- "Y"
dat <- merge(
Expand All @@ -16,9 +19,8 @@ identify_extra_rows <- function(DS1, DS2, KEYS) {
by = KEYS, all.x = TRUE,
sort = TRUE
)
dat <- dat[do.call("order", dat[KEYS]), ]

dat[is.na(dat[["..FLAG.."]]), KEYS, drop = FALSE]
do.call(setorder, list(dat, KEYS))
subset(dat[is.na(dat[["..FLAG.."]])], select = KEYS)
}


Expand Down Expand Up @@ -228,6 +230,8 @@ identify_differences <- function(
tolerance = sqrt(.Machine$double.eps),
scale = NULL
) {
BASE <- copy(BASE)
COMP <- copy(COMP)

matching_cols <- identify_matching_cols(BASE, COMP, c(KEYS, exclude_cols))

Expand All @@ -245,7 +249,7 @@ identify_differences <- function(
if (nrow(DAT) == 0) {
return(tibble())
}
DAT <- DAT[do.call("order", DAT[KEYS]), ]
do.call(setorder, list(DAT, KEYS))

matching_list <- mapply(
is_variable_different,
Expand Down
28 changes: 28 additions & 0 deletions tests/testthat/test-miscellaneous.R
Original file line number Diff line number Diff line change
Expand Up @@ -199,3 +199,31 @@ test_that("Format Char works on standard data types", {
test_that("ascii_table can handle all standard datatypes", {
expect_snapshot(as_ascii_table(TDAT) |> cat())
})


test_that("diffdf doesn't alter existing data.table objects", {
# Many operations in data.table are pass by reference
# So here we just check to make sure diffdf doesn't modify the
# users data which would be unexpected / unwanted behaviour
df1 <- data.table(
id = c(1, 2, 3, 4, 5),
val = rnorm(5),
age = factor(runif(5))
)
attr(df1, "some attr") <- c(1, 2, 3, 4)
attr(df1$val, "att2") <- c("A", "B")
df1_copy <- copy(df1)

df2 <- data.table(
id = c(1, 2, 99, 4, 5),
val = rnorm(5),
age = runif(5)
)
df2_copy <- copy(df2)

devnull <- diffdf(df1, df2, suppress_warnings = TRUE)
devnull <- diffdf(df1, df2, key = "id", suppress_warnings = TRUE)

expect_equal(df1, df1_copy)
expect_equal(df2, df2_copy)
})
161 changes: 57 additions & 104 deletions utils/benchmarks/benchmark2.R
Original file line number Diff line number Diff line change
@@ -1,120 +1,73 @@
devtools::load_all()

n <- 1000000
test_dat <- data_frame( id = 1:n)


vars <-letters[1:15]

for ( i in vars){
test_dat[[paste0(i,"_num.x")]] <- rnorm(n)
test_dat[[paste0(i,"_num.y")]] <- rnorm(n)
test_dat[[paste0(i,"_chr.x")]] <- letters[ round(runif(n , 1,25))]
test_dat[[paste0(i,"_chr.y")]] <- letters[ round(runif(n , 1,25))]
test_dat[[paste0(i,"_fct.x")]] <- factor(test_dat[[paste0(i,"_chr.x")]])
test_dat[[paste0(i,"_fct.y")]] <- factor(test_dat[[paste0(i,"_chr.y")]])
}

vars2 <- c(
paste0(vars, "_num"),
paste0(vars, "_chr"),
paste0(vars, "_fct")
)

matching_cols = vars2
KEYS = "id"
DAT = test_dat
tolerance = sqrt(.Machine$double.eps)
scale = NULL

RES5 <- purrr::rerun( 10 , {
x <- system.time({
matching_list <- mapply(
is_variable_different ,
matching_cols,
MoreArgs = list(
keynames = KEYS,
datain = DAT,
tolerance = tolerance ,
scale = scale
),
SIMPLIFY = FALSE
)
})


y <- system.time({
HOLD <- list()
for ( v in vars2){

xvar <- paste0(v,'.x')
yvar <- paste0(v,'.y')

keep <- find_difference( test_dat[[xvar]] , test_dat[[yvar]] , tolerance = tolerance , scale = scale)

HOLD[[v]] <- data_frame(
VARIABLE = v,
BASE = test_dat[[xvar]][keep],
COMPARE = test_dat[[yvar]][keep]
)

for ( i in KEYS){
HOLD[[v]][[i]] <- test_dat[[i]][keep]
}

#HOLD[[v]] <- HOLD[[v]][ , c("VARIABLE" , KEYS, "BASE" , "COMPARE")]
}
})

data_frame(
x = x[[3]] ,
y = y[[3]] ,
diff = x - y,
pcent = y / x
)

suppressPackageStartupMessages({
library(dplyr)
library(stringi)
library(lubridate)
library(haven)
})

bind_rows(RES1)$pcent %>% mean # nrow = 300000 , ncol = 60
bind_rows(RES2)$pcent %>% mean # nrow = 300000 , ncol = 15
bind_rows(RES3)$pcent %>% mean # nrow = 600000 , ncol = 15
bind_rows(RES4)$pcent %>% mean
bind_rows(RES5)$pcent %>% mean


HOLD[["e_fct"]]
matching_list[["e_fct"]]




is_variable_different2 <- function (variablename, keynames, datain, ...) {



if ( ! xvar %in% names(datain) | ! yvar %in% names(datain)){
stop("Variable does not exist within input dataset")
generate_test_data <- function(
n = 10000,
n_col = 4,
n_num = n_col,
n_int = n_col,
n_chr = n_col,
n_fct = n_col,
n_date = n_col,
n_dt = n_col
) {
dat <- tibble(id = 1:n)

for (i in seq_len(n_num)) {
dat[sprintf("num_%s", i)] <- runif(n, -1000000, 1000000)
}
for (i in seq_len(n_int)) {
dat[sprintf("int_%s", i)] <- as.integer(sample(seq(-9999, 9999), n, TRUE))
}
for (i in seq_len(n_chr)) {
possible_chrs <- stringi::stri_rand_strings(1000, 15)
dat[sprintf("chr_%s", i)] <- sample(possible_chrs, n, TRUE)
}
for (i in seq_len(n_fct)) {
fct_levels <- c("A", "B", "C", "D", "E", "F", "G", "H")
dat[sprintf("fct_%s", i)] <- factor(sample(fct_levels, size = n, replace = TRUE), levels = fct_levels)
}
for (i in seq_len(n_date)) {
dat[sprintf("date_%s", i)] <- ymd("20200101") + days(round(runif(n, -1000, 1000)))
}

target <- datain[[xvar]]
current <- datain[[yvar]]
outvect <- find_difference(target, current, ...)

datain[["VARIABLE"]] <- variablename

names(datain)[names(datain) %in% c(xvar, yvar)] <- c("BASE", "COMPARE")

as.tibble(subset(datain, outvect, select = c("VARIABLE", keynames, "BASE", "COMPARE")))

for (i in seq_len(n_dt)) {
dat[sprintf("dt_%s", i)] <- ymd_hms("2020-01-01T12:00:01") + seconds(round(runif(n, -70000000, 70000000)))
}
return(dat)
}

dat1 <- generate_test_data(1000000, n_col = 10) |>
sample_frac(1)

dat2 <- generate_test_data(1000000, n_col = 10) |>
sample_frac(1)


results_new <- replicate(
{
x <- system.time({
diffdf(dat1, dat2, "id", suppress_warnings = TRUE)
})
Sys.sleep(4)
x
},
simplify = FALSE,
n = 6
)


dat1 <- generate_test_data(1000, n_col = 1) |>
sample_frac(1)

dat2 <- generate_test_data(1000, n_col = 1) |>
sample_frac(1)




diffdf(dat1, dat2, "id")