Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
b723136
Allow `data_summary()` to return expressions with > 1 rows
strengejacke Mar 10, 2026
1bcbdcb
typo
strengejacke Mar 10, 2026
d9ae5a7
fixes
strengejacke Mar 10, 2026
2e9af7e
add tests
strengejacke Mar 10, 2026
46d2475
fix for grouped data frames
strengejacke Mar 10, 2026
e1bddd6
update comment
strengejacke Mar 10, 2026
d9f39f0
add examples in docs
strengejacke Mar 10, 2026
c9bb013
fix, add tests
strengejacke Mar 10, 2026
203c67a
address comments
strengejacke Mar 10, 2026
311aeb9
Update R/data_summary.R
strengejacke Mar 10, 2026
0036aef
whitespace
strengejacke Mar 10, 2026
45903fb
comments
strengejacke Mar 10, 2026
ca1440f
rename `strict`
strengejacke Mar 10, 2026
5c9b0ae
use back ticks
strengejacke Mar 10, 2026
b502786
revise
strengejacke Mar 10, 2026
c701604
add missing snapshot
strengejacke Mar 10, 2026
21978b9
...
strengejacke Mar 10, 2026
942ddbc
docs
strengejacke Mar 10, 2026
06fbe01
add test
strengejacke Mar 10, 2026
40c9d14
automatic suffixes
strengejacke Mar 11, 2026
23f057d
add comments
strengejacke Mar 11, 2026
4e2cf82
clarify
strengejacke Mar 11, 2026
afa2943
Update NEWS.md
strengejacke Mar 11, 2026
a709895
Update R/data_summary.R
strengejacke Mar 11, 2026
4031e03
Update R/data_summary.R
strengejacke Mar 11, 2026
da4d2bc
update RD
strengejacke Mar 11, 2026
5d66f1e
allow named list in suffix
strengejacke Mar 12, 2026
d883bb6
wording
strengejacke Mar 12, 2026
c833cad
minor
strengejacke Mar 12, 2026
100ce01
update
strengejacke Mar 12, 2026
8f12ba0
Update R/data_summary.R
strengejacke Mar 12, 2026
e31aa5d
update RD
strengejacke Mar 12, 2026
ee37ab0
remove trailing whitespace
strengejacke Mar 12, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: datawizard
Title: Easy Data Wrangling and Statistical Transformations
Version: 1.3.0
Version: 1.3.0.1
Authors@R: c(
person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut",
comment = c(ORCID = "0000-0003-1995-6531")),
Expand Down
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@

CHANGES

* `data_summary()` now allows expressions to return more than one summary
value. For each value, a new column is created. Additionally, the optional
`suffix` argument controls the naming of these columns; if `suffix = NULL`,
column names are auto-generated (e.g., with numeric suffixes).

* `standardize()` now works on `fixest` estimations (#665).

# datawizard 1.3.0
Expand Down
238 changes: 202 additions & 36 deletions R/data_summary.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,20 @@
#' @param remove_na Logical. If `TRUE`, missing values are omitted from the
#' grouping variable. If `FALSE` (default), missing values are included as a
#' level in the grouping variable.
#' @param suffix Optional, suffixes to be added to the new variable names,
#' especially useful when a function returns several values (e.g. `quantile()`).
#' Can be:
#' * a character vector: all expressions in `...` must return the same number
#' of values as elements in `suffix`.
#' * a list of named character vectors: the names of elements in `suffix` must
#' match the names of the expressions. It is also allowed to specify suffixes
#' for selected expressions only.
#'
#' The new column names are a combination of the left-hand side (i.e.,
#' the name) of the expression and the related suffixes. If `suffix = NULL` (the
#' default), and a summary expression returns multiple values, either the names
#' of the returned values (if any) or automatically numbered suffixes such as
#' `_1`, `_2`, etc. are used. See 'Examples'.
#' @param ... One or more named expressions that define the new variable name
#' and the function to compute the summary statistic. Example:
#' `mean_sepal_width = mean(Sepal.Width)`. The expression can also be provided
Expand Down Expand Up @@ -50,15 +64,72 @@
#' last = mpg[length(mpg)],
#' by = c("am", "gear")
#' )
#'
#' # allow more than one-column-summaries for expressions
#' d <- data.frame(
#' x = rnorm(100, 1, 1),
#' y = rnorm(100, 2, 2),
#' groups = rep(1:4, each = 25)
#' )
#'
#' # since we have multiple columns for one expression, the names of the
#' # returned summary results are used as suffix by default
#' data_summary(
#' d,
#' quant_x = quantile(x, c(0.25, 0.75)),
#' mean_x = mean(x),
#' quant_y = quantile(y, c(0.25, 0.5, 0.75))
#' )
#'
#' # if a summary function, like `fivenum()`, returns no named vector, suffixes
#' # are automatically numbered
#' data_summary(
#' d,
#' quant_x = quantile(x, c(0.25, 0.75)),
#' mean_x = mean(x),
#' fivenum_y = fivenum(y)
#' )
#'
#' # specify column suffix for expressions, matching by names
#' data_summary(
#' d,
#' quant_x = quantile(x, c(0.25, 0.75)),
#' mean_x = mean(x),
#' quant_y = quantile(y, c(0.25, 0.5, 0.75)),
#' suffix = list(quant_y = c("_Q1", "_Q2", "_Q3"))
#' )
#'
#' # name multiple expression suffixes, grouped by variable
#' data_summary(
#' d,
#' quant_x = quantile(x, c(0.25, 0.75)),
#' mean_x = mean(x),
#' quant_y = quantile(y, c(0.25, 0.5, 0.75)),
#' suffix = list(quant_x = c("Q1", "Q3"), quant_y = c("_Q1", "_Q2", "_Q3")),
#' by = "groups"
#' )
#'
#' @export
data_summary <- function(x, ...) {
UseMethod("data_summary")
}


#' @export
data_summary.matrix <- function(x, ..., by = NULL, remove_na = FALSE) {
data_summary(as.data.frame(x), ..., by = by, remove_na = remove_na)
data_summary.matrix <- function(
x,
...,
by = NULL,
remove_na = FALSE,
suffix = NULL
) {
data_summary(
as.data.frame(x),
...,
by = by,
remove_na = remove_na,
suffix = suffix
)
}


Expand All @@ -72,7 +143,13 @@

#' @rdname data_summary
#' @export
data_summary.data.frame <- function(x, ..., by = NULL, remove_na = FALSE) {
data_summary.data.frame <- function(
x,
...,
by = NULL,
remove_na = FALSE,
suffix = NULL
) {
dots <- eval(substitute(alist(...)))

# do we have any expression at all?
Expand All @@ -84,9 +161,10 @@

if (is.null(by)) {
# when we have no grouping, just compute a one-row summary
summarise <- .process_datasummary_dots(dots, x)
out <- data.frame(summarise)
colnames(out) <- vapply(summarise, names, character(1))
summarise <- .process_datasummary_dots(dots, x, suffix)
# coerce to data frame
out <- as.data.frame(t(summarise))
colnames(out) <- names(summarise)
Comment thread
strengejacke marked this conversation as resolved.
Comment thread
strengejacke marked this conversation as resolved.
} else {
# sanity check - is "by" a character string?
if (!is.character(by)) {
Expand Down Expand Up @@ -122,15 +200,23 @@
return(NULL)
}
# summarize data
summarise <- .process_datasummary_dots(dots, s)
summarise <- .process_datasummary_dots(dots, s, suffix)
# coerce to data frame
summarised_data <- data.frame(summarise)
summarised_data <- as.data.frame(t(summarise))
# bind grouping-variables and values
summarised_data <- cbind(s[1, by], summarised_data)
# make sure we have proper column names
colnames(summarised_data) <- c(by, unlist(lapply(summarise, names)))
colnames(summarised_data) <- c(by, names(summarise))
Comment thread
strengejacke marked this conversation as resolved.
Comment thread
strengejacke marked this conversation as resolved.
summarised_data
})
# check for correct number of columns. If one expression returns different
# number of values (which now means, we have different number of columns
# to bind) for each group, tell user
if (!all(lengths(out) == lengths(out)[1])) {
insight::format_error(
"Each expression must return the same number of values for each group. Some of the expressions seem to return varying numbers of values."

Check warning on line 217 in R/data_summary.R

View workflow job for this annotation

GitHub Actions / lint-changed-files / lint-changed-files

file=R/data_summary.R,line=217,col=121,[line_length_linter] Lines should not be more than 120 characters. This line is 145 characters.

Check warning on line 217 in R/data_summary.R

View workflow job for this annotation

GitHub Actions / lint / lint

file=R/data_summary.R,line=217,col=121,[line_length_linter] Lines should not be more than 120 characters. This line is 145 characters.
)
}
out <- do.call(rbind, out)
}
# sort data
Expand All @@ -143,7 +229,13 @@


#' @export
data_summary.grouped_df <- function(x, ..., by = NULL, remove_na = FALSE) {
data_summary.grouped_df <- function(
x,
...,
by = NULL,
remove_na = FALSE,
suffix = NULL
) {
# extract group variables
grps <- attr(x, "groups", exact = TRUE)
group_variables <- data_remove(grps, ".rows")
Expand All @@ -154,13 +246,13 @@
# remove information specific to grouped df's
attr(x, "groups") <- NULL
class(x) <- "data.frame"
data_summary(x, ..., by = by, remove_na = remove_na)
data_summary(x, ..., by = by, remove_na = remove_na, suffix = suffix)
}


# helper -----------------------------------------------------------------------

.process_datasummary_dots <- function(dots, data) {
.process_datasummary_dots <- function(dots, data, suffix = NULL) {
out <- NULL
if (length(dots)) {
# we check for character vector of expressions, in which case
Expand Down Expand Up @@ -199,39 +291,113 @@
}
}

# sanity check: check the input for the `suffix` argument
# `suffix` can be NULL, or must be a (named) list
if (!is.null(suffix)) {
# if `suffix` is a character vector, we transform it into a list,
# matching the names of the expressions
if (is.character(suffix)) {
suffix <- rep(list(suffix), length(dots))
names(suffix) <- names(dots)
}
# no list? error
if (!is.list(suffix)) {
insight::format_error(
"Argument `suffix` must be a list of (named) character vectors, where the names match the names of the expressions, e.g.:",

Check warning on line 306 in R/data_summary.R

View workflow job for this annotation

GitHub Actions / lint-changed-files / lint-changed-files

file=R/data_summary.R,line=306,col=121,[line_length_linter] Lines should not be more than 120 characters. This line is 133 characters.

Check warning on line 306 in R/data_summary.R

View workflow job for this annotation

GitHub Actions / lint / lint

file=R/data_summary.R,line=306,col=121,[line_length_linter] Lines should not be more than 120 characters. This line is 133 characters.
paste0(
"`suffix = list(",
names(dots)[1],
" = c(\"_suffix1\", \"_suffix2\")`."
)
)
}
# not all elements named? error
if (!length(which(nzchar(names(suffix), keepNA = TRUE)))) {
insight::format_error("All elements of `suffix` must have names.")
}
# names of suffix do not match names of expressions? error
if (!all(names(suffix) %in% names(dots))) {
wrong_name <- which(!names(suffix) %in% names(dots))[1]
insight::format_error(
paste0(
"Names of `suffix` must match the names of the expressions. Suffix `",
names(suffix)[wrong_name],
"` has no corresponding expression."
)
)
}
# identical suffixes for one expression? error
identical_suffix <- vapply(
suffix,
function(i) insight::n_unique(i) != length(i),
logical(1)
)
if (any(identical_suffix)) {
insight::format_error(
paste0(
"All suffixes for a single expression must be unique. Suffix for element `",
names(identical_suffix)[which(identical_suffix)][1],
"` has duplicate values."
)
)
}
}

out <- lapply(seq_along(dots), function(i) {
new_variable <- .get_new_dots_variable(dots, i, data)
# check special case here - we want bayestestR::ci to work with
# data summary, to easily create CIs for, say, posterior draws
if (inherits(new_variable, c("bayestestR_ci", "bayestestR_eti"))) {
stats::setNames(new_variable, c("CI", "CI_low", "CI_high"))
} else {
stats::setNames(new_variable, names(dots)[i])
# init
current_suffix <- NULL
# find matches and set use suffix if found
matching_names <- which(names(suffix) == names(dots)[i])
# either use suffixes based on matching names, or try to extract
# names from the returned summary expression (saved in "new_variable"),
# if the summary function returned a named vector
if (length(matching_names) > 0) {
current_suffix <- suffix[[matching_names]]
} else if (
length(new_variable) > 1 &&
all(nzchar(names(new_variable), keepNA = TRUE))
) {
current_suffix <- names(new_variable)
}
# if we don't have suffixes for multiple columns, but expression
# returns multiple columns, we get NA column names - we use
# automatically numbered suffixes in this case
if (is.null(current_suffix) && length(new_variable) > 1) {
current_suffix <- paste0("_", seq_along(new_variable))
}

# if number of suffixes does not match the number of returned values
# by the expression, error
if (
!is.null(current_suffix) &&
length(current_suffix) != length(new_variable)
) {
insight::format_error(
paste0(
"Argument `suffix` must have the same length as the result of the corresponding summary expression. `suffix` has ",

Check warning on line 383 in R/data_summary.R

View workflow job for this annotation

GitHub Actions / lint-changed-files / lint-changed-files

file=R/data_summary.R,line=383,col=121,[line_length_linter] Lines should not be more than 120 characters. This line is 129 characters.

Check warning on line 383 in R/data_summary.R

View workflow job for this annotation

GitHub Actions / lint / lint

file=R/data_summary.R,line=383,col=121,[line_length_linter] Lines should not be more than 120 characters. This line is 129 characters.
length(current_suffix),
" elements (",
text_concatenate(current_suffix, enclose = "`"),
") for the expression `",
insight::safe_deparse(dots[[i]]),
"`, which returned ",
length(new_variable),
" values."
)
)
}
stats::setNames(new_variable, paste0(names(dots)[i], current_suffix))
Comment thread
strengejacke marked this conversation as resolved.
}
})
}

# check for correct length of output - must be a single value!
# Exception: bayestestR::ci()
wrong_length <- !sapply(
out,
inherits,
what = c("bayestestR_ci", "bayestestR_eti")
) &
lengths(out) != 1 # nolint
if (any(wrong_length)) {
insight::format_error(
paste0(
"Each expression must return a single value. Following expression",
ifelse(sum(wrong_length) > 1, "s", " "),
" returned more than one value: ",
text_concatenate(
vapply(dots[wrong_length], insight::safe_deparse, character(1)),
enclose = "\""
)
)
)
}

out
unlist(out)
Comment thread
strengejacke marked this conversation as resolved.
}


Expand Down
Loading
Loading