Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 17 additions & 91 deletions R/read_azure_files.R
Original file line number Diff line number Diff line change
@@ -1,35 +1,16 @@
#' Read a parquet file from Azure storage
#'
#' @param container An Azure container object, as returned by [get_container]
#' @param file The name of the file to be read, as a string. NB The file
#' extension does not need to be included (though it can be). The function
#' will error if multiple files are somehow matched.
#' @param path The path to the directory where `file` is located, as a string.
#' Only needed if `file` does not already contain its full path. If file is
#' just a file name with no path, then provide the path to the directory here.
#' This must be the full path to the file location, as the function will not
#' search into subdirectories recursively. Set to `"/"` (the root of the
#' container) by default.
#' @param info Boolean. Whether to print user feedback about the file that is
#' being read. Useful for checking the function is doing what is expected, but
#' can be turned off with `FALSE`. Can be set persistently with the option
#' "azkit.info". If `NULL` then it will default to the value of
#' [rlang::is_interactive] (that is, `TRUE` for interactive sessions).
#' @param file string The path to the file to be read.
#' @param ... optional arguments to be passed through to [arrow::read_parquet]
#' @returns A tibble
#' @examples \dontrun{
#' # if a full filepath is available then path can be ignored
#' read_azure_parquet(cont, "data/folder/path/1.parquet")
#' # you can provide a filename without the '.parquet' extension
#' # if you wish to use this partial file name matching it is probably easier
#' # to provide a 'path'
#' read_azure_parquet(cont, "case_details", "storage/parquet/2025/06/29")
#' }
#' @export
read_azure_parquet <- function(container, file, path = "/", info = NULL, ...) {
check_blob_exists(container, file, "parquet", info, path) |>
# using `dest = NULL` means pass the data through as a raw vector
AzureStor::download_blob(container, src = _, dest = NULL) |>
read_azure_parquet <- function(container, file, ...) {
# using `dest = NULL` means pass the data through as a raw vector
AzureStor::download_blob(container, file, dest = NULL) |>
arrow::read_parquet(...)
}

Expand All @@ -41,10 +22,9 @@ read_azure_parquet <- function(container, file, path = "/", info = NULL, ...) {
#' [yyjsonr::read_json_raw]
#' @returns A list
#' @export
read_azure_json <- function(container, file, path = "/", info = NULL, ...) {
check_blob_exists(container, file, "json", info, path) |>
# using `dest = NULL` means pass the data through as a raw vector
AzureStor::download_blob(container, src = _, dest = NULL) |>
read_azure_json <- function(container, file, ...) {
# using `dest = NULL` means pass the data through as a raw vector
AzureStor::download_blob(container, file, dest = NULL) |>
yyjsonr::read_json_raw(...)
}

Expand All @@ -56,13 +36,12 @@ read_azure_json <- function(container, file, path = "/", info = NULL, ...) {
#' [yyjsonr::read_json_file]
#' @returns A list
#' @export
read_azure_jsongz <- function(container, file, path = "/", info = NULL, ...) {
full_path <- check_blob_exists(container, file, "json.gz", info, path)
read_azure_jsongz <- function(container, file, ...) {
dl <- withr::local_tempfile(
pattern = tools::file_path_sans_ext(basename(full_path), TRUE),
pattern = tools::file_path_sans_ext(basename(file), TRUE),
fileext = "json.gz"
)
AzureStor::download_blob(container, src = full_path, dest = dl)
AzureStor::download_blob(container, file, dest = dl)
yyjsonr::read_json_file(dl, ...)
}

Expand All @@ -78,12 +57,11 @@ read_azure_jsongz <- function(container, file, path = "/", info = NULL, ...) {
# If nothing is provided here, the compression type will be set to "none".
#' @returns The data object that was stored in the rds file
#' @export
read_azure_rds <- function(container, file, path = "/", info = NULL, ...) {
read_azure_rds <- function(container, file, ...) {
# If the user doesn't specify a (de)compression type with `type` in `...`, we
# will set a `type` of "none", as this seems to be the standard on SU Azure
dots <- rlang::dots_list(..., type = "none", .homonyms = "first")
blob <- check_blob_exists(container, file, "rds", info, path)
rlang::inject(AzureStor::storage_load_rds(container, blob, !!!dots))
rlang::inject(AzureStor::storage_load_rds(container, file, !!!dots))
}


Expand All @@ -93,71 +71,19 @@ read_azure_rds <- function(container, file, path = "/", info = NULL, ...) {
#' @param ... optional arguments to be passed through to [readr::read_delim]
#' @returns A tibble
#' @export
read_azure_csv <- function(container, file, path = "/", info = NULL, ...) {
check_blob_exists(container, file, "csv", info, path) |>
AzureStor::storage_read_csv(container, file = _, ...)
read_azure_csv <- function(container, file, ...) {
AzureStor::storage_read_csv(container, file, ...)
}


#' Read any file from Azure storage
#'
#' @inheritParams read_azure_parquet
#' @param ext If a custom extension needs to be supplied, you can specify it
#' here. If `NULL`, the default, the extension of `file` will be used
#' @param ... optional arguments to be passed through to
#' [AzureStor::download_blob]
#' @returns A raw data stream
#' @export
read_azure_file <- function(
container,
file,
path = "/",
info = NULL,
ext = NULL,
...
) {
ext <- ext %||% tools::file_ext(file)
check_blob_exists(container, file, ext, info, path) |>
# using `dest = NULL` means pass the data through as a raw vector
AzureStor::download_blob(container, src = _, dest = NULL, ...)
}


#' Ensures that the filepath for the file to read exists
#'
#' @inheritParams read_azure_parquet
#' @param ext The standard file extension for the file type, e.g. "json"
#' @keywords internal
check_blob_exists <- function(container, file, ext, info, path) {
stopifnot("no container found" = inherits(container, "blob_container"))
path <- if (path %in% c("", "/")) "" else path
stopifnot("path not found" = AzureStor::blob_dir_exists(container, path))
dir_name <- if (dirname(file) == ".") "" else dirname(file)
# Potentially the user could provide a partial file path in `path` and a
# further sub-directory as part of `file`. This handles that eventuality,
# though this usage pattern should be quite rare!
dpath <- file.path(path, dir_name)
fname <- basename(file)
if (nzchar(ext) && !gregg(fname, "\\.{ext}$")) {
fname <- glue::glue("{fname}.{ext}")
}
# remove duplicate slashes and any initial slashes
file_path <- sub("^/", "", gsub("/+", "/", file.path(dpath, fname)))

filepath_out <- AzureStor::list_blobs(container, dpath, recursive = FALSE) |>
dplyr::filter(dplyr::if_any("name", \(x) x == {{ file_path }})) |>
dplyr::pull("name")

msg1 <- ct_error_msg("no matching {ext} file found")
msg2 <- cst_error_msg("multiple matching {ext} files found")
check_that(filepath_out, \(x) length(x) > 0, msg1) # check length > 0
check_scalar_type(filepath_out, "character", msg2) # check length == 1

info_option <- getOption("azkit.info")
stopifnot(rlang::is_scalar_logical(info) || is.null(info))
stopifnot(rlang::is_scalar_logical(info_option) || is.null(info_option))
if (info %||% info_option %||% rlang::is_interactive()) {
cli::cli_alert_info("File {.val {filepath_out}} will be read in")
}
filepath_out
read_azure_file <- function(container, file, ...) {
# using `dest = NULL` means pass the data through as a raw vector
AzureStor::download_blob(container, file, dest = NULL, ...)
}
34 changes: 0 additions & 34 deletions man/check_blob_exists.Rd

This file was deleted.

19 changes: 2 additions & 17 deletions man/read_azure_csv.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

22 changes: 2 additions & 20 deletions man/read_azure_file.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 2 additions & 17 deletions man/read_azure_json.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 2 additions & 17 deletions man/read_azure_jsongz.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

24 changes: 2 additions & 22 deletions man/read_azure_parquet.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 2 additions & 17 deletions man/read_azure_rds.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading