From ce06351c3a6803deb3d2b8732ce79ad75cbe91e0 Mon Sep 17 00:00:00 2001 From: Fran Barton Date: Thu, 26 Mar 2026 15:11:27 +0000 Subject: [PATCH] :fire: Remove `path` variable from `read_azure_*` functions Should close #98 and closes #95 --- R/read_azure_files.R | 108 ++++++-------------------------------- man/check_blob_exists.Rd | 34 ------------ man/read_azure_csv.Rd | 19 +------ man/read_azure_file.Rd | 22 +------- man/read_azure_json.Rd | 19 +------ man/read_azure_jsongz.Rd | 19 +------ man/read_azure_parquet.Rd | 24 +-------- man/read_azure_rds.Rd | 19 +------ 8 files changed, 29 insertions(+), 235 deletions(-) delete mode 100644 man/check_blob_exists.Rd diff --git a/R/read_azure_files.R b/R/read_azure_files.R index 70c5d5e..5c89b43 100644 --- a/R/read_azure_files.R +++ b/R/read_azure_files.R @@ -1,35 +1,16 @@ #' Read a parquet file from Azure storage #' #' @param container An Azure container object, as returned by [get_container] -#' @param file The name of the file to be read, as a string. NB The file -#' extension does not need to be included (though it can be). The function -#' will error if multiple files are somehow matched. -#' @param path The path to the directory where `file` is located, as a string. -#' Only needed if `file` does not already contain its full path. If file is -#' just a file name with no path, then provide the path to the directory here. -#' This must be the full path to the file location, as the function will not -#' search into subdirectories recursively. Set to `"/"` (the root of the -#' container) by default. -#' @param info Boolean. Whether to print user feedback about the file that is -#' being read. Useful for checking the function is doing what is expected, but -#' can be turned off with `FALSE`. Can be set persistently with the option -#' "azkit.info". If `NULL` then it will default to the value of -#' [rlang::is_interactive] (that is, `TRUE` for interactive sessions). +#' @param file string The path to the file to be read. #' @param ... optional arguments to be passed through to [arrow::read_parquet] #' @returns A tibble #' @examples \dontrun{ -#' # if a full filepath is available then path can be ignored #' read_azure_parquet(cont, "data/folder/path/1.parquet") -#' # you can provide a filename without the '.parquet' extension -#' # if you wish to use this partial file name matching it is probably easier -#' # to provide a 'path' -#' read_azure_parquet(cont, "case_details", "storage/parquet/2025/06/29") #' } #' @export -read_azure_parquet <- function(container, file, path = "/", info = NULL, ...) { - check_blob_exists(container, file, "parquet", info, path) |> - # using `dest = NULL` means pass the data through as a raw vector - AzureStor::download_blob(container, src = _, dest = NULL) |> +read_azure_parquet <- function(container, file, ...) { + # using `dest = NULL` means pass the data through as a raw vector + AzureStor::download_blob(container, file, dest = NULL) |> arrow::read_parquet(...) } @@ -41,10 +22,9 @@ read_azure_parquet <- function(container, file, path = "/", info = NULL, ...) { #' [yyjsonr::read_json_raw] #' @returns A list #' @export -read_azure_json <- function(container, file, path = "/", info = NULL, ...) { - check_blob_exists(container, file, "json", info, path) |> - # using `dest = NULL` means pass the data through as a raw vector - AzureStor::download_blob(container, src = _, dest = NULL) |> +read_azure_json <- function(container, file, ...) { + # using `dest = NULL` means pass the data through as a raw vector + AzureStor::download_blob(container, file, dest = NULL) |> yyjsonr::read_json_raw(...) } @@ -56,13 +36,12 @@ read_azure_json <- function(container, file, path = "/", info = NULL, ...) { #' [yyjsonr::read_json_file] #' @returns A list #' @export -read_azure_jsongz <- function(container, file, path = "/", info = NULL, ...) { - full_path <- check_blob_exists(container, file, "json.gz", info, path) +read_azure_jsongz <- function(container, file, ...) { dl <- withr::local_tempfile( - pattern = tools::file_path_sans_ext(basename(full_path), TRUE), + pattern = tools::file_path_sans_ext(basename(file), TRUE), fileext = "json.gz" ) - AzureStor::download_blob(container, src = full_path, dest = dl) + AzureStor::download_blob(container, file, dest = dl) yyjsonr::read_json_file(dl, ...) } @@ -78,12 +57,11 @@ read_azure_jsongz <- function(container, file, path = "/", info = NULL, ...) { # If nothing is provided here, the compression type will be set to "none". #' @returns The data object that was stored in the rds file #' @export -read_azure_rds <- function(container, file, path = "/", info = NULL, ...) { +read_azure_rds <- function(container, file, ...) { # If the user doesn't specify a (de)compression type with `type` in `...`, we # will set a `type` of "none", as this seems to be the standard on SU Azure dots <- rlang::dots_list(..., type = "none", .homonyms = "first") - blob <- check_blob_exists(container, file, "rds", info, path) - rlang::inject(AzureStor::storage_load_rds(container, blob, !!!dots)) + rlang::inject(AzureStor::storage_load_rds(container, file, !!!dots)) } @@ -93,71 +71,19 @@ read_azure_rds <- function(container, file, path = "/", info = NULL, ...) { #' @param ... optional arguments to be passed through to [readr::read_delim] #' @returns A tibble #' @export -read_azure_csv <- function(container, file, path = "/", info = NULL, ...) { - check_blob_exists(container, file, "csv", info, path) |> - AzureStor::storage_read_csv(container, file = _, ...) +read_azure_csv <- function(container, file, ...) { + AzureStor::storage_read_csv(container, file, ...) } #' Read any file from Azure storage #' #' @inheritParams read_azure_parquet -#' @param ext If a custom extension needs to be supplied, you can specify it -#' here. If `NULL`, the default, the extension of `file` will be used #' @param ... optional arguments to be passed through to #' [AzureStor::download_blob] #' @returns A raw data stream #' @export -read_azure_file <- function( - container, - file, - path = "/", - info = NULL, - ext = NULL, - ... -) { - ext <- ext %||% tools::file_ext(file) - check_blob_exists(container, file, ext, info, path) |> - # using `dest = NULL` means pass the data through as a raw vector - AzureStor::download_blob(container, src = _, dest = NULL, ...) -} - - -#' Ensures that the filepath for the file to read exists -#' -#' @inheritParams read_azure_parquet -#' @param ext The standard file extension for the file type, e.g. "json" -#' @keywords internal -check_blob_exists <- function(container, file, ext, info, path) { - stopifnot("no container found" = inherits(container, "blob_container")) - path <- if (path %in% c("", "/")) "" else path - stopifnot("path not found" = AzureStor::blob_dir_exists(container, path)) - dir_name <- if (dirname(file) == ".") "" else dirname(file) - # Potentially the user could provide a partial file path in `path` and a - # further sub-directory as part of `file`. This handles that eventuality, - # though this usage pattern should be quite rare! - dpath <- file.path(path, dir_name) - fname <- basename(file) - if (nzchar(ext) && !gregg(fname, "\\.{ext}$")) { - fname <- glue::glue("{fname}.{ext}") - } - # remove duplicate slashes and any initial slashes - file_path <- sub("^/", "", gsub("/+", "/", file.path(dpath, fname))) - - filepath_out <- AzureStor::list_blobs(container, dpath, recursive = FALSE) |> - dplyr::filter(dplyr::if_any("name", \(x) x == {{ file_path }})) |> - dplyr::pull("name") - - msg1 <- ct_error_msg("no matching {ext} file found") - msg2 <- cst_error_msg("multiple matching {ext} files found") - check_that(filepath_out, \(x) length(x) > 0, msg1) # check length > 0 - check_scalar_type(filepath_out, "character", msg2) # check length == 1 - - info_option <- getOption("azkit.info") - stopifnot(rlang::is_scalar_logical(info) || is.null(info)) - stopifnot(rlang::is_scalar_logical(info_option) || is.null(info_option)) - if (info %||% info_option %||% rlang::is_interactive()) { - cli::cli_alert_info("File {.val {filepath_out}} will be read in") - } - filepath_out +read_azure_file <- function(container, file, ...) { + # using `dest = NULL` means pass the data through as a raw vector + AzureStor::download_blob(container, file, dest = NULL, ...) } diff --git a/man/check_blob_exists.Rd b/man/check_blob_exists.Rd deleted file mode 100644 index 472231a..0000000 --- a/man/check_blob_exists.Rd +++ /dev/null @@ -1,34 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/read_azure_files.R -\name{check_blob_exists} -\alias{check_blob_exists} -\title{Ensures that the filepath for the file to read exists} -\usage{ -check_blob_exists(container, file, ext, info, path) -} -\arguments{ -\item{container}{An Azure container object, as returned by \link{get_container}} - -\item{file}{The name of the file to be read, as a string. NB The file -extension does not need to be included (though it can be). The function -will error if multiple files are somehow matched.} - -\item{ext}{The standard file extension for the file type, e.g. "json"} - -\item{info}{Boolean. Whether to print user feedback about the file that is -being read. Useful for checking the function is doing what is expected, but -can be turned off with \code{FALSE}. Can be set persistently with the option -"azkit.info". If \code{NULL} then it will default to the value of -\link[rlang:is_interactive]{rlang::is_interactive} (that is, \code{TRUE} for interactive sessions).} - -\item{path}{The path to the directory where \code{file} is located, as a string. -Only needed if \code{file} does not already contain its full path. If file is -just a file name with no path, then provide the path to the directory here. -This must be the full path to the file location, as the function will not -search into subdirectories recursively. Set to \code{"/"} (the root of the -container) by default.} -} -\description{ -Ensures that the filepath for the file to read exists -} -\keyword{internal} diff --git a/man/read_azure_csv.Rd b/man/read_azure_csv.Rd index 29f55ed..d4ef7da 100644 --- a/man/read_azure_csv.Rd +++ b/man/read_azure_csv.Rd @@ -4,27 +4,12 @@ \alias{read_azure_csv} \title{Read a csv file from Azure storage} \usage{ -read_azure_csv(container, file, path = "/", info = NULL, ...) +read_azure_csv(container, file, ...) } \arguments{ \item{container}{An Azure container object, as returned by \link{get_container}} -\item{file}{The name of the file to be read, as a string. NB The file -extension does not need to be included (though it can be). The function -will error if multiple files are somehow matched.} - -\item{path}{The path to the directory where \code{file} is located, as a string. -Only needed if \code{file} does not already contain its full path. If file is -just a file name with no path, then provide the path to the directory here. -This must be the full path to the file location, as the function will not -search into subdirectories recursively. Set to \code{"/"} (the root of the -container) by default.} - -\item{info}{Boolean. Whether to print user feedback about the file that is -being read. Useful for checking the function is doing what is expected, but -can be turned off with \code{FALSE}. Can be set persistently with the option -"azkit.info". If \code{NULL} then it will default to the value of -\link[rlang:is_interactive]{rlang::is_interactive} (that is, \code{TRUE} for interactive sessions).} +\item{file}{string The path to the file to be read.} \item{...}{optional arguments to be passed through to \link[readr:read_delim]{readr::read_delim}} } diff --git a/man/read_azure_file.Rd b/man/read_azure_file.Rd index eaa51ea..b61ce53 100644 --- a/man/read_azure_file.Rd +++ b/man/read_azure_file.Rd @@ -4,30 +4,12 @@ \alias{read_azure_file} \title{Read any file from Azure storage} \usage{ -read_azure_file(container, file, path = "/", info = NULL, ext = NULL, ...) +read_azure_file(container, file, ...) } \arguments{ \item{container}{An Azure container object, as returned by \link{get_container}} -\item{file}{The name of the file to be read, as a string. NB The file -extension does not need to be included (though it can be). The function -will error if multiple files are somehow matched.} - -\item{path}{The path to the directory where \code{file} is located, as a string. -Only needed if \code{file} does not already contain its full path. If file is -just a file name with no path, then provide the path to the directory here. -This must be the full path to the file location, as the function will not -search into subdirectories recursively. Set to \code{"/"} (the root of the -container) by default.} - -\item{info}{Boolean. Whether to print user feedback about the file that is -being read. Useful for checking the function is doing what is expected, but -can be turned off with \code{FALSE}. Can be set persistently with the option -"azkit.info". If \code{NULL} then it will default to the value of -\link[rlang:is_interactive]{rlang::is_interactive} (that is, \code{TRUE} for interactive sessions).} - -\item{ext}{If a custom extension needs to be supplied, you can specify it -here. If \code{NULL}, the default, the extension of \code{file} will be used} +\item{file}{string The path to the file to be read.} \item{...}{optional arguments to be passed through to \link[AzureStor:blob]{AzureStor::download_blob}} diff --git a/man/read_azure_json.Rd b/man/read_azure_json.Rd index 2df1630..0f57df3 100644 --- a/man/read_azure_json.Rd +++ b/man/read_azure_json.Rd @@ -4,27 +4,12 @@ \alias{read_azure_json} \title{Read a json file from Azure storage} \usage{ -read_azure_json(container, file, path = "/", info = NULL, ...) +read_azure_json(container, file, ...) } \arguments{ \item{container}{An Azure container object, as returned by \link{get_container}} -\item{file}{The name of the file to be read, as a string. NB The file -extension does not need to be included (though it can be). The function -will error if multiple files are somehow matched.} - -\item{path}{The path to the directory where \code{file} is located, as a string. -Only needed if \code{file} does not already contain its full path. If file is -just a file name with no path, then provide the path to the directory here. -This must be the full path to the file location, as the function will not -search into subdirectories recursively. Set to \code{"/"} (the root of the -container) by default.} - -\item{info}{Boolean. Whether to print user feedback about the file that is -being read. Useful for checking the function is doing what is expected, but -can be turned off with \code{FALSE}. Can be set persistently with the option -"azkit.info". If \code{NULL} then it will default to the value of -\link[rlang:is_interactive]{rlang::is_interactive} (that is, \code{TRUE} for interactive sessions).} +\item{file}{string The path to the file to be read.} \item{...}{optional arguments to be passed through to \link[yyjsonr:read_json_raw]{yyjsonr::read_json_raw}} diff --git a/man/read_azure_jsongz.Rd b/man/read_azure_jsongz.Rd index b44a541..f4c4251 100644 --- a/man/read_azure_jsongz.Rd +++ b/man/read_azure_jsongz.Rd @@ -4,27 +4,12 @@ \alias{read_azure_jsongz} \title{Read a json.gz file from Azure storage} \usage{ -read_azure_jsongz(container, file, path = "/", info = NULL, ...) +read_azure_jsongz(container, file, ...) } \arguments{ \item{container}{An Azure container object, as returned by \link{get_container}} -\item{file}{The name of the file to be read, as a string. NB The file -extension does not need to be included (though it can be). The function -will error if multiple files are somehow matched.} - -\item{path}{The path to the directory where \code{file} is located, as a string. -Only needed if \code{file} does not already contain its full path. If file is -just a file name with no path, then provide the path to the directory here. -This must be the full path to the file location, as the function will not -search into subdirectories recursively. Set to \code{"/"} (the root of the -container) by default.} - -\item{info}{Boolean. Whether to print user feedback about the file that is -being read. Useful for checking the function is doing what is expected, but -can be turned off with \code{FALSE}. Can be set persistently with the option -"azkit.info". If \code{NULL} then it will default to the value of -\link[rlang:is_interactive]{rlang::is_interactive} (that is, \code{TRUE} for interactive sessions).} +\item{file}{string The path to the file to be read.} \item{...}{optional arguments to be passed through to \link[yyjsonr:read_json_file]{yyjsonr::read_json_file}} diff --git a/man/read_azure_parquet.Rd b/man/read_azure_parquet.Rd index 2bee797..bb47808 100644 --- a/man/read_azure_parquet.Rd +++ b/man/read_azure_parquet.Rd @@ -4,27 +4,12 @@ \alias{read_azure_parquet} \title{Read a parquet file from Azure storage} \usage{ -read_azure_parquet(container, file, path = "/", info = NULL, ...) +read_azure_parquet(container, file, ...) } \arguments{ \item{container}{An Azure container object, as returned by \link{get_container}} -\item{file}{The name of the file to be read, as a string. NB The file -extension does not need to be included (though it can be). The function -will error if multiple files are somehow matched.} - -\item{path}{The path to the directory where \code{file} is located, as a string. -Only needed if \code{file} does not already contain its full path. If file is -just a file name with no path, then provide the path to the directory here. -This must be the full path to the file location, as the function will not -search into subdirectories recursively. Set to \code{"/"} (the root of the -container) by default.} - -\item{info}{Boolean. Whether to print user feedback about the file that is -being read. Useful for checking the function is doing what is expected, but -can be turned off with \code{FALSE}. Can be set persistently with the option -"azkit.info". If \code{NULL} then it will default to the value of -\link[rlang:is_interactive]{rlang::is_interactive} (that is, \code{TRUE} for interactive sessions).} +\item{file}{string The path to the file to be read.} \item{...}{optional arguments to be passed through to \link[arrow:read_parquet]{arrow::read_parquet}} } @@ -36,11 +21,6 @@ Read a parquet file from Azure storage } \examples{ \dontrun{ - # if a full filepath is available then path can be ignored read_azure_parquet(cont, "data/folder/path/1.parquet") - # you can provide a filename without the '.parquet' extension - # if you wish to use this partial file name matching it is probably easier - # to provide a 'path' - read_azure_parquet(cont, "case_details", "storage/parquet/2025/06/29") } } diff --git a/man/read_azure_rds.Rd b/man/read_azure_rds.Rd index 8f3e0e4..d74d13a 100644 --- a/man/read_azure_rds.Rd +++ b/man/read_azure_rds.Rd @@ -4,27 +4,12 @@ \alias{read_azure_rds} \title{Read an rds file from Azure storage} \usage{ -read_azure_rds(container, file, path = "/", info = NULL, ...) +read_azure_rds(container, file, ...) } \arguments{ \item{container}{An Azure container object, as returned by \link{get_container}} -\item{file}{The name of the file to be read, as a string. NB The file -extension does not need to be included (though it can be). The function -will error if multiple files are somehow matched.} - -\item{path}{The path to the directory where \code{file} is located, as a string. -Only needed if \code{file} does not already contain its full path. If file is -just a file name with no path, then provide the path to the directory here. -This must be the full path to the file location, as the function will not -search into subdirectories recursively. Set to \code{"/"} (the root of the -container) by default.} - -\item{info}{Boolean. Whether to print user feedback about the file that is -being read. Useful for checking the function is doing what is expected, but -can be turned off with \code{FALSE}. Can be set persistently with the option -"azkit.info". If \code{NULL} then it will default to the value of -\link[rlang:is_interactive]{rlang::is_interactive} (that is, \code{TRUE} for interactive sessions).} +\item{file}{string The path to the file to be read.} \item{...}{optional arguments to be passed through to \link[AzureStor:storage_save]{AzureStor::storage_load_rds}. For example, a compression type (one of