Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@
^CODEOWNERS$
^\.github$
^\.lintr$
^[.]?air[.]toml$
592 changes: 317 additions & 275 deletions R/get_auth_token.R

Large diffs are not rendered by default.

166 changes: 83 additions & 83 deletions R/get_container.R
Original file line number Diff line number Diff line change
@@ -1,83 +1,83 @@
#' Get Azure storage container
#'
#' The environment variable "AZ_STORAGE_EP" should be set. This provides the URL
#' for the default Azure storage endpoint.
#' Use [list_container_names] to get a list of available container names.
#'
#' @param container_name Name of the container as a string. `NULL` by default,
#' which means the function will look instead for a container name stored in
#' the environment variable "AZ_CONTAINER"
#' @param token An Azure authentication token. If left as `NULL`, a token
#' returned by [get_auth_token] will be used
#' @param endpoint_url An Azure endpoint URL. If left as `NULL`, the default,
#' the value of the environment variable "AZ_STORAGE_EP" will be used
#' @param ... arguments to be passed through to [get_auth_token], if a token is
#' not already supplied
#' @returns An Azure blob container (list object of class "blob_container")
#' @export
get_container <- function(
container_name = NULL,
token = NULL,
endpoint_url = NULL,
...
) {
msg1 <- paste0(
"{.var container_name} is empty. ",
"Did you forget to set an environment variable?"
)
msg2 <- paste0(
"{.var endpoint_url} is empty. ",
"Did you forget to set an environment variable?"
)
container_name <- (container_name %||% check_envvar("AZ_CONTAINER")) |>
check_nzchar(msg1)
endpoint_url <- (endpoint_url %||% check_envvar("AZ_STORAGE_EP")) |>
check_nzchar(msg2)
token <- token %||% get_auth_token(...)
get_azure_endpoint(token, endpoint_url) |>
AzureStor::blob_container(container_name)
}
#' Return a list of container names that are found at the endpoint
#'
#' @inheritParams get_container
#' @returns A character vector of all container names found
#' @export
list_container_names <- function(token = NULL, endpoint_url = NULL, ...) {
token <- token %||% get_auth_token(...)
endpoint <- get_azure_endpoint(token, endpoint_url)
container_list <- AzureStor::list_blob_containers(endpoint)
stopifnot("no containers found" = length(container_list) >= 1L)
names(container_list)
}
#' Return an Azure "blob_endpoint"
#'
#' This function will return the endpoint specified in the environment variable
#' "AZ_STORAGE_EP" by default
#'
#' @inheritParams get_container
#' @returns An Azure blob endpoint (object of class "blob_endpoint")
#' @keywords internal
get_azure_endpoint <- function(token = NULL, endpoint_url = NULL, ...) {
token <- token %||% get_auth_token(...)
endpoint_url <- endpoint_url %||% check_envvar("AZ_STORAGE_EP")
AzureStor::blob_endpoint(endpoint_url, token = token)
}
#' Check that an environment variable exists
#'
#' The function prints a helpful error if the variable is not found, else
#' it returns the value of `Sys.getenv(x)`
#'
#' @param x the *name* of the environment variable to be found and checked
#' @returns the value of the environment variable named in `x`
#' @export
check_envvar <- function(x) {
cst_msg <- cst_error_msg("The environment variable {.envvar {x}} is not set")
check_scalar_type(Sys.getenv(x, NA_character_), "string", cst_msg)
}
#' Get Azure storage container
#'
#' The environment variable "AZ_STORAGE_EP" should be set. This provides the URL
#' for the default Azure storage endpoint.
#' Use [list_container_names] to get a list of available container names.
#'
#' @param container_name Name of the container as a string. `NULL` by default,
#' which means the function will look instead for a container name stored in
#' the environment variable "AZ_CONTAINER"
#' @param token An Azure authentication token. If left as `NULL`, a token
#' returned by [get_auth_token] will be used
#' @param endpoint_url An Azure endpoint URL. If left as `NULL`, the default,
#' the value of the environment variable "AZ_STORAGE_EP" will be used
#' @param ... arguments to be passed through to [get_auth_token], if a token is
#' not already supplied
#' @returns An Azure blob container (list object of class "blob_container")
#' @export
get_container <- function(
container_name = NULL,
token = NULL,
endpoint_url = NULL,
...
) {
msg1 <- paste0(
"{.var container_name} is empty. ",
"Did you forget to set an environment variable?"
)
msg2 <- paste0(
"{.var endpoint_url} is empty. ",
"Did you forget to set an environment variable?"
)
container_name <- (container_name %||% check_envvar("AZ_CONTAINER")) |>
check_nzchar(msg1)
endpoint_url <- (endpoint_url %||% check_envvar("AZ_STORAGE_EP")) |>
check_nzchar(msg2)
token <- token %||% get_auth_token(...)

get_azure_endpoint(token, endpoint_url) |>
AzureStor::blob_container(container_name)
}


#' Return a list of container names that are found at the endpoint
#'
#' @inheritParams get_container
#' @returns A character vector of all container names found
#' @export
list_container_names <- function(token = NULL, endpoint_url = NULL, ...) {
token <- token %||% get_auth_token(...)
endpoint <- get_azure_endpoint(token, endpoint_url)
container_list <- AzureStor::list_blob_containers(endpoint)
stopifnot("no containers found" = length(container_list) >= 1L)
names(container_list)
}


#' Return an Azure "blob_endpoint"
#'
#' This function will return the endpoint specified in the environment variable
#' "AZ_STORAGE_EP" by default
#'
#' @inheritParams get_container
#' @returns An Azure blob endpoint (object of class "blob_endpoint")
#' @keywords internal
get_azure_endpoint <- function(token = NULL, endpoint_url = NULL, ...) {
token <- token %||% get_auth_token(...)
endpoint_url <- endpoint_url %||% check_envvar("AZ_STORAGE_EP")
AzureStor::blob_endpoint(endpoint_url, token = token)
}


#' Check that an environment variable exists
#'
#' The function prints a helpful error if the variable is not found, else
#' it returns the value of `Sys.getenv(x)`
#'
#' @param x the *name* of the environment variable to be found and checked
#' @returns the value of the environment variable named in `x`
#' @export
check_envvar <- function(x) {
cst_msg <- cst_error_msg("The environment variable {.envvar {x}} is not set")
check_scalar_type(Sys.getenv(x, NA_character_), "string", cst_msg)
}
114 changes: 57 additions & 57 deletions R/list_files.R
Original file line number Diff line number Diff line change
@@ -1,57 +1,57 @@
#' List files in a container
#'
#' Recursively (or not, if desired) lists all files found in a container. Search
#' can be restricted to a particular 'subdirectory' of the container, and/or
#' to files with a specific extension. The function assumes that all file names
#' end with a ".ext" extension of some sort.
#'
#' The function does not support filtering by file name, only by file extension.
#'
#' The returned file list (character vector) contains the full paths to the
#' files, ready to be passed perhaps to a `read_azure_*` function, or further
#' filtered by you. If you just want the names of the files without the folder
#' path, use [basename()] to extract these.
#'
#' @inheritParams read_azure_parquet
#' @param path (optional) subdirectory of the container to list files within.
#' `""` (the root folder of the container) by default
#' @param ext (optional) A string giving the extension of a particular file type
#' you want to restrict the list to. No need to include the initial ".". The
#' default, `""`, means no filtering by file extension will be applied. Can be
#' a regular expression.
#' @param recursive A Boolean value: whether to list files recursively. `TRUE`
#' by default
#'
#' @importFrom rlang .data
#' @returns A vector of file names, or an empty character vector if none found
#' @examples \dontrun{
#' list_files(get_container("example"), ext = "csv")
#' }
#' @export
list_files <- function(container, path = "", ext = "", recursive = TRUE) {
stopifnot(rlang::is_character(c(path, ext), 2))
stopifnot(rlang::is_bool(recursive))
pnf_msg <- ct_error_msg("Path {.val {path}} not found")
check_that(path, \(x) AzureStor::blob_dir_exists(container, x), pnf_msg)
tbl <- AzureStor::list_blobs(container, path, recursive = recursive)
if (nrow(tbl) > 0) {
ext_rx <- if (nzchar(ext)) sub("^\\.+", "", ext) else ".*" # nolint
tbl <- tbl |>
dplyr::filter(!.data[["isdir"]] & gregg(.data[["name"]], "\\.{ext_rx}$"))
}
# A zero-row tbl can result if `path` is initially empty, or via the filter
# step above. We handle this the same way, no matter which route led here.
if (nrow(tbl) == 0) {
fix_path <- \(p) sub("^/+$", "", sub("^([^/])(.*)", "/\\1\\2", p)) # nolint
ext <- if (nzchar(ext)) paste0(" ", ext)
msg <- "No{ext} files found in {.val [{container$name}]:{fix_path(path)}}"
if (rlang::is_interactive()) {
cli::cli_alert_info(msg)
}
invisible(character(0))
} else {
tbl[["name"]]
}
}
#' List files in a container
#'
#' Recursively (or not, if desired) lists all files found in a container. Search
#' can be restricted to a particular 'subdirectory' of the container, and/or
#' to files with a specific extension. The function assumes that all file names
#' end with a ".ext" extension of some sort.
#'
#' The function does not support filtering by file name, only by file extension.
#'
#' The returned file list (character vector) contains the full paths to the
#' files, ready to be passed perhaps to a `read_azure_*` function, or further
#' filtered by you. If you just want the names of the files without the folder
#' path, use [basename()] to extract these.
#'
#' @inheritParams read_azure_parquet
#' @param path (optional) subdirectory of the container to list files within.
#' `""` (the root folder of the container) by default
#' @param ext (optional) A string giving the extension of a particular file type
#' you want to restrict the list to. No need to include the initial ".". The
#' default, `""`, means no filtering by file extension will be applied. Can be
#' a regular expression.
#' @param recursive A Boolean value: whether to list files recursively. `TRUE`
#' by default
#'
#' @importFrom rlang .data
#' @returns A vector of file names, or an empty character vector if none found
#' @examples \dontrun{
#' list_files(get_container("example"), ext = "csv")
#' }
#' @export
list_files <- function(container, path = "", ext = "", recursive = TRUE) {
stopifnot(rlang::is_character(c(path, ext), 2))
stopifnot(rlang::is_bool(recursive))
pnf_msg <- ct_error_msg("Path {.val {path}} not found")
check_that(path, \(x) AzureStor::blob_dir_exists(container, x), pnf_msg)

tbl <- AzureStor::list_blobs(container, path, recursive = recursive)
if (nrow(tbl) > 0) {
ext_rx <- if (nzchar(ext)) sub("^\\.+", "", ext) else ".*" # nolint
tbl <- tbl |>
dplyr::filter(!.data[["isdir"]] & gregg(.data[["name"]], "\\.{ext_rx}$"))
}

# A zero-row tbl can result if `path` is initially empty, or via the filter
# step above. We handle this the same way, no matter which route led here.
if (nrow(tbl) == 0) {
fix_path <- \(p) sub("^/+$", "", sub("^([^/])(.*)", "/\\1\\2", p)) # nolint
ext <- if (nzchar(ext)) paste0(" ", ext)
msg <- "No{ext} files found in {.val [{container$name}]:{fix_path(path)}}"
if (rlang::is_interactive()) {
cli::cli_alert_info(msg)
}
invisible(character(0))
} else {
tbl[["name"]]
}
}
Loading
Loading