From 1ffb07d466df74c4b3a1aa9d4beb04509746b7b8 Mon Sep 17 00:00:00 2001 From: Tom Jemmett Date: Fri, 13 Mar 2026 15:47:09 +0000 Subject: [PATCH 1/2] runs air format . --- R/get_auth_token.R | 273 ++++++------ R/get_container.R | 61 ++- R/helpers.R | 125 +++--- R/list_files.R | 46 +- R/read_azure_files.R | 126 +++--- R/read_azure_table.R | 44 +- tests/testthat/test-azkit_helpers.R | 138 +++--- tests/testthat/test-get_auth_token.R | 51 ++- tests/testthat/test-get_container.R | 64 +-- tests/testthat/test-list_files.R | 137 +++--- tests/testthat/test-read_azure_files.R | 590 ++++++++++++------------- 11 files changed, 814 insertions(+), 841 deletions(-) diff --git a/R/get_auth_token.R b/R/get_auth_token.R index 9e8a0b8..8f84bd3 100644 --- a/R/get_auth_token.R +++ b/R/get_auth_token.R @@ -61,162 +61,158 @@ #' } #' @export get_auth_token <- function( - resource = generate_resource(), - tenant = "common", - client_id = NULL, - auth_method = "authorization_code", - aad_version = 1, - force_refresh = FALSE, - ... + resource = generate_resource(), + tenant = "common", + client_id = NULL, + auth_method = "authorization_code", + aad_version = 1, + force_refresh = FALSE, + ... ) { - aad_msg <- "Invalid {.arg aad_version} variable supplied (must be 1 or 2)" - aad_version <- check_that(aad_version, \(x) x %in% seq(2), aad_msg) + aad_msg <- "Invalid {.arg aad_version} variable supplied (must be 1 or 2)" + aad_version <- check_that(aad_version, \(x) x %in% seq(2), aad_msg) - safely_get_token <- \(...) purrr::safely(AzureAuth::get_azure_token)(...) - get_azure_token <- purrr::partial( - safely_get_token, - resource = resource, - version = aad_version - ) - possibly_get_mtk <- \(...) purrr::possibly(AzureAuth::get_managed_token)(...) + safely_get_token <- \(...) purrr::safely(AzureAuth::get_azure_token)(...) + get_azure_token <- purrr::partial( + safely_get_token, + resource = resource, + version = aad_version + ) + possibly_get_mtk <- \(...) purrr::possibly(AzureAuth::get_managed_token)(...) - dots <- rlang::list2(...) - # If the user specifies force_refresh = TRUE we turn off `use_cache`, - # otherwise we leave `use_cache` as it is (or as `NULL`, its default value) - use_cached <- !force_refresh && (dots[["use_cache"]] %||% TRUE) - dots <- rlang::dots_list(!!!dots, use_cache = use_cached, .homonyms = "last") + dots <- rlang::list2(...) + # If the user specifies force_refresh = TRUE we turn off `use_cache`, + # otherwise we leave `use_cache` as it is (or as `NULL`, its default value) + use_cached <- !force_refresh && (dots[["use_cache"]] %||% TRUE) + dots <- rlang::dots_list(!!!dots, use_cache = use_cached, .homonyms = "last") - # We have 4 approaches to get a token, depending on the context - # 1. Use environment variables if all three are set - token_resp <- rlang::inject(try_token_from_vars(get_azure_token, !!!dots)) - token <- token_resp[["result"]] - token_error <- token_resp[["error"]] + # We have 4 approaches to get a token, depending on the context + # 1. Use environment variables if all three are set + token_resp <- rlang::inject(try_token_from_vars(get_azure_token, !!!dots)) + token <- token_resp[["result"]] + token_error <- token_resp[["error"]] - # 2. Try to get a managed token (for example on Azure VM, App Service) - if (is.null(token)) { - token <- rlang::inject(possibly_get_mtk(resource, !!!dots)) - } + # 2. Try to get a managed token (for example on Azure VM, App Service) + if (is.null(token)) { + token <- rlang::inject(possibly_get_mtk(resource, !!!dots)) + } - # 3. If neither of those has worked, try to get an already stored user token - # (unless `force_refresh` is on, in which case skip to option 4) - if (is.null(token) && use_cached) { - token <- match_cached_token(resource, tenant, aad_version) - } + # 3. If neither of those has worked, try to get an already stored user token + # (unless `force_refresh` is on, in which case skip to option 4) + if (is.null(token) && use_cached) { + token <- match_cached_token(resource, tenant, aad_version) + } - # 4. If we still don't have a token, try to get a new one via reauthentication - if (is.null(token)) { - if (!force_refresh) { - cli::cli_alert_info("No matching cached token found: fetching new token") - } - client_id <- client_id %||% get_client_id() - token_resp <- rlang::inject( - get_azure_token( - tenant = tenant, - app = client_id, - auth_type = auth_method, - !!!dots - ) - ) - token <- token_resp[["result"]] - token_error <- token_error %||% token_resp[["error"]] - } + # 4. If we still don't have a token, try to get a new one via reauthentication + if (is.null(token)) { + if (!force_refresh) { + cli::cli_alert_info("No matching cached token found: fetching new token") + } + client_id <- client_id %||% get_client_id() + token_resp <- rlang::inject( + get_azure_token( + tenant = tenant, + app = client_id, + auth_type = auth_method, + !!!dots + ) + ) + token <- token_resp[["result"]] + token_error <- token_error %||% token_resp[["error"]] + } - # Give some helpful feedback if the steps above have not succeeded - if (is.null(token) || length(token) == 0) { - cli::cli_alert_info("No authentication token was obtained.") - cli::cli_alert_info("Please check any variables you have supplied.") - cli::cli_alert_info( - "Alternatively, running {.fn AzureRMR::get_azure_login} or + # Give some helpful feedback if the steps above have not succeeded + if (is.null(token) || length(token) == 0) { + cli::cli_alert_info("No authentication token was obtained.") + cli::cli_alert_info("Please check any variables you have supplied.") + cli::cli_alert_info( + "Alternatively, running {.fn AzureRMR::get_azure_login} or {.fn AzureRMR::list_azure_tokens} may shed some light on the problem." - ) - error_msg <- "{.fn get_auth_token}: No authentication token was obtained." - cli::cli_abort(as.character(token_error %||% error_msg)) - } else { - if (aad_version == 2) { - check_that(token, AzureAuth::is_azure_v2_token, "Invalid token returned") - } else { - check_that(token, AzureAuth::is_azure_v1_token, "Invalid token returned") - } - } + ) + error_msg <- "{.fn get_auth_token}: No authentication token was obtained." + cli::cli_abort(as.character(token_error %||% error_msg)) + } else { + if (aad_version == 2) { + check_that(token, AzureAuth::is_azure_v2_token, "Invalid token returned") + } else { + check_that(token, AzureAuth::is_azure_v1_token, "Invalid token returned") + } + } } - #' Get token via app and secret environment variables #' Sub-routine for `get_auth_token()` #' @keywords internal #' @returns A list with elements `result` and `error`. If this method is #' successful, the `result` element will contain a token. try_token_from_vars <- function(get_token_fun, ...) { - tenant_id_env <- Sys.getenv("AZ_TENANT_ID") - client_id_env <- Sys.getenv("AZ_CLIENT_ID") - client_secret <- Sys.getenv("AZ_APP_SECRET") + tenant_id_env <- Sys.getenv("AZ_TENANT_ID") + client_id_env <- Sys.getenv("AZ_CLIENT_ID") + client_secret <- Sys.getenv("AZ_APP_SECRET") - if (all(nzchar(c(tenant_id_env, client_id_env, client_secret)))) { - rlang::inject( - get_token_fun( - tenant = tenant_id_env, - app = client_id_env, - password = client_secret, - ... - ) - ) - } else { - list(result = NULL, error = NULL) - } + if (all(nzchar(c(tenant_id_env, client_id_env, client_secret)))) { + rlang::inject( + get_token_fun( + tenant = tenant_id_env, + app = client_id_env, + password = client_secret, + ... + ) + ) + } else { + list(result = NULL, error = NULL) + } } - #' Find an already cached token that matches desired parameters #' Sub-routine for `get_auth_token()` #' @keywords internal #' @returns A token from local cache, or NULL if none matches match_cached_token <- function(resource, tenant, aad_version) { - # list tokens already locally cached - local_tokens <- AzureAuth::list_azure_tokens() - if (length(local_tokens) > 0) { - resources <- purrr::map(local_tokens, "resource") - scopes <- purrr::map(local_tokens, list("scope", 1)) - resources <- purrr::map2_chr(resources, scopes, `%||%`) - tenants <- purrr::map_chr(local_tokens, "tenant") - versions <- purrr::map_int(local_tokens, "version") + # list tokens already locally cached + local_tokens <- AzureAuth::list_azure_tokens() + if (length(local_tokens) > 0) { + resources <- purrr::map(local_tokens, "resource") + scopes <- purrr::map(local_tokens, list("scope", 1)) + resources <- purrr::map2_chr(resources, scopes, `%||%`) + tenants <- purrr::map_chr(local_tokens, "tenant") + versions <- purrr::map_int(local_tokens, "version") - resource_index <- gregg(resources, "^{resource[[1]]}") - tenant_index <- tenants == tenant - version_index <- versions == aad_version + resource_index <- gregg(resources, "^{resource[[1]]}") + tenant_index <- tenants == tenant + version_index <- versions == aad_version - # return a token matching `resource`, `tenant` and `version`, if any - token_index <- which(resource_index & tenant_index & version_index)[1] - if (!is.na(token_index)) local_tokens[[token_index]] else NULL - } else { - NULL - } + # return a token matching `resource`, `tenant` and `version`, if any + token_index <- which(resource_index & tenant_index & version_index)[1] + if (!is.na(token_index)) local_tokens[[token_index]] else NULL + } else { + NULL + } } - #' Sub-routine for `get_auth_token()` #' #' Pulled out mainly to tidy up the main function code a bit #' @keywords internal #' @returns A string (the client ID) get_client_id <- function() { - pluck_client_id <- function() { - suppressMessages(AzureRMR::get_azure_login()) |> - purrr::pluck("token", "client", "client_id") - } - possibly_pluck_client_id <- \(...) purrr::possibly(pluck_client_id)(...) - azure_cli_default_client_id <- "04b07795-8ddb-461a-bbee-02f9e1bf7b46" + pluck_client_id <- function() { + suppressMessages(AzureRMR::get_azure_login()) |> + purrr::pluck("token", "client", "client_id") + } + possibly_pluck_client_id <- \(...) purrr::possibly(pluck_client_id)(...) + azure_cli_default_client_id <- "04b07795-8ddb-461a-bbee-02f9e1bf7b46" - client_id <- possibly_pluck_client_id() - # if that fails, prompt the user to log in, then try again... - if (is.null(client_id)) { - AzureRMR::create_azure_login() - # ...using the default Azure CLI client ID as a final fallback - client_id <- possibly_pluck_client_id() %||% azure_cli_default_client_id - } - client_id + client_id <- possibly_pluck_client_id() + # if that fails, prompt the user to log in, then try again... + if (is.null(client_id)) { + AzureRMR::create_azure_login() + # ...using the default Azure CLI client ID as a final fallback + client_id <- possibly_pluck_client_id() %||% azure_cli_default_client_id + } + client_id } - #' Generate appropriate values for the `resource` parameter in [get_auth_token] #' #' A helper function to generate appropriate values. Ensure that the `version` @@ -237,30 +233,29 @@ get_client_id <- function() { #' @returns A scalar character, or (in most v2 situations) a character vector #' @export generate_resource <- function( - version = 1, - url = "https://storage.azure.com", - path = "/.default", - authorise = TRUE, - refresh = TRUE + version = 1, + url = "https://storage.azure.com", + path = "/.default", + authorise = TRUE, + refresh = TRUE ) { - stopifnot("version must be 1 or 2" = version %in% seq(2)) - scopes <- if (refresh) c("openid", "offline_access") else "openid" - if (authorise) { - if (version == 2) { - c(paste0(url, path), scopes) - } else { - url - } - } else { - if (version == 2) { - scopes - } else { - "" - } - } + stopifnot("version must be 1 or 2" = version %in% seq(2)) + scopes <- if (refresh) c("openid", "offline_access") else "openid" + if (authorise) { + if (version == 2) { + c(paste0(url, path), scopes) + } else { + url + } + } else { + if (version == 2) { + scopes + } else { + "" + } + } } - #' Use a token's internal `refresh()` method to refresh it #' #' This method avoids the need to refresh by re-authenticating online. It seems diff --git a/R/get_container.R b/R/get_container.R index 31a5d3d..74cb791 100644 --- a/R/get_container.R +++ b/R/get_container.R @@ -16,44 +16,42 @@ #' @returns An Azure blob container (list object of class "blob_container") #' @export get_container <- function( - container_name = NULL, - token = NULL, - endpoint_url = NULL, - ... + container_name = NULL, + token = NULL, + endpoint_url = NULL, + ... ) { - msg1 <- paste0( - "{.var container_name} is empty. ", - "Did you forget to set an environment variable?" - ) - msg2 <- paste0( - "{.var endpoint_url} is empty. ", - "Did you forget to set an environment variable?" - ) - container_name <- (container_name %||% check_envvar("AZ_CONTAINER")) |> - check_nzchar(msg1) - endpoint_url <- (endpoint_url %||% check_envvar("AZ_STORAGE_EP")) |> - check_nzchar(msg2) - token <- token %||% get_auth_token(...) + msg1 <- paste0( + "{.var container_name} is empty. ", + "Did you forget to set an environment variable?" + ) + msg2 <- paste0( + "{.var endpoint_url} is empty. ", + "Did you forget to set an environment variable?" + ) + container_name <- (container_name %||% check_envvar("AZ_CONTAINER")) |> + check_nzchar(msg1) + endpoint_url <- (endpoint_url %||% check_envvar("AZ_STORAGE_EP")) |> + check_nzchar(msg2) + token <- token %||% get_auth_token(...) - get_azure_endpoint(token, endpoint_url) |> - AzureStor::blob_container(container_name) + get_azure_endpoint(token, endpoint_url) |> + AzureStor::blob_container(container_name) } - #' Return a list of container names that are found at the endpoint #' #' @inheritParams get_container #' @returns A character vector of all container names found #' @export list_container_names <- function(token = NULL, endpoint_url = NULL, ...) { - token <- token %||% get_auth_token(...) - endpoint <- get_azure_endpoint(token, endpoint_url) - container_list <- AzureStor::list_blob_containers(endpoint) - stopifnot("no containers found" = length(container_list) >= 1L) - names(container_list) + token <- token %||% get_auth_token(...) + endpoint <- get_azure_endpoint(token, endpoint_url) + container_list <- AzureStor::list_blob_containers(endpoint) + stopifnot("no containers found" = length(container_list) >= 1L) + names(container_list) } - #' Return an Azure "blob_endpoint" #' #' This function will return the endpoint specified in the environment variable @@ -63,12 +61,11 @@ list_container_names <- function(token = NULL, endpoint_url = NULL, ...) { #' @returns An Azure blob endpoint (object of class "blob_endpoint") #' @keywords internal get_azure_endpoint <- function(token = NULL, endpoint_url = NULL, ...) { - token <- token %||% get_auth_token(...) - endpoint_url <- endpoint_url %||% check_envvar("AZ_STORAGE_EP") - AzureStor::blob_endpoint(endpoint_url, token = token) + token <- token %||% get_auth_token(...) + endpoint_url <- endpoint_url %||% check_envvar("AZ_STORAGE_EP") + AzureStor::blob_endpoint(endpoint_url, token = token) } - #' Check that an environment variable exists #' #' The function prints a helpful error if the variable is not found, else @@ -78,6 +75,6 @@ get_azure_endpoint <- function(token = NULL, endpoint_url = NULL, ...) { #' @returns the value of the environment variable named in `x` #' @export check_envvar <- function(x) { - cst_msg <- cst_error_msg("The environment variable {.envvar {x}} is not set") - check_scalar_type(Sys.getenv(x, NA_character_), "string", cst_msg) + cst_msg <- cst_error_msg("The environment variable {.envvar {x}} is not set") + check_scalar_type(Sys.getenv(x, NA_character_), "string", cst_msg) } diff --git a/R/helpers.R b/R/helpers.R index 18afcd0..b9b5cfe 100644 --- a/R/helpers.R +++ b/R/helpers.R @@ -13,14 +13,13 @@ #' @seealso [check_vec] #' @export check_that <- function(x, predicate, message, pf = parent.frame()) { - if (predicate(x)) { - x - } else { - cli::cli_abort(message, call = rlang::caller_call(), .envir = pf) - } + if (predicate(x)) { + x + } else { + cli::cli_abort(message, call = rlang::caller_call(), .envir = pf) + } } - #' @export ct_error_msg <- \(text) paste0("{.fn check_that}: ", text) @@ -48,25 +47,24 @@ ct_error_msg <- \(text) paste0("{.fn check_that}: ", text) #' @seealso [check_scalar_type()] #' @export check_vec <- function( - x, - predicate, - message, - which = c("every", "some", "none"), - pf = parent.frame() + x, + predicate, + message, + which = c("every", "some", "none"), + pf = parent.frame() ) { - w <- rlang::arg_match(which) - test_call <- rlang::call2(w, .x = x, .p = predicate, .ns = "purrr") - if (eval(test_call)) { - x - } else { - cli::cli_abort(message, call = rlang::caller_call(), .envir = pf) - } + w <- rlang::arg_match(which) + test_call <- rlang::call2(w, .x = x, .p = predicate, .ns = "purrr") + if (eval(test_call)) { + x + } else { + cli::cli_abort(message, call = rlang::caller_call(), .envir = pf) + } } #' @export cv_error_msg <- \(text) paste0("{.fn check_vec}: ", text) - #' An alternative to stopifnot/assert_that etc #' #' This function makes it easy to use the `is_scalar_*` functions from `{rlang}` @@ -80,38 +78,37 @@ cv_error_msg <- \(text) paste0("{.fn check_vec}: ", text) #' @param type A string defining the R object type that `x` is checked to be #' @export check_scalar_type <- function( - x, - type, - message, - pf = parent.frame() + x, + type, + message, + pf = parent.frame() ) { - opts <- c( - "character", - "logical", - "integer", - "double", - "string", - "bool", - "list", - "bytes", - "raw", - "vector", - "complex" - ) - t <- rlang::arg_match(type, opts) - t <- if (t %in% c("string", "bool")) t else paste0("scalar_", t) - test_call <- rlang::call2(paste0("is_", t), x = x, .ns = "rlang") - if (eval(test_call)) { - x - } else { - cli::cli_abort(message, call = rlang::caller_call(), .envir = pf) - } + opts <- c( + "character", + "logical", + "integer", + "double", + "string", + "bool", + "list", + "bytes", + "raw", + "vector", + "complex" + ) + t <- rlang::arg_match(type, opts) + t <- if (t %in% c("string", "bool")) t else paste0("scalar_", t) + test_call <- rlang::call2(paste0("is_", t), x = x, .ns = "rlang") + if (eval(test_call)) { + x + } else { + cli::cli_abort(message, call = rlang::caller_call(), .envir = pf) + } } #' @export cst_error_msg <- \(text) paste0("{.fn check_scalar_type}: ", text) - #' Check if a supplied non-NULL value is a string with >0 characters #' #' Will error if x is equal to `""`, or if it is otherwise missing or invalid. @@ -124,20 +121,19 @@ cst_error_msg <- \(text) paste0("{.fn check_scalar_type}: ", text) #' makes it easier to include informative values in the message. #' @export check_nzchar <- function(x, message, pf = parent.frame()) { - if (is.null(x)) { - NULL - } - cnz <- "check_nzchar" # nolint - check_scalar_type(x, "string", "{.fn {cnz}}: {.var x} is not a string") - if (nzchar(x)) { - x - } else { - message <- paste0("{.fn {cnz}}: ", message) - cli::cli_abort(message, call = rlang::caller_call(), .envir = pf) - } + if (is.null(x)) { + NULL + } + cnz <- "check_nzchar" # nolint + check_scalar_type(x, "string", "{.fn {cnz}}: {.var x} is not a string") + if (nzchar(x)) { + x + } else { + message <- paste0("{.fn {cnz}}: ", message) + cli::cli_abort(message, call = rlang::caller_call(), .envir = pf) + } } - #' grepl a glued regex #' #' Use \{glue\} expressions in grepl (and put the arguments the right way round) @@ -151,16 +147,15 @@ check_nzchar <- function(x, message, pf = parent.frame()) { #' @keywords internal gregg <- \(x, rx, ..., g = parent.frame()) grepl(glue::glue_data(g, rx), x, ...) - #' Check that a container looks like a real container #' @inheritParams read_azure_parquet #' @export check_container_class <- function(container) { - if (inherits(container, "blob_container")) { - container - } else { - ccc <- "check_container_class" # nolint - cc <- rlang::caller_call() - cli::cli_abort("{.fn {ccc}}: This is not a valid blob container", call = cc) - } + if (inherits(container, "blob_container")) { + container + } else { + ccc <- "check_container_class" # nolint + cc <- rlang::caller_call() + cli::cli_abort("{.fn {ccc}}: This is not a valid blob container", call = cc) + } } diff --git a/R/list_files.R b/R/list_files.R index cfff3f3..c5b48f5 100644 --- a/R/list_files.R +++ b/R/list_files.R @@ -29,29 +29,29 @@ #' } #' @export list_files <- function(container, path = "", ext = "", recursive = TRUE) { - stopifnot(rlang::is_character(c(path, ext), 2)) - stopifnot(rlang::is_bool(recursive)) - pnf_msg <- ct_error_msg("Path {.val {path}} not found") - check_that(path, \(x) AzureStor::blob_dir_exists(container, x), pnf_msg) + stopifnot(rlang::is_character(c(path, ext), 2)) + stopifnot(rlang::is_bool(recursive)) + pnf_msg <- ct_error_msg("Path {.val {path}} not found") + check_that(path, \(x) AzureStor::blob_dir_exists(container, x), pnf_msg) - tbl <- AzureStor::list_blobs(container, path, recursive = recursive) - if (nrow(tbl) > 0) { - ext_rx <- if (nzchar(ext)) sub("^\\.+", "", ext) else ".*" # nolint - tbl <- tbl |> - dplyr::filter(!.data[["isdir"]] & gregg(.data[["name"]], "\\.{ext_rx}$")) - } + tbl <- AzureStor::list_blobs(container, path, recursive = recursive) + if (nrow(tbl) > 0) { + ext_rx <- if (nzchar(ext)) sub("^\\.+", "", ext) else ".*" # nolint + tbl <- tbl |> + dplyr::filter(!.data[["isdir"]] & gregg(.data[["name"]], "\\.{ext_rx}$")) + } - # A zero-row tbl can result if `path` is initially empty, or via the filter - # step above. We handle this the same way, no matter which route led here. - if (nrow(tbl) == 0) { - fix_path <- \(p) sub("^/+$", "", sub("^([^/])(.*)", "/\\1\\2", p)) # nolint - ext <- if (nzchar(ext)) paste0(" ", ext) - msg <- "No{ext} files found in {.val [{container$name}]:{fix_path(path)}}" - if (rlang::is_interactive()) { - cli::cli_alert_info(msg) - } - invisible(character(0)) - } else { - tbl[["name"]] - } + # A zero-row tbl can result if `path` is initially empty, or via the filter + # step above. We handle this the same way, no matter which route led here. + if (nrow(tbl) == 0) { + fix_path <- \(p) sub("^/+$", "", sub("^([^/])(.*)", "/\\1\\2", p)) # nolint + ext <- if (nzchar(ext)) paste0(" ", ext) + msg <- "No{ext} files found in {.val [{container$name}]:{fix_path(path)}}" + if (rlang::is_interactive()) { + cli::cli_alert_info(msg) + } + invisible(character(0)) + } else { + tbl[["name"]] + } } diff --git a/R/read_azure_files.R b/R/read_azure_files.R index e76183b..b24c1e6 100644 --- a/R/read_azure_files.R +++ b/R/read_azure_files.R @@ -27,13 +27,12 @@ #' } #' @export read_azure_parquet <- function(container, file, path = "/", info = NULL, ...) { - check_blob_exists(container, file, "parquet", info, path) |> - # using `dest = NULL` means pass the data through as a raw vector - AzureStor::download_blob(container, src = _, dest = NULL) |> - arrow::read_parquet(...) + check_blob_exists(container, file, "parquet", info, path) |> + # using `dest = NULL` means pass the data through as a raw vector + AzureStor::download_blob(container, src = _, dest = NULL) |> + arrow::read_parquet(...) } - #' Read a json file from Azure storage #' #' @inheritParams read_azure_parquet @@ -42,13 +41,12 @@ read_azure_parquet <- function(container, file, path = "/", info = NULL, ...) { #' @returns A list #' @export read_azure_json <- function(container, file, path = "/", info = NULL, ...) { - check_blob_exists(container, file, "json", info, path) |> - # using `dest = NULL` means pass the data through as a raw vector - AzureStor::download_blob(container, src = _, dest = NULL) |> - yyjsonr::read_json_raw(...) + check_blob_exists(container, file, "json", info, path) |> + # using `dest = NULL` means pass the data through as a raw vector + AzureStor::download_blob(container, src = _, dest = NULL) |> + yyjsonr::read_json_raw(...) } - #' Read a json.gz file from Azure storage #' #' @inheritParams read_azure_parquet @@ -57,16 +55,15 @@ read_azure_json <- function(container, file, path = "/", info = NULL, ...) { #' @returns A list #' @export read_azure_jsongz <- function(container, file, path = "/", info = NULL, ...) { - full_path <- check_blob_exists(container, file, "json.gz", info, path) - dl <- withr::local_tempfile( - pattern = tools::file_path_sans_ext(basename(full_path), TRUE), - fileext = "json.gz" - ) - AzureStor::download_blob(container, src = full_path, dest = dl) - yyjsonr::read_json_file(dl, ...) + full_path <- check_blob_exists(container, file, "json.gz", info, path) + dl <- withr::local_tempfile( + pattern = tools::file_path_sans_ext(basename(full_path), TRUE), + fileext = "json.gz" + ) + AzureStor::download_blob(container, src = full_path, dest = dl) + yyjsonr::read_json_file(dl, ...) } - #' Read an rds file from Azure storage #' #' @inheritParams read_azure_parquet @@ -79,14 +76,13 @@ read_azure_jsongz <- function(container, file, path = "/", info = NULL, ...) { #' @returns The data object that was stored in the rds file #' @export read_azure_rds <- function(container, file, path = "/", info = NULL, ...) { - # If the user doesn't specify a (de)compression type with `type` in `...`, we - # will set a `type` of "none", as this seems to be the standard on SU Azure - dots <- rlang::dots_list(..., type = "none", .homonyms = "first") - blob <- check_blob_exists(container, file, "rds", info, path) - rlang::inject(AzureStor::storage_load_rds(container, blob, !!!dots)) + # If the user doesn't specify a (de)compression type with `type` in `...`, we + # will set a `type` of "none", as this seems to be the standard on SU Azure + dots <- rlang::dots_list(..., type = "none", .homonyms = "first") + blob <- check_blob_exists(container, file, "rds", info, path) + rlang::inject(AzureStor::storage_load_rds(container, blob, !!!dots)) } - #' Read a csv file from Azure storage #' #' @inheritParams read_azure_parquet @@ -94,11 +90,10 @@ read_azure_rds <- function(container, file, path = "/", info = NULL, ...) { #' @returns A tibble #' @export read_azure_csv <- function(container, file, path = "/", info = NULL, ...) { - check_blob_exists(container, file, "csv", info, path) |> - AzureStor::storage_read_csv(container, file = _, ...) + check_blob_exists(container, file, "csv", info, path) |> + AzureStor::storage_read_csv(container, file = _, ...) } - #' Read any file from Azure storage #' #' @inheritParams read_azure_parquet @@ -109,55 +104,54 @@ read_azure_csv <- function(container, file, path = "/", info = NULL, ...) { #' @returns A raw data stream #' @export read_azure_file <- function( - container, - file, - path = "/", - info = NULL, - ext = NULL, - ... + container, + file, + path = "/", + info = NULL, + ext = NULL, + ... ) { - ext <- ext %||% tools::file_ext(file) - check_blob_exists(container, file, ext, info, path) |> - # using `dest = NULL` means pass the data through as a raw vector - AzureStor::download_blob(container, src = _, dest = NULL, ...) + ext <- ext %||% tools::file_ext(file) + check_blob_exists(container, file, ext, info, path) |> + # using `dest = NULL` means pass the data through as a raw vector + AzureStor::download_blob(container, src = _, dest = NULL, ...) } - #' Ensures that the filepath for the file to read exists #' #' @inheritParams read_azure_parquet #' @param ext The standard file extension for the file type, e.g. "json" #' @keywords internal check_blob_exists <- function(container, file, ext, info, path) { - stopifnot("no container found" = inherits(container, "blob_container")) - path <- if (path %in% c("", "/")) "" else path - stopifnot("path not found" = AzureStor::blob_dir_exists(container, path)) - dir_name <- if (dirname(file) == ".") "" else dirname(file) - # Potentially the user could provide a partial file path in `path` and a - # further sub-directory as part of `file`. This handles that eventuality, - # though this usage pattern should be quite rare! - dpath <- file.path(path, dir_name) - fname <- basename(file) - if (nzchar(ext) && !gregg(fname, "\\.{ext}$")) { - fname <- glue::glue("{fname}.{ext}") - } - # remove duplicate slashes and any initial slashes - file_path <- sub("^/", "", gsub("/+", "/", file.path(dpath, fname))) + stopifnot("no container found" = inherits(container, "blob_container")) + path <- if (path %in% c("", "/")) "" else path + stopifnot("path not found" = AzureStor::blob_dir_exists(container, path)) + dir_name <- if (dirname(file) == ".") "" else dirname(file) + # Potentially the user could provide a partial file path in `path` and a + # further sub-directory as part of `file`. This handles that eventuality, + # though this usage pattern should be quite rare! + dpath <- file.path(path, dir_name) + fname <- basename(file) + if (nzchar(ext) && !gregg(fname, "\\.{ext}$")) { + fname <- glue::glue("{fname}.{ext}") + } + # remove duplicate slashes and any initial slashes + file_path <- sub("^/", "", gsub("/+", "/", file.path(dpath, fname))) - filepath_out <- AzureStor::list_blobs(container, dpath, recursive = FALSE) |> - dplyr::filter(dplyr::if_any("name", \(x) x == {{ file_path }})) |> - dplyr::pull("name") + filepath_out <- AzureStor::list_blobs(container, dpath, recursive = FALSE) |> + dplyr::filter(dplyr::if_any("name", \(x) x == {{ file_path }})) |> + dplyr::pull("name") - msg1 <- ct_error_msg("no matching {ext} file found") - msg2 <- cst_error_msg("multiple matching {ext} files found") - check_that(filepath_out, \(x) length(x) > 0, msg1) # check length > 0 - check_scalar_type(filepath_out, "character", msg2) # check length == 1 + msg1 <- ct_error_msg("no matching {ext} file found") + msg2 <- cst_error_msg("multiple matching {ext} files found") + check_that(filepath_out, \(x) length(x) > 0, msg1) # check length > 0 + check_scalar_type(filepath_out, "character", msg2) # check length == 1 - info_option <- getOption("azkit.info") - stopifnot(rlang::is_scalar_logical(info) || is.null(info)) - stopifnot(rlang::is_scalar_logical(info_option) || is.null(info_option)) - if (info %||% info_option %||% rlang::is_interactive()) { - cli::cli_alert_info("File {.val {filepath_out}} will be read in") - } - filepath_out + info_option <- getOption("azkit.info") + stopifnot(rlang::is_scalar_logical(info) || is.null(info)) + stopifnot(rlang::is_scalar_logical(info_option) || is.null(info_option)) + if (info %||% info_option %||% rlang::is_interactive()) { + cli::cli_alert_info("File {.val {filepath_out}} will be read in") + } + filepath_out } diff --git a/R/read_azure_table.R b/R/read_azure_table.R index 5a85452..db9559d 100644 --- a/R/read_azure_table.R +++ b/R/read_azure_table.R @@ -11,29 +11,29 @@ #' @returns A tibble #' @export read_azure_table <- function( - table_name = NULL, - table_endpoint = NULL, - token = NULL, - ... + table_name = NULL, + table_endpoint = NULL, + token = NULL, + ... ) { - table_name <- table_name %||% check_envvar("AZ_TABLE_NAME") - table_ep <- table_endpoint %||% check_envvar("AZ_TABLE_EP") - token <- token %||% get_auth_token(...) - access_token <- token |> - purrr::pluck("credentials", "access_token") - headers <- list("2025-11-05", "application/json;odata=nometadata") |> - purrr::set_names(c("x-ms-version", "Accept")) + table_name <- table_name %||% check_envvar("AZ_TABLE_NAME") + table_ep <- table_endpoint %||% check_envvar("AZ_TABLE_EP") + token <- token %||% get_auth_token(...) + access_token <- token |> + purrr::pluck("credentials", "access_token") + headers <- list("2025-11-05", "application/json;odata=nometadata") |> + purrr::set_names(c("x-ms-version", "Accept")) - resp <- httr2::request(table_ep) |> - httr2::req_url_path_append(table_name) |> - httr2::req_auth_bearer_token(access_token) |> - httr2::req_headers(!!!headers) |> - httr2::req_perform() |> - httr2::resp_check_status() + resp <- httr2::request(table_ep) |> + httr2::req_url_path_append(table_name) |> + httr2::req_auth_bearer_token(access_token) |> + httr2::req_headers(!!!headers) |> + httr2::req_perform() |> + httr2::resp_check_status() - resp |> - httr2::resp_body_json() |> - purrr::pluck("value") |> - purrr::map(tibble::as_tibble) |> - purrr::list_rbind() + resp |> + httr2::resp_body_json() |> + purrr::pluck("value") |> + purrr::map(tibble::as_tibble) |> + purrr::list_rbind() } diff --git a/tests/testthat/test-azkit_helpers.R b/tests/testthat/test-azkit_helpers.R index e336d89..7b0fa68 100644 --- a/tests/testthat/test-azkit_helpers.R +++ b/tests/testthat/test-azkit_helpers.R @@ -1,81 +1,79 @@ test_that("check_vec uses {cli} formatting and glue variables", { - var <- "test" - expect_identical(check_vec(letters, nzchar), letters) - check_vec(letters, nzchar, "{.var {var}}", "none") |> - expect_error(class = "rlang_error") - check_vec(letters, nzchar, "{.var {var}}", "none") |> - expect_error("`test`", class = "rlang_error") + var <- "test" + expect_identical(check_vec(letters, nzchar), letters) + check_vec(letters, nzchar, "{.var {var}}", "none") |> + expect_error(class = "rlang_error") + check_vec(letters, nzchar, "{.var {var}}", "none") |> + expect_error("`test`", class = "rlang_error") }) - test_that("I understand how rlang::abort works", { - # Previously I was generating errors with `cli::cli_abort()` but maybe it - # makes more sense to use `rlang::abort()` if it handles glue vars in its - # messages OK - var <- "test" - message <- "{.var {var}} error" - # will only succeed if devtools::load_all() has been run: - # expect_error(rlang::abort(message), "`test` error") # nolint - expect_error(rlang::abort(message), class = "rlang_error") - rlang::local_use_cli(inline = TRUE) - expect_error(rlang::abort(message), "`test` error", class = "rlang_error") + # Previously I was generating errors with `cli::cli_abort()` but maybe it + # makes more sense to use `rlang::abort()` if it handles glue vars in its + # messages OK + var <- "test" + message <- "{.var {var}} error" + # will only succeed if devtools::load_all() has been run: + # expect_error(rlang::abort(message), "`test` error") # nolint + expect_error(rlang::abort(message), class = "rlang_error") + rlang::local_use_cli(inline = TRUE) + expect_error(rlang::abort(message), "`test` error", class = "rlang_error") - check_var <- function(var, message) { - if (var != "test") { - rlang::abort(message, class = "azkit") - } else { - var - } - } - expect_equal(check_var("test", "Error with {.var {var}}"), var) - check_var("tst", "Error with {.var {var}}") |> - expect_error("Error with `tst`", class = "rlang_error") - check_var("tst", "Error with {.var {var}}") |> - expect_error("Error with `tst`", class = "azkit") + check_var <- function(var, message) { + if (var != "test") { + rlang::abort(message, class = "azkit") + } else { + var + } + } + expect_equal(check_var("test", "Error with {.var {var}}"), var) + check_var("tst", "Error with {.var {var}}") |> + expect_error("Error with `tst`", class = "rlang_error") + check_var("tst", "Error with {.var {var}}") |> + expect_error("Error with `tst`", class = "azkit") }) - test_that("check_scalar_type builds functions correctly", { - typ <- "character" - typ <- if (typ %in% c("string", "bool")) typ else paste0("scalar_", typ) - expect_equal(typ, "scalar_character") - x <- "a" - test_call <- rlang::call2(paste0("is_", typ), x = x, .ns = "rlang") - expect_identical( - as.character(test_call), - c("rlang::is_scalar_character", "a") - ) - expect_true(eval(test_call)) - test_call <- rlang::call_modify(test_call, x = NA_character_) - expect_true(eval(test_call)) - test_call <- rlang::call_modify(test_call, x = NA) - expect_false(eval(test_call)) - test_call <- rlang::call_modify(test_call, x = 9) - expect_false(eval(test_call)) + typ <- "character" + typ <- if (typ %in% c("string", "bool")) typ else paste0("scalar_", typ) + expect_equal(typ, "scalar_character") + x <- "a" + test_call <- rlang::call2(paste0("is_", typ), x = x, .ns = "rlang") + expect_identical( + as.character(test_call), + c("rlang::is_scalar_character", "a") + ) + expect_true(eval(test_call)) + test_call <- rlang::call_modify(test_call, x = NA_character_) + expect_true(eval(test_call)) + test_call <- rlang::call_modify(test_call, x = NA) + expect_false(eval(test_call)) + test_call <- rlang::call_modify(test_call, x = 9) + expect_false(eval(test_call)) - typ <- "string" - typ <- if (typ %in% c("string", "bool")) typ else paste0("scalar_", typ) - expect_equal(typ, "string") - x <- "a" - test_call <- rlang::call2(paste0("is_", typ), x = x, .ns = "rlang") - expect_identical( - as.character(test_call), - c("rlang::is_string", "a") - ) - expect_true(eval(test_call)) - test_call <- rlang::call_modify(test_call, x = NA_character_) - expect_false(eval(test_call)) - test_call <- rlang::call_modify(test_call, x = 9) - expect_false(eval(test_call)) + typ <- "string" + typ <- if (typ %in% c("string", "bool")) typ else paste0("scalar_", typ) + expect_equal(typ, "string") + x <- "a" + test_call <- rlang::call2(paste0("is_", typ), x = x, .ns = "rlang") + expect_identical( + as.character(test_call), + c("rlang::is_string", "a") + ) + expect_true(eval(test_call)) + test_call <- rlang::call_modify(test_call, x = NA_character_) + expect_false(eval(test_call)) + test_call <- rlang::call_modify(test_call, x = 9) + expect_false(eval(test_call)) - typ <- "integer" - typ <- if (typ %in% c("string", "bool")) typ else paste0("scalar_", typ) - expect_equal(typ, "scalar_integer") - x <- 2L - test_call <- rlang::call2(paste0("is_", typ), x = x, .ns = "rlang") - expect_true(eval(test_call)) - test_call <- rlang::call_modify(test_call, x = NA_integer_) - expect_true(eval(test_call)) - test_call <- rlang::call_modify(test_call, x = 9) - expect_false(eval(test_call)) + typ <- "integer" + typ <- if (typ %in% c("string", "bool")) typ else paste0("scalar_", typ) + expect_equal(typ, "scalar_integer") + x <- 2L + test_call <- rlang::call2(paste0("is_", typ), x = x, .ns = "rlang") + expect_true(eval(test_call)) + test_call <- rlang::call_modify(test_call, x = NA_integer_) + expect_true(eval(test_call)) + test_call <- rlang::call_modify(test_call, x = 9) + expect_false(eval(test_call)) }) diff --git a/tests/testthat/test-get_auth_token.R b/tests/testthat/test-get_auth_token.R index c359569..5f7d80f 100644 --- a/tests/testthat/test-get_auth_token.R +++ b/tests/testthat/test-get_auth_token.R @@ -1,32 +1,31 @@ test_that("possibly manages failure by returning NULL", { - possibly_get_mtk <- \(...) purrr::possibly(AzureAuth::get_managed_token)(...) - managed_resource <- "https://management.azure.com" - expect_null(possibly_get_mtk(managed_resource)) + possibly_get_mtk <- \(...) purrr::possibly(AzureAuth::get_managed_token)(...) + managed_resource <- "https://management.azure.com" + expect_null(possibly_get_mtk(managed_resource)) }) - test_that("generate_resource() behaves itself", { - generate_resource(version = 3) |> - expect_error() - base_url <- "https://storage.azure.com" - def_url <- paste0(base_url, "/.default") - generate_resource() |> - expect_equal(base_url) - generate_resource(refresh = FALSE) |> - expect_equal(base_url) - generate_resource(authorise = FALSE) |> - expect_equal("") - generate_resource(authorise = FALSE, refresh = FALSE) |> - expect_equal("") + generate_resource(version = 3) |> + expect_error() + base_url <- "https://storage.azure.com" + def_url <- paste0(base_url, "/.default") + generate_resource() |> + expect_equal(base_url) + generate_resource(refresh = FALSE) |> + expect_equal(base_url) + generate_resource(authorise = FALSE) |> + expect_equal("") + generate_resource(authorise = FALSE, refresh = FALSE) |> + expect_equal("") - def1 <- c(def_url, "openid", "offline_access") - generate_resource(version = 2) |> - expect_equal(def1) - def2 <- c(def_url, "openid") - generate_resource(version = 2, refresh = FALSE) |> - expect_equal(def2) - generate_resource(version = 2, authorise = FALSE) |> - expect_equal(c("openid", "offline_access")) - generate_resource(version = 2, authorise = FALSE, refresh = FALSE) |> - expect_equal("openid") + def1 <- c(def_url, "openid", "offline_access") + generate_resource(version = 2) |> + expect_equal(def1) + def2 <- c(def_url, "openid") + generate_resource(version = 2, refresh = FALSE) |> + expect_equal(def2) + generate_resource(version = 2, authorise = FALSE) |> + expect_equal(c("openid", "offline_access")) + generate_resource(version = 2, authorise = FALSE, refresh = FALSE) |> + expect_equal("openid") }) diff --git a/tests/testthat/test-get_container.R b/tests/testthat/test-get_container.R index 7ea20fc..db5b367 100644 --- a/tests/testthat/test-get_container.R +++ b/tests/testthat/test-get_container.R @@ -1,41 +1,41 @@ test_that("simple failing tests for missing env vars", { - skip_on_ci() - withr::with_envvar(c(AZ_CONTAINER = ""), get_container()) |> - expect_error("`AZ_CONTAINER` is not set", class = "rlang_error") + skip_on_ci() + withr::with_envvar(c(AZ_CONTAINER = ""), get_container()) |> + expect_error("`AZ_CONTAINER` is not set", class = "rlang_error") - c(AZ_CONTAINER = "results", AZ_STORAGE_EP = "") |> - withr::with_envvar(get_container()) |> - expect_error("`AZ_STORAGE_EP` is not set", class = "rlang_error") + c(AZ_CONTAINER = "results", AZ_STORAGE_EP = "") |> + withr::with_envvar(get_container()) |> + expect_error("`AZ_STORAGE_EP` is not set", class = "rlang_error") }) test_that("basic success", { - endpoint_uri <- Sys.getenv("AZ_STORAGE_EP") - # only run the test if this variable is set (ie locally, but not on GitHub) - if (nzchar(endpoint_uri)) { - token <- get_auth_token() # should return NULL if unsuccessful - expect_false(is.null(token)) - expect_s3_class(token, "AzureToken") + endpoint_uri <- Sys.getenv("AZ_STORAGE_EP") + # only run the test if this variable is set (ie locally, but not on GitHub) + if (nzchar(endpoint_uri)) { + token <- get_auth_token() # should return NULL if unsuccessful + expect_false(is.null(token)) + expect_s3_class(token, "AzureToken") - # explore behaviour with blob_endpoint - ep <- AzureStor::blob_endpoint(endpoint_uri, token = token) |> - expect_no_error() - expect_s3_class(ep, "blob_endpoint") - cont <- expect_no_error(AzureStor::blob_container(ep, "supporting-data")) - expect_s3_class(cont, "blob_container") + # explore behaviour with blob_endpoint + ep <- AzureStor::blob_endpoint(endpoint_uri, token = token) |> + expect_no_error() + expect_s3_class(ep, "blob_endpoint") + cont <- expect_no_error(AzureStor::blob_container(ep, "supporting-data")) + expect_s3_class(cont, "blob_container") - expect_error(AzureStor::list_adls_files(cont)) - expect_no_error(AzureStor::list_blobs(cont)) + expect_error(AzureStor::list_adls_files(cont)) + expect_no_error(AzureStor::list_blobs(cont)) - # compare behaviour with adls_endpoint instead (it's the same) - ep <- AzureStor::adls_endpoint(endpoint_uri, token = token) - expect_s3_class(ep, "adls_endpoint") - fs <- AzureStor::adls_filesystem(ep, "results") |> - expect_no_error() - expect_s3_class(fs, "adls_filesystem") - path <- "/archive/dev/synthetic" - # Expected this to succeed with adls_endpoint but it doesn't - expect_error(AzureStor::list_adls_files(fs, path)) - # list_blobs works just like it does with the blob container - expect_no_error(AzureStor::list_blobs(fs, path)) - } + # compare behaviour with adls_endpoint instead (it's the same) + ep <- AzureStor::adls_endpoint(endpoint_uri, token = token) + expect_s3_class(ep, "adls_endpoint") + fs <- AzureStor::adls_filesystem(ep, "results") |> + expect_no_error() + expect_s3_class(fs, "adls_filesystem") + path <- "/archive/dev/synthetic" + # Expected this to succeed with adls_endpoint but it doesn't + expect_error(AzureStor::list_adls_files(fs, path)) + # list_blobs works just like it does with the blob container + expect_no_error(AzureStor::list_blobs(fs, path)) + } }) diff --git a/tests/testthat/test-list_files.R b/tests/testthat/test-list_files.R index 882b653..f03a965 100644 --- a/tests/testthat/test-list_files.R +++ b/tests/testthat/test-list_files.R @@ -1,82 +1,81 @@ test_that("function behaves as expected", { - skip_on_ci() - cont <- expect_no_error(get_container("supporting-data")) - res <- expect_no_error(AzureStor::list_blobs(cont)) - expect_equal(nrow(res), 15L) - res2 <- dplyr::filter(res, !dplyr::if_any("isdir")) - expect_equal(nrow(res2), 14L) - file_ext <- NULL - file_ext <- file_ext %||% ".*" - res3 <- res2 |> - dplyr::filter(dplyr::if_any("name", \(x) gregg(x, "\\.{file_ext}$"))) - expect_equal(nrow(res2), nrow(res3)) # because nothing filtered out yet - file_ext <- "json" - res4 <- res2 |> - dplyr::filter(dplyr::if_any("name", \(x) gregg(x, "\\.{file_ext}$"))) - expect_equal(nrow(res4), 6L) + skip_on_ci() + cont <- expect_no_error(get_container("supporting-data")) + res <- expect_no_error(AzureStor::list_blobs(cont)) + expect_equal(nrow(res), 15L) + res2 <- dplyr::filter(res, !dplyr::if_any("isdir")) + expect_equal(nrow(res2), 14L) + file_ext <- NULL + file_ext <- file_ext %||% ".*" + res3 <- res2 |> + dplyr::filter(dplyr::if_any("name", \(x) gregg(x, "\\.{file_ext}$"))) + expect_equal(nrow(res2), nrow(res3)) # because nothing filtered out yet + file_ext <- "json" + res4 <- res2 |> + dplyr::filter(dplyr::if_any("name", \(x) gregg(x, "\\.{file_ext}$"))) + expect_equal(nrow(res4), 6L) }) test_that("we can evolve list_files()", { - endpoint_uri <- Sys.getenv("AZ_STORAGE_EP") - # only run the test if this variable is set (ie locally, but not on GitHub) - if (nzchar(endpoint_uri)) { - cont <- expect_no_error(get_container("supporting-data")) + endpoint_uri <- Sys.getenv("AZ_STORAGE_EP") + # only run the test if this variable is set (ie locally, but not on GitHub) + if (nzchar(endpoint_uri)) { + cont <- expect_no_error(get_container("supporting-data")) - list_files <- \(container) AzureStor::list_blobs(container) - expect_equal(nrow(expect_no_error(list_files(cont))), 15L) + list_files <- \(container) AzureStor::list_blobs(container) + expect_equal(nrow(expect_no_error(list_files(cont))), 15L) - list_files <- function(container, recursive = TRUE) { - AzureStor::list_blobs(container, recursive = recursive) |> - dplyr::filter(!dplyr::if_any("isdir")) |> - dplyr::pull("name") - } - expect_length(expect_no_error(list_files(cont)), 14L) + list_files <- function(container, recursive = TRUE) { + AzureStor::list_blobs(container, recursive = recursive) |> + dplyr::filter(!dplyr::if_any("isdir")) |> + dplyr::pull("name") + } + expect_length(expect_no_error(list_files(cont)), 14L) - list_files <- function(container, file_ext = ".*", recursive = TRUE) { - AzureStor::list_blobs(container, recursive = recursive) |> - dplyr::filter( - !dplyr::if_any("isdir") & - dplyr::if_any("name", \(x) gregg(x, "\\.{file_ext}$")) - ) |> - dplyr::pull("name") - } - expect_length(expect_no_error(list_files(cont, "json", FALSE)), 6L) - expect_true(rlang::is_bare_character(list_files(cont, "json"))) + list_files <- function(container, file_ext = ".*", recursive = TRUE) { + AzureStor::list_blobs(container, recursive = recursive) |> + dplyr::filter( + !dplyr::if_any("isdir") & + dplyr::if_any("name", \(x) gregg(x, "\\.{file_ext}$")) + ) |> + dplyr::pull("name") + } + expect_length(expect_no_error(list_files(cont, "json", FALSE)), 6L) + expect_true(rlang::is_bare_character(list_files(cont, "json"))) - list_files <- function(container, path = "", ext = ".*", recursive = TRUE) { - stopifnot("path not found" = AzureStor::blob_dir_exists(container, path)) - tbl <- AzureStor::list_blobs(container, dir = path, recursive = recursive) - if (nrow(tbl) == 0) { - character(0) - } else { - tbl |> - dplyr::filter( - !dplyr::if_any("isdir") & - dplyr::if_any("name", \(x) gregg(x, "\\.{ext}$")) - ) |> - dplyr::pull("name") - } - } - expect_length(list_files(cont), 14L) - expect_length(list_files(cont, ext = "json"), 6L) - # root folder can be specified as "" or "/", it doesn't matter - expect_length(list_files(cont, "/", "json"), 6L) - # no files with this extension present - but should not error - expect_length(expect_no_error(list_files(cont, ext = "xlsx")), 0L) + list_files <- function(container, path = "", ext = ".*", recursive = TRUE) { + stopifnot("path not found" = AzureStor::blob_dir_exists(container, path)) + tbl <- AzureStor::list_blobs(container, dir = path, recursive = recursive) + if (nrow(tbl) == 0) { + character(0) + } else { + tbl |> + dplyr::filter( + !dplyr::if_any("isdir") & + dplyr::if_any("name", \(x) gregg(x, "\\.{ext}$")) + ) |> + dplyr::pull("name") + } + } + expect_length(list_files(cont), 14L) + expect_length(list_files(cont, ext = "json"), 6L) + # root folder can be specified as "" or "/", it doesn't matter + expect_length(list_files(cont, "/", "json"), 6L) + # no files with this extension present - but should not error + expect_length(expect_no_error(list_files(cont, ext = "xlsx")), 0L) - expect_no_error(list_files(cont, "QA")) # folder QA should exist - expect_no_error(list_files(cont, "/QA")) # /QA also works - expect_length(list_files(cont, "QA"), 0) - # folder does not exist - expect_error(list_files(cont, "BLAH"), "path not found") - } + expect_no_error(list_files(cont, "QA")) # folder QA should exist + expect_no_error(list_files(cont, "/QA")) # /QA also works + expect_length(list_files(cont, "QA"), 0) + # folder does not exist + expect_error(list_files(cont, "BLAH"), "path not found") + } }) - test_that("edited path starts with '/' unless it's the root folder", { - fix_path <- \(path) sub("^/+$", "", sub("^([^/])(.*)", "/\\1\\2", path)) - expect_equal(fix_path("/"), "") - expect_equal(fix_path(""), "") - expect_equal(fix_path("/QA"), "/QA") - expect_equal(fix_path("QA"), "/QA") + fix_path <- \(path) sub("^/+$", "", sub("^([^/])(.*)", "/\\1\\2", path)) + expect_equal(fix_path("/"), "") + expect_equal(fix_path(""), "") + expect_equal(fix_path("/QA"), "/QA") + expect_equal(fix_path("QA"), "/QA") }) diff --git a/tests/testthat/test-read_azure_files.R b/tests/testthat/test-read_azure_files.R index 1f9e0fc..c5d08d7 100644 --- a/tests/testthat/test-read_azure_files.R +++ b/tests/testthat/test-read_azure_files.R @@ -1,343 +1,339 @@ test_that("basic success", { - endpoint_uri <- Sys.getenv("AZ_STORAGE_EP") - # only run the test if this variable is set (ie locally, but not on GitHub) - if (nzchar(endpoint_uri)) { - token <- get_auth_token() - expect_false(is.null(token)) - expect_s3_class(token, "AzureToken") + endpoint_uri <- Sys.getenv("AZ_STORAGE_EP") + # only run the test if this variable is set (ie locally, but not on GitHub) + if (nzchar(endpoint_uri)) { + token <- get_auth_token() + expect_false(is.null(token)) + expect_s3_class(token, "AzureToken") - # explore behaviour with blob_endpoint - ep <- AzureStor::blob_endpoint(endpoint_uri, token = token) |> - expect_no_error() - expect_s3_class(ep, "blob_endpoint") - inputs_data <- AzureStor::blob_container(ep, "inputs-data") |> - expect_no_error() - expect_s3_class(inputs_data, "blob_container") - files <- AzureStor::list_blobs(inputs_data, "dev", recursive = FALSE) |> - dplyr::filter(grepl("\\.parquet$", .data[["name"]])) |> - dplyr::pull("name") - name <- "baseline" - pqt_file <- grep(name, files, value = TRUE) - expect_length(pqt_file, 1) - expect_true(AzureStor::blob_exists(inputs_data, pqt_file)) - out <- AzureStor::download_blob(inputs_data, pqt_file, dest = NULL) |> - arrow::read_parquet() - expect_s3_class(out, "tbl_df") - expect_length(out, 6) # ncol - } + # explore behaviour with blob_endpoint + ep <- AzureStor::blob_endpoint(endpoint_uri, token = token) |> + expect_no_error() + expect_s3_class(ep, "blob_endpoint") + inputs_data <- AzureStor::blob_container(ep, "inputs-data") |> + expect_no_error() + expect_s3_class(inputs_data, "blob_container") + files <- AzureStor::list_blobs(inputs_data, "dev", recursive = FALSE) |> + dplyr::filter(grepl("\\.parquet$", .data[["name"]])) |> + dplyr::pull("name") + name <- "baseline" + pqt_file <- grep(name, files, value = TRUE) + expect_length(pqt_file, 1) + expect_true(AzureStor::blob_exists(inputs_data, pqt_file)) + out <- AzureStor::download_blob(inputs_data, pqt_file, dest = NULL) |> + arrow::read_parquet() + expect_s3_class(out, "tbl_df") + expect_length(out, 6) # ncol + } }) - test_that("understand some new errors in check_blob_exists", { - endpoint_uri <- Sys.getenv("AZ_STORAGE_EP") - # only run the test if this variable is set (i.e. locally, but not on GitHub) - if (nzchar(endpoint_uri)) { - inp <- expect_no_error(get_container("inputs-data")) - path <- "dev" - file <- "baseline" - ext <- "parquet" - path <- if (path %in% c("", "/")) "" else path - expect_equal(path, "dev") - dir_name <- if (dirname(file) == ".") "" else dirname(file) - expect_equal(dir_name, "") - p2 <- file.path(path, dir_name) - expect_equal(p2, "dev") - file_name <- paste0(basename(file), ".", ext) - expect_equal(file_name, "baseline.parquet") - file_path <- sub("^/", "", sub("/+", "/", file.path(p2, file_name))) - expect_equal(file_path, "dev/baseline.parquet") - dir_list <- AzureStor::list_blobs(inp, p2, recursive = FALSE) - file_name_out <- dir_list |> - dplyr::filter(dplyr::if_any("name", \(x) x == file_path)) |> - dplyr::pull("name") - expect_equal(file_name_out, "dev/baseline.parquet") - expect_no_error(check_blob_exists(inp, file, ext, FALSE, path)) + endpoint_uri <- Sys.getenv("AZ_STORAGE_EP") + # only run the test if this variable is set (i.e. locally, but not on GitHub) + if (nzchar(endpoint_uri)) { + inp <- expect_no_error(get_container("inputs-data")) + path <- "dev" + file <- "baseline" + ext <- "parquet" + path <- if (path %in% c("", "/")) "" else path + expect_equal(path, "dev") + dir_name <- if (dirname(file) == ".") "" else dirname(file) + expect_equal(dir_name, "") + p2 <- file.path(path, dir_name) + expect_equal(p2, "dev") + file_name <- paste0(basename(file), ".", ext) + expect_equal(file_name, "baseline.parquet") + file_path <- sub("^/", "", sub("/+", "/", file.path(p2, file_name))) + expect_equal(file_path, "dev/baseline.parquet") + dir_list <- AzureStor::list_blobs(inp, p2, recursive = FALSE) + file_name_out <- dir_list |> + dplyr::filter(dplyr::if_any("name", \(x) x == file_path)) |> + dplyr::pull("name") + expect_equal(file_name_out, "dev/baseline.parquet") + expect_no_error(check_blob_exists(inp, file, ext, FALSE, path)) - # check still works if full filepath is passed to file arg - path <- "" - file <- "dev/baseline.parquet" - path <- if (path %in% c("", "/")) "" else path - expect_equal(path, "") - dir_name <- if (dirname(file) == ".") "" else dirname(file) - expect_equal(dir_name, "dev") - p2 <- glue::glue("{path}/{dir_name}") - expect_equal(p2, "/dev") - file_name <- basename(file) - expect_equal(file_name, "baseline.parquet") - file_path <- sub("^/", "", sub("/+", "/", glue::glue("{p2}/{file_name}"))) - expect_equal(file_path, "dev/baseline.parquet") - dir_list <- AzureStor::list_blobs(inp, p2, recursive = FALSE) - file_name_out <- dir_list |> - dplyr::filter(dplyr::if_any("name", \(x) x == {{ file_path }})) |> - dplyr::pull("name") - expect_equal(file_name_out, file) - expect_no_error(check_blob_exists(inp, file, ext, FALSE, path)) - } + # check still works if full filepath is passed to file arg + path <- "" + file <- "dev/baseline.parquet" + path <- if (path %in% c("", "/")) "" else path + expect_equal(path, "") + dir_name <- if (dirname(file) == ".") "" else dirname(file) + expect_equal(dir_name, "dev") + p2 <- glue::glue("{path}/{dir_name}") + expect_equal(p2, "/dev") + file_name <- basename(file) + expect_equal(file_name, "baseline.parquet") + file_path <- sub("^/", "", sub("/+", "/", glue::glue("{p2}/{file_name}"))) + expect_equal(file_path, "dev/baseline.parquet") + dir_list <- AzureStor::list_blobs(inp, p2, recursive = FALSE) + file_name_out <- dir_list |> + dplyr::filter(dplyr::if_any("name", \(x) x == {{ file_path }})) |> + dplyr::pull("name") + expect_equal(file_name_out, file) + expect_no_error(check_blob_exists(inp, file, ext, FALSE, path)) + } }) - test_that("whole read_parquet function works", { - endpoint_uri <- Sys.getenv("AZ_STORAGE_EP") - # only run the test if this variable is set (i.e. locally, but not on GitHub) - if (nzchar(endpoint_uri)) { - inputs_container <- expect_no_error(get_container("inputs-data")) - expect_no_error(read_azure_parquet( - inputs_container, - "baseline", - path = "dev" - )) - out1 <- read_azure_parquet(inputs_container, "baseline", path = "dev") - expect_s3_class(out1, "tbl_df") - # check that it works with the file extension included - out2 <- read_azure_parquet( - inputs_container, - "baseline.parquet", - path = "dev" - ) - expect_length(out2, 6) # ncol + endpoint_uri <- Sys.getenv("AZ_STORAGE_EP") + # only run the test if this variable is set (i.e. locally, but not on GitHub) + if (nzchar(endpoint_uri)) { + inputs_container <- expect_no_error(get_container("inputs-data")) + expect_no_error( + read_azure_parquet( + inputs_container, + "baseline", + path = "dev" + ) + ) + out1 <- read_azure_parquet(inputs_container, "baseline", path = "dev") + expect_s3_class(out1, "tbl_df") + # check that it works with the file extension included + out2 <- read_azure_parquet( + inputs_container, + "baseline.parquet", + path = "dev" + ) + expect_length(out2, 6) # ncol - res <- get_container(Sys.getenv("AZ_RESULTS_CONTAINER")) - pqt_file <- Sys.getenv("TEST_PARQUET_FILE") - path <- "/" - file_ext <- "parquet" + res <- get_container(Sys.getenv("AZ_RESULTS_CONTAINER")) + pqt_file <- Sys.getenv("TEST_PARQUET_FILE") + path <- "/" + file_ext <- "parquet" - # experiment with changes to code: - path <- if (path %in% c("", "/")) "" else path - expect_equal(path, "") - dir_name <- if (dirname(pqt_file) == ".") "" else dirname(pqt_file) - dpath <- glue::glue("{path}/{dir_name}") - file_name <- sub(glue::glue("\\.{file_ext}$"), "", basename(pqt_file)) + # experiment with changes to code: + path <- if (path %in% c("", "/")) "" else path + expect_equal(path, "") + dir_name <- if (dirname(pqt_file) == ".") "" else dirname(pqt_file) + dpath <- glue::glue("{path}/{dir_name}") + file_name <- sub(glue::glue("\\.{file_ext}$"), "", basename(pqt_file)) - file_name <- if (gregg(basename(pqt_file), "\\.{file_ext}$")) { - basename(pqt_file) - } else { - glue::glue("{basename(pqt_file)}.{file_ext}") - } - # remove duplicate slashes and any initial slashes - fpath <- sub("^/", "", sub("/+", "/", glue::glue("{dpath}/{file_name}"))) - expect_equal(fpath, pqt_file) - # now function should run without error - expect_no_error(check_blob_exists(res, pqt_file, "parquet", FALSE, "/")) - # we want this to error if the file_ext doesn't match the file - expect_error(check_blob_exists(res, pqt_file, "rds", FALSE, "/")) - } + file_name <- if (gregg(basename(pqt_file), "\\.{file_ext}$")) { + basename(pqt_file) + } else { + glue::glue("{basename(pqt_file)}.{file_ext}") + } + # remove duplicate slashes and any initial slashes + fpath <- sub("^/", "", sub("/+", "/", glue::glue("{dpath}/{file_name}"))) + expect_equal(fpath, pqt_file) + # now function should run without error + expect_no_error(check_blob_exists(res, pqt_file, "parquet", FALSE, "/")) + # we want this to error if the file_ext doesn't match the file + expect_error(check_blob_exists(res, pqt_file, "rds", FALSE, "/")) + } }) - test_that("read_azure_json basically works", { - endpoint_uri <- Sys.getenv("AZ_STORAGE_EP") - # only run the test if this variable is set (i.e. locally, but not on GitHub) - if (nzchar(endpoint_uri)) { - download_azure_blob <- function(container, file, file_ext, path = "") { - check_blob_exists(container, file, file_ext, FALSE, path) |> - AzureStor::download_blob(container, src = _, dest = NULL) - } - expect_no_error(support_container <- get_container("supporting-data")) - raw_out <- support_container |> - download_azure_blob("providers", "json") - expect_type(raw_out, "raw") - expect_no_error(yyjsonr::read_json_raw(raw_out)) - out <- yyjsonr::read_json_raw(raw_out) - expect_type(out, "character") - expect_length(out, 138) - } + endpoint_uri <- Sys.getenv("AZ_STORAGE_EP") + # only run the test if this variable is set (i.e. locally, but not on GitHub) + if (nzchar(endpoint_uri)) { + download_azure_blob <- function(container, file, file_ext, path = "") { + check_blob_exists(container, file, file_ext, FALSE, path) |> + AzureStor::download_blob(container, src = _, dest = NULL) + } + expect_no_error(support_container <- get_container("supporting-data")) + raw_out <- support_container |> + download_azure_blob("providers", "json") + expect_type(raw_out, "raw") + expect_no_error(yyjsonr::read_json_raw(raw_out)) + out <- yyjsonr::read_json_raw(raw_out) + expect_type(out, "character") + expect_length(out, 138) + } }) test_that("dirname and basename logic works", { - endpoint_uri <- Sys.getenv("AZ_STORAGE_EP") - # only run the test if this variable is set (i.e. locally, but not on GitHub) - if (nzchar(endpoint_uri)) { - expect_no_error(support_container <- get_container("supporting-data")) - file <- "neecom_table.rds" - path <- "" - expect_no_error(AzureStor::blob_exists(support_container, file)) - file2 <- paste0("/", file) - file3 <- paste0("//", file) - expect_no_error(AzureStor::blob_exists(support_container, file2)) - expect_no_error(AzureStor::blob_exists(support_container, file3)) - expect_equal(dirname(file), ".") - file <- paste0(path, "/", file) - expect_equal(dirname(file), "/") - path <- "/" - file <- paste0(path, "/", file) - expect_equal(dirname(file), "/") + endpoint_uri <- Sys.getenv("AZ_STORAGE_EP") + # only run the test if this variable is set (i.e. locally, but not on GitHub) + if (nzchar(endpoint_uri)) { + expect_no_error(support_container <- get_container("supporting-data")) + file <- "neecom_table.rds" + path <- "" + expect_no_error(AzureStor::blob_exists(support_container, file)) + file2 <- paste0("/", file) + file3 <- paste0("//", file) + expect_no_error(AzureStor::blob_exists(support_container, file2)) + expect_no_error(AzureStor::blob_exists(support_container, file3)) + expect_equal(dirname(file), ".") + file <- paste0(path, "/", file) + expect_equal(dirname(file), "/") + path <- "/" + file <- paste0(path, "/", file) + expect_equal(dirname(file), "/") - new_path <- sub("^\\.$", "/", dirname(file)) - expect_equal(new_path, "/") - path <- "QA" - file <- "none.txt" - file <- paste0(path, "/", file) - new_path <- sub("^\\.$", "/", dirname(file)) - expect_equal(new_path, "QA") + new_path <- sub("^\\.$", "/", dirname(file)) + expect_equal(new_path, "/") + path <- "QA" + file <- "none.txt" + file <- paste0(path, "/", file) + new_path <- sub("^\\.$", "/", dirname(file)) + expect_equal(new_path, "QA") - res <- get_container("results") - path <- "prod/dev/national/national" - file <- "test-2025" - file_ext <- "json.gz" - filepath <- sub("^/+", "", paste0(path, "/", file)) - expect_equal(filepath, "prod/dev/national/national/test-2025") + res <- get_container("results") + path <- "prod/dev/national/national" + file <- "test-2025" + file_ext <- "json.gz" + filepath <- sub("^/+", "", paste0(path, "/", file)) + expect_equal(filepath, "prod/dev/national/national/test-2025") - path <- "/" - filepath <- sub("^/+", "", paste0(path, "/", file)) - expect_equal(dirname(filepath), ".") + path <- "/" + filepath <- sub("^/+", "", paste0(path, "/", file)) + expect_equal(dirname(filepath), ".") - path <- "prod/dev/national/national" - filepath <- sub("^/+", "", paste0(path, "/", file)) - expect_equal(filepath, "prod/dev/national/national/test-2025") - path <- sub("^\\.$", "/", dirname(filepath)) + path <- "prod/dev/national/national" + filepath <- sub("^/+", "", paste0(path, "/", file)) + expect_equal(filepath, "prod/dev/national/national/test-2025") + path <- sub("^\\.$", "/", dirname(filepath)) - filepath_out <- AzureStor::list_blobs(res, path, recursive = FALSE) |> - dplyr::filter( - !dplyr::if_any("isdir") & - dplyr::if_any("name", \(x) { - gregg(x, "\\.{file_ext}$") & gregg(x, "^{filepath}") - }) - ) |> - dplyr::pull("name") - expect_match(filepath_out, "prod/dev/national/national/test-2025.*json.gz") - } + filepath_out <- AzureStor::list_blobs(res, path, recursive = FALSE) |> + dplyr::filter( + !dplyr::if_any("isdir") & + dplyr::if_any("name", \(x) { + gregg(x, "\\.{file_ext}$") & gregg(x, "^{filepath}") + }) + ) |> + dplyr::pull("name") + expect_match(filepath_out, "prod/dev/national/national/test-2025.*json.gz") + } }) test_that("tdd of check_blob_exists", { - endpoint_uri <- Sys.getenv("AZ_STORAGE_EP") - # only run the test if this variable is set (i.e. locally, but not on GitHub) - if (nzchar(endpoint_uri)) { - expect_no_error(support_container <- get_container("supporting-data")) - check_blob_exists <- function(container, file, path = "/") { - stopifnot("path not found" = AzureStor::blob_dir_exists(container, path)) - } - expect_no_error(check_blob_exists(support_container, "file")) + endpoint_uri <- Sys.getenv("AZ_STORAGE_EP") + # only run the test if this variable is set (i.e. locally, but not on GitHub) + if (nzchar(endpoint_uri)) { + expect_no_error(support_container <- get_container("supporting-data")) + check_blob_exists <- function(container, file, path = "/") { + stopifnot("path not found" = AzureStor::blob_dir_exists(container, path)) + } + expect_no_error(check_blob_exists(support_container, "file")) - check_blob_exists <- function(container, file, path = "/") { - stopifnot("path not found" = AzureStor::blob_dir_exists(container, path)) - file_ext <- "json" - AzureStor::list_blobs(container, path, recursive = FALSE) |> - dplyr::filter( - !dplyr::if_any("isdir") & - dplyr::if_any("name", \(x) { - grepl(glue::glue("\\.{file_ext}$"), x) & grepl(file, x) - }) - ) |> - dplyr::pull("name") - } - expect_no_error(check_blob_exists(support_container, "sites")) - expect_length(check_blob_exists(support_container, "sites"), 2) + check_blob_exists <- function(container, file, path = "/") { + stopifnot("path not found" = AzureStor::blob_dir_exists(container, path)) + file_ext <- "json" + AzureStor::list_blobs(container, path, recursive = FALSE) |> + dplyr::filter( + !dplyr::if_any("isdir") & + dplyr::if_any("name", \(x) { + grepl(glue::glue("\\.{file_ext}$"), x) & grepl(file, x) + }) + ) |> + dplyr::pull("name") + } + expect_no_error(check_blob_exists(support_container, "sites")) + expect_length(check_blob_exists(support_container, "sites"), 2) - check_blob_exists <- function(container, file, path = "/") { - stopifnot("path not found" = AzureStor::blob_dir_exists(container, path)) - file_ext <- "json" - filepath <- AzureStor::list_blobs(container, path, recursive = FALSE) |> - dplyr::filter( - !dplyr::if_any("isdir") & - dplyr::if_any("name", \(x) { - grepl(glue::glue("\\.{file_ext}$"), x) & grepl(file, x) - }) - ) |> - dplyr::pull("name") - stop_msg1 <- glue::glue("no matching {file_ext} file found") - stop_msg2 <- glue::glue("multiple matching {file_ext} files found") - check_that(filepath, \(x) length(x) > 0, stop_msg1) # check length > 0 - check_scalar_type(filepath, "character", stop_msg2) # check length == 1 - } - expect_error(check_blob_exists(support_container, "unmatched"), "matching") - expect_error(check_blob_exists(support_container, "sites"), "multiple") + check_blob_exists <- function(container, file, path = "/") { + stopifnot("path not found" = AzureStor::blob_dir_exists(container, path)) + file_ext <- "json" + filepath <- AzureStor::list_blobs(container, path, recursive = FALSE) |> + dplyr::filter( + !dplyr::if_any("isdir") & + dplyr::if_any("name", \(x) { + grepl(glue::glue("\\.{file_ext}$"), x) & grepl(file, x) + }) + ) |> + dplyr::pull("name") + stop_msg1 <- glue::glue("no matching {file_ext} file found") + stop_msg2 <- glue::glue("multiple matching {file_ext} files found") + check_that(filepath, \(x) length(x) > 0, stop_msg1) # check length > 0 + check_scalar_type(filepath, "character", stop_msg2) # check length == 1 + } + expect_error(check_blob_exists(support_container, "unmatched"), "matching") + expect_error(check_blob_exists(support_container, "sites"), "multiple") - check_blob_exists <- function(container, file, path = "/") { - stopifnot("path not found" = AzureStor::blob_dir_exists(container, path)) - file_ext <- "json" - filepath <- AzureStor::list_blobs(container, path, recursive = FALSE) |> - dplyr::filter( - !dplyr::if_any("isdir") & - dplyr::if_any("name", \(x) { - grepl(glue::glue("\\.{file_ext}$"), x) & grepl(file, x) - }) - ) |> - dplyr::pull("name") - stop_msg1 <- glue::glue("no matching {file_ext} file found") - stop_msg2 <- glue::glue("multiple matching {file_ext} files found") - check_that(filepath, \(x) length(x) > 0, stop_msg1) # check length > 0 - check_scalar_type(filepath, "character", stop_msg2) # check length == 1 - filepath - } - expect_no_error(check_blob_exists(support_container, "providers")) - expect_no_error(check_blob_exists(support_container, "providers.json")) - } + check_blob_exists <- function(container, file, path = "/") { + stopifnot("path not found" = AzureStor::blob_dir_exists(container, path)) + file_ext <- "json" + filepath <- AzureStor::list_blobs(container, path, recursive = FALSE) |> + dplyr::filter( + !dplyr::if_any("isdir") & + dplyr::if_any("name", \(x) { + grepl(glue::glue("\\.{file_ext}$"), x) & grepl(file, x) + }) + ) |> + dplyr::pull("name") + stop_msg1 <- glue::glue("no matching {file_ext} file found") + stop_msg2 <- glue::glue("multiple matching {file_ext} files found") + check_that(filepath, \(x) length(x) > 0, stop_msg1) # check length > 0 + check_scalar_type(filepath, "character", stop_msg2) # check length == 1 + filepath + } + expect_no_error(check_blob_exists(support_container, "providers")) + expect_no_error(check_blob_exists(support_container, "providers.json")) + } }) - # parquet and json read functions need `download_blob` first before reading in. # However there are 'native' {AzureStor} functions for csv and rds files, so we # should use those instead. Requiring a slightly different (simpler) workflow. test_that("read_azure_csv basically works", { - endpoint_uri <- Sys.getenv("AZ_STORAGE_EP") - # only run the test if this variable is set (i.e. locally, but not on GitHub) - if (nzchar(endpoint_uri)) { - download_azure_blob <- function(container, file, file_ext, path = "") { - check_blob_exists(container, file, file_ext, FALSE, path) |> - AzureStor::download_blob(container, src = _, dest = NULL) - } - expect_no_error(support_container <- get_container("supporting-data")) - support_container |> - # should construct the file name OK - download_azure_blob("mitigator-lookup", "csv") |> - expect_no_error() - support_container |> - download_azure_blob("mitigator-lookup.csv", "csv") |> - expect_no_error() - raw_out <- support_container |> - download_azure_blob("mitigator-lookup.csv", "csv") - expect_type(raw_out, "raw") - expect_no_error(readr::read_csv(raw_out, show_col_types = FALSE)) - dat <- readr::read_csv(raw_out, show_col_types = FALSE) - expect_type(dat, "list") - expect_s3_class(dat, "tbl_df") - } + endpoint_uri <- Sys.getenv("AZ_STORAGE_EP") + # only run the test if this variable is set (i.e. locally, but not on GitHub) + if (nzchar(endpoint_uri)) { + download_azure_blob <- function(container, file, file_ext, path = "") { + check_blob_exists(container, file, file_ext, FALSE, path) |> + AzureStor::download_blob(container, src = _, dest = NULL) + } + expect_no_error(support_container <- get_container("supporting-data")) + support_container |> + # should construct the file name OK + download_azure_blob("mitigator-lookup", "csv") |> + expect_no_error() + support_container |> + download_azure_blob("mitigator-lookup.csv", "csv") |> + expect_no_error() + raw_out <- support_container |> + download_azure_blob("mitigator-lookup.csv", "csv") + expect_type(raw_out, "raw") + expect_no_error(readr::read_csv(raw_out, show_col_types = FALSE)) + dat <- readr::read_csv(raw_out, show_col_types = FALSE) + expect_type(dat, "list") + expect_s3_class(dat, "tbl_df") + } }) - test_that("... parameters are passed through", { - endpoint_uri <- Sys.getenv("AZ_STORAGE_EP") - # only run the test if this variable is set (i.e. locally, but not on GitHub) - if (nzchar(endpoint_uri)) { - download_azure_blob <- function(container, file, file_ext, path = "") { - check_blob_exists(container, file, file_ext, FALSE, path) |> - AzureStor::download_blob(container, src = _, dest = NULL) - } - expect_no_error(support_container <- get_container("supporting-data")) - col_types <- "ccc------" - csv_out1 <- support_container |> - download_azure_blob("mitigator-lookup.csv", "csv") |> - readr::read_csv(col_types = col_types) |> - expect_no_error() - csv_out2 <- support_container |> - AzureStor::storage_read_csv( - "mitigator-lookup.csv", - col_types = col_types - ) |> - expect_no_error() - expect_identical(csv_out1, csv_out2) - expect_length(csv_out1, 3) # ncol - } + endpoint_uri <- Sys.getenv("AZ_STORAGE_EP") + # only run the test if this variable is set (i.e. locally, but not on GitHub) + if (nzchar(endpoint_uri)) { + download_azure_blob <- function(container, file, file_ext, path = "") { + check_blob_exists(container, file, file_ext, FALSE, path) |> + AzureStor::download_blob(container, src = _, dest = NULL) + } + expect_no_error(support_container <- get_container("supporting-data")) + col_types <- "ccc------" + csv_out1 <- support_container |> + download_azure_blob("mitigator-lookup.csv", "csv") |> + readr::read_csv(col_types = col_types) |> + expect_no_error() + csv_out2 <- support_container |> + AzureStor::storage_read_csv( + "mitigator-lookup.csv", + col_types = col_types + ) |> + expect_no_error() + expect_identical(csv_out1, csv_out2) + expect_length(csv_out1, 3) # ncol + } }) test_that("read functions all work a bit at least", { - endpoint_uri <- Sys.getenv("AZ_STORAGE_EP") - # only run the test if this variable is set (i.e. locally, but not on GitHub) - if (nzchar(endpoint_uri)) { - res <- get_container(Sys.getenv("AZ_RESULTS_CONTAINER")) - expect_no_error(read_azure_parquet(res, Sys.getenv("TEST_PARQUET_FILE"))) - supp <- get_container(Sys.getenv("AZ_SUPPORT_CONTAINER")) - expect_no_error(read_azure_json(supp, Sys.getenv("TEST_JSON_FILE"))) - expect_no_error(read_azure_rds(supp, Sys.getenv("TEST_RDS_FILE"))) - read_azure_csv(supp, Sys.getenv("TEST_CSV_FILE"), show_col_types = FALSE) |> - expect_no_error() - } + endpoint_uri <- Sys.getenv("AZ_STORAGE_EP") + # only run the test if this variable is set (i.e. locally, but not on GitHub) + if (nzchar(endpoint_uri)) { + res <- get_container(Sys.getenv("AZ_RESULTS_CONTAINER")) + expect_no_error(read_azure_parquet(res, Sys.getenv("TEST_PARQUET_FILE"))) + supp <- get_container(Sys.getenv("AZ_SUPPORT_CONTAINER")) + expect_no_error(read_azure_json(supp, Sys.getenv("TEST_JSON_FILE"))) + expect_no_error(read_azure_rds(supp, Sys.getenv("TEST_RDS_FILE"))) + read_azure_csv(supp, Sys.getenv("TEST_CSV_FILE"), show_col_types = FALSE) |> + expect_no_error() + } }) - test_that("check that sub-functions inherit options in R!", { - yes <- \() getOption("do_it") %||% "nope" - wrap <- \() yes() - expect_identical(withr::with_options(list(do_it = "yes"), wrap()), "yes") + yes <- \() getOption("do_it") %||% "nope" + wrap <- \() yes() + expect_identical(withr::with_options(list(do_it = "yes"), wrap()), "yes") }) From daf5698e20dae176e5c799ed8800ed880e7c53d9 Mon Sep 17 00:00:00 2001 From: Tom Jemmett Date: Fri, 13 Mar 2026 16:12:26 +0000 Subject: [PATCH 2/2] fix lintr warnings about using tabs not spaces --- R/get_auth_token.R | 540 ++++++++++---------- R/get_container.R | 160 +++--- R/helpers.R | 120 ++--- R/list_files.R | 114 ++--- R/read_azure_files.R | 314 ++++++------ R/read_azure_table.R | 78 +-- azkit.Rproj | 44 +- azkit.code-workspace | 114 ++--- inst/azkit.svg | 4 +- inst/azkit_hex.svg | 4 +- tests/testthat/test-azkit_helpers.R | 136 ++--- tests/testthat/test-get_auth_token.R | 50 +- tests/testthat/test-get_container.R | 64 +-- tests/testthat/test-list_files.R | 136 ++--- tests/testthat/test-read_azure_files.R | 678 ++++++++++++------------- 15 files changed, 1278 insertions(+), 1278 deletions(-) diff --git a/R/get_auth_token.R b/R/get_auth_token.R index 8f84bd3..cf8044e 100644 --- a/R/get_auth_token.R +++ b/R/get_auth_token.R @@ -1,270 +1,270 @@ -#' Get Azure authentication token -#' -#' This function retrieves an Azure token for a specified resource. -#' -#' If the environment variables `AZ_TENANT_ID`, `AZ_CLIENT_ID` and -#' `AZ_APP_SECRET` are all set, it will try to use these to return a token. -#' -#' Otherwise it will try to get a managed token from a managed resource such as -#' Azure VM or Azure App Service. -#' -#' If neither of these approaches has returned a token, it will try to retrieve -#' a user token using the provided parameters, requiring the user to have -#' authenticated using their device. If `force_refresh` is set to `TRUE`, a -#' fresh web authentication process should be launched. Otherwise it will -#' attempt to use a cached token matching the given `resource`, `tenant` and -#' `aad_version`. -#' -#' @param resource For v1, a simple URL such as `"https://storage.azure.com/"` -#' should be supplied.For v2, a vector specifying the URL of the Azure resource -#' for which the token is requested as well as any desired scopes. See -#' [AzureAuth::get_azure_token] for details. Use [generate_resource] -#' to help provide an appropriate string or vector. The values default to -#' `c("https://storage.azure.com/.default", "openid", "offline_access")`. -#' If setting version to 2, ensure that the `aad_version` argument is also set -#' to 2. Both are set to use AAD version 1 by default. -#' @param tenant A string specifying the Azure tenant. Defaults to -#' `"common"`. See [AzureAuth::get_azure_token] for other values. -#' @param client_id A string specifying the application ID (client ID). If -#' `NULL`, (the default) the function attempts to obtain the client ID from the -#' Azure Resource Manager token, or prompts the user to log in to obtain it. -#' @param auth_method A string specifying the authentication method. Defaults to -#' `"authorization_code"`. See [AzureAuth::get_azure_token] for other values. -#' @param aad_version Numeric. The AAD version, either 1 or 2 (1 by default) -#' @param force_refresh Boolean: whether to use a stored token if available -#' (`FALSE`, the default), or try to obtain a new one from Azure (`TRUE`). -#' This may be useful if you wish to generate a new token with the same -#' `resource` value as an existing token, but a different `tenant` or -#' `auth_method`. Note that you can also try using [refresh_token] which will -#' cause an existing token to refresh itself, without obtaining a new token -#' from Azure via online reauthentication -#' @param ... Optional arguments (`token_args` or `use_cache`) to be passed on -#' to [AzureAuth::get_managed_token] or [AzureAuth::get_azure_token]. -#' -#' @returns An Azure token object -#' @examples -#' \dontrun{ -#' # Get a token for the default resource -#' token <- get_auth_token() -#' -#' # Force generation of a new token via online reauthentication -#' token <- get_auth_token(force_refresh = TRUE) -#' -#' # Get a token for a specific resource and tenant -#' token <- get_auth_token( -#' resource = "https://graph.microsoft.com", -#' tenant = "my-tenant-id" -#' ) -#' -#' # Get a token using a specific app ID -#' token <- get_auth_token(client_id = "my-app-id") -#' } -#' @export -get_auth_token <- function( - resource = generate_resource(), - tenant = "common", - client_id = NULL, - auth_method = "authorization_code", - aad_version = 1, - force_refresh = FALSE, - ... -) { - aad_msg <- "Invalid {.arg aad_version} variable supplied (must be 1 or 2)" - aad_version <- check_that(aad_version, \(x) x %in% seq(2), aad_msg) - - safely_get_token <- \(...) purrr::safely(AzureAuth::get_azure_token)(...) - get_azure_token <- purrr::partial( - safely_get_token, - resource = resource, - version = aad_version - ) - possibly_get_mtk <- \(...) purrr::possibly(AzureAuth::get_managed_token)(...) - - dots <- rlang::list2(...) - # If the user specifies force_refresh = TRUE we turn off `use_cache`, - # otherwise we leave `use_cache` as it is (or as `NULL`, its default value) - use_cached <- !force_refresh && (dots[["use_cache"]] %||% TRUE) - dots <- rlang::dots_list(!!!dots, use_cache = use_cached, .homonyms = "last") - - # We have 4 approaches to get a token, depending on the context - # 1. Use environment variables if all three are set - token_resp <- rlang::inject(try_token_from_vars(get_azure_token, !!!dots)) - token <- token_resp[["result"]] - token_error <- token_resp[["error"]] - - # 2. Try to get a managed token (for example on Azure VM, App Service) - if (is.null(token)) { - token <- rlang::inject(possibly_get_mtk(resource, !!!dots)) - } - - # 3. If neither of those has worked, try to get an already stored user token - # (unless `force_refresh` is on, in which case skip to option 4) - if (is.null(token) && use_cached) { - token <- match_cached_token(resource, tenant, aad_version) - } - - # 4. If we still don't have a token, try to get a new one via reauthentication - if (is.null(token)) { - if (!force_refresh) { - cli::cli_alert_info("No matching cached token found: fetching new token") - } - client_id <- client_id %||% get_client_id() - token_resp <- rlang::inject( - get_azure_token( - tenant = tenant, - app = client_id, - auth_type = auth_method, - !!!dots - ) - ) - token <- token_resp[["result"]] - token_error <- token_error %||% token_resp[["error"]] - } - - # Give some helpful feedback if the steps above have not succeeded - if (is.null(token) || length(token) == 0) { - cli::cli_alert_info("No authentication token was obtained.") - cli::cli_alert_info("Please check any variables you have supplied.") - cli::cli_alert_info( - "Alternatively, running {.fn AzureRMR::get_azure_login} or - {.fn AzureRMR::list_azure_tokens} may shed some light on the problem." - ) - error_msg <- "{.fn get_auth_token}: No authentication token was obtained." - cli::cli_abort(as.character(token_error %||% error_msg)) - } else { - if (aad_version == 2) { - check_that(token, AzureAuth::is_azure_v2_token, "Invalid token returned") - } else { - check_that(token, AzureAuth::is_azure_v1_token, "Invalid token returned") - } - } -} - -#' Get token via app and secret environment variables -#' Sub-routine for `get_auth_token()` -#' @keywords internal -#' @returns A list with elements `result` and `error`. If this method is -#' successful, the `result` element will contain a token. -try_token_from_vars <- function(get_token_fun, ...) { - tenant_id_env <- Sys.getenv("AZ_TENANT_ID") - client_id_env <- Sys.getenv("AZ_CLIENT_ID") - client_secret <- Sys.getenv("AZ_APP_SECRET") - - if (all(nzchar(c(tenant_id_env, client_id_env, client_secret)))) { - rlang::inject( - get_token_fun( - tenant = tenant_id_env, - app = client_id_env, - password = client_secret, - ... - ) - ) - } else { - list(result = NULL, error = NULL) - } -} - -#' Find an already cached token that matches desired parameters -#' Sub-routine for `get_auth_token()` -#' @keywords internal -#' @returns A token from local cache, or NULL if none matches -match_cached_token <- function(resource, tenant, aad_version) { - # list tokens already locally cached - local_tokens <- AzureAuth::list_azure_tokens() - if (length(local_tokens) > 0) { - resources <- purrr::map(local_tokens, "resource") - scopes <- purrr::map(local_tokens, list("scope", 1)) - resources <- purrr::map2_chr(resources, scopes, `%||%`) - tenants <- purrr::map_chr(local_tokens, "tenant") - versions <- purrr::map_int(local_tokens, "version") - - resource_index <- gregg(resources, "^{resource[[1]]}") - tenant_index <- tenants == tenant - version_index <- versions == aad_version - - # return a token matching `resource`, `tenant` and `version`, if any - token_index <- which(resource_index & tenant_index & version_index)[1] - if (!is.na(token_index)) local_tokens[[token_index]] else NULL - } else { - NULL - } -} - -#' Sub-routine for `get_auth_token()` -#' -#' Pulled out mainly to tidy up the main function code a bit -#' @keywords internal -#' @returns A string (the client ID) -get_client_id <- function() { - pluck_client_id <- function() { - suppressMessages(AzureRMR::get_azure_login()) |> - purrr::pluck("token", "client", "client_id") - } - possibly_pluck_client_id <- \(...) purrr::possibly(pluck_client_id)(...) - azure_cli_default_client_id <- "04b07795-8ddb-461a-bbee-02f9e1bf7b46" - - client_id <- possibly_pluck_client_id() - # if that fails, prompt the user to log in, then try again... - if (is.null(client_id)) { - AzureRMR::create_azure_login() - # ...using the default Azure CLI client ID as a final fallback - client_id <- possibly_pluck_client_id() %||% azure_cli_default_client_id - } - client_id -} - -#' Generate appropriate values for the `resource` parameter in [get_auth_token] -#' -#' A helper function to generate appropriate values. Ensure that the `version` -#' argument matches the `aad_version` argument to [get_auth_token]. -#' It's unlikely that you will ever want to set `authorise` to `FALSE` but it's -#' here as an option since [AzureAuth::get_azure_token] supports it. Similarly, -#' you are likely to want to keep `refresh` turned on (this argument has no -#' effect on v1 tokens, it only applies to v2). -#' -#' @param version numeric. The AAD version, either 1 or 2 (1 by default) -#' @param url The URL of the Azure resource host -#' @param path For v2, the path designating the access scope -#' @param authorise Boolean, whether to return a token with authorisation scope, -#' (TRUE, the default) or one that just provides authentication. You are -#' unlikely to want to turn this off -#' @param refresh Boolean, applies to v2 tokens only, whether to return a token -#' that has a refresh token also supplied. -#' @returns A scalar character, or (in most v2 situations) a character vector -#' @export -generate_resource <- function( - version = 1, - url = "https://storage.azure.com", - path = "/.default", - authorise = TRUE, - refresh = TRUE -) { - stopifnot("version must be 1 or 2" = version %in% seq(2)) - scopes <- if (refresh) c("openid", "offline_access") else "openid" - if (authorise) { - if (version == 2) { - c(paste0(url, path), scopes) - } else { - url - } - } else { - if (version == 2) { - scopes - } else { - "" - } - } -} - -#' Use a token's internal `refresh()` method to refresh it -#' -#' This method avoids the need to refresh by re-authenticating online. It seems -#' like this only works with v1 tokens. v2 tokens always seem to refresh by -#' re-authenticating with Azure online. But v2 tokens _ought_ to refresh -#' automatically and not need manual refreshing. To instead generate a -#' completely fresh token, pass `use_cache = FALSE` or `force_refresh = TRUE` -#' to [get_auth_token]. -#' @param token An Azure authentication token -#' @returns An Azure authentication token -#' @export -refresh_token <- \(token) token$refresh() +#' Get Azure authentication token +#' +#' This function retrieves an Azure token for a specified resource. +#' +#' If the environment variables `AZ_TENANT_ID`, `AZ_CLIENT_ID` and +#' `AZ_APP_SECRET` are all set, it will try to use these to return a token. +#' +#' Otherwise it will try to get a managed token from a managed resource such as +#' Azure VM or Azure App Service. +#' +#' If neither of these approaches has returned a token, it will try to retrieve +#' a user token using the provided parameters, requiring the user to have +#' authenticated using their device. If `force_refresh` is set to `TRUE`, a +#' fresh web authentication process should be launched. Otherwise it will +#' attempt to use a cached token matching the given `resource`, `tenant` and +#' `aad_version`. +#' +#' @param resource For v1, a simple URL such as `"https://storage.azure.com/"` +#' should be supplied.For v2, a vector specifying the URL of the Azure resource +#' for which the token is requested as well as any desired scopes. See +#' [AzureAuth::get_azure_token] for details. Use [generate_resource] +#' to help provide an appropriate string or vector. The values default to +#' `c("https://storage.azure.com/.default", "openid", "offline_access")`. +#' If setting version to 2, ensure that the `aad_version` argument is also set +#' to 2. Both are set to use AAD version 1 by default. +#' @param tenant A string specifying the Azure tenant. Defaults to +#' `"common"`. See [AzureAuth::get_azure_token] for other values. +#' @param client_id A string specifying the application ID (client ID). If +#' `NULL`, (the default) the function attempts to obtain the client ID from the +#' Azure Resource Manager token, or prompts the user to log in to obtain it. +#' @param auth_method A string specifying the authentication method. Defaults to +#' `"authorization_code"`. See [AzureAuth::get_azure_token] for other values. +#' @param aad_version Numeric. The AAD version, either 1 or 2 (1 by default) +#' @param force_refresh Boolean: whether to use a stored token if available +#' (`FALSE`, the default), or try to obtain a new one from Azure (`TRUE`). +#' This may be useful if you wish to generate a new token with the same +#' `resource` value as an existing token, but a different `tenant` or +#' `auth_method`. Note that you can also try using [refresh_token] which will +#' cause an existing token to refresh itself, without obtaining a new token +#' from Azure via online reauthentication +#' @param ... Optional arguments (`token_args` or `use_cache`) to be passed on +#' to [AzureAuth::get_managed_token] or [AzureAuth::get_azure_token]. +#' +#' @returns An Azure token object +#' @examples +#' \dontrun{ +#' # Get a token for the default resource +#' token <- get_auth_token() +#' +#' # Force generation of a new token via online reauthentication +#' token <- get_auth_token(force_refresh = TRUE) +#' +#' # Get a token for a specific resource and tenant +#' token <- get_auth_token( +#' resource = "https://graph.microsoft.com", +#' tenant = "my-tenant-id" +#' ) +#' +#' # Get a token using a specific app ID +#' token <- get_auth_token(client_id = "my-app-id") +#' } +#' @export +get_auth_token <- function( + resource = generate_resource(), + tenant = "common", + client_id = NULL, + auth_method = "authorization_code", + aad_version = 1, + force_refresh = FALSE, + ... +) { + aad_msg <- "Invalid {.arg aad_version} variable supplied (must be 1 or 2)" + aad_version <- check_that(aad_version, \(x) x %in% seq(2), aad_msg) + + safely_get_token <- \(...) purrr::safely(AzureAuth::get_azure_token)(...) + get_azure_token <- purrr::partial( + safely_get_token, + resource = resource, + version = aad_version + ) + possibly_get_mtk <- \(...) purrr::possibly(AzureAuth::get_managed_token)(...) + + dots <- rlang::list2(...) + # If the user specifies force_refresh = TRUE we turn off `use_cache`, + # otherwise we leave `use_cache` as it is (or as `NULL`, its default value) + use_cached <- !force_refresh && (dots[["use_cache"]] %||% TRUE) + dots <- rlang::dots_list(!!!dots, use_cache = use_cached, .homonyms = "last") + + # We have 4 approaches to get a token, depending on the context + # 1. Use environment variables if all three are set + token_resp <- rlang::inject(try_token_from_vars(get_azure_token, !!!dots)) + token <- token_resp[["result"]] + token_error <- token_resp[["error"]] + + # 2. Try to get a managed token (for example on Azure VM, App Service) + if (is.null(token)) { + token <- rlang::inject(possibly_get_mtk(resource, !!!dots)) + } + + # 3. If neither of those has worked, try to get an already stored user token + # (unless `force_refresh` is on, in which case skip to option 4) + if (is.null(token) && use_cached) { + token <- match_cached_token(resource, tenant, aad_version) + } + + # 4. If we still don't have a token, try to get a new one via reauthentication + if (is.null(token)) { + if (!force_refresh) { + cli::cli_alert_info("No matching cached token found: fetching new token") + } + client_id <- client_id %||% get_client_id() + token_resp <- rlang::inject( + get_azure_token( + tenant = tenant, + app = client_id, + auth_type = auth_method, + !!!dots + ) + ) + token <- token_resp[["result"]] + token_error <- token_error %||% token_resp[["error"]] + } + + # Give some helpful feedback if the steps above have not succeeded + if (is.null(token) || length(token) == 0) { + cli::cli_alert_info("No authentication token was obtained.") + cli::cli_alert_info("Please check any variables you have supplied.") + cli::cli_alert_info( + "Alternatively, running {.fn AzureRMR::get_azure_login} or + {.fn AzureRMR::list_azure_tokens} may shed some light on the problem." + ) + error_msg <- "{.fn get_auth_token}: No authentication token was obtained." + cli::cli_abort(as.character(token_error %||% error_msg)) + } else { + if (aad_version == 2) { + check_that(token, AzureAuth::is_azure_v2_token, "Invalid token returned") + } else { + check_that(token, AzureAuth::is_azure_v1_token, "Invalid token returned") + } + } +} + +#' Get token via app and secret environment variables +#' Sub-routine for `get_auth_token()` +#' @keywords internal +#' @returns A list with elements `result` and `error`. If this method is +#' successful, the `result` element will contain a token. +try_token_from_vars <- function(get_token_fun, ...) { + tenant_id_env <- Sys.getenv("AZ_TENANT_ID") + client_id_env <- Sys.getenv("AZ_CLIENT_ID") + client_secret <- Sys.getenv("AZ_APP_SECRET") + + if (all(nzchar(c(tenant_id_env, client_id_env, client_secret)))) { + rlang::inject( + get_token_fun( + tenant = tenant_id_env, + app = client_id_env, + password = client_secret, + ... + ) + ) + } else { + list(result = NULL, error = NULL) + } +} + +#' Find an already cached token that matches desired parameters +#' Sub-routine for `get_auth_token()` +#' @keywords internal +#' @returns A token from local cache, or NULL if none matches +match_cached_token <- function(resource, tenant, aad_version) { + # list tokens already locally cached + local_tokens <- AzureAuth::list_azure_tokens() + if (length(local_tokens) > 0) { + resources <- purrr::map(local_tokens, "resource") + scopes <- purrr::map(local_tokens, list("scope", 1)) + resources <- purrr::map2_chr(resources, scopes, `%||%`) + tenants <- purrr::map_chr(local_tokens, "tenant") + versions <- purrr::map_int(local_tokens, "version") + + resource_index <- gregg(resources, "^{resource[[1]]}") + tenant_index <- tenants == tenant + version_index <- versions == aad_version + + # return a token matching `resource`, `tenant` and `version`, if any + token_index <- which(resource_index & tenant_index & version_index)[1] + if (!is.na(token_index)) local_tokens[[token_index]] else NULL + } else { + NULL + } +} + +#' Sub-routine for `get_auth_token()` +#' +#' Pulled out mainly to tidy up the main function code a bit +#' @keywords internal +#' @returns A string (the client ID) +get_client_id <- function() { + pluck_client_id <- function() { + suppressMessages(AzureRMR::get_azure_login()) |> + purrr::pluck("token", "client", "client_id") + } + possibly_pluck_client_id <- \(...) purrr::possibly(pluck_client_id)(...) + azure_cli_default_client_id <- "04b07795-8ddb-461a-bbee-02f9e1bf7b46" + + client_id <- possibly_pluck_client_id() + # if that fails, prompt the user to log in, then try again... + if (is.null(client_id)) { + AzureRMR::create_azure_login() + # ...using the default Azure CLI client ID as a final fallback + client_id <- possibly_pluck_client_id() %||% azure_cli_default_client_id + } + client_id +} + +#' Generate appropriate values for the `resource` parameter in [get_auth_token] +#' +#' A helper function to generate appropriate values. Ensure that the `version` +#' argument matches the `aad_version` argument to [get_auth_token]. +#' It's unlikely that you will ever want to set `authorise` to `FALSE` but it's +#' here as an option since [AzureAuth::get_azure_token] supports it. Similarly, +#' you are likely to want to keep `refresh` turned on (this argument has no +#' effect on v1 tokens, it only applies to v2). +#' +#' @param version numeric. The AAD version, either 1 or 2 (1 by default) +#' @param url The URL of the Azure resource host +#' @param path For v2, the path designating the access scope +#' @param authorise Boolean, whether to return a token with authorisation scope, +#' (TRUE, the default) or one that just provides authentication. You are +#' unlikely to want to turn this off +#' @param refresh Boolean, applies to v2 tokens only, whether to return a token +#' that has a refresh token also supplied. +#' @returns A scalar character, or (in most v2 situations) a character vector +#' @export +generate_resource <- function( + version = 1, + url = "https://storage.azure.com", + path = "/.default", + authorise = TRUE, + refresh = TRUE +) { + stopifnot("version must be 1 or 2" = version %in% seq(2)) + scopes <- if (refresh) c("openid", "offline_access") else "openid" + if (authorise) { + if (version == 2) { + c(paste0(url, path), scopes) + } else { + url + } + } else { + if (version == 2) { + scopes + } else { + "" + } + } +} + +#' Use a token's internal `refresh()` method to refresh it +#' +#' This method avoids the need to refresh by re-authenticating online. It seems +#' like this only works with v1 tokens. v2 tokens always seem to refresh by +#' re-authenticating with Azure online. But v2 tokens _ought_ to refresh +#' automatically and not need manual refreshing. To instead generate a +#' completely fresh token, pass `use_cache = FALSE` or `force_refresh = TRUE` +#' to [get_auth_token]. +#' @param token An Azure authentication token +#' @returns An Azure authentication token +#' @export +refresh_token <- \(token) token$refresh() diff --git a/R/get_container.R b/R/get_container.R index 74cb791..249276f 100644 --- a/R/get_container.R +++ b/R/get_container.R @@ -1,80 +1,80 @@ -#' Get Azure storage container -#' -#' The environment variable "AZ_STORAGE_EP" should be set. This provides the URL -#' for the default Azure storage endpoint. -#' Use [list_container_names] to get a list of available container names. -#' -#' @param container_name Name of the container as a string. `NULL` by default, -#' which means the function will look instead for a container name stored in -#' the environment variable "AZ_CONTAINER" -#' @param token An Azure authentication token. If left as `NULL`, a token -#' returned by [get_auth_token] will be used -#' @param endpoint_url An Azure endpoint URL. If left as `NULL`, the default, -#' the value of the environment variable "AZ_STORAGE_EP" will be used -#' @param ... arguments to be passed through to [get_auth_token], if a token is -#' not already supplied -#' @returns An Azure blob container (list object of class "blob_container") -#' @export -get_container <- function( - container_name = NULL, - token = NULL, - endpoint_url = NULL, - ... -) { - msg1 <- paste0( - "{.var container_name} is empty. ", - "Did you forget to set an environment variable?" - ) - msg2 <- paste0( - "{.var endpoint_url} is empty. ", - "Did you forget to set an environment variable?" - ) - container_name <- (container_name %||% check_envvar("AZ_CONTAINER")) |> - check_nzchar(msg1) - endpoint_url <- (endpoint_url %||% check_envvar("AZ_STORAGE_EP")) |> - check_nzchar(msg2) - token <- token %||% get_auth_token(...) - - get_azure_endpoint(token, endpoint_url) |> - AzureStor::blob_container(container_name) -} - -#' Return a list of container names that are found at the endpoint -#' -#' @inheritParams get_container -#' @returns A character vector of all container names found -#' @export -list_container_names <- function(token = NULL, endpoint_url = NULL, ...) { - token <- token %||% get_auth_token(...) - endpoint <- get_azure_endpoint(token, endpoint_url) - container_list <- AzureStor::list_blob_containers(endpoint) - stopifnot("no containers found" = length(container_list) >= 1L) - names(container_list) -} - -#' Return an Azure "blob_endpoint" -#' -#' This function will return the endpoint specified in the environment variable -#' "AZ_STORAGE_EP" by default -#' -#' @inheritParams get_container -#' @returns An Azure blob endpoint (object of class "blob_endpoint") -#' @keywords internal -get_azure_endpoint <- function(token = NULL, endpoint_url = NULL, ...) { - token <- token %||% get_auth_token(...) - endpoint_url <- endpoint_url %||% check_envvar("AZ_STORAGE_EP") - AzureStor::blob_endpoint(endpoint_url, token = token) -} - -#' Check that an environment variable exists -#' -#' The function prints a helpful error if the variable is not found, else -#' it returns the value of `Sys.getenv(x)` -#' -#' @param x the *name* of the environment variable to be found and checked -#' @returns the value of the environment variable named in `x` -#' @export -check_envvar <- function(x) { - cst_msg <- cst_error_msg("The environment variable {.envvar {x}} is not set") - check_scalar_type(Sys.getenv(x, NA_character_), "string", cst_msg) -} +#' Get Azure storage container +#' +#' The environment variable "AZ_STORAGE_EP" should be set. This provides the URL +#' for the default Azure storage endpoint. +#' Use [list_container_names] to get a list of available container names. +#' +#' @param container_name Name of the container as a string. `NULL` by default, +#' which means the function will look instead for a container name stored in +#' the environment variable "AZ_CONTAINER" +#' @param token An Azure authentication token. If left as `NULL`, a token +#' returned by [get_auth_token] will be used +#' @param endpoint_url An Azure endpoint URL. If left as `NULL`, the default, +#' the value of the environment variable "AZ_STORAGE_EP" will be used +#' @param ... arguments to be passed through to [get_auth_token], if a token is +#' not already supplied +#' @returns An Azure blob container (list object of class "blob_container") +#' @export +get_container <- function( + container_name = NULL, + token = NULL, + endpoint_url = NULL, + ... +) { + msg1 <- paste0( + "{.var container_name} is empty. ", + "Did you forget to set an environment variable?" + ) + msg2 <- paste0( + "{.var endpoint_url} is empty. ", + "Did you forget to set an environment variable?" + ) + container_name <- (container_name %||% check_envvar("AZ_CONTAINER")) |> + check_nzchar(msg1) + endpoint_url <- (endpoint_url %||% check_envvar("AZ_STORAGE_EP")) |> + check_nzchar(msg2) + token <- token %||% get_auth_token(...) + + get_azure_endpoint(token, endpoint_url) |> + AzureStor::blob_container(container_name) +} + +#' Return a list of container names that are found at the endpoint +#' +#' @inheritParams get_container +#' @returns A character vector of all container names found +#' @export +list_container_names <- function(token = NULL, endpoint_url = NULL, ...) { + token <- token %||% get_auth_token(...) + endpoint <- get_azure_endpoint(token, endpoint_url) + container_list <- AzureStor::list_blob_containers(endpoint) + stopifnot("no containers found" = length(container_list) >= 1L) + names(container_list) +} + +#' Return an Azure "blob_endpoint" +#' +#' This function will return the endpoint specified in the environment variable +#' "AZ_STORAGE_EP" by default +#' +#' @inheritParams get_container +#' @returns An Azure blob endpoint (object of class "blob_endpoint") +#' @keywords internal +get_azure_endpoint <- function(token = NULL, endpoint_url = NULL, ...) { + token <- token %||% get_auth_token(...) + endpoint_url <- endpoint_url %||% check_envvar("AZ_STORAGE_EP") + AzureStor::blob_endpoint(endpoint_url, token = token) +} + +#' Check that an environment variable exists +#' +#' The function prints a helpful error if the variable is not found, else +#' it returns the value of `Sys.getenv(x)` +#' +#' @param x the *name* of the environment variable to be found and checked +#' @returns the value of the environment variable named in `x` +#' @export +check_envvar <- function(x) { + cst_msg <- cst_error_msg("The environment variable {.envvar {x}} is not set") + check_scalar_type(Sys.getenv(x, NA_character_), "string", cst_msg) +} diff --git a/R/helpers.R b/R/helpers.R index b9b5cfe..05b7392 100644 --- a/R/helpers.R +++ b/R/helpers.R @@ -13,11 +13,11 @@ #' @seealso [check_vec] #' @export check_that <- function(x, predicate, message, pf = parent.frame()) { - if (predicate(x)) { - x - } else { - cli::cli_abort(message, call = rlang::caller_call(), .envir = pf) - } + if (predicate(x)) { + x + } else { + cli::cli_abort(message, call = rlang::caller_call(), .envir = pf) + } } #' @export @@ -47,19 +47,19 @@ ct_error_msg <- \(text) paste0("{.fn check_that}: ", text) #' @seealso [check_scalar_type()] #' @export check_vec <- function( - x, - predicate, - message, - which = c("every", "some", "none"), - pf = parent.frame() + x, + predicate, + message, + which = c("every", "some", "none"), + pf = parent.frame() ) { - w <- rlang::arg_match(which) - test_call <- rlang::call2(w, .x = x, .p = predicate, .ns = "purrr") - if (eval(test_call)) { - x - } else { - cli::cli_abort(message, call = rlang::caller_call(), .envir = pf) - } + w <- rlang::arg_match(which) + test_call <- rlang::call2(w, .x = x, .p = predicate, .ns = "purrr") + if (eval(test_call)) { + x + } else { + cli::cli_abort(message, call = rlang::caller_call(), .envir = pf) + } } #' @export @@ -78,32 +78,32 @@ cv_error_msg <- \(text) paste0("{.fn check_vec}: ", text) #' @param type A string defining the R object type that `x` is checked to be #' @export check_scalar_type <- function( - x, - type, - message, - pf = parent.frame() + x, + type, + message, + pf = parent.frame() ) { - opts <- c( - "character", - "logical", - "integer", - "double", - "string", - "bool", - "list", - "bytes", - "raw", - "vector", - "complex" - ) - t <- rlang::arg_match(type, opts) - t <- if (t %in% c("string", "bool")) t else paste0("scalar_", t) - test_call <- rlang::call2(paste0("is_", t), x = x, .ns = "rlang") - if (eval(test_call)) { - x - } else { - cli::cli_abort(message, call = rlang::caller_call(), .envir = pf) - } + opts <- c( + "character", + "logical", + "integer", + "double", + "string", + "bool", + "list", + "bytes", + "raw", + "vector", + "complex" + ) + t <- rlang::arg_match(type, opts) + t <- if (t %in% c("string", "bool")) t else paste0("scalar_", t) + test_call <- rlang::call2(paste0("is_", t), x = x, .ns = "rlang") + if (eval(test_call)) { + x + } else { + cli::cli_abort(message, call = rlang::caller_call(), .envir = pf) + } } #' @export @@ -121,17 +121,17 @@ cst_error_msg <- \(text) paste0("{.fn check_scalar_type}: ", text) #' makes it easier to include informative values in the message. #' @export check_nzchar <- function(x, message, pf = parent.frame()) { - if (is.null(x)) { - NULL - } - cnz <- "check_nzchar" # nolint - check_scalar_type(x, "string", "{.fn {cnz}}: {.var x} is not a string") - if (nzchar(x)) { - x - } else { - message <- paste0("{.fn {cnz}}: ", message) - cli::cli_abort(message, call = rlang::caller_call(), .envir = pf) - } + if (is.null(x)) { + NULL + } + cnz <- "check_nzchar" # nolint + check_scalar_type(x, "string", "{.fn {cnz}}: {.var x} is not a string") + if (nzchar(x)) { + x + } else { + message <- paste0("{.fn {cnz}}: ", message) + cli::cli_abort(message, call = rlang::caller_call(), .envir = pf) + } } #' grepl a glued regex @@ -151,11 +151,11 @@ gregg <- \(x, rx, ..., g = parent.frame()) grepl(glue::glue_data(g, rx), x, ...) #' @inheritParams read_azure_parquet #' @export check_container_class <- function(container) { - if (inherits(container, "blob_container")) { - container - } else { - ccc <- "check_container_class" # nolint - cc <- rlang::caller_call() - cli::cli_abort("{.fn {ccc}}: This is not a valid blob container", call = cc) - } + if (inherits(container, "blob_container")) { + container + } else { + ccc <- "check_container_class" # nolint + cc <- rlang::caller_call() + cli::cli_abort("{.fn {ccc}}: This is not a valid blob container", call = cc) + } } diff --git a/R/list_files.R b/R/list_files.R index c5b48f5..b15e5ce 100644 --- a/R/list_files.R +++ b/R/list_files.R @@ -1,57 +1,57 @@ -#' List files in a container -#' -#' Recursively (or not, if desired) lists all files found in a container. Search -#' can be restricted to a particular 'subdirectory' of the container, and/or -#' to files with a specific extension. The function assumes that all file names -#' end with a ".ext" extension of some sort. -#' -#' The function does not support filtering by file name, only by file extension. -#' -#' The returned file list (character vector) contains the full paths to the -#' files, ready to be passed perhaps to a `read_azure_*` function, or further -#' filtered by you. If you just want the names of the files without the folder -#' path, use [basename()] to extract these. -#' -#' @inheritParams read_azure_parquet -#' @param path (optional) subdirectory of the container to list files within. -#' `""` (the root folder of the container) by default -#' @param ext (optional) A string giving the extension of a particular file type -#' you want to restrict the list to. No need to include the initial ".". The -#' default, `""`, means no filtering by file extension will be applied. Can be -#' a regular expression. -#' @param recursive A Boolean value: whether to list files recursively. `TRUE` -#' by default -#' -#' @importFrom rlang .data -#' @returns A vector of file names, or an empty character vector if none found -#' @examples \dontrun{ -#' list_files(get_container("example"), ext = "csv") -#' } -#' @export -list_files <- function(container, path = "", ext = "", recursive = TRUE) { - stopifnot(rlang::is_character(c(path, ext), 2)) - stopifnot(rlang::is_bool(recursive)) - pnf_msg <- ct_error_msg("Path {.val {path}} not found") - check_that(path, \(x) AzureStor::blob_dir_exists(container, x), pnf_msg) - - tbl <- AzureStor::list_blobs(container, path, recursive = recursive) - if (nrow(tbl) > 0) { - ext_rx <- if (nzchar(ext)) sub("^\\.+", "", ext) else ".*" # nolint - tbl <- tbl |> - dplyr::filter(!.data[["isdir"]] & gregg(.data[["name"]], "\\.{ext_rx}$")) - } - - # A zero-row tbl can result if `path` is initially empty, or via the filter - # step above. We handle this the same way, no matter which route led here. - if (nrow(tbl) == 0) { - fix_path <- \(p) sub("^/+$", "", sub("^([^/])(.*)", "/\\1\\2", p)) # nolint - ext <- if (nzchar(ext)) paste0(" ", ext) - msg <- "No{ext} files found in {.val [{container$name}]:{fix_path(path)}}" - if (rlang::is_interactive()) { - cli::cli_alert_info(msg) - } - invisible(character(0)) - } else { - tbl[["name"]] - } -} +#' List files in a container +#' +#' Recursively (or not, if desired) lists all files found in a container. Search +#' can be restricted to a particular 'subdirectory' of the container, and/or +#' to files with a specific extension. The function assumes that all file names +#' end with a ".ext" extension of some sort. +#' +#' The function does not support filtering by file name, only by file extension. +#' +#' The returned file list (character vector) contains the full paths to the +#' files, ready to be passed perhaps to a `read_azure_*` function, or further +#' filtered by you. If you just want the names of the files without the folder +#' path, use [basename()] to extract these. +#' +#' @inheritParams read_azure_parquet +#' @param path (optional) subdirectory of the container to list files within. +#' `""` (the root folder of the container) by default +#' @param ext (optional) A string giving the extension of a particular file type +#' you want to restrict the list to. No need to include the initial ".". The +#' default, `""`, means no filtering by file extension will be applied. Can be +#' a regular expression. +#' @param recursive A Boolean value: whether to list files recursively. `TRUE` +#' by default +#' +#' @importFrom rlang .data +#' @returns A vector of file names, or an empty character vector if none found +#' @examples \dontrun{ +#' list_files(get_container("example"), ext = "csv") +#' } +#' @export +list_files <- function(container, path = "", ext = "", recursive = TRUE) { + stopifnot(rlang::is_character(c(path, ext), 2)) + stopifnot(rlang::is_bool(recursive)) + pnf_msg <- ct_error_msg("Path {.val {path}} not found") + check_that(path, \(x) AzureStor::blob_dir_exists(container, x), pnf_msg) + + tbl <- AzureStor::list_blobs(container, path, recursive = recursive) + if (nrow(tbl) > 0) { + ext_rx <- if (nzchar(ext)) sub("^\\.+", "", ext) else ".*" # nolint + tbl <- tbl |> + dplyr::filter(!.data[["isdir"]] & gregg(.data[["name"]], "\\.{ext_rx}$")) + } + + # A zero-row tbl can result if `path` is initially empty, or via the filter + # step above. We handle this the same way, no matter which route led here. + if (nrow(tbl) == 0) { + fix_path <- \(p) sub("^/+$", "", sub("^([^/])(.*)", "/\\1\\2", p)) # nolint + ext <- if (nzchar(ext)) paste0(" ", ext) + msg <- "No{ext} files found in {.val [{container$name}]:{fix_path(path)}}" + if (rlang::is_interactive()) { + cli::cli_alert_info(msg) + } + invisible(character(0)) + } else { + tbl[["name"]] + } +} diff --git a/R/read_azure_files.R b/R/read_azure_files.R index b24c1e6..7b58120 100644 --- a/R/read_azure_files.R +++ b/R/read_azure_files.R @@ -1,157 +1,157 @@ -#' Read a parquet file from Azure storage -#' -#' @param container An Azure container object, as returned by [get_container] -#' @param file The name of the file to be read, as a string. NB The file -#' extension does not need to be included (though it can be). The function -#' will error if multiple files are somehow matched. -#' @param path The path to the directory where `file` is located, as a string. -#' Only needed if `file` does not already contain its full path. If file is -#' just a file name with no path, then provide the path to the directory here. -#' This must be the full path to the file location, as the function will not -#' search into subdirectories recursively. Set to `"/"` (the root of the -#' container) by default. -#' @param info Boolean. Whether to print user feedback about the file that is -#' being read. Useful for checking the function is doing what is expected, but -#' can be turned off with `FALSE`. Can be set persistently with the option -#' "azkit.info". If `NULL` then it will default to the value of -#' [rlang::is_interactive] (that is, `TRUE` for interactive sessions). -#' @param ... optional arguments to be passed through to [arrow::read_parquet] -#' @returns A tibble -#' @examples \dontrun{ -#' # if a full filepath is available then path can be ignored -#' read_azure_parquet(cont, "data/folder/path/1.parquet") -#' # you can provide a filename without the '.parquet' extension -#' # if you wish to use this partial file name matching it is probably easier -#' # to provide a 'path' -#' read_azure_parquet(cont, "case_details", "storage/parquet/2025/06/29") -#' } -#' @export -read_azure_parquet <- function(container, file, path = "/", info = NULL, ...) { - check_blob_exists(container, file, "parquet", info, path) |> - # using `dest = NULL` means pass the data through as a raw vector - AzureStor::download_blob(container, src = _, dest = NULL) |> - arrow::read_parquet(...) -} - -#' Read a json file from Azure storage -#' -#' @inheritParams read_azure_parquet -#' @param ... optional arguments to be passed through to -#' [yyjsonr::read_json_raw] -#' @returns A list -#' @export -read_azure_json <- function(container, file, path = "/", info = NULL, ...) { - check_blob_exists(container, file, "json", info, path) |> - # using `dest = NULL` means pass the data through as a raw vector - AzureStor::download_blob(container, src = _, dest = NULL) |> - yyjsonr::read_json_raw(...) -} - -#' Read a json.gz file from Azure storage -#' -#' @inheritParams read_azure_parquet -#' @param ... optional arguments to be passed through to -#' [yyjsonr::read_json_file] -#' @returns A list -#' @export -read_azure_jsongz <- function(container, file, path = "/", info = NULL, ...) { - full_path <- check_blob_exists(container, file, "json.gz", info, path) - dl <- withr::local_tempfile( - pattern = tools::file_path_sans_ext(basename(full_path), TRUE), - fileext = "json.gz" - ) - AzureStor::download_blob(container, src = full_path, dest = dl) - yyjsonr::read_json_file(dl, ...) -} - -#' Read an rds file from Azure storage -#' -#' @inheritParams read_azure_parquet -#' @param ... optional arguments to be passed through to -#' [AzureStor::storage_load_rds]. For example, a compression type (one of -#' c("unknown", "gzip", "bzip2", "xz", "zstd", "none")) can be provided using -#' the argument `type`, which will be passed on to [memDecompress] via -#' [AzureStor::storage_load_rds]. -# If nothing is provided here, the compression type will be set to "none". -#' @returns The data object that was stored in the rds file -#' @export -read_azure_rds <- function(container, file, path = "/", info = NULL, ...) { - # If the user doesn't specify a (de)compression type with `type` in `...`, we - # will set a `type` of "none", as this seems to be the standard on SU Azure - dots <- rlang::dots_list(..., type = "none", .homonyms = "first") - blob <- check_blob_exists(container, file, "rds", info, path) - rlang::inject(AzureStor::storage_load_rds(container, blob, !!!dots)) -} - -#' Read a csv file from Azure storage -#' -#' @inheritParams read_azure_parquet -#' @param ... optional arguments to be passed through to [readr::read_delim] -#' @returns A tibble -#' @export -read_azure_csv <- function(container, file, path = "/", info = NULL, ...) { - check_blob_exists(container, file, "csv", info, path) |> - AzureStor::storage_read_csv(container, file = _, ...) -} - -#' Read any file from Azure storage -#' -#' @inheritParams read_azure_parquet -#' @param ext If a custom extension needs to be supplied, you can specify it -#' here. If `NULL`, the default, the extension of `file` will be used -#' @param ... optional arguments to be passed through to -#' [AzureStor::download_blob] -#' @returns A raw data stream -#' @export -read_azure_file <- function( - container, - file, - path = "/", - info = NULL, - ext = NULL, - ... -) { - ext <- ext %||% tools::file_ext(file) - check_blob_exists(container, file, ext, info, path) |> - # using `dest = NULL` means pass the data through as a raw vector - AzureStor::download_blob(container, src = _, dest = NULL, ...) -} - -#' Ensures that the filepath for the file to read exists -#' -#' @inheritParams read_azure_parquet -#' @param ext The standard file extension for the file type, e.g. "json" -#' @keywords internal -check_blob_exists <- function(container, file, ext, info, path) { - stopifnot("no container found" = inherits(container, "blob_container")) - path <- if (path %in% c("", "/")) "" else path - stopifnot("path not found" = AzureStor::blob_dir_exists(container, path)) - dir_name <- if (dirname(file) == ".") "" else dirname(file) - # Potentially the user could provide a partial file path in `path` and a - # further sub-directory as part of `file`. This handles that eventuality, - # though this usage pattern should be quite rare! - dpath <- file.path(path, dir_name) - fname <- basename(file) - if (nzchar(ext) && !gregg(fname, "\\.{ext}$")) { - fname <- glue::glue("{fname}.{ext}") - } - # remove duplicate slashes and any initial slashes - file_path <- sub("^/", "", gsub("/+", "/", file.path(dpath, fname))) - - filepath_out <- AzureStor::list_blobs(container, dpath, recursive = FALSE) |> - dplyr::filter(dplyr::if_any("name", \(x) x == {{ file_path }})) |> - dplyr::pull("name") - - msg1 <- ct_error_msg("no matching {ext} file found") - msg2 <- cst_error_msg("multiple matching {ext} files found") - check_that(filepath_out, \(x) length(x) > 0, msg1) # check length > 0 - check_scalar_type(filepath_out, "character", msg2) # check length == 1 - - info_option <- getOption("azkit.info") - stopifnot(rlang::is_scalar_logical(info) || is.null(info)) - stopifnot(rlang::is_scalar_logical(info_option) || is.null(info_option)) - if (info %||% info_option %||% rlang::is_interactive()) { - cli::cli_alert_info("File {.val {filepath_out}} will be read in") - } - filepath_out -} +#' Read a parquet file from Azure storage +#' +#' @param container An Azure container object, as returned by [get_container] +#' @param file The name of the file to be read, as a string. NB The file +#' extension does not need to be included (though it can be). The function +#' will error if multiple files are somehow matched. +#' @param path The path to the directory where `file` is located, as a string. +#' Only needed if `file` does not already contain its full path. If file is +#' just a file name with no path, then provide the path to the directory here. +#' This must be the full path to the file location, as the function will not +#' search into subdirectories recursively. Set to `"/"` (the root of the +#' container) by default. +#' @param info Boolean. Whether to print user feedback about the file that is +#' being read. Useful for checking the function is doing what is expected, but +#' can be turned off with `FALSE`. Can be set persistently with the option +#' "azkit.info". If `NULL` then it will default to the value of +#' [rlang::is_interactive] (that is, `TRUE` for interactive sessions). +#' @param ... optional arguments to be passed through to [arrow::read_parquet] +#' @returns A tibble +#' @examples \dontrun{ +#' # if a full filepath is available then path can be ignored +#' read_azure_parquet(cont, "data/folder/path/1.parquet") +#' # you can provide a filename without the '.parquet' extension +#' # if you wish to use this partial file name matching it is probably easier +#' # to provide a 'path' +#' read_azure_parquet(cont, "case_details", "storage/parquet/2025/06/29") +#' } +#' @export +read_azure_parquet <- function(container, file, path = "/", info = NULL, ...) { + check_blob_exists(container, file, "parquet", info, path) |> + # using `dest = NULL` means pass the data through as a raw vector + AzureStor::download_blob(container, src = _, dest = NULL) |> + arrow::read_parquet(...) +} + +#' Read a json file from Azure storage +#' +#' @inheritParams read_azure_parquet +#' @param ... optional arguments to be passed through to +#' [yyjsonr::read_json_raw] +#' @returns A list +#' @export +read_azure_json <- function(container, file, path = "/", info = NULL, ...) { + check_blob_exists(container, file, "json", info, path) |> + # using `dest = NULL` means pass the data through as a raw vector + AzureStor::download_blob(container, src = _, dest = NULL) |> + yyjsonr::read_json_raw(...) +} + +#' Read a json.gz file from Azure storage +#' +#' @inheritParams read_azure_parquet +#' @param ... optional arguments to be passed through to +#' [yyjsonr::read_json_file] +#' @returns A list +#' @export +read_azure_jsongz <- function(container, file, path = "/", info = NULL, ...) { + full_path <- check_blob_exists(container, file, "json.gz", info, path) + dl <- withr::local_tempfile( + pattern = tools::file_path_sans_ext(basename(full_path), TRUE), + fileext = "json.gz" + ) + AzureStor::download_blob(container, src = full_path, dest = dl) + yyjsonr::read_json_file(dl, ...) +} + +#' Read an rds file from Azure storage +#' +#' @inheritParams read_azure_parquet +#' @param ... optional arguments to be passed through to +#' [AzureStor::storage_load_rds]. For example, a compression type (one of +#' c("unknown", "gzip", "bzip2", "xz", "zstd", "none")) can be provided using +#' the argument `type`, which will be passed on to [memDecompress] via +#' [AzureStor::storage_load_rds]. +# If nothing is provided here, the compression type will be set to "none". +#' @returns The data object that was stored in the rds file +#' @export +read_azure_rds <- function(container, file, path = "/", info = NULL, ...) { + # If the user doesn't specify a (de)compression type with `type` in `...`, we + # will set a `type` of "none", as this seems to be the standard on SU Azure + dots <- rlang::dots_list(..., type = "none", .homonyms = "first") + blob <- check_blob_exists(container, file, "rds", info, path) + rlang::inject(AzureStor::storage_load_rds(container, blob, !!!dots)) +} + +#' Read a csv file from Azure storage +#' +#' @inheritParams read_azure_parquet +#' @param ... optional arguments to be passed through to [readr::read_delim] +#' @returns A tibble +#' @export +read_azure_csv <- function(container, file, path = "/", info = NULL, ...) { + check_blob_exists(container, file, "csv", info, path) |> + AzureStor::storage_read_csv(container, file = _, ...) +} + +#' Read any file from Azure storage +#' +#' @inheritParams read_azure_parquet +#' @param ext If a custom extension needs to be supplied, you can specify it +#' here. If `NULL`, the default, the extension of `file` will be used +#' @param ... optional arguments to be passed through to +#' [AzureStor::download_blob] +#' @returns A raw data stream +#' @export +read_azure_file <- function( + container, + file, + path = "/", + info = NULL, + ext = NULL, + ... +) { + ext <- ext %||% tools::file_ext(file) + check_blob_exists(container, file, ext, info, path) |> + # using `dest = NULL` means pass the data through as a raw vector + AzureStor::download_blob(container, src = _, dest = NULL, ...) +} + +#' Ensures that the filepath for the file to read exists +#' +#' @inheritParams read_azure_parquet +#' @param ext The standard file extension for the file type, e.g. "json" +#' @keywords internal +check_blob_exists <- function(container, file, ext, info, path) { + stopifnot("no container found" = inherits(container, "blob_container")) + path <- if (path %in% c("", "/")) "" else path + stopifnot("path not found" = AzureStor::blob_dir_exists(container, path)) + dir_name <- if (dirname(file) == ".") "" else dirname(file) + # Potentially the user could provide a partial file path in `path` and a + # further sub-directory as part of `file`. This handles that eventuality, + # though this usage pattern should be quite rare! + dpath <- file.path(path, dir_name) + fname <- basename(file) + if (nzchar(ext) && !gregg(fname, "\\.{ext}$")) { + fname <- glue::glue("{fname}.{ext}") + } + # remove duplicate slashes and any initial slashes + file_path <- sub("^/", "", gsub("/+", "/", file.path(dpath, fname))) + + filepath_out <- AzureStor::list_blobs(container, dpath, recursive = FALSE) |> + dplyr::filter(dplyr::if_any("name", \(x) x == {{ file_path }})) |> + dplyr::pull("name") + + msg1 <- ct_error_msg("no matching {ext} file found") + msg2 <- cst_error_msg("multiple matching {ext} files found") + check_that(filepath_out, \(x) length(x) > 0, msg1) # check length > 0 + check_scalar_type(filepath_out, "character", msg2) # check length == 1 + + info_option <- getOption("azkit.info") + stopifnot(rlang::is_scalar_logical(info) || is.null(info)) + stopifnot(rlang::is_scalar_logical(info_option) || is.null(info_option)) + if (info %||% info_option %||% rlang::is_interactive()) { + cli::cli_alert_info("File {.val {filepath_out}} will be read in") + } + filepath_out +} diff --git a/R/read_azure_table.R b/R/read_azure_table.R index db9559d..3dd3998 100644 --- a/R/read_azure_table.R +++ b/R/read_azure_table.R @@ -1,39 +1,39 @@ -#' Read in data from an Azure table -#' -#' @param table_name name of the table to be read. If left as `NULL`, -#' the default, the function will look instead for a value stored in the -#' environment variable "AZ_TABLE_NAME" -#' @param table_endpoint URL of the Azure table endpoint. If left as `NULL`, -#' the default, the function will look instead for a value stored in the -#' environment variable "AZ_TABLE_EP" -#' @param ... parameters to be passed through to [get_auth_token] -#' @inheritParams get_container -#' @returns A tibble -#' @export -read_azure_table <- function( - table_name = NULL, - table_endpoint = NULL, - token = NULL, - ... -) { - table_name <- table_name %||% check_envvar("AZ_TABLE_NAME") - table_ep <- table_endpoint %||% check_envvar("AZ_TABLE_EP") - token <- token %||% get_auth_token(...) - access_token <- token |> - purrr::pluck("credentials", "access_token") - headers <- list("2025-11-05", "application/json;odata=nometadata") |> - purrr::set_names(c("x-ms-version", "Accept")) - - resp <- httr2::request(table_ep) |> - httr2::req_url_path_append(table_name) |> - httr2::req_auth_bearer_token(access_token) |> - httr2::req_headers(!!!headers) |> - httr2::req_perform() |> - httr2::resp_check_status() - - resp |> - httr2::resp_body_json() |> - purrr::pluck("value") |> - purrr::map(tibble::as_tibble) |> - purrr::list_rbind() -} +#' Read in data from an Azure table +#' +#' @param table_name name of the table to be read. If left as `NULL`, +#' the default, the function will look instead for a value stored in the +#' environment variable "AZ_TABLE_NAME" +#' @param table_endpoint URL of the Azure table endpoint. If left as `NULL`, +#' the default, the function will look instead for a value stored in the +#' environment variable "AZ_TABLE_EP" +#' @param ... parameters to be passed through to [get_auth_token] +#' @inheritParams get_container +#' @returns A tibble +#' @export +read_azure_table <- function( + table_name = NULL, + table_endpoint = NULL, + token = NULL, + ... +) { + table_name <- table_name %||% check_envvar("AZ_TABLE_NAME") + table_ep <- table_endpoint %||% check_envvar("AZ_TABLE_EP") + token <- token %||% get_auth_token(...) + access_token <- token |> + purrr::pluck("credentials", "access_token") + headers <- list("2025-11-05", "application/json;odata=nometadata") |> + purrr::set_names(c("x-ms-version", "Accept")) + + resp <- httr2::request(table_ep) |> + httr2::req_url_path_append(table_name) |> + httr2::req_auth_bearer_token(access_token) |> + httr2::req_headers(!!!headers) |> + httr2::req_perform() |> + httr2::resp_check_status() + + resp |> + httr2::resp_body_json() |> + purrr::pluck("value") |> + purrr::map(tibble::as_tibble) |> + purrr::list_rbind() +} diff --git a/azkit.Rproj b/azkit.Rproj index 38b9011..69fafd4 100644 --- a/azkit.Rproj +++ b/azkit.Rproj @@ -1,22 +1,22 @@ -Version: 1.0 - -RestoreWorkspace: No -SaveWorkspace: No -AlwaysSaveHistory: Default - -EnableCodeIndexing: Yes -UseSpacesForTab: Yes -NumSpacesForTab: 2 -Encoding: UTF-8 - -RnwWeave: Sweave -LaTeX: pdfLaTeX - -AutoAppendNewline: Yes -StripTrailingWhitespace: Yes -LineEndingConversion: Posix - -BuildType: Package -PackageUseDevtools: Yes -PackageInstallArgs: --no-multiarch --with-keep.source -PackageRoxygenize: rd,collate,namespace +Version: 1.0 + +RestoreWorkspace: No +SaveWorkspace: No +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX + +AutoAppendNewline: Yes +StripTrailingWhitespace: Yes +LineEndingConversion: Posix + +BuildType: Package +PackageUseDevtools: Yes +PackageInstallArgs: --no-multiarch --with-keep.source +PackageRoxygenize: rd,collate,namespace diff --git a/azkit.code-workspace b/azkit.code-workspace index 7f23b68..01efb84 100644 --- a/azkit.code-workspace +++ b/azkit.code-workspace @@ -1,59 +1,59 @@ { - "folders": [ - { - "path": "." - } - ], - "settings": { - "workbench.colorCustomizations": { - "editorRuler.foreground": "#ff4081", - "statusBarItem.warningBackground": "#41e6ec", - "statusBarItem.warningForeground": "#000000", - "statusBarItem.warningHoverBackground": "#41e6ec", - "statusBarItem.warningHoverForeground": "#00000090", - "statusBarItem.remoteBackground": "#4ef3f9", - "statusBarItem.remoteForeground": "#000000", - "statusBarItem.remoteHoverBackground": "#5bffff", - "statusBarItem.remoteHoverForeground": "#00000090", - "statusBar.background": "#41e6ec", - "statusBar.foreground": "#000000", - "statusBar.border": "#41e6ec", - "statusBar.debuggingBackground": "#41e6ec", - "statusBar.debuggingForeground": "#000000", - "statusBar.debuggingBorder": "#41e6ec", - "statusBar.noFolderBackground": "#41e6ec", - "statusBar.noFolderForeground": "#000000", - "statusBar.noFolderBorder": "#41e6ec", - "statusBar.prominentBackground": "#41e6ec", - "statusBar.prominentForeground": "#000000", - "statusBar.prominentHoverBackground": "#41e6ec", - "statusBar.prominentHoverForeground": "#00000090", - "focusBorder": "#41e6ec99", - "progressBar.background": "#41e6ec", - "textLink.foreground": "#81ffff", - "textLink.activeForeground": "#8effff", - "selection.background": "#34d9df", - "list.highlightForeground": "#41e6ec", - "list.focusAndSelectionOutline": "#41e6ec99", - "button.background": "#41e6ec", - "button.foreground": "#000000", - "button.hoverBackground": "#4ef3f9", - "tab.activeBorderTop": "#4ef3f9", - "pickerGroup.foreground": "#4ef3f9", - "list.activeSelectionBackground": "#41e6ec4d", - "panelTitle.activeBorder": "#4ef3f9", - "activityBar.activeBorder": "#41e6ec", - "activityBarBadge.foreground": "#000000", - "activityBarBadge.background": "#41e6ec" - }, - "window.title": "{azkit}", - "projectColors.name": "{azkit}", - "projectColors.mainColor": "#41e6ec", - "projectColors.isActivityBarColored": false, - "projectColors.isTitleBarColored": false, - "projectColors.isStatusBarColored": true, - "projectColors.isProjectNameColored": true, - "projectColors.isActiveItemsColored": true, - "projectColors.setWindowTitle": true - } + "folders": [ + { + "path": "." + } + ], + "settings": { + "workbench.colorCustomizations": { + "editorRuler.foreground": "#ff4081", + "statusBarItem.warningBackground": "#41e6ec", + "statusBarItem.warningForeground": "#000000", + "statusBarItem.warningHoverBackground": "#41e6ec", + "statusBarItem.warningHoverForeground": "#00000090", + "statusBarItem.remoteBackground": "#4ef3f9", + "statusBarItem.remoteForeground": "#000000", + "statusBarItem.remoteHoverBackground": "#5bffff", + "statusBarItem.remoteHoverForeground": "#00000090", + "statusBar.background": "#41e6ec", + "statusBar.foreground": "#000000", + "statusBar.border": "#41e6ec", + "statusBar.debuggingBackground": "#41e6ec", + "statusBar.debuggingForeground": "#000000", + "statusBar.debuggingBorder": "#41e6ec", + "statusBar.noFolderBackground": "#41e6ec", + "statusBar.noFolderForeground": "#000000", + "statusBar.noFolderBorder": "#41e6ec", + "statusBar.prominentBackground": "#41e6ec", + "statusBar.prominentForeground": "#000000", + "statusBar.prominentHoverBackground": "#41e6ec", + "statusBar.prominentHoverForeground": "#00000090", + "focusBorder": "#41e6ec99", + "progressBar.background": "#41e6ec", + "textLink.foreground": "#81ffff", + "textLink.activeForeground": "#8effff", + "selection.background": "#34d9df", + "list.highlightForeground": "#41e6ec", + "list.focusAndSelectionOutline": "#41e6ec99", + "button.background": "#41e6ec", + "button.foreground": "#000000", + "button.hoverBackground": "#4ef3f9", + "tab.activeBorderTop": "#4ef3f9", + "pickerGroup.foreground": "#4ef3f9", + "list.activeSelectionBackground": "#41e6ec4d", + "panelTitle.activeBorder": "#4ef3f9", + "activityBar.activeBorder": "#41e6ec", + "activityBarBadge.foreground": "#000000", + "activityBarBadge.background": "#41e6ec" + }, + "window.title": "{azkit}", + "projectColors.name": "{azkit}", + "projectColors.mainColor": "#41e6ec", + "projectColors.isActivityBarColored": false, + "projectColors.isTitleBarColored": false, + "projectColors.isStatusBarColored": true, + "projectColors.isProjectNameColored": true, + "projectColors.isActiveItemsColored": true, + "projectColors.setWindowTitle": true + } } diff --git a/inst/azkit.svg b/inst/azkit.svg index d0427bf..f845cb4 100644 --- a/inst/azkit.svg +++ b/inst/azkit.svg @@ -260,8 +260,8 @@ inkscape:label="cloud" style="display:none" sodipodi:insensitive="true"> - - + +