From 4aff1f99ed251e9d66bcf83df4b80bbac4c407f3 Mon Sep 17 00:00:00 2001 From: Tom Jemmett Date: Fri, 13 Mar 2026 16:54:12 +0000 Subject: [PATCH] runs air format --- R/get_auth_token.R | 550 ++++++++++---------- R/get_container.R | 166 +++--- R/list_files.R | 114 ++-- R/read_azure_files.R | 326 ++++++------ R/read_azure_table.R | 78 +-- tests/testthat/test-read_azure_files.R | 686 ++++++++++++------------- 6 files changed, 960 insertions(+), 960 deletions(-) diff --git a/R/get_auth_token.R b/R/get_auth_token.R index 9e8a0b8..3a0e7a5 100644 --- a/R/get_auth_token.R +++ b/R/get_auth_token.R @@ -1,275 +1,275 @@ -#' Get Azure authentication token -#' -#' This function retrieves an Azure token for a specified resource. -#' -#' If the environment variables `AZ_TENANT_ID`, `AZ_CLIENT_ID` and -#' `AZ_APP_SECRET` are all set, it will try to use these to return a token. -#' -#' Otherwise it will try to get a managed token from a managed resource such as -#' Azure VM or Azure App Service. -#' -#' If neither of these approaches has returned a token, it will try to retrieve -#' a user token using the provided parameters, requiring the user to have -#' authenticated using their device. If `force_refresh` is set to `TRUE`, a -#' fresh web authentication process should be launched. Otherwise it will -#' attempt to use a cached token matching the given `resource`, `tenant` and -#' `aad_version`. -#' -#' @param resource For v1, a simple URL such as `"https://storage.azure.com/"` -#' should be supplied.For v2, a vector specifying the URL of the Azure resource -#' for which the token is requested as well as any desired scopes. See -#' [AzureAuth::get_azure_token] for details. Use [generate_resource] -#' to help provide an appropriate string or vector. The values default to -#' `c("https://storage.azure.com/.default", "openid", "offline_access")`. -#' If setting version to 2, ensure that the `aad_version` argument is also set -#' to 2. Both are set to use AAD version 1 by default. -#' @param tenant A string specifying the Azure tenant. Defaults to -#' `"common"`. See [AzureAuth::get_azure_token] for other values. -#' @param client_id A string specifying the application ID (client ID). If -#' `NULL`, (the default) the function attempts to obtain the client ID from the -#' Azure Resource Manager token, or prompts the user to log in to obtain it. -#' @param auth_method A string specifying the authentication method. Defaults to -#' `"authorization_code"`. See [AzureAuth::get_azure_token] for other values. -#' @param aad_version Numeric. The AAD version, either 1 or 2 (1 by default) -#' @param force_refresh Boolean: whether to use a stored token if available -#' (`FALSE`, the default), or try to obtain a new one from Azure (`TRUE`). -#' This may be useful if you wish to generate a new token with the same -#' `resource` value as an existing token, but a different `tenant` or -#' `auth_method`. Note that you can also try using [refresh_token] which will -#' cause an existing token to refresh itself, without obtaining a new token -#' from Azure via online reauthentication -#' @param ... Optional arguments (`token_args` or `use_cache`) to be passed on -#' to [AzureAuth::get_managed_token] or [AzureAuth::get_azure_token]. -#' -#' @returns An Azure token object -#' @examples -#' \dontrun{ -#' # Get a token for the default resource -#' token <- get_auth_token() -#' -#' # Force generation of a new token via online reauthentication -#' token <- get_auth_token(force_refresh = TRUE) -#' -#' # Get a token for a specific resource and tenant -#' token <- get_auth_token( -#' resource = "https://graph.microsoft.com", -#' tenant = "my-tenant-id" -#' ) -#' -#' # Get a token using a specific app ID -#' token <- get_auth_token(client_id = "my-app-id") -#' } -#' @export -get_auth_token <- function( - resource = generate_resource(), - tenant = "common", - client_id = NULL, - auth_method = "authorization_code", - aad_version = 1, - force_refresh = FALSE, - ... -) { - aad_msg <- "Invalid {.arg aad_version} variable supplied (must be 1 or 2)" - aad_version <- check_that(aad_version, \(x) x %in% seq(2), aad_msg) - - safely_get_token <- \(...) purrr::safely(AzureAuth::get_azure_token)(...) - get_azure_token <- purrr::partial( - safely_get_token, - resource = resource, - version = aad_version - ) - possibly_get_mtk <- \(...) purrr::possibly(AzureAuth::get_managed_token)(...) - - dots <- rlang::list2(...) - # If the user specifies force_refresh = TRUE we turn off `use_cache`, - # otherwise we leave `use_cache` as it is (or as `NULL`, its default value) - use_cached <- !force_refresh && (dots[["use_cache"]] %||% TRUE) - dots <- rlang::dots_list(!!!dots, use_cache = use_cached, .homonyms = "last") - - # We have 4 approaches to get a token, depending on the context - # 1. Use environment variables if all three are set - token_resp <- rlang::inject(try_token_from_vars(get_azure_token, !!!dots)) - token <- token_resp[["result"]] - token_error <- token_resp[["error"]] - - # 2. Try to get a managed token (for example on Azure VM, App Service) - if (is.null(token)) { - token <- rlang::inject(possibly_get_mtk(resource, !!!dots)) - } - - # 3. If neither of those has worked, try to get an already stored user token - # (unless `force_refresh` is on, in which case skip to option 4) - if (is.null(token) && use_cached) { - token <- match_cached_token(resource, tenant, aad_version) - } - - # 4. If we still don't have a token, try to get a new one via reauthentication - if (is.null(token)) { - if (!force_refresh) { - cli::cli_alert_info("No matching cached token found: fetching new token") - } - client_id <- client_id %||% get_client_id() - token_resp <- rlang::inject( - get_azure_token( - tenant = tenant, - app = client_id, - auth_type = auth_method, - !!!dots - ) - ) - token <- token_resp[["result"]] - token_error <- token_error %||% token_resp[["error"]] - } - - # Give some helpful feedback if the steps above have not succeeded - if (is.null(token) || length(token) == 0) { - cli::cli_alert_info("No authentication token was obtained.") - cli::cli_alert_info("Please check any variables you have supplied.") - cli::cli_alert_info( - "Alternatively, running {.fn AzureRMR::get_azure_login} or - {.fn AzureRMR::list_azure_tokens} may shed some light on the problem." - ) - error_msg <- "{.fn get_auth_token}: No authentication token was obtained." - cli::cli_abort(as.character(token_error %||% error_msg)) - } else { - if (aad_version == 2) { - check_that(token, AzureAuth::is_azure_v2_token, "Invalid token returned") - } else { - check_that(token, AzureAuth::is_azure_v1_token, "Invalid token returned") - } - } -} - - -#' Get token via app and secret environment variables -#' Sub-routine for `get_auth_token()` -#' @keywords internal -#' @returns A list with elements `result` and `error`. If this method is -#' successful, the `result` element will contain a token. -try_token_from_vars <- function(get_token_fun, ...) { - tenant_id_env <- Sys.getenv("AZ_TENANT_ID") - client_id_env <- Sys.getenv("AZ_CLIENT_ID") - client_secret <- Sys.getenv("AZ_APP_SECRET") - - if (all(nzchar(c(tenant_id_env, client_id_env, client_secret)))) { - rlang::inject( - get_token_fun( - tenant = tenant_id_env, - app = client_id_env, - password = client_secret, - ... - ) - ) - } else { - list(result = NULL, error = NULL) - } -} - - -#' Find an already cached token that matches desired parameters -#' Sub-routine for `get_auth_token()` -#' @keywords internal -#' @returns A token from local cache, or NULL if none matches -match_cached_token <- function(resource, tenant, aad_version) { - # list tokens already locally cached - local_tokens <- AzureAuth::list_azure_tokens() - if (length(local_tokens) > 0) { - resources <- purrr::map(local_tokens, "resource") - scopes <- purrr::map(local_tokens, list("scope", 1)) - resources <- purrr::map2_chr(resources, scopes, `%||%`) - tenants <- purrr::map_chr(local_tokens, "tenant") - versions <- purrr::map_int(local_tokens, "version") - - resource_index <- gregg(resources, "^{resource[[1]]}") - tenant_index <- tenants == tenant - version_index <- versions == aad_version - - # return a token matching `resource`, `tenant` and `version`, if any - token_index <- which(resource_index & tenant_index & version_index)[1] - if (!is.na(token_index)) local_tokens[[token_index]] else NULL - } else { - NULL - } -} - - -#' Sub-routine for `get_auth_token()` -#' -#' Pulled out mainly to tidy up the main function code a bit -#' @keywords internal -#' @returns A string (the client ID) -get_client_id <- function() { - pluck_client_id <- function() { - suppressMessages(AzureRMR::get_azure_login()) |> - purrr::pluck("token", "client", "client_id") - } - possibly_pluck_client_id <- \(...) purrr::possibly(pluck_client_id)(...) - azure_cli_default_client_id <- "04b07795-8ddb-461a-bbee-02f9e1bf7b46" - - client_id <- possibly_pluck_client_id() - # if that fails, prompt the user to log in, then try again... - if (is.null(client_id)) { - AzureRMR::create_azure_login() - # ...using the default Azure CLI client ID as a final fallback - client_id <- possibly_pluck_client_id() %||% azure_cli_default_client_id - } - client_id -} - - -#' Generate appropriate values for the `resource` parameter in [get_auth_token] -#' -#' A helper function to generate appropriate values. Ensure that the `version` -#' argument matches the `aad_version` argument to [get_auth_token]. -#' It's unlikely that you will ever want to set `authorise` to `FALSE` but it's -#' here as an option since [AzureAuth::get_azure_token] supports it. Similarly, -#' you are likely to want to keep `refresh` turned on (this argument has no -#' effect on v1 tokens, it only applies to v2). -#' -#' @param version numeric. The AAD version, either 1 or 2 (1 by default) -#' @param url The URL of the Azure resource host -#' @param path For v2, the path designating the access scope -#' @param authorise Boolean, whether to return a token with authorisation scope, -#' (TRUE, the default) or one that just provides authentication. You are -#' unlikely to want to turn this off -#' @param refresh Boolean, applies to v2 tokens only, whether to return a token -#' that has a refresh token also supplied. -#' @returns A scalar character, or (in most v2 situations) a character vector -#' @export -generate_resource <- function( - version = 1, - url = "https://storage.azure.com", - path = "/.default", - authorise = TRUE, - refresh = TRUE -) { - stopifnot("version must be 1 or 2" = version %in% seq(2)) - scopes <- if (refresh) c("openid", "offline_access") else "openid" - if (authorise) { - if (version == 2) { - c(paste0(url, path), scopes) - } else { - url - } - } else { - if (version == 2) { - scopes - } else { - "" - } - } -} - - -#' Use a token's internal `refresh()` method to refresh it -#' -#' This method avoids the need to refresh by re-authenticating online. It seems -#' like this only works with v1 tokens. v2 tokens always seem to refresh by -#' re-authenticating with Azure online. But v2 tokens _ought_ to refresh -#' automatically and not need manual refreshing. To instead generate a -#' completely fresh token, pass `use_cache = FALSE` or `force_refresh = TRUE` -#' to [get_auth_token]. -#' @param token An Azure authentication token -#' @returns An Azure authentication token -#' @export -refresh_token <- \(token) token$refresh() +#' Get Azure authentication token +#' +#' This function retrieves an Azure token for a specified resource. +#' +#' If the environment variables `AZ_TENANT_ID`, `AZ_CLIENT_ID` and +#' `AZ_APP_SECRET` are all set, it will try to use these to return a token. +#' +#' Otherwise it will try to get a managed token from a managed resource such as +#' Azure VM or Azure App Service. +#' +#' If neither of these approaches has returned a token, it will try to retrieve +#' a user token using the provided parameters, requiring the user to have +#' authenticated using their device. If `force_refresh` is set to `TRUE`, a +#' fresh web authentication process should be launched. Otherwise it will +#' attempt to use a cached token matching the given `resource`, `tenant` and +#' `aad_version`. +#' +#' @param resource For v1, a simple URL such as `"https://storage.azure.com/"` +#' should be supplied.For v2, a vector specifying the URL of the Azure resource +#' for which the token is requested as well as any desired scopes. See +#' [AzureAuth::get_azure_token] for details. Use [generate_resource] +#' to help provide an appropriate string or vector. The values default to +#' `c("https://storage.azure.com/.default", "openid", "offline_access")`. +#' If setting version to 2, ensure that the `aad_version` argument is also set +#' to 2. Both are set to use AAD version 1 by default. +#' @param tenant A string specifying the Azure tenant. Defaults to +#' `"common"`. See [AzureAuth::get_azure_token] for other values. +#' @param client_id A string specifying the application ID (client ID). If +#' `NULL`, (the default) the function attempts to obtain the client ID from the +#' Azure Resource Manager token, or prompts the user to log in to obtain it. +#' @param auth_method A string specifying the authentication method. Defaults to +#' `"authorization_code"`. See [AzureAuth::get_azure_token] for other values. +#' @param aad_version Numeric. The AAD version, either 1 or 2 (1 by default) +#' @param force_refresh Boolean: whether to use a stored token if available +#' (`FALSE`, the default), or try to obtain a new one from Azure (`TRUE`). +#' This may be useful if you wish to generate a new token with the same +#' `resource` value as an existing token, but a different `tenant` or +#' `auth_method`. Note that you can also try using [refresh_token] which will +#' cause an existing token to refresh itself, without obtaining a new token +#' from Azure via online reauthentication +#' @param ... Optional arguments (`token_args` or `use_cache`) to be passed on +#' to [AzureAuth::get_managed_token] or [AzureAuth::get_azure_token]. +#' +#' @returns An Azure token object +#' @examples +#' \dontrun{ +#' # Get a token for the default resource +#' token <- get_auth_token() +#' +#' # Force generation of a new token via online reauthentication +#' token <- get_auth_token(force_refresh = TRUE) +#' +#' # Get a token for a specific resource and tenant +#' token <- get_auth_token( +#' resource = "https://graph.microsoft.com", +#' tenant = "my-tenant-id" +#' ) +#' +#' # Get a token using a specific app ID +#' token <- get_auth_token(client_id = "my-app-id") +#' } +#' @export +get_auth_token <- function( + resource = generate_resource(), + tenant = "common", + client_id = NULL, + auth_method = "authorization_code", + aad_version = 1, + force_refresh = FALSE, + ... +) { + aad_msg <- "Invalid {.arg aad_version} variable supplied (must be 1 or 2)" + aad_version <- check_that(aad_version, \(x) x %in% seq(2), aad_msg) + + safely_get_token <- \(...) purrr::safely(AzureAuth::get_azure_token)(...) + get_azure_token <- purrr::partial( + safely_get_token, + resource = resource, + version = aad_version + ) + possibly_get_mtk <- \(...) purrr::possibly(AzureAuth::get_managed_token)(...) + + dots <- rlang::list2(...) + # If the user specifies force_refresh = TRUE we turn off `use_cache`, + # otherwise we leave `use_cache` as it is (or as `NULL`, its default value) + use_cached <- !force_refresh && (dots[["use_cache"]] %||% TRUE) + dots <- rlang::dots_list(!!!dots, use_cache = use_cached, .homonyms = "last") + + # We have 4 approaches to get a token, depending on the context + # 1. Use environment variables if all three are set + token_resp <- rlang::inject(try_token_from_vars(get_azure_token, !!!dots)) + token <- token_resp[["result"]] + token_error <- token_resp[["error"]] + + # 2. Try to get a managed token (for example on Azure VM, App Service) + if (is.null(token)) { + token <- rlang::inject(possibly_get_mtk(resource, !!!dots)) + } + + # 3. If neither of those has worked, try to get an already stored user token + # (unless `force_refresh` is on, in which case skip to option 4) + if (is.null(token) && use_cached) { + token <- match_cached_token(resource, tenant, aad_version) + } + + # 4. If we still don't have a token, try to get a new one via reauthentication + if (is.null(token)) { + if (!force_refresh) { + cli::cli_alert_info("No matching cached token found: fetching new token") + } + client_id <- client_id %||% get_client_id() + token_resp <- rlang::inject( + get_azure_token( + tenant = tenant, + app = client_id, + auth_type = auth_method, + !!!dots + ) + ) + token <- token_resp[["result"]] + token_error <- token_error %||% token_resp[["error"]] + } + + # Give some helpful feedback if the steps above have not succeeded + if (is.null(token) || length(token) == 0) { + cli::cli_alert_info("No authentication token was obtained.") + cli::cli_alert_info("Please check any variables you have supplied.") + cli::cli_alert_info( + "Alternatively, running {.fn AzureRMR::get_azure_login} or + {.fn AzureRMR::list_azure_tokens} may shed some light on the problem." + ) + error_msg <- "{.fn get_auth_token}: No authentication token was obtained." + cli::cli_abort(as.character(token_error %||% error_msg)) + } else { + if (aad_version == 2) { + check_that(token, AzureAuth::is_azure_v2_token, "Invalid token returned") + } else { + check_that(token, AzureAuth::is_azure_v1_token, "Invalid token returned") + } + } +} + + +#' Get token via app and secret environment variables +#' Sub-routine for `get_auth_token()` +#' @keywords internal +#' @returns A list with elements `result` and `error`. If this method is +#' successful, the `result` element will contain a token. +try_token_from_vars <- function(get_token_fun, ...) { + tenant_id_env <- Sys.getenv("AZ_TENANT_ID") + client_id_env <- Sys.getenv("AZ_CLIENT_ID") + client_secret <- Sys.getenv("AZ_APP_SECRET") + + if (all(nzchar(c(tenant_id_env, client_id_env, client_secret)))) { + rlang::inject( + get_token_fun( + tenant = tenant_id_env, + app = client_id_env, + password = client_secret, + ... + ) + ) + } else { + list(result = NULL, error = NULL) + } +} + + +#' Find an already cached token that matches desired parameters +#' Sub-routine for `get_auth_token()` +#' @keywords internal +#' @returns A token from local cache, or NULL if none matches +match_cached_token <- function(resource, tenant, aad_version) { + # list tokens already locally cached + local_tokens <- AzureAuth::list_azure_tokens() + if (length(local_tokens) > 0) { + resources <- purrr::map(local_tokens, "resource") + scopes <- purrr::map(local_tokens, list("scope", 1)) + resources <- purrr::map2_chr(resources, scopes, `%||%`) + tenants <- purrr::map_chr(local_tokens, "tenant") + versions <- purrr::map_int(local_tokens, "version") + + resource_index <- gregg(resources, "^{resource[[1]]}") + tenant_index <- tenants == tenant + version_index <- versions == aad_version + + # return a token matching `resource`, `tenant` and `version`, if any + token_index <- which(resource_index & tenant_index & version_index)[1] + if (!is.na(token_index)) local_tokens[[token_index]] else NULL + } else { + NULL + } +} + + +#' Sub-routine for `get_auth_token()` +#' +#' Pulled out mainly to tidy up the main function code a bit +#' @keywords internal +#' @returns A string (the client ID) +get_client_id <- function() { + pluck_client_id <- function() { + suppressMessages(AzureRMR::get_azure_login()) |> + purrr::pluck("token", "client", "client_id") + } + possibly_pluck_client_id <- \(...) purrr::possibly(pluck_client_id)(...) + azure_cli_default_client_id <- "04b07795-8ddb-461a-bbee-02f9e1bf7b46" + + client_id <- possibly_pluck_client_id() + # if that fails, prompt the user to log in, then try again... + if (is.null(client_id)) { + AzureRMR::create_azure_login() + # ...using the default Azure CLI client ID as a final fallback + client_id <- possibly_pluck_client_id() %||% azure_cli_default_client_id + } + client_id +} + + +#' Generate appropriate values for the `resource` parameter in [get_auth_token] +#' +#' A helper function to generate appropriate values. Ensure that the `version` +#' argument matches the `aad_version` argument to [get_auth_token]. +#' It's unlikely that you will ever want to set `authorise` to `FALSE` but it's +#' here as an option since [AzureAuth::get_azure_token] supports it. Similarly, +#' you are likely to want to keep `refresh` turned on (this argument has no +#' effect on v1 tokens, it only applies to v2). +#' +#' @param version numeric. The AAD version, either 1 or 2 (1 by default) +#' @param url The URL of the Azure resource host +#' @param path For v2, the path designating the access scope +#' @param authorise Boolean, whether to return a token with authorisation scope, +#' (TRUE, the default) or one that just provides authentication. You are +#' unlikely to want to turn this off +#' @param refresh Boolean, applies to v2 tokens only, whether to return a token +#' that has a refresh token also supplied. +#' @returns A scalar character, or (in most v2 situations) a character vector +#' @export +generate_resource <- function( + version = 1, + url = "https://storage.azure.com", + path = "/.default", + authorise = TRUE, + refresh = TRUE +) { + stopifnot("version must be 1 or 2" = version %in% seq(2)) + scopes <- if (refresh) c("openid", "offline_access") else "openid" + if (authorise) { + if (version == 2) { + c(paste0(url, path), scopes) + } else { + url + } + } else { + if (version == 2) { + scopes + } else { + "" + } + } +} + + +#' Use a token's internal `refresh()` method to refresh it +#' +#' This method avoids the need to refresh by re-authenticating online. It seems +#' like this only works with v1 tokens. v2 tokens always seem to refresh by +#' re-authenticating with Azure online. But v2 tokens _ought_ to refresh +#' automatically and not need manual refreshing. To instead generate a +#' completely fresh token, pass `use_cache = FALSE` or `force_refresh = TRUE` +#' to [get_auth_token]. +#' @param token An Azure authentication token +#' @returns An Azure authentication token +#' @export +refresh_token <- \(token) token$refresh() diff --git a/R/get_container.R b/R/get_container.R index 31a5d3d..2370ede 100644 --- a/R/get_container.R +++ b/R/get_container.R @@ -1,83 +1,83 @@ -#' Get Azure storage container -#' -#' The environment variable "AZ_STORAGE_EP" should be set. This provides the URL -#' for the default Azure storage endpoint. -#' Use [list_container_names] to get a list of available container names. -#' -#' @param container_name Name of the container as a string. `NULL` by default, -#' which means the function will look instead for a container name stored in -#' the environment variable "AZ_CONTAINER" -#' @param token An Azure authentication token. If left as `NULL`, a token -#' returned by [get_auth_token] will be used -#' @param endpoint_url An Azure endpoint URL. If left as `NULL`, the default, -#' the value of the environment variable "AZ_STORAGE_EP" will be used -#' @param ... arguments to be passed through to [get_auth_token], if a token is -#' not already supplied -#' @returns An Azure blob container (list object of class "blob_container") -#' @export -get_container <- function( - container_name = NULL, - token = NULL, - endpoint_url = NULL, - ... -) { - msg1 <- paste0( - "{.var container_name} is empty. ", - "Did you forget to set an environment variable?" - ) - msg2 <- paste0( - "{.var endpoint_url} is empty. ", - "Did you forget to set an environment variable?" - ) - container_name <- (container_name %||% check_envvar("AZ_CONTAINER")) |> - check_nzchar(msg1) - endpoint_url <- (endpoint_url %||% check_envvar("AZ_STORAGE_EP")) |> - check_nzchar(msg2) - token <- token %||% get_auth_token(...) - - get_azure_endpoint(token, endpoint_url) |> - AzureStor::blob_container(container_name) -} - - -#' Return a list of container names that are found at the endpoint -#' -#' @inheritParams get_container -#' @returns A character vector of all container names found -#' @export -list_container_names <- function(token = NULL, endpoint_url = NULL, ...) { - token <- token %||% get_auth_token(...) - endpoint <- get_azure_endpoint(token, endpoint_url) - container_list <- AzureStor::list_blob_containers(endpoint) - stopifnot("no containers found" = length(container_list) >= 1L) - names(container_list) -} - - -#' Return an Azure "blob_endpoint" -#' -#' This function will return the endpoint specified in the environment variable -#' "AZ_STORAGE_EP" by default -#' -#' @inheritParams get_container -#' @returns An Azure blob endpoint (object of class "blob_endpoint") -#' @keywords internal -get_azure_endpoint <- function(token = NULL, endpoint_url = NULL, ...) { - token <- token %||% get_auth_token(...) - endpoint_url <- endpoint_url %||% check_envvar("AZ_STORAGE_EP") - AzureStor::blob_endpoint(endpoint_url, token = token) -} - - -#' Check that an environment variable exists -#' -#' The function prints a helpful error if the variable is not found, else -#' it returns the value of `Sys.getenv(x)` -#' -#' @param x the *name* of the environment variable to be found and checked -#' @returns the value of the environment variable named in `x` -#' @export -check_envvar <- function(x) { - cst_msg <- cst_error_msg("The environment variable {.envvar {x}} is not set") - check_scalar_type(Sys.getenv(x, NA_character_), "string", cst_msg) -} +#' Get Azure storage container +#' +#' The environment variable "AZ_STORAGE_EP" should be set. This provides the URL +#' for the default Azure storage endpoint. +#' Use [list_container_names] to get a list of available container names. +#' +#' @param container_name Name of the container as a string. `NULL` by default, +#' which means the function will look instead for a container name stored in +#' the environment variable "AZ_CONTAINER" +#' @param token An Azure authentication token. If left as `NULL`, a token +#' returned by [get_auth_token] will be used +#' @param endpoint_url An Azure endpoint URL. If left as `NULL`, the default, +#' the value of the environment variable "AZ_STORAGE_EP" will be used +#' @param ... arguments to be passed through to [get_auth_token], if a token is +#' not already supplied +#' @returns An Azure blob container (list object of class "blob_container") +#' @export +get_container <- function( + container_name = NULL, + token = NULL, + endpoint_url = NULL, + ... +) { + msg1 <- paste0( + "{.var container_name} is empty. ", + "Did you forget to set an environment variable?" + ) + msg2 <- paste0( + "{.var endpoint_url} is empty. ", + "Did you forget to set an environment variable?" + ) + container_name <- (container_name %||% check_envvar("AZ_CONTAINER")) |> + check_nzchar(msg1) + endpoint_url <- (endpoint_url %||% check_envvar("AZ_STORAGE_EP")) |> + check_nzchar(msg2) + token <- token %||% get_auth_token(...) + + get_azure_endpoint(token, endpoint_url) |> + AzureStor::blob_container(container_name) +} + + +#' Return a list of container names that are found at the endpoint +#' +#' @inheritParams get_container +#' @returns A character vector of all container names found +#' @export +list_container_names <- function(token = NULL, endpoint_url = NULL, ...) { + token <- token %||% get_auth_token(...) + endpoint <- get_azure_endpoint(token, endpoint_url) + container_list <- AzureStor::list_blob_containers(endpoint) + stopifnot("no containers found" = length(container_list) >= 1L) + names(container_list) +} + + +#' Return an Azure "blob_endpoint" +#' +#' This function will return the endpoint specified in the environment variable +#' "AZ_STORAGE_EP" by default +#' +#' @inheritParams get_container +#' @returns An Azure blob endpoint (object of class "blob_endpoint") +#' @keywords internal +get_azure_endpoint <- function(token = NULL, endpoint_url = NULL, ...) { + token <- token %||% get_auth_token(...) + endpoint_url <- endpoint_url %||% check_envvar("AZ_STORAGE_EP") + AzureStor::blob_endpoint(endpoint_url, token = token) +} + + +#' Check that an environment variable exists +#' +#' The function prints a helpful error if the variable is not found, else +#' it returns the value of `Sys.getenv(x)` +#' +#' @param x the *name* of the environment variable to be found and checked +#' @returns the value of the environment variable named in `x` +#' @export +check_envvar <- function(x) { + cst_msg <- cst_error_msg("The environment variable {.envvar {x}} is not set") + check_scalar_type(Sys.getenv(x, NA_character_), "string", cst_msg) +} diff --git a/R/list_files.R b/R/list_files.R index cfff3f3..99cad18 100644 --- a/R/list_files.R +++ b/R/list_files.R @@ -1,57 +1,57 @@ -#' List files in a container -#' -#' Recursively (or not, if desired) lists all files found in a container. Search -#' can be restricted to a particular 'subdirectory' of the container, and/or -#' to files with a specific extension. The function assumes that all file names -#' end with a ".ext" extension of some sort. -#' -#' The function does not support filtering by file name, only by file extension. -#' -#' The returned file list (character vector) contains the full paths to the -#' files, ready to be passed perhaps to a `read_azure_*` function, or further -#' filtered by you. If you just want the names of the files without the folder -#' path, use [basename()] to extract these. -#' -#' @inheritParams read_azure_parquet -#' @param path (optional) subdirectory of the container to list files within. -#' `""` (the root folder of the container) by default -#' @param ext (optional) A string giving the extension of a particular file type -#' you want to restrict the list to. No need to include the initial ".". The -#' default, `""`, means no filtering by file extension will be applied. Can be -#' a regular expression. -#' @param recursive A Boolean value: whether to list files recursively. `TRUE` -#' by default -#' -#' @importFrom rlang .data -#' @returns A vector of file names, or an empty character vector if none found -#' @examples \dontrun{ -#' list_files(get_container("example"), ext = "csv") -#' } -#' @export -list_files <- function(container, path = "", ext = "", recursive = TRUE) { - stopifnot(rlang::is_character(c(path, ext), 2)) - stopifnot(rlang::is_bool(recursive)) - pnf_msg <- ct_error_msg("Path {.val {path}} not found") - check_that(path, \(x) AzureStor::blob_dir_exists(container, x), pnf_msg) - - tbl <- AzureStor::list_blobs(container, path, recursive = recursive) - if (nrow(tbl) > 0) { - ext_rx <- if (nzchar(ext)) sub("^\\.+", "", ext) else ".*" # nolint - tbl <- tbl |> - dplyr::filter(!.data[["isdir"]] & gregg(.data[["name"]], "\\.{ext_rx}$")) - } - - # A zero-row tbl can result if `path` is initially empty, or via the filter - # step above. We handle this the same way, no matter which route led here. - if (nrow(tbl) == 0) { - fix_path <- \(p) sub("^/+$", "", sub("^([^/])(.*)", "/\\1\\2", p)) # nolint - ext <- if (nzchar(ext)) paste0(" ", ext) - msg <- "No{ext} files found in {.val [{container$name}]:{fix_path(path)}}" - if (rlang::is_interactive()) { - cli::cli_alert_info(msg) - } - invisible(character(0)) - } else { - tbl[["name"]] - } -} +#' List files in a container +#' +#' Recursively (or not, if desired) lists all files found in a container. Search +#' can be restricted to a particular 'subdirectory' of the container, and/or +#' to files with a specific extension. The function assumes that all file names +#' end with a ".ext" extension of some sort. +#' +#' The function does not support filtering by file name, only by file extension. +#' +#' The returned file list (character vector) contains the full paths to the +#' files, ready to be passed perhaps to a `read_azure_*` function, or further +#' filtered by you. If you just want the names of the files without the folder +#' path, use [basename()] to extract these. +#' +#' @inheritParams read_azure_parquet +#' @param path (optional) subdirectory of the container to list files within. +#' `""` (the root folder of the container) by default +#' @param ext (optional) A string giving the extension of a particular file type +#' you want to restrict the list to. No need to include the initial ".". The +#' default, `""`, means no filtering by file extension will be applied. Can be +#' a regular expression. +#' @param recursive A Boolean value: whether to list files recursively. `TRUE` +#' by default +#' +#' @importFrom rlang .data +#' @returns A vector of file names, or an empty character vector if none found +#' @examples \dontrun{ +#' list_files(get_container("example"), ext = "csv") +#' } +#' @export +list_files <- function(container, path = "", ext = "", recursive = TRUE) { + stopifnot(rlang::is_character(c(path, ext), 2)) + stopifnot(rlang::is_bool(recursive)) + pnf_msg <- ct_error_msg("Path {.val {path}} not found") + check_that(path, \(x) AzureStor::blob_dir_exists(container, x), pnf_msg) + + tbl <- AzureStor::list_blobs(container, path, recursive = recursive) + if (nrow(tbl) > 0) { + ext_rx <- if (nzchar(ext)) sub("^\\.+", "", ext) else ".*" # nolint + tbl <- tbl |> + dplyr::filter(!.data[["isdir"]] & gregg(.data[["name"]], "\\.{ext_rx}$")) + } + + # A zero-row tbl can result if `path` is initially empty, or via the filter + # step above. We handle this the same way, no matter which route led here. + if (nrow(tbl) == 0) { + fix_path <- \(p) sub("^/+$", "", sub("^([^/])(.*)", "/\\1\\2", p)) # nolint + ext <- if (nzchar(ext)) paste0(" ", ext) + msg <- "No{ext} files found in {.val [{container$name}]:{fix_path(path)}}" + if (rlang::is_interactive()) { + cli::cli_alert_info(msg) + } + invisible(character(0)) + } else { + tbl[["name"]] + } +} diff --git a/R/read_azure_files.R b/R/read_azure_files.R index e76183b..70c5d5e 100644 --- a/R/read_azure_files.R +++ b/R/read_azure_files.R @@ -1,163 +1,163 @@ -#' Read a parquet file from Azure storage -#' -#' @param container An Azure container object, as returned by [get_container] -#' @param file The name of the file to be read, as a string. NB The file -#' extension does not need to be included (though it can be). The function -#' will error if multiple files are somehow matched. -#' @param path The path to the directory where `file` is located, as a string. -#' Only needed if `file` does not already contain its full path. If file is -#' just a file name with no path, then provide the path to the directory here. -#' This must be the full path to the file location, as the function will not -#' search into subdirectories recursively. Set to `"/"` (the root of the -#' container) by default. -#' @param info Boolean. Whether to print user feedback about the file that is -#' being read. Useful for checking the function is doing what is expected, but -#' can be turned off with `FALSE`. Can be set persistently with the option -#' "azkit.info". If `NULL` then it will default to the value of -#' [rlang::is_interactive] (that is, `TRUE` for interactive sessions). -#' @param ... optional arguments to be passed through to [arrow::read_parquet] -#' @returns A tibble -#' @examples \dontrun{ -#' # if a full filepath is available then path can be ignored -#' read_azure_parquet(cont, "data/folder/path/1.parquet") -#' # you can provide a filename without the '.parquet' extension -#' # if you wish to use this partial file name matching it is probably easier -#' # to provide a 'path' -#' read_azure_parquet(cont, "case_details", "storage/parquet/2025/06/29") -#' } -#' @export -read_azure_parquet <- function(container, file, path = "/", info = NULL, ...) { - check_blob_exists(container, file, "parquet", info, path) |> - # using `dest = NULL` means pass the data through as a raw vector - AzureStor::download_blob(container, src = _, dest = NULL) |> - arrow::read_parquet(...) -} - - -#' Read a json file from Azure storage -#' -#' @inheritParams read_azure_parquet -#' @param ... optional arguments to be passed through to -#' [yyjsonr::read_json_raw] -#' @returns A list -#' @export -read_azure_json <- function(container, file, path = "/", info = NULL, ...) { - check_blob_exists(container, file, "json", info, path) |> - # using `dest = NULL` means pass the data through as a raw vector - AzureStor::download_blob(container, src = _, dest = NULL) |> - yyjsonr::read_json_raw(...) -} - - -#' Read a json.gz file from Azure storage -#' -#' @inheritParams read_azure_parquet -#' @param ... optional arguments to be passed through to -#' [yyjsonr::read_json_file] -#' @returns A list -#' @export -read_azure_jsongz <- function(container, file, path = "/", info = NULL, ...) { - full_path <- check_blob_exists(container, file, "json.gz", info, path) - dl <- withr::local_tempfile( - pattern = tools::file_path_sans_ext(basename(full_path), TRUE), - fileext = "json.gz" - ) - AzureStor::download_blob(container, src = full_path, dest = dl) - yyjsonr::read_json_file(dl, ...) -} - - -#' Read an rds file from Azure storage -#' -#' @inheritParams read_azure_parquet -#' @param ... optional arguments to be passed through to -#' [AzureStor::storage_load_rds]. For example, a compression type (one of -#' c("unknown", "gzip", "bzip2", "xz", "zstd", "none")) can be provided using -#' the argument `type`, which will be passed on to [memDecompress] via -#' [AzureStor::storage_load_rds]. -# If nothing is provided here, the compression type will be set to "none". -#' @returns The data object that was stored in the rds file -#' @export -read_azure_rds <- function(container, file, path = "/", info = NULL, ...) { - # If the user doesn't specify a (de)compression type with `type` in `...`, we - # will set a `type` of "none", as this seems to be the standard on SU Azure - dots <- rlang::dots_list(..., type = "none", .homonyms = "first") - blob <- check_blob_exists(container, file, "rds", info, path) - rlang::inject(AzureStor::storage_load_rds(container, blob, !!!dots)) -} - - -#' Read a csv file from Azure storage -#' -#' @inheritParams read_azure_parquet -#' @param ... optional arguments to be passed through to [readr::read_delim] -#' @returns A tibble -#' @export -read_azure_csv <- function(container, file, path = "/", info = NULL, ...) { - check_blob_exists(container, file, "csv", info, path) |> - AzureStor::storage_read_csv(container, file = _, ...) -} - - -#' Read any file from Azure storage -#' -#' @inheritParams read_azure_parquet -#' @param ext If a custom extension needs to be supplied, you can specify it -#' here. If `NULL`, the default, the extension of `file` will be used -#' @param ... optional arguments to be passed through to -#' [AzureStor::download_blob] -#' @returns A raw data stream -#' @export -read_azure_file <- function( - container, - file, - path = "/", - info = NULL, - ext = NULL, - ... -) { - ext <- ext %||% tools::file_ext(file) - check_blob_exists(container, file, ext, info, path) |> - # using `dest = NULL` means pass the data through as a raw vector - AzureStor::download_blob(container, src = _, dest = NULL, ...) -} - - -#' Ensures that the filepath for the file to read exists -#' -#' @inheritParams read_azure_parquet -#' @param ext The standard file extension for the file type, e.g. "json" -#' @keywords internal -check_blob_exists <- function(container, file, ext, info, path) { - stopifnot("no container found" = inherits(container, "blob_container")) - path <- if (path %in% c("", "/")) "" else path - stopifnot("path not found" = AzureStor::blob_dir_exists(container, path)) - dir_name <- if (dirname(file) == ".") "" else dirname(file) - # Potentially the user could provide a partial file path in `path` and a - # further sub-directory as part of `file`. This handles that eventuality, - # though this usage pattern should be quite rare! - dpath <- file.path(path, dir_name) - fname <- basename(file) - if (nzchar(ext) && !gregg(fname, "\\.{ext}$")) { - fname <- glue::glue("{fname}.{ext}") - } - # remove duplicate slashes and any initial slashes - file_path <- sub("^/", "", gsub("/+", "/", file.path(dpath, fname))) - - filepath_out <- AzureStor::list_blobs(container, dpath, recursive = FALSE) |> - dplyr::filter(dplyr::if_any("name", \(x) x == {{ file_path }})) |> - dplyr::pull("name") - - msg1 <- ct_error_msg("no matching {ext} file found") - msg2 <- cst_error_msg("multiple matching {ext} files found") - check_that(filepath_out, \(x) length(x) > 0, msg1) # check length > 0 - check_scalar_type(filepath_out, "character", msg2) # check length == 1 - - info_option <- getOption("azkit.info") - stopifnot(rlang::is_scalar_logical(info) || is.null(info)) - stopifnot(rlang::is_scalar_logical(info_option) || is.null(info_option)) - if (info %||% info_option %||% rlang::is_interactive()) { - cli::cli_alert_info("File {.val {filepath_out}} will be read in") - } - filepath_out -} +#' Read a parquet file from Azure storage +#' +#' @param container An Azure container object, as returned by [get_container] +#' @param file The name of the file to be read, as a string. NB The file +#' extension does not need to be included (though it can be). The function +#' will error if multiple files are somehow matched. +#' @param path The path to the directory where `file` is located, as a string. +#' Only needed if `file` does not already contain its full path. If file is +#' just a file name with no path, then provide the path to the directory here. +#' This must be the full path to the file location, as the function will not +#' search into subdirectories recursively. Set to `"/"` (the root of the +#' container) by default. +#' @param info Boolean. Whether to print user feedback about the file that is +#' being read. Useful for checking the function is doing what is expected, but +#' can be turned off with `FALSE`. Can be set persistently with the option +#' "azkit.info". If `NULL` then it will default to the value of +#' [rlang::is_interactive] (that is, `TRUE` for interactive sessions). +#' @param ... optional arguments to be passed through to [arrow::read_parquet] +#' @returns A tibble +#' @examples \dontrun{ +#' # if a full filepath is available then path can be ignored +#' read_azure_parquet(cont, "data/folder/path/1.parquet") +#' # you can provide a filename without the '.parquet' extension +#' # if you wish to use this partial file name matching it is probably easier +#' # to provide a 'path' +#' read_azure_parquet(cont, "case_details", "storage/parquet/2025/06/29") +#' } +#' @export +read_azure_parquet <- function(container, file, path = "/", info = NULL, ...) { + check_blob_exists(container, file, "parquet", info, path) |> + # using `dest = NULL` means pass the data through as a raw vector + AzureStor::download_blob(container, src = _, dest = NULL) |> + arrow::read_parquet(...) +} + + +#' Read a json file from Azure storage +#' +#' @inheritParams read_azure_parquet +#' @param ... optional arguments to be passed through to +#' [yyjsonr::read_json_raw] +#' @returns A list +#' @export +read_azure_json <- function(container, file, path = "/", info = NULL, ...) { + check_blob_exists(container, file, "json", info, path) |> + # using `dest = NULL` means pass the data through as a raw vector + AzureStor::download_blob(container, src = _, dest = NULL) |> + yyjsonr::read_json_raw(...) +} + + +#' Read a json.gz file from Azure storage +#' +#' @inheritParams read_azure_parquet +#' @param ... optional arguments to be passed through to +#' [yyjsonr::read_json_file] +#' @returns A list +#' @export +read_azure_jsongz <- function(container, file, path = "/", info = NULL, ...) { + full_path <- check_blob_exists(container, file, "json.gz", info, path) + dl <- withr::local_tempfile( + pattern = tools::file_path_sans_ext(basename(full_path), TRUE), + fileext = "json.gz" + ) + AzureStor::download_blob(container, src = full_path, dest = dl) + yyjsonr::read_json_file(dl, ...) +} + + +#' Read an rds file from Azure storage +#' +#' @inheritParams read_azure_parquet +#' @param ... optional arguments to be passed through to +#' [AzureStor::storage_load_rds]. For example, a compression type (one of +#' c("unknown", "gzip", "bzip2", "xz", "zstd", "none")) can be provided using +#' the argument `type`, which will be passed on to [memDecompress] via +#' [AzureStor::storage_load_rds]. +# If nothing is provided here, the compression type will be set to "none". +#' @returns The data object that was stored in the rds file +#' @export +read_azure_rds <- function(container, file, path = "/", info = NULL, ...) { + # If the user doesn't specify a (de)compression type with `type` in `...`, we + # will set a `type` of "none", as this seems to be the standard on SU Azure + dots <- rlang::dots_list(..., type = "none", .homonyms = "first") + blob <- check_blob_exists(container, file, "rds", info, path) + rlang::inject(AzureStor::storage_load_rds(container, blob, !!!dots)) +} + + +#' Read a csv file from Azure storage +#' +#' @inheritParams read_azure_parquet +#' @param ... optional arguments to be passed through to [readr::read_delim] +#' @returns A tibble +#' @export +read_azure_csv <- function(container, file, path = "/", info = NULL, ...) { + check_blob_exists(container, file, "csv", info, path) |> + AzureStor::storage_read_csv(container, file = _, ...) +} + + +#' Read any file from Azure storage +#' +#' @inheritParams read_azure_parquet +#' @param ext If a custom extension needs to be supplied, you can specify it +#' here. If `NULL`, the default, the extension of `file` will be used +#' @param ... optional arguments to be passed through to +#' [AzureStor::download_blob] +#' @returns A raw data stream +#' @export +read_azure_file <- function( + container, + file, + path = "/", + info = NULL, + ext = NULL, + ... +) { + ext <- ext %||% tools::file_ext(file) + check_blob_exists(container, file, ext, info, path) |> + # using `dest = NULL` means pass the data through as a raw vector + AzureStor::download_blob(container, src = _, dest = NULL, ...) +} + + +#' Ensures that the filepath for the file to read exists +#' +#' @inheritParams read_azure_parquet +#' @param ext The standard file extension for the file type, e.g. "json" +#' @keywords internal +check_blob_exists <- function(container, file, ext, info, path) { + stopifnot("no container found" = inherits(container, "blob_container")) + path <- if (path %in% c("", "/")) "" else path + stopifnot("path not found" = AzureStor::blob_dir_exists(container, path)) + dir_name <- if (dirname(file) == ".") "" else dirname(file) + # Potentially the user could provide a partial file path in `path` and a + # further sub-directory as part of `file`. This handles that eventuality, + # though this usage pattern should be quite rare! + dpath <- file.path(path, dir_name) + fname <- basename(file) + if (nzchar(ext) && !gregg(fname, "\\.{ext}$")) { + fname <- glue::glue("{fname}.{ext}") + } + # remove duplicate slashes and any initial slashes + file_path <- sub("^/", "", gsub("/+", "/", file.path(dpath, fname))) + + filepath_out <- AzureStor::list_blobs(container, dpath, recursive = FALSE) |> + dplyr::filter(dplyr::if_any("name", \(x) x == {{ file_path }})) |> + dplyr::pull("name") + + msg1 <- ct_error_msg("no matching {ext} file found") + msg2 <- cst_error_msg("multiple matching {ext} files found") + check_that(filepath_out, \(x) length(x) > 0, msg1) # check length > 0 + check_scalar_type(filepath_out, "character", msg2) # check length == 1 + + info_option <- getOption("azkit.info") + stopifnot(rlang::is_scalar_logical(info) || is.null(info)) + stopifnot(rlang::is_scalar_logical(info_option) || is.null(info_option)) + if (info %||% info_option %||% rlang::is_interactive()) { + cli::cli_alert_info("File {.val {filepath_out}} will be read in") + } + filepath_out +} diff --git a/R/read_azure_table.R b/R/read_azure_table.R index 5a85452..d95c3b4 100644 --- a/R/read_azure_table.R +++ b/R/read_azure_table.R @@ -1,39 +1,39 @@ -#' Read in data from an Azure table -#' -#' @param table_name name of the table to be read. If left as `NULL`, -#' the default, the function will look instead for a value stored in the -#' environment variable "AZ_TABLE_NAME" -#' @param table_endpoint URL of the Azure table endpoint. If left as `NULL`, -#' the default, the function will look instead for a value stored in the -#' environment variable "AZ_TABLE_EP" -#' @param ... parameters to be passed through to [get_auth_token] -#' @inheritParams get_container -#' @returns A tibble -#' @export -read_azure_table <- function( - table_name = NULL, - table_endpoint = NULL, - token = NULL, - ... -) { - table_name <- table_name %||% check_envvar("AZ_TABLE_NAME") - table_ep <- table_endpoint %||% check_envvar("AZ_TABLE_EP") - token <- token %||% get_auth_token(...) - access_token <- token |> - purrr::pluck("credentials", "access_token") - headers <- list("2025-11-05", "application/json;odata=nometadata") |> - purrr::set_names(c("x-ms-version", "Accept")) - - resp <- httr2::request(table_ep) |> - httr2::req_url_path_append(table_name) |> - httr2::req_auth_bearer_token(access_token) |> - httr2::req_headers(!!!headers) |> - httr2::req_perform() |> - httr2::resp_check_status() - - resp |> - httr2::resp_body_json() |> - purrr::pluck("value") |> - purrr::map(tibble::as_tibble) |> - purrr::list_rbind() -} +#' Read in data from an Azure table +#' +#' @param table_name name of the table to be read. If left as `NULL`, +#' the default, the function will look instead for a value stored in the +#' environment variable "AZ_TABLE_NAME" +#' @param table_endpoint URL of the Azure table endpoint. If left as `NULL`, +#' the default, the function will look instead for a value stored in the +#' environment variable "AZ_TABLE_EP" +#' @param ... parameters to be passed through to [get_auth_token] +#' @inheritParams get_container +#' @returns A tibble +#' @export +read_azure_table <- function( + table_name = NULL, + table_endpoint = NULL, + token = NULL, + ... +) { + table_name <- table_name %||% check_envvar("AZ_TABLE_NAME") + table_ep <- table_endpoint %||% check_envvar("AZ_TABLE_EP") + token <- token %||% get_auth_token(...) + access_token <- token |> + purrr::pluck("credentials", "access_token") + headers <- list("2025-11-05", "application/json;odata=nometadata") |> + purrr::set_names(c("x-ms-version", "Accept")) + + resp <- httr2::request(table_ep) |> + httr2::req_url_path_append(table_name) |> + httr2::req_auth_bearer_token(access_token) |> + httr2::req_headers(!!!headers) |> + httr2::req_perform() |> + httr2::resp_check_status() + + resp |> + httr2::resp_body_json() |> + purrr::pluck("value") |> + purrr::map(tibble::as_tibble) |> + purrr::list_rbind() +} diff --git a/tests/testthat/test-read_azure_files.R b/tests/testthat/test-read_azure_files.R index 1f9e0fc..de7432e 100644 --- a/tests/testthat/test-read_azure_files.R +++ b/tests/testthat/test-read_azure_files.R @@ -1,343 +1,343 @@ -test_that("basic success", { - endpoint_uri <- Sys.getenv("AZ_STORAGE_EP") - # only run the test if this variable is set (ie locally, but not on GitHub) - if (nzchar(endpoint_uri)) { - token <- get_auth_token() - expect_false(is.null(token)) - expect_s3_class(token, "AzureToken") - - # explore behaviour with blob_endpoint - ep <- AzureStor::blob_endpoint(endpoint_uri, token = token) |> - expect_no_error() - expect_s3_class(ep, "blob_endpoint") - inputs_data <- AzureStor::blob_container(ep, "inputs-data") |> - expect_no_error() - expect_s3_class(inputs_data, "blob_container") - files <- AzureStor::list_blobs(inputs_data, "dev", recursive = FALSE) |> - dplyr::filter(grepl("\\.parquet$", .data[["name"]])) |> - dplyr::pull("name") - name <- "baseline" - pqt_file <- grep(name, files, value = TRUE) - expect_length(pqt_file, 1) - expect_true(AzureStor::blob_exists(inputs_data, pqt_file)) - out <- AzureStor::download_blob(inputs_data, pqt_file, dest = NULL) |> - arrow::read_parquet() - expect_s3_class(out, "tbl_df") - expect_length(out, 6) # ncol - } -}) - - -test_that("understand some new errors in check_blob_exists", { - endpoint_uri <- Sys.getenv("AZ_STORAGE_EP") - # only run the test if this variable is set (i.e. locally, but not on GitHub) - if (nzchar(endpoint_uri)) { - inp <- expect_no_error(get_container("inputs-data")) - path <- "dev" - file <- "baseline" - ext <- "parquet" - path <- if (path %in% c("", "/")) "" else path - expect_equal(path, "dev") - dir_name <- if (dirname(file) == ".") "" else dirname(file) - expect_equal(dir_name, "") - p2 <- file.path(path, dir_name) - expect_equal(p2, "dev") - file_name <- paste0(basename(file), ".", ext) - expect_equal(file_name, "baseline.parquet") - file_path <- sub("^/", "", sub("/+", "/", file.path(p2, file_name))) - expect_equal(file_path, "dev/baseline.parquet") - dir_list <- AzureStor::list_blobs(inp, p2, recursive = FALSE) - file_name_out <- dir_list |> - dplyr::filter(dplyr::if_any("name", \(x) x == file_path)) |> - dplyr::pull("name") - expect_equal(file_name_out, "dev/baseline.parquet") - expect_no_error(check_blob_exists(inp, file, ext, FALSE, path)) - - # check still works if full filepath is passed to file arg - path <- "" - file <- "dev/baseline.parquet" - path <- if (path %in% c("", "/")) "" else path - expect_equal(path, "") - dir_name <- if (dirname(file) == ".") "" else dirname(file) - expect_equal(dir_name, "dev") - p2 <- glue::glue("{path}/{dir_name}") - expect_equal(p2, "/dev") - file_name <- basename(file) - expect_equal(file_name, "baseline.parquet") - file_path <- sub("^/", "", sub("/+", "/", glue::glue("{p2}/{file_name}"))) - expect_equal(file_path, "dev/baseline.parquet") - dir_list <- AzureStor::list_blobs(inp, p2, recursive = FALSE) - file_name_out <- dir_list |> - dplyr::filter(dplyr::if_any("name", \(x) x == {{ file_path }})) |> - dplyr::pull("name") - expect_equal(file_name_out, file) - expect_no_error(check_blob_exists(inp, file, ext, FALSE, path)) - } -}) - - -test_that("whole read_parquet function works", { - endpoint_uri <- Sys.getenv("AZ_STORAGE_EP") - # only run the test if this variable is set (i.e. locally, but not on GitHub) - if (nzchar(endpoint_uri)) { - inputs_container <- expect_no_error(get_container("inputs-data")) - expect_no_error(read_azure_parquet( - inputs_container, - "baseline", - path = "dev" - )) - out1 <- read_azure_parquet(inputs_container, "baseline", path = "dev") - expect_s3_class(out1, "tbl_df") - # check that it works with the file extension included - out2 <- read_azure_parquet( - inputs_container, - "baseline.parquet", - path = "dev" - ) - expect_length(out2, 6) # ncol - - res <- get_container(Sys.getenv("AZ_RESULTS_CONTAINER")) - pqt_file <- Sys.getenv("TEST_PARQUET_FILE") - path <- "/" - file_ext <- "parquet" - - # experiment with changes to code: - path <- if (path %in% c("", "/")) "" else path - expect_equal(path, "") - dir_name <- if (dirname(pqt_file) == ".") "" else dirname(pqt_file) - dpath <- glue::glue("{path}/{dir_name}") - file_name <- sub(glue::glue("\\.{file_ext}$"), "", basename(pqt_file)) - - file_name <- if (gregg(basename(pqt_file), "\\.{file_ext}$")) { - basename(pqt_file) - } else { - glue::glue("{basename(pqt_file)}.{file_ext}") - } - # remove duplicate slashes and any initial slashes - fpath <- sub("^/", "", sub("/+", "/", glue::glue("{dpath}/{file_name}"))) - expect_equal(fpath, pqt_file) - # now function should run without error - expect_no_error(check_blob_exists(res, pqt_file, "parquet", FALSE, "/")) - # we want this to error if the file_ext doesn't match the file - expect_error(check_blob_exists(res, pqt_file, "rds", FALSE, "/")) - } -}) - - -test_that("read_azure_json basically works", { - endpoint_uri <- Sys.getenv("AZ_STORAGE_EP") - # only run the test if this variable is set (i.e. locally, but not on GitHub) - if (nzchar(endpoint_uri)) { - download_azure_blob <- function(container, file, file_ext, path = "") { - check_blob_exists(container, file, file_ext, FALSE, path) |> - AzureStor::download_blob(container, src = _, dest = NULL) - } - expect_no_error(support_container <- get_container("supporting-data")) - raw_out <- support_container |> - download_azure_blob("providers", "json") - expect_type(raw_out, "raw") - expect_no_error(yyjsonr::read_json_raw(raw_out)) - out <- yyjsonr::read_json_raw(raw_out) - expect_type(out, "character") - expect_length(out, 138) - } -}) - -test_that("dirname and basename logic works", { - endpoint_uri <- Sys.getenv("AZ_STORAGE_EP") - # only run the test if this variable is set (i.e. locally, but not on GitHub) - if (nzchar(endpoint_uri)) { - expect_no_error(support_container <- get_container("supporting-data")) - file <- "neecom_table.rds" - path <- "" - expect_no_error(AzureStor::blob_exists(support_container, file)) - file2 <- paste0("/", file) - file3 <- paste0("//", file) - expect_no_error(AzureStor::blob_exists(support_container, file2)) - expect_no_error(AzureStor::blob_exists(support_container, file3)) - expect_equal(dirname(file), ".") - file <- paste0(path, "/", file) - expect_equal(dirname(file), "/") - path <- "/" - file <- paste0(path, "/", file) - expect_equal(dirname(file), "/") - - new_path <- sub("^\\.$", "/", dirname(file)) - expect_equal(new_path, "/") - path <- "QA" - file <- "none.txt" - file <- paste0(path, "/", file) - new_path <- sub("^\\.$", "/", dirname(file)) - expect_equal(new_path, "QA") - - res <- get_container("results") - path <- "prod/dev/national/national" - file <- "test-2025" - file_ext <- "json.gz" - filepath <- sub("^/+", "", paste0(path, "/", file)) - expect_equal(filepath, "prod/dev/national/national/test-2025") - - path <- "/" - filepath <- sub("^/+", "", paste0(path, "/", file)) - expect_equal(dirname(filepath), ".") - - path <- "prod/dev/national/national" - filepath <- sub("^/+", "", paste0(path, "/", file)) - expect_equal(filepath, "prod/dev/national/national/test-2025") - path <- sub("^\\.$", "/", dirname(filepath)) - - filepath_out <- AzureStor::list_blobs(res, path, recursive = FALSE) |> - dplyr::filter( - !dplyr::if_any("isdir") & - dplyr::if_any("name", \(x) { - gregg(x, "\\.{file_ext}$") & gregg(x, "^{filepath}") - }) - ) |> - dplyr::pull("name") - expect_match(filepath_out, "prod/dev/national/national/test-2025.*json.gz") - } -}) - -test_that("tdd of check_blob_exists", { - endpoint_uri <- Sys.getenv("AZ_STORAGE_EP") - # only run the test if this variable is set (i.e. locally, but not on GitHub) - if (nzchar(endpoint_uri)) { - expect_no_error(support_container <- get_container("supporting-data")) - check_blob_exists <- function(container, file, path = "/") { - stopifnot("path not found" = AzureStor::blob_dir_exists(container, path)) - } - expect_no_error(check_blob_exists(support_container, "file")) - - check_blob_exists <- function(container, file, path = "/") { - stopifnot("path not found" = AzureStor::blob_dir_exists(container, path)) - file_ext <- "json" - AzureStor::list_blobs(container, path, recursive = FALSE) |> - dplyr::filter( - !dplyr::if_any("isdir") & - dplyr::if_any("name", \(x) { - grepl(glue::glue("\\.{file_ext}$"), x) & grepl(file, x) - }) - ) |> - dplyr::pull("name") - } - expect_no_error(check_blob_exists(support_container, "sites")) - expect_length(check_blob_exists(support_container, "sites"), 2) - - check_blob_exists <- function(container, file, path = "/") { - stopifnot("path not found" = AzureStor::blob_dir_exists(container, path)) - file_ext <- "json" - filepath <- AzureStor::list_blobs(container, path, recursive = FALSE) |> - dplyr::filter( - !dplyr::if_any("isdir") & - dplyr::if_any("name", \(x) { - grepl(glue::glue("\\.{file_ext}$"), x) & grepl(file, x) - }) - ) |> - dplyr::pull("name") - stop_msg1 <- glue::glue("no matching {file_ext} file found") - stop_msg2 <- glue::glue("multiple matching {file_ext} files found") - check_that(filepath, \(x) length(x) > 0, stop_msg1) # check length > 0 - check_scalar_type(filepath, "character", stop_msg2) # check length == 1 - } - expect_error(check_blob_exists(support_container, "unmatched"), "matching") - expect_error(check_blob_exists(support_container, "sites"), "multiple") - - check_blob_exists <- function(container, file, path = "/") { - stopifnot("path not found" = AzureStor::blob_dir_exists(container, path)) - file_ext <- "json" - filepath <- AzureStor::list_blobs(container, path, recursive = FALSE) |> - dplyr::filter( - !dplyr::if_any("isdir") & - dplyr::if_any("name", \(x) { - grepl(glue::glue("\\.{file_ext}$"), x) & grepl(file, x) - }) - ) |> - dplyr::pull("name") - stop_msg1 <- glue::glue("no matching {file_ext} file found") - stop_msg2 <- glue::glue("multiple matching {file_ext} files found") - check_that(filepath, \(x) length(x) > 0, stop_msg1) # check length > 0 - check_scalar_type(filepath, "character", stop_msg2) # check length == 1 - filepath - } - expect_no_error(check_blob_exists(support_container, "providers")) - expect_no_error(check_blob_exists(support_container, "providers.json")) - } -}) - - -# parquet and json read functions need `download_blob` first before reading in. -# However there are 'native' {AzureStor} functions for csv and rds files, so we -# should use those instead. Requiring a slightly different (simpler) workflow. - -test_that("read_azure_csv basically works", { - endpoint_uri <- Sys.getenv("AZ_STORAGE_EP") - # only run the test if this variable is set (i.e. locally, but not on GitHub) - if (nzchar(endpoint_uri)) { - download_azure_blob <- function(container, file, file_ext, path = "") { - check_blob_exists(container, file, file_ext, FALSE, path) |> - AzureStor::download_blob(container, src = _, dest = NULL) - } - expect_no_error(support_container <- get_container("supporting-data")) - support_container |> - # should construct the file name OK - download_azure_blob("mitigator-lookup", "csv") |> - expect_no_error() - support_container |> - download_azure_blob("mitigator-lookup.csv", "csv") |> - expect_no_error() - raw_out <- support_container |> - download_azure_blob("mitigator-lookup.csv", "csv") - expect_type(raw_out, "raw") - expect_no_error(readr::read_csv(raw_out, show_col_types = FALSE)) - dat <- readr::read_csv(raw_out, show_col_types = FALSE) - expect_type(dat, "list") - expect_s3_class(dat, "tbl_df") - } -}) - - -test_that("... parameters are passed through", { - endpoint_uri <- Sys.getenv("AZ_STORAGE_EP") - # only run the test if this variable is set (i.e. locally, but not on GitHub) - if (nzchar(endpoint_uri)) { - download_azure_blob <- function(container, file, file_ext, path = "") { - check_blob_exists(container, file, file_ext, FALSE, path) |> - AzureStor::download_blob(container, src = _, dest = NULL) - } - expect_no_error(support_container <- get_container("supporting-data")) - col_types <- "ccc------" - csv_out1 <- support_container |> - download_azure_blob("mitigator-lookup.csv", "csv") |> - readr::read_csv(col_types = col_types) |> - expect_no_error() - csv_out2 <- support_container |> - AzureStor::storage_read_csv( - "mitigator-lookup.csv", - col_types = col_types - ) |> - expect_no_error() - expect_identical(csv_out1, csv_out2) - expect_length(csv_out1, 3) # ncol - } -}) - -test_that("read functions all work a bit at least", { - endpoint_uri <- Sys.getenv("AZ_STORAGE_EP") - # only run the test if this variable is set (i.e. locally, but not on GitHub) - if (nzchar(endpoint_uri)) { - res <- get_container(Sys.getenv("AZ_RESULTS_CONTAINER")) - expect_no_error(read_azure_parquet(res, Sys.getenv("TEST_PARQUET_FILE"))) - supp <- get_container(Sys.getenv("AZ_SUPPORT_CONTAINER")) - expect_no_error(read_azure_json(supp, Sys.getenv("TEST_JSON_FILE"))) - expect_no_error(read_azure_rds(supp, Sys.getenv("TEST_RDS_FILE"))) - read_azure_csv(supp, Sys.getenv("TEST_CSV_FILE"), show_col_types = FALSE) |> - expect_no_error() - } -}) - - -test_that("check that sub-functions inherit options in R!", { - yes <- \() getOption("do_it") %||% "nope" - wrap <- \() yes() - expect_identical(withr::with_options(list(do_it = "yes"), wrap()), "yes") -}) +test_that("basic success", { + endpoint_uri <- Sys.getenv("AZ_STORAGE_EP") + # only run the test if this variable is set (ie locally, but not on GitHub) + if (nzchar(endpoint_uri)) { + token <- get_auth_token() + expect_false(is.null(token)) + expect_s3_class(token, "AzureToken") + + # explore behaviour with blob_endpoint + ep <- AzureStor::blob_endpoint(endpoint_uri, token = token) |> + expect_no_error() + expect_s3_class(ep, "blob_endpoint") + inputs_data <- AzureStor::blob_container(ep, "inputs-data") |> + expect_no_error() + expect_s3_class(inputs_data, "blob_container") + files <- AzureStor::list_blobs(inputs_data, "dev", recursive = FALSE) |> + dplyr::filter(grepl("\\.parquet$", .data[["name"]])) |> + dplyr::pull("name") + name <- "baseline" + pqt_file <- grep(name, files, value = TRUE) + expect_length(pqt_file, 1) + expect_true(AzureStor::blob_exists(inputs_data, pqt_file)) + out <- AzureStor::download_blob(inputs_data, pqt_file, dest = NULL) |> + arrow::read_parquet() + expect_s3_class(out, "tbl_df") + expect_length(out, 6) # ncol + } +}) + + +test_that("understand some new errors in check_blob_exists", { + endpoint_uri <- Sys.getenv("AZ_STORAGE_EP") + # only run the test if this variable is set (i.e. locally, but not on GitHub) + if (nzchar(endpoint_uri)) { + inp <- expect_no_error(get_container("inputs-data")) + path <- "dev" + file <- "baseline" + ext <- "parquet" + path <- if (path %in% c("", "/")) "" else path + expect_equal(path, "dev") + dir_name <- if (dirname(file) == ".") "" else dirname(file) + expect_equal(dir_name, "") + p2 <- file.path(path, dir_name) + expect_equal(p2, "dev") + file_name <- paste0(basename(file), ".", ext) + expect_equal(file_name, "baseline.parquet") + file_path <- sub("^/", "", sub("/+", "/", file.path(p2, file_name))) + expect_equal(file_path, "dev/baseline.parquet") + dir_list <- AzureStor::list_blobs(inp, p2, recursive = FALSE) + file_name_out <- dir_list |> + dplyr::filter(dplyr::if_any("name", \(x) x == file_path)) |> + dplyr::pull("name") + expect_equal(file_name_out, "dev/baseline.parquet") + expect_no_error(check_blob_exists(inp, file, ext, FALSE, path)) + + # check still works if full filepath is passed to file arg + path <- "" + file <- "dev/baseline.parquet" + path <- if (path %in% c("", "/")) "" else path + expect_equal(path, "") + dir_name <- if (dirname(file) == ".") "" else dirname(file) + expect_equal(dir_name, "dev") + p2 <- glue::glue("{path}/{dir_name}") + expect_equal(p2, "/dev") + file_name <- basename(file) + expect_equal(file_name, "baseline.parquet") + file_path <- sub("^/", "", sub("/+", "/", glue::glue("{p2}/{file_name}"))) + expect_equal(file_path, "dev/baseline.parquet") + dir_list <- AzureStor::list_blobs(inp, p2, recursive = FALSE) + file_name_out <- dir_list |> + dplyr::filter(dplyr::if_any("name", \(x) x == {{ file_path }})) |> + dplyr::pull("name") + expect_equal(file_name_out, file) + expect_no_error(check_blob_exists(inp, file, ext, FALSE, path)) + } +}) + + +test_that("whole read_parquet function works", { + endpoint_uri <- Sys.getenv("AZ_STORAGE_EP") + # only run the test if this variable is set (i.e. locally, but not on GitHub) + if (nzchar(endpoint_uri)) { + inputs_container <- expect_no_error(get_container("inputs-data")) + expect_no_error(read_azure_parquet( + inputs_container, + "baseline", + path = "dev" + )) + out1 <- read_azure_parquet(inputs_container, "baseline", path = "dev") + expect_s3_class(out1, "tbl_df") + # check that it works with the file extension included + out2 <- read_azure_parquet( + inputs_container, + "baseline.parquet", + path = "dev" + ) + expect_length(out2, 6) # ncol + + res <- get_container(Sys.getenv("AZ_RESULTS_CONTAINER")) + pqt_file <- Sys.getenv("TEST_PARQUET_FILE") + path <- "/" + file_ext <- "parquet" + + # experiment with changes to code: + path <- if (path %in% c("", "/")) "" else path + expect_equal(path, "") + dir_name <- if (dirname(pqt_file) == ".") "" else dirname(pqt_file) + dpath <- glue::glue("{path}/{dir_name}") + file_name <- sub(glue::glue("\\.{file_ext}$"), "", basename(pqt_file)) + + file_name <- if (gregg(basename(pqt_file), "\\.{file_ext}$")) { + basename(pqt_file) + } else { + glue::glue("{basename(pqt_file)}.{file_ext}") + } + # remove duplicate slashes and any initial slashes + fpath <- sub("^/", "", sub("/+", "/", glue::glue("{dpath}/{file_name}"))) + expect_equal(fpath, pqt_file) + # now function should run without error + expect_no_error(check_blob_exists(res, pqt_file, "parquet", FALSE, "/")) + # we want this to error if the file_ext doesn't match the file + expect_error(check_blob_exists(res, pqt_file, "rds", FALSE, "/")) + } +}) + + +test_that("read_azure_json basically works", { + endpoint_uri <- Sys.getenv("AZ_STORAGE_EP") + # only run the test if this variable is set (i.e. locally, but not on GitHub) + if (nzchar(endpoint_uri)) { + download_azure_blob <- function(container, file, file_ext, path = "") { + check_blob_exists(container, file, file_ext, FALSE, path) |> + AzureStor::download_blob(container, src = _, dest = NULL) + } + expect_no_error(support_container <- get_container("supporting-data")) + raw_out <- support_container |> + download_azure_blob("providers", "json") + expect_type(raw_out, "raw") + expect_no_error(yyjsonr::read_json_raw(raw_out)) + out <- yyjsonr::read_json_raw(raw_out) + expect_type(out, "character") + expect_length(out, 138) + } +}) + +test_that("dirname and basename logic works", { + endpoint_uri <- Sys.getenv("AZ_STORAGE_EP") + # only run the test if this variable is set (i.e. locally, but not on GitHub) + if (nzchar(endpoint_uri)) { + expect_no_error(support_container <- get_container("supporting-data")) + file <- "neecom_table.rds" + path <- "" + expect_no_error(AzureStor::blob_exists(support_container, file)) + file2 <- paste0("/", file) + file3 <- paste0("//", file) + expect_no_error(AzureStor::blob_exists(support_container, file2)) + expect_no_error(AzureStor::blob_exists(support_container, file3)) + expect_equal(dirname(file), ".") + file <- paste0(path, "/", file) + expect_equal(dirname(file), "/") + path <- "/" + file <- paste0(path, "/", file) + expect_equal(dirname(file), "/") + + new_path <- sub("^\\.$", "/", dirname(file)) + expect_equal(new_path, "/") + path <- "QA" + file <- "none.txt" + file <- paste0(path, "/", file) + new_path <- sub("^\\.$", "/", dirname(file)) + expect_equal(new_path, "QA") + + res <- get_container("results") + path <- "prod/dev/national/national" + file <- "test-2025" + file_ext <- "json.gz" + filepath <- sub("^/+", "", paste0(path, "/", file)) + expect_equal(filepath, "prod/dev/national/national/test-2025") + + path <- "/" + filepath <- sub("^/+", "", paste0(path, "/", file)) + expect_equal(dirname(filepath), ".") + + path <- "prod/dev/national/national" + filepath <- sub("^/+", "", paste0(path, "/", file)) + expect_equal(filepath, "prod/dev/national/national/test-2025") + path <- sub("^\\.$", "/", dirname(filepath)) + + filepath_out <- AzureStor::list_blobs(res, path, recursive = FALSE) |> + dplyr::filter( + !dplyr::if_any("isdir") & + dplyr::if_any("name", \(x) { + gregg(x, "\\.{file_ext}$") & gregg(x, "^{filepath}") + }) + ) |> + dplyr::pull("name") + expect_match(filepath_out, "prod/dev/national/national/test-2025.*json.gz") + } +}) + +test_that("tdd of check_blob_exists", { + endpoint_uri <- Sys.getenv("AZ_STORAGE_EP") + # only run the test if this variable is set (i.e. locally, but not on GitHub) + if (nzchar(endpoint_uri)) { + expect_no_error(support_container <- get_container("supporting-data")) + check_blob_exists <- function(container, file, path = "/") { + stopifnot("path not found" = AzureStor::blob_dir_exists(container, path)) + } + expect_no_error(check_blob_exists(support_container, "file")) + + check_blob_exists <- function(container, file, path = "/") { + stopifnot("path not found" = AzureStor::blob_dir_exists(container, path)) + file_ext <- "json" + AzureStor::list_blobs(container, path, recursive = FALSE) |> + dplyr::filter( + !dplyr::if_any("isdir") & + dplyr::if_any("name", \(x) { + grepl(glue::glue("\\.{file_ext}$"), x) & grepl(file, x) + }) + ) |> + dplyr::pull("name") + } + expect_no_error(check_blob_exists(support_container, "sites")) + expect_length(check_blob_exists(support_container, "sites"), 2) + + check_blob_exists <- function(container, file, path = "/") { + stopifnot("path not found" = AzureStor::blob_dir_exists(container, path)) + file_ext <- "json" + filepath <- AzureStor::list_blobs(container, path, recursive = FALSE) |> + dplyr::filter( + !dplyr::if_any("isdir") & + dplyr::if_any("name", \(x) { + grepl(glue::glue("\\.{file_ext}$"), x) & grepl(file, x) + }) + ) |> + dplyr::pull("name") + stop_msg1 <- glue::glue("no matching {file_ext} file found") + stop_msg2 <- glue::glue("multiple matching {file_ext} files found") + check_that(filepath, \(x) length(x) > 0, stop_msg1) # check length > 0 + check_scalar_type(filepath, "character", stop_msg2) # check length == 1 + } + expect_error(check_blob_exists(support_container, "unmatched"), "matching") + expect_error(check_blob_exists(support_container, "sites"), "multiple") + + check_blob_exists <- function(container, file, path = "/") { + stopifnot("path not found" = AzureStor::blob_dir_exists(container, path)) + file_ext <- "json" + filepath <- AzureStor::list_blobs(container, path, recursive = FALSE) |> + dplyr::filter( + !dplyr::if_any("isdir") & + dplyr::if_any("name", \(x) { + grepl(glue::glue("\\.{file_ext}$"), x) & grepl(file, x) + }) + ) |> + dplyr::pull("name") + stop_msg1 <- glue::glue("no matching {file_ext} file found") + stop_msg2 <- glue::glue("multiple matching {file_ext} files found") + check_that(filepath, \(x) length(x) > 0, stop_msg1) # check length > 0 + check_scalar_type(filepath, "character", stop_msg2) # check length == 1 + filepath + } + expect_no_error(check_blob_exists(support_container, "providers")) + expect_no_error(check_blob_exists(support_container, "providers.json")) + } +}) + + +# parquet and json read functions need `download_blob` first before reading in. +# However there are 'native' {AzureStor} functions for csv and rds files, so we +# should use those instead. Requiring a slightly different (simpler) workflow. + +test_that("read_azure_csv basically works", { + endpoint_uri <- Sys.getenv("AZ_STORAGE_EP") + # only run the test if this variable is set (i.e. locally, but not on GitHub) + if (nzchar(endpoint_uri)) { + download_azure_blob <- function(container, file, file_ext, path = "") { + check_blob_exists(container, file, file_ext, FALSE, path) |> + AzureStor::download_blob(container, src = _, dest = NULL) + } + expect_no_error(support_container <- get_container("supporting-data")) + support_container |> + # should construct the file name OK + download_azure_blob("mitigator-lookup", "csv") |> + expect_no_error() + support_container |> + download_azure_blob("mitigator-lookup.csv", "csv") |> + expect_no_error() + raw_out <- support_container |> + download_azure_blob("mitigator-lookup.csv", "csv") + expect_type(raw_out, "raw") + expect_no_error(readr::read_csv(raw_out, show_col_types = FALSE)) + dat <- readr::read_csv(raw_out, show_col_types = FALSE) + expect_type(dat, "list") + expect_s3_class(dat, "tbl_df") + } +}) + + +test_that("... parameters are passed through", { + endpoint_uri <- Sys.getenv("AZ_STORAGE_EP") + # only run the test if this variable is set (i.e. locally, but not on GitHub) + if (nzchar(endpoint_uri)) { + download_azure_blob <- function(container, file, file_ext, path = "") { + check_blob_exists(container, file, file_ext, FALSE, path) |> + AzureStor::download_blob(container, src = _, dest = NULL) + } + expect_no_error(support_container <- get_container("supporting-data")) + col_types <- "ccc------" + csv_out1 <- support_container |> + download_azure_blob("mitigator-lookup.csv", "csv") |> + readr::read_csv(col_types = col_types) |> + expect_no_error() + csv_out2 <- support_container |> + AzureStor::storage_read_csv( + "mitigator-lookup.csv", + col_types = col_types + ) |> + expect_no_error() + expect_identical(csv_out1, csv_out2) + expect_length(csv_out1, 3) # ncol + } +}) + +test_that("read functions all work a bit at least", { + endpoint_uri <- Sys.getenv("AZ_STORAGE_EP") + # only run the test if this variable is set (i.e. locally, but not on GitHub) + if (nzchar(endpoint_uri)) { + res <- get_container(Sys.getenv("AZ_RESULTS_CONTAINER")) + expect_no_error(read_azure_parquet(res, Sys.getenv("TEST_PARQUET_FILE"))) + supp <- get_container(Sys.getenv("AZ_SUPPORT_CONTAINER")) + expect_no_error(read_azure_json(supp, Sys.getenv("TEST_JSON_FILE"))) + expect_no_error(read_azure_rds(supp, Sys.getenv("TEST_RDS_FILE"))) + read_azure_csv(supp, Sys.getenv("TEST_CSV_FILE"), show_col_types = FALSE) |> + expect_no_error() + } +}) + + +test_that("check that sub-functions inherit options in R!", { + yes <- \() getOption("do_it") %||% "nope" + wrap <- \() yes() + expect_identical(withr::with_options(list(do_it = "yes"), wrap()), "yes") +})