From a6a0c30f2356e84f44baf2bafdb576961f9fba18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Art=C3=BCr=20Manukyan?= Date: Fri, 10 Apr 2026 20:24:57 +0200 Subject: [PATCH 1/9] Update TODO list for ZarrArray integration Both `ImageArray` and `SpatialData` now depends on BioC/ZarrArray. --- TODO | 8 -------- 1 file changed, 8 deletions(-) diff --git a/TODO b/TODO index fd559b9..9c5d2a6 100644 --- a/TODO +++ b/TODO @@ -39,14 +39,6 @@ o Benchmarking: After ZarrArray is accepted and added to BioC 3.23 ================================================== -o Only packages that seem to be using Rarr::ZarrArray() at the moment are - - ImageArray from Artür: - https://github.com/Bioconductor/Contributions/issues/3946) - - SpatialData from Helena L. Crowell (not submitted to Bioconductor yet): - https://github.com/HelenaLC/SpatialData - Make them use ZarrArray::ZarrArray() instead, after replacing Rarr with - ZarrArray in Imports. - o Register realization backend _ZarrArray in the DelayedArray package. o Implement saveZarrSummarizedExperiment() and From decca7c7ffd75bb8f5a94a8bd5e635e3dcbb44a7 Mon Sep 17 00:00:00 2001 From: Artur-man Date: Fri, 1 May 2026 21:20:47 +0200 Subject: [PATCH 2/9] implement ZarrSparseMatrix --- .Rbuildignore | 2 + .gitignore | 5 + DESCRIPTION | 10 +- NAMESPACE | 21 +- R/ZarrSparseMatrix-class.R | 49 ++ R/ZarrSparseMatrixSeed-class.R | 686 +++++++++++++++++++ R/zarr_mread.R | 110 +++ R/zarr_utils.R | 232 +++++++ tests/testthat/test-ZarrSparseMatrix-class.R | 35 + 9 files changed, 1147 insertions(+), 3 deletions(-) create mode 100644 .Rbuildignore create mode 100644 .gitignore create mode 100644 R/ZarrSparseMatrix-class.R create mode 100644 R/ZarrSparseMatrixSeed-class.R create mode 100644 R/zarr_mread.R create mode 100644 R/zarr_utils.R create mode 100644 tests/testthat/test-ZarrSparseMatrix-class.R diff --git a/.Rbuildignore b/.Rbuildignore new file mode 100644 index 0000000..91114bf --- /dev/null +++ b/.Rbuildignore @@ -0,0 +1,2 @@ +^.*\.Rproj$ +^\.Rproj\.user$ diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..dcc69b0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +.Rproj.user +.Rhistory +.RData +.Ruserdata +*Rproj \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index ec16449..01c1730 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -28,15 +28,21 @@ Authors@R: c( email="levi.waldron@sph.cuny.edu", comment=c(ORCID="0000-0003-2725-0694"))) Depends: R (>= 3.4), methods, SparseArray, DelayedArray -Imports: stats, tools, BiocGenerics, S4Vectors, IRanges, S4Arrays, +Imports: stats, tools, BiocGenerics, S4Vectors, IRanges, S4Arrays, h5mread, Rarr (>= 1.11.33) -Suggests: paws.storage, HDF5Array, testthat, knitr, rmarkdown, BiocStyle +Suggests: paws.storage, HDF5Array, testthat, knitr, rmarkdown, BiocStyle, + anndataR VignetteBuilder: knitr Collate: utils.R options.R + zarr_utils.R + zarr_mread.R ZarrArraySeed-class.R ZarrArray-class.R + ZarrSparseMatrixSeed-class.R + ZarrSparseMatrix-class.R writeZarrArray-auto-args.R writeZarrArray.R zzz.R +RoxygenNote: 7.3.3 diff --git a/NAMESPACE b/NAMESPACE index 4480344..aaa6a95 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -18,9 +18,18 @@ importFrom(Rarr, read_zarr_array, create_empty_zarr_array, update_zarr_array) exportClasses( "ZarrArraySeed", "ZarrArray", "ZarrMatrix", - "ZarrRealizationSink" + "ZarrRealizationSink", + "ZarrSparseMatrixSeed", "CSC_ZarrSparseMatrixSeed", "CSR_ZarrSparseMatrixSeed", + "ZarrSparseMatrix" ) +### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### Export S3 methods +### + +S3method(t, CSC_ZarrSparseMatrixSeed) +S3method(t, CSR_ZarrSparseMatrixSeed) + ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ### Export S4 methods for generics not defined in ZarrArray @@ -35,6 +44,12 @@ exportMethods( ## Methods for generics defined in the BiocGenerics package: path, type, + + ## Methods for generics defined in the S4Arrays package: + extract_array, is_sparse, + + ## Methods for generics defined in the SparseArray package: + nzcount, extract_sparse_array, ## Methods for generics defined in the S4Arrays package: extract_array, write_block, @@ -62,6 +77,10 @@ export( get_writeZarrArray_chunk_maxlen, set_writeZarrArray_chunk_maxlen, get_writeZarrArray_chunk_shape, set_writeZarrArray_chunk_shape, get_writeZarrArray_auto_chunkdim, + + ## + ZarrSparseMatrixSeed, + ZarrSparseMatrix, ## writeZarrArray.R: ZarrRealizationSink, writeZarrArray diff --git a/R/ZarrSparseMatrix-class.R b/R/ZarrSparseMatrix-class.R new file mode 100644 index 0000000..9761a61 --- /dev/null +++ b/R/ZarrSparseMatrix-class.R @@ -0,0 +1,49 @@ +### ========================================================================= +### ZarrSparseMatrix objects +### ------------------------------------------------------------------------- +### + + +setClass("ZarrSparseMatrix", + contains="DelayedMatrix", + representation(seed="ZarrSparseMatrixSeed") +) + + +### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### Constructor +### + +setMethod("DelayedArray", "ZarrSparseMatrixSeed", + function(seed) new_DelayedArray(seed, Class="ZarrSparseMatrix") +) + +### Works directly on an ZarrSparseMatrixSeed derivative, in which case it must +### be called with a single argument. +ZarrSparseMatrix <- function(filepath, group) +{ + if (is(filepath, "ZarrSparseMatrixSeed")) { + if (!missing(group)) + stop(wmsg("ZarrSparseMatrix() must be called with a single argument ", + "when passed an ZarrSparseMatrixSeed object")) + seed <- filepath + } else { + seed <- ZarrSparseMatrixSeed(filepath, group) + } + DelayedArray(seed) +} + + +### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### Taking advantage of sparsity +### + +setMethod("nzcount", "ZarrSparseMatrix", function(x) nzcount(x@seed)) + +# setMethod("extractNonzeroDataByCol", "ZarrSparseMatrix", +# function(x, j) extractNonzeroDataByCol(x@seed, j) +# ) +# +# setMethod("extractNonzeroDataByRow", "ZarrSparseMatrix", +# function(x, i) extractNonzeroDataByCol(x@seed, i) +# ) diff --git a/R/ZarrSparseMatrixSeed-class.R b/R/ZarrSparseMatrixSeed-class.R new file mode 100644 index 0000000..c98a993 --- /dev/null +++ b/R/ZarrSparseMatrixSeed-class.R @@ -0,0 +1,686 @@ +### ========================================================================= +### ZarrSparseMatrixSeed objects +### ------------------------------------------------------------------------- + +setClass("ZarrSparseMatrixSeed", + contains=c("Array", "OutOfMemoryObject"), + representation( + "VIRTUAL", + + ## --------------------- user supplied slots --------------------- + + ## Absolute path to the Zarr file so the object won't break when + ## the user changes the working directory (e.g. with 'setwd()'). + filepath="character", + + ## Name of the group in the Zarr file where the sparse matrix is + ## stored. + group="character", + + ## If 'paste0(group, "/data")' is a group, name of a dataset in + ## that group. Otherwise, must be set to NULL. + subdata="character_OR_NULL", + + ## ---------------- automatically populated slots ---------------- + + dim="integer", + + ## Can't use an IRanges object for this at the moment because IRanges + ## objects don't support large integer start/end values yet. + indptr_ranges="data.frame", + + ## ------------- populated by specialized subclasses ------------- + + dimnames="list" + ), + prototype( + dimnames=list(NULL, NULL) + ) +) + +.get_data_name <- function(subdata, group=NULL) +{ + name <- "data" + if (!is.null(subdata)) + name <- paste0(name, "/", subdata) + if (!is.null(group)) + name <- paste0(group, "/", name) + name +} + +setClass("CSC_ZarrSparseMatrixSeed", contains="ZarrSparseMatrixSeed") +setClass("CSR_ZarrSparseMatrixSeed", contains="ZarrSparseMatrixSeed") + + +### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### Transposition +### + +### S3/S4 combo for t.CSC_ZarrSparseMatrixSeed +t.CSC_ZarrSparseMatrixSeed <- function(x) +{ + x@dim <- rev(x@dim) + x@dimnames <- rev(x@dimnames) + class(x) <- class(new("CSR_ZarrSparseMatrixSeed")) + x +} +setMethod("t", "CSC_ZarrSparseMatrixSeed", t.CSC_ZarrSparseMatrixSeed) + +### S3/S4 combo for t.CSR_ZarrSparseMatrixSeed +t.CSR_ZarrSparseMatrixSeed <- function(x) +{ + x@dim <- rev(x@dim) + x@dimnames <- rev(x@dimnames) + class(x) <- class(new("CSC_ZarrSparseMatrixSeed")) + x +} +setMethod("t", "CSR_ZarrSparseMatrixSeed", t.CSR_ZarrSparseMatrixSeed) + + +### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### path() getter/setter +### + +### Does NOT access the file. +setMethod("path", "ZarrSparseMatrixSeed", function(object) object@filepath) + +### Just a placeholder for now. Doesn't actually allow changing the path of +### the object yet. +setReplaceMethod("path", "ZarrSparseMatrixSeed", + function(object, value) + { + new_filepath <- normarg_zarr_filepath(value, + what1="the supplied path", + what2="the sparse matrix") + old_filepath <- path(object) + if (new_filepath != old_filepath) + stop(wmsg("changing the path of a ", class(object), " object ", + "is not supported yet")) + object + } +) + + +### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### dim() and dimnames() getters +### +### They access the slot, not the file. +### + +setMethod("dim", "ZarrSparseMatrixSeed", function(x) x@dim) + +setMethod("dimnames", "ZarrSparseMatrixSeed", + function(x) S4Arrays:::simplify_NULL_dimnames(x@dimnames) +) + + +### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### chunkdim() getter +### + +### Does NOT access the file. +setMethod("chunkdim", "CSC_ZarrSparseMatrixSeed", + function(x) c(nrow(x), min(ncol(x), 1L)) +) + +setMethod("chunkdim", "CSR_ZarrSparseMatrixSeed", + function(x) c(min(nrow(x), 1L), ncol(x)) +) + + +### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### is_sparse() and nzcount() methods +### + +### This is about **structural** sparsity, not about quantitative sparsity +### measured by sparsity(). +setMethod("is_sparse", "ZarrSparseMatrixSeed", function(x) TRUE) + +setMethod("nzcount", "ZarrSparseMatrixSeed", + function(x) h5length(x@filepath, .get_data_name(x@subdata, x@group)) +) + + +### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### Low-level internal zarrsparse data readers +### + +### All the zarrsparse components are monodimensional. +read_zarrsparse_component <- function(filepath, group, name, + start=NULL, count=NULL, as.integer=FALSE) +{ + name <- paste0(group, "/", name) + if (!is.null(start)) + start <- list(start) + if (!is.null(count)) + count <- list(count) + print(filepath) + print(name) + print(start) + print(count) + print(as.integer) + print( + zarr_mread(filepath, name, starts=start, counts=count, + as.vector=TRUE, as.integer=as.integer) + ) + zarr_mread(filepath, name, starts=start, counts=count, + as.vector=TRUE, as.integer=as.integer) +} + +### Returns a numeric vector (integer or double). +.read_zarrsparse_dim <- function(filepath, group) +{ + if (zarrexists(filepath, paste0(group, "/shape"))) { + ## 10x format + return(read_zarrsparse_component(filepath, group, "shape")) + } + ## zarr format + h5attrs <- Rarr::read_zarr_attributes(file.path(filepath, group)) + shape <- h5attrs$shape + if (is.null(shape)) + shape <- h5attrs$h5sparse_shape + if (is.null(shape)) + stop(wmsg("Group \"", group, "\" in Zarr file \"", filepath,"\" ", + "contains no 'shape' dataset and has no 'shape' ", + "or 'h5sparse_shape' attribute. As a consequence, the ", + "dimensions of the sparse matrix can't be determined.")) + ## We pass 'shape' thru as.vector() to drop its class attribute in case + ## it's an array. + rev(as.vector(shape)) +} + +.read_zarrsparse_layout <- function(filepath, group) +{ + if (zarrexists(filepath, paste0(group, "/shape"))) { + ## 10x format + return("csr") + } + ## h5ad format + zarrattrs <- Rarr::read_zarr_attributes(file.path(filepath, group)) + h5sparse_layout <- zarrattrs[["encoding-type"]] + if (is.null(h5sparse_layout)) + h5sparse_layout <- zarrattrs[["h5sparse_format"]] + if (is.null(h5sparse_layout)) + return("csr") + ans <- tolower(substr(h5sparse_layout, 1L, 3L)) + if (!(ans %in% c("csr", "csc"))) + stop(wmsg("sparse matrix in group \"", group, "\" in Zarr ", + "file \"", filepath,"\" is stored in unsupported ", + "layout \"", h5sparse_layout, "\"")) + ans +} + +.read_zarrsparse_indptr <- function(filepath, group) + read_zarrsparse_component(filepath, group, "indptr") + +.read_zarrsparse_data <- + function(filepath, group, subdata, start=NULL, count=NULL) + { + name <- .get_data_name(subdata) + read_zarrsparse_component(filepath, group, name, start=start, count=count) + } + +### The row (or column) indices stored in Zarr dataset "indices" are 0-based +### and we return them as such. +.read_zarrsparse_indices <- function(filepath, group, start=NULL, count=NULL) + read_zarrsparse_component(filepath, group, "indices", + start=start, count=count, as.integer=TRUE) + + +### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### Constructor +### + +.check_group <- function(filepath, group) +{ + if (!zarrexists(filepath, group)) + stop(wmsg("Zarr group \"", group, "\" does not exist ", + "in this Zarr file")) + if (zarrisdataset(filepath, group)) { + is_zarr_X_or_layer <- group == "/X" || + substr(group, 1L, 8L) == "/layers/" + msg1 <- c("\"", group, "\" is an Zarr dataset, not an Zarr group, ", + "so it looks like the matrix that you are trying to ", + "access is not stored in a sparse format. Please ", + "consider using the ") + if (is_zarr_X_or_layer) { + msg2 <- c("ZarrMatrix() constructor if you are trying ", + "to access the central matrix of an h5ad file. ", + "Otherwise, use the ZarrArray() constructor.") + } else { + msg2 <- "ZarrArray() constructor to access this dataset." + } + stop(wmsg(msg1, msg2)) + } + if (!zarrisgroup(filepath, group)) + stop(wmsg("Zarr object \"", group, "\" is not a group")) +} + +.check_data_and_subdata <- function(filepath, group, subdata) +{ + data_fullname <- paste0(group, "/data") + if (!zarrexists(filepath, data_fullname)) + stop(wmsg("Zarr object \"", data_fullname, "\" does not ", + "exist in this Zarr file. Are you sure that Zarr ", + "group \"", group, "\" contains a sparse matrix ", + "stored in CSR/CSC/Yale layout?")) + if (is.null(subdata)) { + if (zarrisgroup(filepath, data_fullname)) + stop(wmsg("\"", data_fullname, "\" is an Zarr group, not an ", + "Zarr dataset. Please use the 'subdata' argument to ", + "specify the name of the dataset in this group that ", + "contains the matrix data.")) + if (!zarrisdataset(filepath, data_fullname)) + stop(wmsg("Zarr object \"", data_fullname, "\" is not a dataset.")) + } else { + if (!isSingleString(subdata) || subdata == "") + stop(wmsg("'subdata' must be NULL or a single non-empty string")) + if (zarrisdataset(filepath, data_fullname)) + stop(wmsg("\"", data_fullname, "\" is an Zarr dataset, not an ", + "Zarr group. Please note that the 'subdata' argument ", + "can be used only when it's a group.")) + if (!zarrisgroup(filepath, data_fullname)) + stop(wmsg("Zarr object \"", data_fullname, "\" is not a group.")) + subdata_fullname <- .get_data_name(subdata, group) + if (!zarrexists(filepath, subdata_fullname)) + stop(wmsg("Zarr object \"", subdata_fullname, "\" does not ", + "exist in this Zarr file.")) + if (!zarrisdataset(filepath, subdata_fullname)) + stop(wmsg("Zarr object \"", subdata_fullname, "\" is ", + "not a dataset.")) + } +} + +.get_sparse_matrix_dim <- function(filepath, group, dim=NULL) +{ + if (is.null(dim)) { + dim <- .read_zarrsparse_dim(filepath, group) + stopifnot(length(dim) == 2L) + return(dim_as_integer(dim, filepath, group, what="sparse matrix")) + } + ## Check user-supplied 'dim'. + if (!is.numeric(dim) || length(dim) != 2L || anyNA(dim)) + stop(wmsg("supplied 'dim' must be an integer vector ", + "of length 2 with no NAs")) + if (!is.integer(dim)) { + if (any(dim > .Machine$integer.max)) + stop(wmsg("supplied dimensions are too big (all dimensions ", + "must be <= '.Machine$integer.max' (= 2^31 - 1))")) + dim <- as.integer(dim) + } + if (any(dim < 0L)) + stop(wmsg("supplied 'dim' cannot contain negative values")) + dim +} + +### Must return "CSC" or "CSR". +.get_sparse_matrix_layout <- function(filepath, group, sparse.layout=NULL) +{ + if (is.null(sparse.layout)) { + zarrsparse_layout <- .read_zarrsparse_layout(filepath, group) + ## Layout in R will be transposed w.r.t. layout used in h5 file. + ans <- switch(zarrsparse_layout, `csr`="CSC", `csc`="CSR", + stop(wmsg("unsupported 'zarrsparse_layout': ", + zarrsparse_layout))) + return(ans) + } + ## Check user-supplied 'sparse.layout'. + if (!isSingleString(sparse.layout)) + stop(wmsg("'sparse.layout' must be a single string")) + ans <- toupper(sparse.layout) + if (!(ans %in% c("CSC", "CSR"))) + stop(wmsg("'sparse.layout' must be either \"CSC\" or \"CSR\"")) + ans +} + +### Returns an ZarrSparseMatrixSeed derivative (can be either a +### CSC_ZarrSparseMatrixSeed or CSR_ZarrSparseMatrixSeed object). +ZarrSparseMatrixSeed <- function(filepath, group, subdata=NULL, + dim=NULL, sparse.layout=NULL) +{ + ## Check 'filepath', 'group', and 'subdata'. + filepath <- normarg_zarr_filepath(filepath, + what2="the sparse matrix") + group <- normarg_zarr_name(group, + what1="'group'", + what2="the name of the group", + what3=" that stores the sparse matrix") + .check_group(filepath, group) + .check_data_and_subdata(filepath, group, subdata) + + ## Get matrix dimensions. + dim <- .get_sparse_matrix_dim(filepath, group, dim=dim) + + ## Get sparse layout to use ("CSC" or "CSR"). + ## Note that R has the notions of rows and columns flipped w.r.t. + ## Zarr so: + ## - "compressed sparse row" at the Zarr level translates + ## into "compressed sparse column" at the R level, + ## - "compressed sparse column" at the Zarr level translates + ## into "compressed sparse row" at the R level. + layout <- .get_sparse_matrix_layout(filepath, group, + sparse.layout=sparse.layout) + if (layout == "CSC") { + expected_indptr_len <- dim[[2L]] + 1L + ans_class <- "CSC_ZarrSparseMatrixSeed" + } else { + expected_indptr_len <- dim[[1L]] + 1L + ans_class <- "CSR_ZarrSparseMatrixSeed" + } + + ## Get 'indptr_ranges'. + # nzcount <- h5length(filepath, .get_data_name(subdata, group)) + nzcount <- zarrdim(filepath, .get_data_name(subdata, group)) + # indices_len <- h5length(filepath, paste0(group, "/indices")) + indices_len <- zarrdim(filepath, paste0(group, "/indices")) + stopifnot(indices_len == nzcount) + indptr <- .read_zarrsparse_indptr(filepath, group) + stopifnot(length(indptr) == expected_indptr_len, + indptr[[1L]] == 0L, + indptr[[length(indptr)]] == nzcount) + indptr_ranges <- data.frame(start=indptr[-length(indptr)] + 1, + width=as.integer(diff(indptr))) + + new2(ans_class, filepath=filepath, group=group, + dim=dim, indptr_ranges=indptr_ranges) +} + + +### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### .load_CSC_ZarrSparseMatrixSeed +### +### Loads whole CSC_ZarrSparseMatrixSeed object 'x' into memory as an +### SVT_SparseMatrix object, or only selected columns if 'j' is specified. +### This is the workhorse behind the extract_sparse_array(), extract_array(), +### and read_block_as_sparse() methods for ZarrSparseMatrixSeed objects, as +### well as behind coercion from CSC_ZarrSparseMatrixSeed to SVT_SparseMatrix. +### Does NOT propagate the dimnames. +### +### Notes: +### - SparseArray:::make_SVT_SparseMatrix_from_CSC() will fail if passed +### long vectors via its 'data' and/or 'row_indices' arguments because R +### does not support passing long vectors to the .Call interface yet! +### So we use a block strategy where we load blocks of adjacent columns +### and convert them to SVT_SparseMatrix objects, then cbind() all the +### objects together. By default, blocks are made of 125 millions +### data/indices elements. +### - Supports parallelization via the 'BPPARAM' argument. However some +### quick testing with 'BiocParallel::MulticoreParam(2)' on a powerful +### Linux server seemed to indicate that it's not worth it. Execution +### time remained about the same but memory footprint increased +### significantly! + +.load_CSC_ZarrSparseMatrixSeed <- function(x, j=NULL, + DATABLOCKLEN=125000000L, + BPPARAM=NULL) +{ + stopifnot(is(x, "CSC_ZarrSparseMatrixSeed"), + isSingleInteger(DATABLOCKLEN), DATABLOCKLEN >= 0L) + if (is.null(j)) { + ans_ncol <- ncol(x) + w <- x@indptr_ranges[ , "width"] + } else { + stopifnot(is.integer(j)) + ans_ncol <- length(j) + if (ans_ncol != 0L) + stopifnot(isStrictlySorted(j), + 1L <= j[[1L]], j[[ans_ncol]] <= ncol(x)) + w <- x@indptr_ranges[j , "width"] + } + ans_dim <- c(nrow(x), ans_ncol) + ## 'cumsum(as.double(w))' instead of 'cumsum(w)' to avoid integer overflow. + ans_indptr <- c(0, cumsum(as.double(w))) + ans_nzcount <- ans_indptr[[length(ans_indptr)]] + + ## DATABLOCKLEN == 0L means no block processing. + if (DATABLOCKLEN == 0L || ans_nzcount <= DATABLOCKLEN) { + if (is.null(j)) { + start <- count <- NULL + } else { + start <- x@indptr_ranges[j, "start"] + count <- x@indptr_ranges[j, "width"] + } + ans_data <- .read_zarrsparse_data(x@filepath, x@group, x@subdata, + start=start, count=count) + ans_row_indices <- .read_zarrsparse_indices(x@filepath, x@group, + start=start, count=count) + ans <- SparseArray:::make_SVT_SparseMatrix_from_CSC(ans_dim, + ans_indptr, ans_data, ans_row_indices) + return(ans) + } + + ## Compute 'nblock' (will always be >= 2). + nblock <- ans_nzcount %/% DATABLOCKLEN + if (ans_nzcount %% DATABLOCKLEN != 0L) + nblock <- nblock + 1L + + ## Partition column indices in ranges (nb of ranges is guaranteed to be + ## >= 1 and <= 'min(nblock, ans_ncol)'). + col_ranges <- breakInChunks(ans_ncol, nblock) + ## There will be zero-width ranges if and only if 'nblock' > 'ans_ncol'. + ## Drop them. + col_ranges <- col_ranges[width(col_ranges) != 0L] + s <- start(col_ranges) + e <- end(col_ranges) + + ## Load ranges of columns into SVT_SparseMatrix objects. + objects <- S4Arrays:::bplapply2(seq_along(col_ranges), + function(b, x, j, s, e) { + k1 <- s[[b]] + k2 <- e[[b]] + jj <- if (is.null(j)) k1:k2 else j[k1:k2] + ## Set 'DATABLOCKLEN' to 0L to disable block processing. + .load_CSC_ZarrSparseMatrixSeed(x, jj, DATABLOCKLEN=0L) + }, + x, j, s, e, + BPPARAM=BPPARAM + ) + + ## Combine all objects together. + do.call(cbind, objects) +} + + +### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### extract_sparse_array() and extract_array() methods +### + +.extract_sparse_array_from_CSC_ZarrSparseMatrixSeed <- function(x, index) +{ + j <- index[[2L]] + if (!is.null(j)) { + if (!is.integer(j)) + j <- as.integer(j) + sort_j <- !isStrictlySorted(j) + if (sort_j) { + j0 <- j + j <- unique(sort(j)) + } + } + svt <- .load_CSC_ZarrSparseMatrixSeed(x, j=j) + index2 <- list(index[[1L]], NULL) + if (!is.null(j) && sort_j) + index2[[2L]] <- match(j0, j) + extract_sparse_array(svt, index2) +} + +setMethod("extract_sparse_array", "CSC_ZarrSparseMatrixSeed", + function(x, index) + .extract_sparse_array_from_CSC_ZarrSparseMatrixSeed(x, index) +) + +setMethod("extract_sparse_array", "CSR_ZarrSparseMatrixSeed", + function(x, index) + t(.extract_sparse_array_from_CSC_ZarrSparseMatrixSeed(t(x), rev(index))) +) + +setMethod("extract_array", "ZarrSparseMatrixSeed", + function(x, index) as.array(extract_sparse_array(x, index)) +) + + +### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### Show +### + +setMethod("show", "ZarrSparseMatrixSeed", + function(object) + { + cat(S4Arrays:::array_as_one_line_summary(object), ":\n", sep="") + cat("# dirname: ", dirname(object), "\n", sep="") + cat("# basename: ", basename(object), "\n", sep="") + cat("# group: ", object@group, "\n", sep="") + } +) + + +### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### extractNonzeroDataByCol() and extractNonzeroDataByRow() +### +### TODO: Deprecate these 2 generics and their methods. These 2 generics are +### weird and don't have good/strong use cases. I suspect nobody uses them +### nor is aware of them. + +### base::sequence() does not properly handle a 'from' that is > +### .Machine$integer.max so we implement a variant that does. Note that +### the 2nd argument of this variant is 'offset' instead of 'from' ('offset' +### being the same as 'from' - 1). +### TODO: Does .sequence2() work if sum(lengths) is > .Machine$integer.max? +.sequence2 <- function(lengths, offset=0) +{ + lengths_len <- length(lengths) + if (lengths_len == 0L) + return(numeric(0)) + offsets <- offset - cumsum(c(0L, lengths[-lengths_len])) + seq_len(sum(lengths)) + rep.int(offsets, lengths) +} + +### 'j' must be an integer vector containing valid col indices. +### Return data indices in a NumericList object parallel to 'j' i.e. with +### one list element per col index in 'j'. +.get_data_indices_by_col <- function(x, j) +{ + indptr_ranges <- S4Vectors:::extract_data_frame_rows(x@indptr_ranges, j) + start2 <- indptr_ranges[ , "start"] + width2 <- indptr_ranges[ , "width"] + idx2 <- .sequence2(width2, offset=start2 - 1L) + ### Will this work if 'idx2' is a long vector? + relist(idx2, PartitioningByWidth(width2)) +} + +### 'j1' and 'j2' must be 2 single integers representing a valid range of +### col indices. +### Returns a NumericList or IntegerList object parallel +### to 'j1:j2' i.e. with one list element per col index in 'j1:j2'. +.extract_data_from_adjacent_cols <- function(x, j1, j2) +{ + j12 <- j1:j2 + start <- x@indptr_ranges[j1, "start"] + count_per_col <- x@indptr_ranges[j12, "width"] + count <- sum(count_per_col) + ans_nzdata <- .read_zarrsparse_data(x@filepath, x@group, x@subdata, + start=start, count=count) + relist(ans_nzdata, PartitioningByWidth(count_per_col)) +} + +.normarg_method <- function(method, j) +{ + if (method != "auto") + return(method) + if (is.null(j)) + return("linear") + if (length(j) == 0L) + return("random") + j1 <- min(j) + j2 <- max(j) + ## 'ratio' is > 0 and <= 1. A value close to 1 indicates that the columns + ## to extract are close from each other (a value of 1 indicating that + ## they are adjacent e.g. j <- 18:25). A value close to 0 indicates that + ## they are far apart from each other i.e. that they are separated by many + ## columns that are not requested. The "linear" method is very efficient + ## when 'ratio' is close to 1. It is so much more efficient than the + ## "random" method (typically 10x or 20x faster) that we choose it when + ## 'ratio' is >= 0.2 + ratio <- length(j) / (j2 - j1 + 1L) + if (ratio >= 0.2) "linear" else "random" +} + +### Extract nonzero data using the "random" method. +### This method is based on h5mread( , starts=list(start)) which retrieves +### an arbitrary/random subset of the data. +### 'j' must be an integer vector containing valid col indices. It cannot +### be NULL. +.random_extract_nonzero_data_by_col <- function(x, j) +{ + data_indices <- .get_data_indices_by_col(x, j) + idx2 <- unlist(data_indices, use.names=FALSE) + data <- .read_zarrsparse_data(x@filepath, x@group, x@subdata, start=idx2) + relist(data, data_indices) +} + +### Extract nonzero data using the "linear" method. +### This method is based on h5mread( , starts=list(start), counts=list(count)) +### which retrieves a linear subset of the data and should be more efficient +### than doing h5mread( , starts=list(seq(start, length.out=count))). +### 'j' must be NULL or an integer vector containing valid col indices. It +### should not be empty. +.linear_extract_nonzero_data_by_col <- function(x, j) +{ + if (is.null(j)) { + j1 <- 1L + j2 <- ncol(x) + } else { + stopifnot(is.numeric(j), length(j) != 0L) + j1 <- min(j) + j2 <- max(j) + } + nonzero_data <- .extract_data_from_adjacent_cols(x, j1, j2) + if (is.null(j)) + return(nonzero_data) + nonzero_data[match(j, j1:j2)] +} + +### 'j' must be NULL or an integer vector containing valid col indices. +### Return a NumericList or IntegerList object parallel to 'j' i.e. with +### one list element per col index in 'j'. +.extract_nonzero_csc_sparse_data_by_col <- + function(x, j, method=c("auto", "random", "linear")) + { + method <- match.arg(method) + method <- .normarg_method(method, j) + if (method == "random") { + .random_extract_nonzero_data_by_col(x, j) + } else { + .linear_extract_nonzero_data_by_col(x, j) + } + } + +### Return a NumericList or IntegerList object parallel to 'j' i.e. with +### one list element per col index in 'j'. +setGeneric("extractNonzeroDataByCol", signature="x", + function(x, j) standardGeneric("extractNonzeroDataByCol") +) + +setMethod("extractNonzeroDataByCol", "CSC_ZarrSparseMatrixSeed", + function(x, j) + { + j <- S4Arrays:::normalizeSingleBracketSubscript2(j, ncol(x), + colnames(x)) + .extract_nonzero_csc_sparse_data_by_col(x, j) + } +) + +### Return a NumericList or IntegerList object parallel to 'i' i.e. with +### one list element per row index in 'i'. +setGeneric("extractNonzeroDataByRow", signature="x", + function(x, i) standardGeneric("extractNonzeroDataByRow") +) + +setMethod("extractNonzeroDataByRow", "CSR_ZarrSparseMatrixSeed", + function(x, i) + { + i <- S4Arrays:::normalizeSingleBracketSubscript2(i, nrow(x), + rownames(x)) + .extract_nonzero_csc_sparse_data_by_col(t(x), i) + } +) \ No newline at end of file diff --git a/R/zarr_mread.R b/R/zarr_mread.R new file mode 100644 index 0000000..0c3b709 --- /dev/null +++ b/R/zarr_mread.R @@ -0,0 +1,110 @@ + +### ========================================================================= +### zarr_mread() +### ------------------------------------------------------------------------- +### + +### The R type returned by zarr_mread() is determined by arguments 'filepath', +### 'name', and 'as.integer'. +get_zarrmread_returned_type <- function(filepath, name, as.integer=FALSE) +{ + name <- normarg_zarr_name(name) + + .Call2("C_get_zarrmread_returned_type", filepath, name, as.integer, + PACKAGE="HDF5Array") +} + +### When both 'starts' and 'counts' are specified, the selection must be +### strictly ascending along each dimension. +### By default the user-supplied selection is checked and reduced (if it +### can be). +### Set 'noreduce' to TRUE to skip the reduction step. +### Set 'as.integer' to TRUE to force returning the result as an integer array. +zarr_mread <- function(filepath, name, starts=NULL, counts=NULL, noreduce=FALSE, + as.vector=NA, as.integer=FALSE, as.sparse=FALSE) +{ + # check name + # name <- normarg_zarr_name(name) + + if (!isTRUEorFALSE(as.sparse)) + stop(wmsg("'as.sparse' must be TRUE or FALSE")) + if (is.null(starts)) { + if (!is.null(counts)) + stop(wmsg("'counts' must be NULL when 'starts' is NULL")) + } else if (is.list(starts)) { + order_starts <- is.null(counts) && + !all(S4Vectors:::sapply_isNULL(starts)) + if (order_starts) { + ## Round the 'starts'. + starts0 <- lapply(starts, + function(start) { + if (is.null(start)) + return(NULL) + if (!is.numeric(start)) + stop(wmsg("each list element in 'starts' must ", + "be NULL or a numeric vector")) + if (!is.integer(start)) + start <- round(start) + start + }) + ok <- vapply(starts0, + function(start0) is.null(start0) || isStrictlySorted(start0), + logical(1)) + order_starts <- !all(ok) + if (order_starts) { + if (length(ok) != 1L && isTRUE(as.vector)) + stop(wmsg("when using 'as.vector=TRUE' on a ", + "multidimensional dataset, list elements ", + "in 'starts' must be strictly sorted")) + starts <- lapply(seq_along(starts0), + function(i) { + start0 <- starts0[[i]] + if (ok[[i]]) + return(start0) + start0 <- sort(start0) + start <- unique(start0) + if (as.sparse && length(start) != length(start0)) + stop(wmsg("when using 'as.sparse=TRUE', list ", + "elements in 'starts' are not allowed ", + "to contain duplicates")) + start + }) + } else { + starts <- starts0 + } + } + } else { + stop(wmsg("'starts' must be a list (or NULL)")) + } + + # read zarr + if(is.null(starts)){ + # TODO: is this necessary + ndim <- length(zarrdim(filepath, name)) + index <- vector("list", ndim) + } else { + index <- starts + } + ans <- read_zarr_array(file.path(filepath, name), index = index) + + if (as.sparse) + ans <- COO_SparseArray(ans[[1L]], ans[[2L]], ans[[3L]], check=FALSE) + if (is.null(starts) || !order_starts) + return(ans) + index <- lapply(seq_along(starts0), + function(i) { + if (ok[[i]]) + return(NULL) + match(starts0[[i]], starts[[i]]) + }) + if (as.sparse) { + extract_sparse_array(ans, index) + } else if (is.array(ans)) { + extract_array(ans, index) + } else if (length(index) == 1L) { + ans[index[[1L]]] + } else { + ## Sanity check (should never happen). + stop(wmsg(".Call entry point C_zarrmread returned an unexpected object")) + } +} \ No newline at end of file diff --git a/R/zarr_utils.R b/R/zarr_utils.R new file mode 100644 index 0000000..98e693b --- /dev/null +++ b/R/zarr_utils.R @@ -0,0 +1,232 @@ +### ========================================================================= +### Some low-level HDF5 utilities +### ------------------------------------------------------------------------- +### +### Nothing in this file is exported. +### + +### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### create_zarr_group() and create_zarr() +### + +#' create_zarr_group +#' +#' create zarr groups +#' +#' @param store the location of (zarr) store +#' @param name name of the group +#' @param version zarr version +#' @export +create_zarr_group <- function(store, name, version = "v2") { + split.name <- strsplit(name, split = "\\/")[[1]] + if (length(split.name) > 1) { + split.name <- vapply(seq_len(length(split.name)), + function(x) paste(split.name[seq_len(x)], collapse = "/"), + FUN.VALUE = character(1)) + split.name <- rev(tail(split.name, 2)) + if (!dir.exists(file.path(store, split.name[2]))) + create_zarr_group(store = store, name = split.name[2]) + } + dir.create(file.path(store, split.name[1]), showWarnings = FALSE) + switch(version, + v2 = { + write("{\"zarr_format\":2}", file = file.path(store, split.name[1], ".zgroup"))}, + v3 = { + stop("Currently only zarr v2 is supported!") + }, + stop("only zarr v2 is supported. Use version = 'v2'") + ) +} + +#' create_zarr +#' +#' create zarr store +#' +#' @param dir the location of zarr store +#' @param prefix prefix of the zarr store +#' @param version zarr version +#' @examples +#' dir.create(td <- tempfile()) +#' zarr_name <- "test" +#' create_zarr(dir = td, prefix = "test") +#' dir.exists(file.path(td, "test.zarr")) +#' @export +create_zarr <- function(store, version = "v2") { + prefix <- basename(store) + dir <- gsub(paste0(prefix, "$"), "", store) + create_zarr_group(store = dir, name = prefix, version = version) +} + +### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### zarrexists() +### + +zarrexists <- function(filepath, name) +{ + dir.exists(file.path(filepath, name)) + # zarr.array <- pizzarr::zarr_open(store = filepath, mode = "r") + # if(grepl(".zarr$", filepath)){ + # zarr.array$contains_item(name) + # } else { + # return(FALSE) + # } +} + +### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### zarrisgroup() +### + +zarrisgroup <- function(filepath, name) +{ + # zarr.array <- pizzarr::zarr_open(store = filepath, mode = "r") + # did <- try(zarr.array$get_item(name)) + # ans <- !inherits(did, "try-error") + # ans + file.exists(file.path(filepath, name, ".zgroup")) +} + + +### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### zarrisdataset() +### + +zarrisdataset <- function(filepath, name) +{ + # zarr.array <- pizzarr::zarr_open(store = filepath, mode = "r") + # did <- try(zarr.array$get_item(name)) + # ans <- !inherits(did, "try-error") + # ans + file.exists(file.path(filepath, name, ".zarray")) +} + +### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### zarrdim() and zarrchunkdim() +### + +zarrdim <- function(filepath, name) +{ + # zarr.array <- pizzarr::zarr_open(store = filepath, mode = "r") + # zarrmat <- zarr.array$get_item(name) + # zarrmat$get_shape() + overview <- zarr_overview(file.path(filepath, name), + as_data_frame = TRUE) + overview$dim[[1]] +} + +zarrchunkdim <- function(filepath, name, adjust=FALSE) +{ + overview <- zarr_overview(file.path(filepath, name), + as_data_frame = TRUE) + chunkdim <- overview$chunk_dim[[1]] + if (adjust) { + dim <- overview$dim[[1]] + stopifnot(length(chunkdim) == length(dim)) + chunkdim <- as.integer(pmin(dim, chunkdim)) + } + chunkdim +} + +### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### normarg_zarr_filepath() and normarg_zarr_name() +### + +normarg_zarr_filepath <- function(path, what1="'filepath'", what2="the dataset") +{ + if (!isSingleString(path)) + stop(wmsg(what1, " must be a single string specifying the path ", + "to the Zarr directory where ", what2, " is located")) + tools::file_path_as_absolute(path) # return absolute path in canonical form +} + +normarg_zarr_name <- function(name, what1="'name'", + what2="the name of a dataset", + what3="") +{ + if (!isSingleString(name)) + stop(wmsg(what1, " must be a single string specifying ", + what2, " in the Zarr directory", what3)) + if (name == "") + stop(wmsg(what1, " cannot be the empty string")) + if (substr(name, start=1L, stop=1L) == "/") { + name <- sub("^/*", "/", name) # only keep first leading slash + } else { + name <- paste0("/", name) + } + name +} + +### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### Used in validity methods +### + +### 'path' is expected to be the **absolute** path to a local zarr array. +validate_zarr_absolute_path <- function(path, what="'path'") +{ + if (!(isSingleString(path) && nzchar(path))) + return(paste0(what, " must be a single non-empty string")) + + ## Check that 'path' points to an Zarr directory that is accessible. + if (!file.exists(path)) + return(paste0(what, " (\"", path, "\") must be the path to ", + "an existing zarr array")) + if (!grepl(".zarr$", path)) + return(paste0(what, " (\"", path, "\") doesn't seem to be ", + "the path to a valid zarr array")) + if (path != tools::file_path_as_absolute(path)) + return(paste0(what, " (\"", path, "\") must be the absolute ", + "canonical path the Zarr array")) + TRUE +} + +validate_zarr_dataset_name <- function(path, name, what="'name'") +{ + if (!(isSingleString(name) && nzchar(name))) + return(paste0(what, " must be a single non-empty string")) + + if (!zarrexists(path, name)) + return(paste0(what, " (\"", name, "\") doesn't exist ", + "in Zarr directory \"", path, "\"")) + if (!zarrisdataset(path, name)) + return(paste0(what, " (\"", name, "\") is not a dataset ", + "in Zarr directory \"", path, "\"")) + zarr_dim <- try(zarrdim(path, name), silent=TRUE) + if (inherits(zarr_dim, "try-error")) + return(paste0(what, " (\"", name, "\") is a dataset with ", + "no dimensions in Zarr directory \"", path, "\"")) + TRUE +} + +### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### ZarrCreateDataset() +### + +compute_max_string_size <- function(x) +{ + if (type(x) != "character") + return(NULL) + if (length(x) == 0L) + return(0L) + max(nchar(x, type="bytes", keepNA=FALSE)) +} + +ZarrCreateDataset <- function(filepath, + name, + dim, + maxdim=dim, + type="double", + size=NULL, + chunkdim=dim, + level=6L) +{ + stopifnot(is.numeric(dim), + is.numeric(maxdim), length(maxdim) == length(dim)) + if (!is.null(chunkdim)) { + stopifnot(is.numeric(chunkdim), length(chunkdim) == length(dim)) + chunkdim <- pmin(chunkdim, maxdim) + } + create_empty_zarr_array(file.path(filepath,name), + dim = dim, + chunk_dim = chunkdim, + data_type = type, + nchar = size) +} \ No newline at end of file diff --git a/tests/testthat/test-ZarrSparseMatrix-class.R b/tests/testthat/test-ZarrSparseMatrix-class.R new file mode 100644 index 0000000..9d32f73 --- /dev/null +++ b/tests/testthat/test-ZarrSparseMatrix-class.R @@ -0,0 +1,35 @@ +library(Rarr) +library(anndataR) +library(h5mread) +library(ZarrArray) + +# zarr file +zarr_dir <- system.file("extdata", "example_v2.zarr.zip", package = "anndataR") +td <- tempdir(check = TRUE) +unzip(zarr_dir, exdir = td) +store <- file.path(td, "example_v2.zarr") + +filepath <- store +name <- "/layers/csc_counts/indices" +start <- list(c( 1, 45, 87, 130, 171, 4099, 4144, 4190, 4235, 4279)) +count <- list(c(44, 42, 43, 41, 42, 45, 46, 45, 44, 39)) +as.integer <- TRUE +zarr_mread(filepath, name, starts=start, counts=count, + as.vector=TRUE, as.integer=as.integer) + +filepath <- system.file("extdata", "example.h5ad", package = "anndataR") +h5mread(filepath, name, starts=start, counts=count, + as.vector=TRUE, as.integer=as.integer) + +# # read sparse matrix +# name <- "layers/csc_counts" +# ZarrSparseMatrix(store, name) + +# test_that("read sparse", { +# +# # read sparse matrix +# name <- "layers/csc_counts" +# ZarrSparseMatrix(store, name) +# +# expect_equal(1,1) +# }) \ No newline at end of file From 8e4f19fa0c209f9ce1417e68b778f7414d1da1fb Mon Sep 17 00:00:00 2001 From: Artur-man Date: Fri, 1 May 2026 21:52:22 +0200 Subject: [PATCH 3/9] zarr sparse matrix works --- R/ZarrSparseMatrixSeed-class.R | 9 ------- R/zarr_mread.R | 8 +++++- tests/testthat/test-ZarrSparseMatrix-class.R | 28 +++++--------------- 3 files changed, 13 insertions(+), 32 deletions(-) diff --git a/R/ZarrSparseMatrixSeed-class.R b/R/ZarrSparseMatrixSeed-class.R index c98a993..47c27ce 100644 --- a/R/ZarrSparseMatrixSeed-class.R +++ b/R/ZarrSparseMatrixSeed-class.R @@ -154,15 +154,6 @@ read_zarrsparse_component <- function(filepath, group, name, start <- list(start) if (!is.null(count)) count <- list(count) - print(filepath) - print(name) - print(start) - print(count) - print(as.integer) - print( - zarr_mread(filepath, name, starts=start, counts=count, - as.vector=TRUE, as.integer=as.integer) - ) zarr_mread(filepath, name, starts=start, counts=count, as.vector=TRUE, as.integer=as.integer) } diff --git a/R/zarr_mread.R b/R/zarr_mread.R index 0c3b709..e774888 100644 --- a/R/zarr_mread.R +++ b/R/zarr_mread.R @@ -83,7 +83,13 @@ zarr_mread <- function(filepath, name, starts=NULL, counts=NULL, noreduce=FALSE, ndim <- length(zarrdim(filepath, name)) index <- vector("list", ndim) } else { - index <- starts + index <- mapply(function(x,y){ + unlist( + mapply(function(xx,yy){ + seq(xx, xx+yy-1) + }, x, y) + ) + }, starts, counts, SIMPLIFY = FALSE) } ans <- read_zarr_array(file.path(filepath, name), index = index) diff --git a/tests/testthat/test-ZarrSparseMatrix-class.R b/tests/testthat/test-ZarrSparseMatrix-class.R index 9d32f73..d478278 100644 --- a/tests/testthat/test-ZarrSparseMatrix-class.R +++ b/tests/testthat/test-ZarrSparseMatrix-class.R @@ -9,27 +9,11 @@ td <- tempdir(check = TRUE) unzip(zarr_dir, exdir = td) store <- file.path(td, "example_v2.zarr") -filepath <- store -name <- "/layers/csc_counts/indices" -start <- list(c( 1, 45, 87, 130, 171, 4099, 4144, 4190, 4235, 4279)) -count <- list(c(44, 42, 43, 41, 42, 45, 46, 45, 44, 39)) -as.integer <- TRUE -zarr_mread(filepath, name, starts=start, counts=count, - as.vector=TRUE, as.integer=as.integer) +test_that("read sparse", { -filepath <- system.file("extdata", "example.h5ad", package = "anndataR") -h5mread(filepath, name, starts=start, counts=count, - as.vector=TRUE, as.integer=as.integer) + # read sparse matrix + name <- "layers/csc_counts" + ZarrSparseMatrix(store, name) -# # read sparse matrix -# name <- "layers/csc_counts" -# ZarrSparseMatrix(store, name) - -# test_that("read sparse", { -# -# # read sparse matrix -# name <- "layers/csc_counts" -# ZarrSparseMatrix(store, name) -# -# expect_equal(1,1) -# }) \ No newline at end of file + expect_equal(1,1) +}) \ No newline at end of file From 1c04ad6dc03f41af0dc8507389d07b7671eda81f Mon Sep 17 00:00:00 2001 From: Artur-man Date: Fri, 1 May 2026 23:16:09 +0200 Subject: [PATCH 4/9] add some tests and fix documentation --- DESCRIPTION | 4 +- R/ZarrSparseMatrix-class.R | 14 +- R/ZarrSparseMatrixSeed-class.R | 25 ++-- R/zarr_mread.R | 12 +- R/zarr_utils.R | 131 +++++++++---------- tests/testthat/test-ZarrSparseMatrix-class.R | 44 ++++--- 6 files changed, 111 insertions(+), 119 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 01c1730..1197926 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -28,8 +28,8 @@ Authors@R: c( email="levi.waldron@sph.cuny.edu", comment=c(ORCID="0000-0003-2725-0694"))) Depends: R (>= 3.4), methods, SparseArray, DelayedArray -Imports: stats, tools, BiocGenerics, S4Vectors, IRanges, S4Arrays, h5mread, - Rarr (>= 1.11.33) +Imports: stats, tools, BiocGenerics, S4Vectors, IRanges, S4Arrays, + Rarr (>= 1.11.33), jsonlite Suggests: paws.storage, HDF5Array, testthat, knitr, rmarkdown, BiocStyle, anndataR VignetteBuilder: knitr diff --git a/R/ZarrSparseMatrix-class.R b/R/ZarrSparseMatrix-class.R index 9761a61..f19223e 100644 --- a/R/ZarrSparseMatrix-class.R +++ b/R/ZarrSparseMatrix-class.R @@ -40,10 +40,10 @@ ZarrSparseMatrix <- function(filepath, group) setMethod("nzcount", "ZarrSparseMatrix", function(x) nzcount(x@seed)) -# setMethod("extractNonzeroDataByCol", "ZarrSparseMatrix", -# function(x, j) extractNonzeroDataByCol(x@seed, j) -# ) -# -# setMethod("extractNonzeroDataByRow", "ZarrSparseMatrix", -# function(x, i) extractNonzeroDataByCol(x@seed, i) -# ) +setMethod("extractNonzeroDataByCol", "ZarrSparseMatrix", + function(x, j) extractNonzeroDataByCol(x@seed, j) +) + +setMethod("extractNonzeroDataByRow", "ZarrSparseMatrix", + function(x, i) extractNonzeroDataByCol(x@seed, i) +) diff --git a/R/ZarrSparseMatrixSeed-class.R b/R/ZarrSparseMatrixSeed-class.R index 47c27ce..de73e06 100644 --- a/R/ZarrSparseMatrixSeed-class.R +++ b/R/ZarrSparseMatrixSeed-class.R @@ -137,7 +137,7 @@ setMethod("chunkdim", "CSR_ZarrSparseMatrixSeed", setMethod("is_sparse", "ZarrSparseMatrixSeed", function(x) TRUE) setMethod("nzcount", "ZarrSparseMatrixSeed", - function(x) h5length(x@filepath, .get_data_name(x@subdata, x@group)) + function(x) zarrlength(x@filepath, .get_data_name(x@subdata, x@group)) ) @@ -166,10 +166,10 @@ read_zarrsparse_component <- function(filepath, group, name, return(read_zarrsparse_component(filepath, group, "shape")) } ## zarr format - h5attrs <- Rarr::read_zarr_attributes(file.path(filepath, group)) - shape <- h5attrs$shape + zarrattrs <- Rarr::read_zarr_attributes(file.path(filepath, group)) + shape <- zarrattrs$shape if (is.null(shape)) - shape <- h5attrs$h5sparse_shape + shape <- zarrattrs$h5sparse_shape if (is.null(shape)) stop(wmsg("Group \"", group, "\" in Zarr file \"", filepath,"\" ", "contains no 'shape' dataset and has no 'shape' ", @@ -186,18 +186,19 @@ read_zarrsparse_component <- function(filepath, group, name, ## 10x format return("csr") } - ## h5ad format + ## anndata-zarr ? zarrattrs <- Rarr::read_zarr_attributes(file.path(filepath, group)) - h5sparse_layout <- zarrattrs[["encoding-type"]] - if (is.null(h5sparse_layout)) - h5sparse_layout <- zarrattrs[["h5sparse_format"]] - if (is.null(h5sparse_layout)) + zarrsparse_layout <- zarrattrs[["encoding-type"]] + if (is.null(zarrsparse_layout)) + # TODO: is there h5sparse_format like attribute in ... somewhere ? + zarrsparse_layout <- zarrattrs[["h5sparse_format"]] + if (is.null(zarrsparse_layout)) return("csr") - ans <- tolower(substr(h5sparse_layout, 1L, 3L)) + ans <- tolower(substr(zarrsparse_layout, 1L, 3L)) if (!(ans %in% c("csr", "csc"))) stop(wmsg("sparse matrix in group \"", group, "\" in Zarr ", "file \"", filepath,"\" is stored in unsupported ", - "layout \"", h5sparse_layout, "\"")) + "layout \"", zarrsparse_layout, "\"")) ans } @@ -360,9 +361,7 @@ ZarrSparseMatrixSeed <- function(filepath, group, subdata=NULL, } ## Get 'indptr_ranges'. - # nzcount <- h5length(filepath, .get_data_name(subdata, group)) nzcount <- zarrdim(filepath, .get_data_name(subdata, group)) - # indices_len <- h5length(filepath, paste0(group, "/indices")) indices_len <- zarrdim(filepath, paste0(group, "/indices")) stopifnot(indices_len == nzcount) indptr <- .read_zarrsparse_indptr(filepath, group) diff --git a/R/zarr_mread.R b/R/zarr_mread.R index e774888..b1ae410 100644 --- a/R/zarr_mread.R +++ b/R/zarr_mread.R @@ -4,16 +4,6 @@ ### ------------------------------------------------------------------------- ### -### The R type returned by zarr_mread() is determined by arguments 'filepath', -### 'name', and 'as.integer'. -get_zarrmread_returned_type <- function(filepath, name, as.integer=FALSE) -{ - name <- normarg_zarr_name(name) - - .Call2("C_get_zarrmread_returned_type", filepath, name, as.integer, - PACKAGE="HDF5Array") -} - ### When both 'starts' and 'counts' are specified, the selection must be ### strictly ascending along each dimension. ### By default the user-supplied selection is checked and reduced (if it @@ -78,6 +68,8 @@ zarr_mread <- function(filepath, name, starts=NULL, counts=NULL, noreduce=FALSE, } # read zarr + # This block mimics .Call2("C_h5mread", ...) from h5mread package + # TODO: are we using all .Call2("C_h5mread") arguments ? if(is.null(starts)){ # TODO: is this necessary ndim <- length(zarrdim(filepath, name)) diff --git a/R/zarr_utils.R b/R/zarr_utils.R index 98e693b..0b2e487 100644 --- a/R/zarr_utils.R +++ b/R/zarr_utils.R @@ -6,70 +6,37 @@ ### ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -### create_zarr_group() and create_zarr() +### zarrexists() ### -#' create_zarr_group -#' -#' create zarr groups -#' -#' @param store the location of (zarr) store -#' @param name name of the group -#' @param version zarr version -#' @export -create_zarr_group <- function(store, name, version = "v2") { - split.name <- strsplit(name, split = "\\/")[[1]] - if (length(split.name) > 1) { - split.name <- vapply(seq_len(length(split.name)), - function(x) paste(split.name[seq_len(x)], collapse = "/"), - FUN.VALUE = character(1)) - split.name <- rev(tail(split.name, 2)) - if (!dir.exists(file.path(store, split.name[2]))) - create_zarr_group(store = store, name = split.name[2]) - } - dir.create(file.path(store, split.name[1]), showWarnings = FALSE) - switch(version, - v2 = { - write("{\"zarr_format\":2}", file = file.path(store, split.name[1], ".zgroup"))}, - v3 = { - stop("Currently only zarr v2 is supported!") - }, - stop("only zarr v2 is supported. Use version = 'v2'") - ) -} - -#' create_zarr -#' -#' create zarr store -#' -#' @param dir the location of zarr store -#' @param prefix prefix of the zarr store -#' @param version zarr version -#' @examples -#' dir.create(td <- tempfile()) -#' zarr_name <- "test" -#' create_zarr(dir = td, prefix = "test") -#' dir.exists(file.path(td, "test.zarr")) -#' @export -create_zarr <- function(store, version = "v2") { - prefix <- basename(store) - dir <- gsub(paste0(prefix, "$"), "", store) - create_zarr_group(store = dir, name = prefix, version = version) +zarrexists <- function(filepath, name) +{ + dir.exists(file.path(filepath, name)) } ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -### zarrexists() +### zarrtype() ### -zarrexists <- function(filepath, name) +zarrtype <- function(filepath, name) { - dir.exists(file.path(filepath, name)) - # zarr.array <- pizzarr::zarr_open(store = filepath, mode = "r") - # if(grepl(".zarr$", filepath)){ - # zarr.array$contains_item(name) - # } else { - # return(FALSE) - # } + loc <- file.path(filepath, name) + if(file.exists(file.path(loc, ".zarray"))) + return("array") + + if(file.exists(file.path(loc, ".zgroup"))) + return("group") + + zarrjson <- file.path(loc, "zarr.json") + if(file.exists(zarrjson)){ + zarrmeta <- jsonlite::read_json(zarrjson) + if(zarrmeta[["node_type"]] == "group") { + return("group") + } else { + return("array") + } + } + stop("Zarr node type cannot be determined!") } ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -78,11 +45,7 @@ zarrexists <- function(filepath, name) zarrisgroup <- function(filepath, name) { - # zarr.array <- pizzarr::zarr_open(store = filepath, mode = "r") - # did <- try(zarr.array$get_item(name)) - # ans <- !inherits(did, "try-error") - # ans - file.exists(file.path(filepath, name, ".zgroup")) + zarrtype(filepath, name) == "group" } @@ -92,25 +55,21 @@ zarrisgroup <- function(filepath, name) zarrisdataset <- function(filepath, name) { - # zarr.array <- pizzarr::zarr_open(store = filepath, mode = "r") - # did <- try(zarr.array$get_item(name)) - # ans <- !inherits(did, "try-error") - # ans - file.exists(file.path(filepath, name, ".zarray")) + zarrtype(filepath, name) == "array" } ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ### zarrdim() and zarrchunkdim() ### -zarrdim <- function(filepath, name) +zarrdim <- function(filepath, name, as.integer = TRUE) { - # zarr.array <- pizzarr::zarr_open(store = filepath, mode = "r") - # zarrmat <- zarr.array$get_item(name) - # zarrmat$get_shape() overview <- zarr_overview(file.path(filepath, name), as_data_frame = TRUE) - overview$dim[[1]] + dim <- overview$dim[[1]] + if (as.integer) + dim <- dim_as_integer(dim, filepath, name) + dim } zarrchunkdim <- function(filepath, name, adjust=FALSE) @@ -126,6 +85,23 @@ zarrchunkdim <- function(filepath, name, adjust=FALSE) chunkdim } +# TODO: is this needed ? +dim_as_integer <- function(dim, filepath, name, what = "Zarr dataset") +{ + if (is.integer(dim)) + return(dim) + if (any(dim > .Machine$integer.max)) { + dim_in1string <- paste0(dim, collapse = " x ") + stop(wmsg("Dimensions of ", what, " are too big: ", dim_in1string), + "\n\n ", wmsg("(This error is about Zarr dataset '", + name, "' ", "from file '", filepath, "'.)"), + "\n\n ", wmsg("Please note that the ZarrArray package only ", + "supports datasets where each dimension is ", + "<= '.Machine$integer.max' (= 2**31 - 1).")) + } + as.integer(dim) +} + ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ### normarg_zarr_filepath() and normarg_zarr_name() ### @@ -196,6 +172,19 @@ validate_zarr_dataset_name <- function(path, name, what="'name'") TRUE } +### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### Manipulate one-dimensional HDF5 datasets +### + +### Length of a one-dimensional HDF5 dataset. +### Return the length as a single integer (if < 2^31) or numeric (if >= 2^31). +zarrlength <- function(filepath, name) +{ + len <- zarrdim(filepath, name, as.integer=FALSE) + stopifnot(length(len) == 1L) + len +} + ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ### ZarrCreateDataset() ### diff --git a/tests/testthat/test-ZarrSparseMatrix-class.R b/tests/testthat/test-ZarrSparseMatrix-class.R index d478278..a3b41aa 100644 --- a/tests/testthat/test-ZarrSparseMatrix-class.R +++ b/tests/testthat/test-ZarrSparseMatrix-class.R @@ -1,19 +1,31 @@ library(Rarr) -library(anndataR) -library(h5mread) library(ZarrArray) +skip_if_not_installed("anndataR") -# zarr file -zarr_dir <- system.file("extdata", "example_v2.zarr.zip", package = "anndataR") -td <- tempdir(check = TRUE) -unzip(zarr_dir, exdir = td) -store <- file.path(td, "example_v2.zarr") - -test_that("read sparse", { - - # read sparse matrix - name <- "layers/csc_counts" - ZarrSparseMatrix(store, name) - - expect_equal(1,1) -}) \ No newline at end of file +# test on both v2 and v3 +for(v in c("v2", "v3")){ + + # unpack zarr + zarr_dir <- system.file("extdata", + paste0("example_", v, ".zarr.zip"), + package = "anndataR") + td <- tempdir(check = TRUE) + unzip(zarr_dir, exdir = td) + zarr_path <- file.path(td, paste0("example_", v, ".zarr")) + + test_that("read sparse", { + + # read sparse matrix + name <- "layers/csc_counts" + A <- ZarrSparseMatrix(zarr_path, name) + expect_true(is(A, "ZarrSparseMatrix")) + expect_true(is(A, "DelayedArray")) + expect_true(is(seed(A), "ZarrSparseMatrixSeed")) + expect_identical(tools::file_path_as_absolute(path(A)), zarr_path) + expect_identical(dim(A), c(100L, 50L)) + expect_identical(type(A), "double") + expect_identical(chunkdim(A), c(1L, 50L)) + + expect_equal(1,1) + }) +} \ No newline at end of file From cd7685c187c23722e9f15972f849e55c5d62ddce Mon Sep 17 00:00:00 2001 From: Artur-man Date: Sun, 3 May 2026 18:08:46 +0200 Subject: [PATCH 5/9] do a NULL check --- R/zarr_mread.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/zarr_mread.R b/R/zarr_mread.R index b1ae410..041aea1 100644 --- a/R/zarr_mread.R +++ b/R/zarr_mread.R @@ -76,6 +76,8 @@ zarr_mread <- function(filepath, name, starts=NULL, counts=NULL, noreduce=FALSE, index <- vector("list", ndim) } else { index <- mapply(function(x,y){ + if(length(x) < 1) + return(numeric(0)) unlist( mapply(function(xx,yy){ seq(xx, xx+yy-1) From 354837db1e9d33569b48aa26e05f10d4b1518684 Mon Sep 17 00:00:00 2001 From: Artur-man Date: Sun, 3 May 2026 21:13:31 +0200 Subject: [PATCH 6/9] add ZarrADMatrix(seed) class and add additonal documentation --- R/ZarrADMatrix-class.R | 53 +++++++ R/ZarrADMatrixSeed-class.R | 134 ++++++++++++++++++ man/ZarrADMatrix-class.Rd | 118 ++++++++++++++++ man/ZarrADMatrixSeed-class.Rd | 133 ++++++++++++++++++ man/ZarrSparseMatrix-class.Rd | 95 +++++++++++++ man/ZarrSparseMatrixSeed-class.Rd | 223 ++++++++++++++++++++++++++++++ 6 files changed, 756 insertions(+) create mode 100644 R/ZarrADMatrix-class.R create mode 100644 R/ZarrADMatrixSeed-class.R create mode 100644 man/ZarrADMatrix-class.Rd create mode 100644 man/ZarrADMatrixSeed-class.Rd create mode 100644 man/ZarrSparseMatrix-class.Rd create mode 100644 man/ZarrSparseMatrixSeed-class.Rd diff --git a/R/ZarrADMatrix-class.R b/R/ZarrADMatrix-class.R new file mode 100644 index 0000000..073053d --- /dev/null +++ b/R/ZarrADMatrix-class.R @@ -0,0 +1,53 @@ +### ========================================================================= +### ZarrADMatrix objects +### ------------------------------------------------------------------------- +### + + +setClass("ZarrADMatrix", + contains="DelayedMatrix", + representation(seed="ZarrADMatrixSeed") +) + + +### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### Constructor +### + +setMethod("DelayedArray", "ZarrADMatrixSeed", + function(seed) new_DelayedArray(seed, Class="ZarrADMatrix") +) + +### Works directly on an ZarrADMatrixSeed derivative, in which case it must +### be called with a single argument. +ZarrADMatrix <- function(filepath, layer=NULL) +{ + if (is(filepath, "ZarrADMatrixSeed")) { + if (!is.null(layer)) + stop(wmsg("ZarrADMatrix() must be called with a single argument ", + "when passed an ZarrADMatrixSeed derivative")) + seed <- filepath + } else { + seed <- ZarrADMatrixSeed(filepath, layer=layer) + } + DelayedArray(seed) +} + + +### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### Taking advantage of sparsity +### + +### Will work only if the seed is an H5SparseMatrixSeed derivative, that is, +### if it's a CSC_ZarrADMatrixSeed or CSR_ZarrADMatrixSeed object. +setMethod("nzcount", "ZarrADMatrix", function(x) nzcount(x@seed)) + +### Will work only if the seed is a CSC_ZarrADMatrixSeed object. +setMethod("extractNonzeroDataByCol", "ZarrADMatrix", + function(x, j) extractNonzeroDataByCol(x@seed, j) +) + +### Will work only if the seed is a CSR_ZarrADMatrixSeed object. +setMethod("extractNonzeroDataByRow", "ZarrADMatrix", + function(x, i) extractNonzeroDataByCol(x@seed, i) +) \ No newline at end of file diff --git a/R/ZarrADMatrixSeed-class.R b/R/ZarrADMatrixSeed-class.R new file mode 100644 index 0000000..4dc56e3 --- /dev/null +++ b/R/ZarrADMatrixSeed-class.R @@ -0,0 +1,134 @@ +### ========================================================================= +### ZarrADMatrixSeed objects +### ------------------------------------------------------------------------- + + +setClass("ZarrADMatrixSeed", + contains=c("Array", "OutOfMemoryObject"), + representation("VIRTUAL") +) + +setClass("Dense_ZarrADMatrixSeed", + contains=c("ZarrADMatrixSeed", "ZarrArraySeed"), + representation(dimnames="list"), + prototype(dimnames=list(NULL, NULL)) +) +setClass("CSC_ZarrADMatrixSeed", + contains=c("ZarrADMatrixSeed", "CSC_ZarrSparseMatrixSeed") +) +setClass("CSR_ZarrADMatrixSeed", + contains=c("ZarrADMatrixSeed", "CSR_ZarrSparseMatrixSeed") +) + + +### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### dimnames() method for Dense_ZarrADMatrixSeed objects +### + +### We overwrite the method for HDF5ArraySeed objects with a method that +### accesses the slot, not the store +setMethod("dimnames", "Dense_ZarrADMatrixSeed", + function(x) S4Arrays:::simplify_NULL_dimnames(x@dimnames) +) + + +### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### Transposition +### + +### S3/S4 combo for t.CSC_ZarrADMatrixSeed +t.CSC_ZarrADMatrixSeed <- function(x) +{ + new2("CSR_ZarrADMatrixSeed", callNextMethod()) +} +setMethod("t", "CSC_ZarrADMatrixSeed", t.CSC_ZarrADMatrixSeed) + +### S3/S4 combo for t.CSR_ZarrADMatrixSeed +t.CSR_ZarrADMatrixSeed <- function(x) +{ + new2("CSC_ZarrADMatrixSeed", callNextMethod()) +} +setMethod("t", "CSR_ZarrADMatrixSeed", t.CSR_ZarrADMatrixSeed) + + +### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### Constructor +### + +.load_zarr_ad_rownames <- function(filepath, name="var") +{ + ok <- try(zarrisdataset(filepath, name), silent=TRUE) + if (isTRUE(ok)) { + ## Must use rhdf5::h5read() for now, until h5mread() knows how + ## to read COMPOUND datasets. + ans <- h5read(filepath, name)$index + if (!is.null(ans)) + ans <- as.character(ans) + return(ans) + } + ok <- try(zarrisgroup(filepath, name), silent=TRUE) + if (!isTRUE(ok)) + return(NULL) + ROWNAMES_DATASET <- paste0(name, "/_index") + ok <- try(zarrhisdataset(filepath, ROWNAMES_DATASET), silent=TRUE) + if (!isTRUE(ok)) + return(NULL) + zarr_mread(filepath, ROWNAMES_DATASET, as.vector=TRUE) +} + +### Must return a list of length 2. +.load_zarr_ad_dimnames <- function(filepath) +{ + ans_rownames <- .load_zarr_ad_rownames(filepath) + ans_colnames <- .load_zarr_ad_rownames(filepath, name="obs") + if (is.null(ans_rownames) && is.null(ans_colnames)) + warning(wmsg("could not find dimnames in this anndata-zarr store")) + list(ans_rownames, ans_colnames) +} + +### Returns an ZarrADMatrixSeed derivative (can be either a Dense_ZarrADMatrixSeed, +### or a CSC_ZarrSparseMatrixSeed, or a CSR_ZarrSparseMatrixSeed object). +ZarrADMatrixSeed <- function(filepath, layer=NULL) +{ + if (!isSingleString(filepath)) + stop(wmsg("'filepath' must be a single string specifying the ", + "path to the anndata-zarr store")) + filepath <- file_path_as_absolute(filepath) + if (is.null(layer)) { + name <- "/X" + } else { + if (!isSingleString(layer) || layer == "") + stop(wmsg("'layer' must be NULL or a single non-empty string")) + name <- paste0("/layers/", layer) + } + if (!zarrexists(filepath, name)) { + msg <- c("Zarr object \"", name, "\" does not exist ", + "in this Zarr store") + if (is.null(layer)) + msg <- c(msg, " Is this a valid anndata-zarr store?") + stop(wmsg(msg)) + } + dimnames <- .load_zarr_ad_dimnames(filepath) + + if (zarrisdataset(filepath, name)) { + ans0 <- HDF5ArraySeed(filepath, name) + if (length(dim(ans0)) != 2L) + stop(wmsg("Zarr dataset \"", name, "\" in store \"", filepath, "\" ", + "does not have exactly 2 dimensions. Please consider ", + "using the HDF5Array() constructor to access this ", + "dataset.")) + ans <- new2("Dense_ZarrADMatrixSeed", ans0, dimnames=dimnames) + } else if (zarrisgroup(filepath, name)) { + ans0 <- ZarrSparseMatrixSeed(filepath, name) + if (is(ans0, "CSC_ZarrSparseMatrixSeed")) + ans_class <- "CSC_ZarrADMatrixSeed" + else + ans_class <- "CSR_ZarrADMatrixSeed" + ans <- new2(ans_class, ans0, dimnames=dimnames) + } else { + stop(wmsg("Zarr object \"", name, "\" in store \"", filepath, "\" ", + "is neither a dataset or a group. Is this a valid ", + "anndata-zarr store?")) + } + ans +} \ No newline at end of file diff --git a/man/ZarrADMatrix-class.Rd b/man/ZarrADMatrix-class.Rd new file mode 100644 index 0000000..a9a277a --- /dev/null +++ b/man/ZarrADMatrix-class.Rd @@ -0,0 +1,118 @@ +\name{ZarrADMatrix-class} +\docType{class} + +\alias{class:ZarrADMatrix} +\alias{ZarrADMatrix-class} +\alias{ZarrADMatrix} + +\alias{DelayedArray,ZarrADMatrixSeed-method} + +\alias{nzcount,ZarrADMatrix-method} +\alias{extractNonzeroDataByCol,ZarrADMatrix-method} +\alias{extractNonzeroDataByRow,ZarrADMatrix-method} + +\title{anndata-zarr central matrices (or matrices in the /layers group) + as DelayedMatrix objects} + +\description{ + \code{anndata-zarr} stores are Zarr stores used for on-disk representation + of AnnData Python objects. At the very minimum, they contain a central + data matrix, named \code{X}, of shape #observations x #variables, and + possibly additional data matrices (stored in the Zarr group \code{/layers}) + that share the shape and dimnames of \code{X}. + See \url{https://anndata.readthedocs.io/} for more information. + + The ZarrADMatrix class is a \link[DelayedArray]{DelayedMatrix} subclass + for representing and operating on the central matrix of an \code{anndata-zarr} + store, or any matrix in its \code{/layers} group. + + All the operations available for \link[DelayedArray]{DelayedMatrix} + objects work on ZarrADMatrix objects. +} + +\usage{ +## Constructor function: +ZarrADMatrix(filepath, layer=NULL) +} + +\arguments{ + \item{filepath}{ + The path (as a single string) to the \code{anndata-zarr} store. + } + \item{layer}{ + \code{NULL} (the default) or the name of a matrix in the \code{/layers} + group. By default (i.e. when \code{layer} is not specified) + \code{ZarrADMatrix()} returns the central matrix (\code{X}). + } +} + +\value{ + \code{ZarrADMatrix()} returns an ZarrADMatrix object of shape #variables x + #observations. Note that in Python and Zarr the shape of this matrix is + considered to be #observations x #variables, but in R it is transposed. + This follows the widely adopted convention of transposing Zarr matrices + when they get loaded into R. +} + +\references{ + \url{https://anndata.readthedocs.io/} for AnnData Python objects + and the \code{anndata-zarr} format. +} + +\seealso{ + \itemize{ + \item \link{ZarrArray} objects for representing conventional (a.k.a. + dense) Zarr datasets as \link[DelayedArray]{DelayedArray} objects. + + \item \link{H5SparseMatrix} objects for representing Zarr sparse matrices + as \link[DelayedArray]{DelayedMatrix} objects. + + \item \link[DelayedArray]{DelayedMatrix} objects in the \pkg{DelayedArray} + package. + + \item The \link{ZarrADMatrixSeed} helper class. + + \item \code{\link[zellkonverter]{readH5AD}} and + \code{\link[zellkonverter]{writeH5AD}} in + the \pkg{zellkonverter} package for + importing/exporting an \code{h5ad} file as/from a + \link[SingleCellExperiment]{SingleCellExperiment} object. + + \item \link[SparseArray]{SparseArray} objects in the \pkg{SparseArray} + package. + } +} + +\examples{ +library(anndataR) +zarr_zip <- system.file("extdata", + paste0("example_v2.zarr.zip"), + package = "anndataR") +td <- tempdir(check = TRUE) +unzip(zarr_zip, exdir = td) +zarr_store <- file.path(td, paste0("example_v2.zarr")) + +X <- ZarrADMatrix(zarr_store) +X + +class(X) # ZarrADMatrix +is(X, "DelayedMatrix") # TRUE + +class(seed(X)) # Dense_ZarrADMatrixSeed + +dim(X) +path(X) +is_sparse(X) # FALSE + +## Use coercion to load the full dataset into memory: +as.matrix(X) # as ordinary array (usually not recommended) + +\dontrun{ + ## Works only if ZarrADMatrix object is sparse! + as(X, "dgCMatrix") # as dgCMatrix + as(X, "SparseArray") # as SparseArray object (most efficient) + SparseArray(X) # equivalent to 'as(X, "SparseArray")' +} +} +\keyword{classes} +\keyword{methods} diff --git a/man/ZarrADMatrixSeed-class.Rd b/man/ZarrADMatrixSeed-class.Rd new file mode 100644 index 0000000..546c69b --- /dev/null +++ b/man/ZarrADMatrixSeed-class.Rd @@ -0,0 +1,133 @@ +\name{ZarrADMatrixSeed-class} +\docType{class} + +\alias{class:ZarrADMatrixSeed} +\alias{ZarrADMatrixSeed-class} +\alias{ZarrADMatrixSeed} + +\alias{class:Dense_ZarrADMatrixSeed} +\alias{Dense_ZarrADMatrixSeed-class} +\alias{Dense_ZarrADMatrixSeed} + +\alias{class:CSC_ZarrADMatrixSeed} +\alias{CSC_ZarrADMatrixSeed-class} +\alias{CSC_ZarrADMatrixSeed} + +\alias{class:CSR_ZarrADMatrixSeed} +\alias{CSR_ZarrADMatrixSeed-class} +\alias{CSR_ZarrADMatrixSeed} + +\alias{dimnames,Dense_ZarrADMatrixSeed-method} + +\alias{t.CSC_ZarrADMatrixSeed} +\alias{t,CSC_ZarrADMatrixSeed-method} +\alias{t.CSR_ZarrADMatrixSeed} +\alias{t,CSR_ZarrADMatrixSeed-method} + +\title{ZarrADMatrixSeed objects} + +\description{ + ZarrADMatrixSeed is a low-level helper class used to represent a pointer + to the central matrix stored of an \code{anndata-zarr} store, or to one of the + matrices in the \code{/layers} group. + + It is a virtual class with three concrete subclasses: Dense_ZarrADMatrixSeed, + CSC_ZarrADMatrixSeed, and CSR_ZarrADMatrixSeed: + \itemize{ + \item The Dense_ZarrADMatrixSeed class is used when the matrix is stored + as a conventional Zarr dataset in the \code{anndata-zarr} store. It is + a direct entension of the \link{ZarrArraySeed} class. + + \item The CSC_ZarrADMatrixSeed or CSR_ZarrADMatrixSeed classes is used + when the matrix is stored in the \emph{Compressed Sparse Column} + or \emph{Compressed Sparse Row} format in the \code{anndata-zarr} + store. CSC_ZarrADMatrixSeed is a direct entension of + \link{CSC_ZarrSparseMatrixSeed}, and CSR_ZarrADMatrixSeed a + direct entension of \link{CSR_ZarrSparseMatrixSeed}. + } + + Note that an ZarrADMatrixSeed derivative is not intended to be used directly. + Most end users will typically create and manipulate a higher-level + \link{ZarrADMatrix} object instead. See \code{?\link{ZarrADMatrix}} for + more information. +} + +\usage{ +## Constructor function: +ZarrADMatrixSeed(filepath, layer=NULL) +} + +\arguments{ + \item{filepath, layer}{ + See \code{?\link{ZarrADMatrix}} for a description of these arguments. + } +} + +\details{ + Dense_ZarrADMatrixSeed objects support the same limited set of methods as + \link{ZarrArraySeed} objects, and CSC_ZarrADMatrixSeed and CSR_ZarrADMatrixSeed + objects support the same limited set of methods as \link{ZarrSparseMatrixSeed} + objects. + See \code{?\link{ZarrArraySeed}} and \code{?\link{ZarrSparseMatrixSeed}} + for the details. +} + +\value{ + \code{ZarrADMatrixSeed()} returns an ZarrADMatrixSeed derivative + (Dense_ZarrADMatrixSeed or CSC_ZarrADMatrixSeed or CSR_ZarrADMatrixSeed) + of shape #variables x #observations. +} + +\section{ZarrADMatrixSeed vs ZarrADMatrix objects}{ + In order to have access to the full set of operations that are available + for \link[DelayedArray]{DelayedMatrix} objects, an ZarrADMatrixSeed + derivative first needs to be wrapped in a \link[DelayedArray]{DelayedMatrix} + object, typically by calling the \code{\link[DelayedArray]{DelayedArray}()} + constructor on it. + + This is what the \code{\link{ZarrADMatrix}()} constructor function does. + + Note that the result of this wrapping is an \link{ZarrADMatrix} object, + which is just an ZarrADMatrixSeed derivative wrapped in a + \link[DelayedArray]{DelayedMatrix} object. +} + +\references{ + \url{https://anndata.readthedocs.io/} for AnnData Python objects + and the \code{ZarrAD} format. +} + +\seealso{ + \itemize{ + \item \link{ZarrADMatrix} objects. + + \item \link{ZarrArraySeed} and \link{ZarrSparseMatrixSeed} objects. + + \item \code{\link[zellkonverter]{readZarrAD}} and + \code{\link[zellkonverter]{writeZarrAD}} in + the \pkg{zellkonverter} package for + importing/exporting an \code{anndata-zarr} store as/from a + \link[SingleCellExperiment]{SingleCellExperiment} object. + } +} + +\examples{ +library(anndataR) +zarr_zip <- system.file("extdata", + paste0("example_v2.zarr.zip"), + package = "anndataR") +td <- tempdir(check = TRUE) +unzip(zarr_zip, exdir = td) +zarr_store <- file.path(td, paste0("example_v2.zarr")) + +seed <- ZarrADMatrix(zarr_store) +seed +path(seed) +dim(seed) +is_sparse(seed) + +DelayedArray(seed) +stopifnot(class(DelayedArray(seed)) == "ZarrADMatrix") +} +\keyword{classes} +\keyword{methods} diff --git a/man/ZarrSparseMatrix-class.Rd b/man/ZarrSparseMatrix-class.Rd new file mode 100644 index 0000000..106ed29 --- /dev/null +++ b/man/ZarrSparseMatrix-class.Rd @@ -0,0 +1,95 @@ +\name{ZarrSparseMatrix-class} +\docType{class} + +\alias{class:ZarrSparseMatrix} +\alias{ZarrSparseMatrix-class} +\alias{ZarrSparseMatrix} + +\alias{DelayedArray,ZarrSparseMatrixSeed-method} + +\alias{nzcount,ZarrSparseMatrix-method} +\alias{extractNonzeroDataByCol,ZarrSparseMatrix-method} +\alias{extractNonzeroDataByRow,ZarrSparseMatrix-method} + +\title{Zarr sparse matrices as DelayedMatrix objects} + +\description{ + The ZarrSparseMatrix class is a \link[DelayedArray]{DelayedMatrix} subclass + for representing and operating on an Zarr sparse matrix stored in + CSR/CSC/Yale format. + + All the operations available for \link[DelayedArray]{DelayedMatrix} + objects work on ZarrSparseMatrix objects. +} + +\usage{ +## Constructor function: +ZarrSparseMatrix(filepath, group) +} + +\arguments{ + \item{filepath}{ + The path (as a single string) to the anndata-zarr store (\code{.zarr} where + the sparse matrix is located. + } + \item{group}{ + The name of the group in the Zarr store where the sparse matrix is stored. + } +} + +\value{ + An ZarrSparseMatrix object. +} + +\seealso{ + \itemize{ + \item \link{ZarrArray} objects for representing conventional (a.k.a. + dense) Zarr datasets as \link[DelayedArray]{DelayedArray} objects. + + \item \link{ZarrADMatrix} objects for representing anndata-zarr central + matrices (or matrices in the \code{/layers} group) + as \link[DelayedArray]{DelayedMatrix} objects. + + \item \link{TENxMatrix} objects for representing 10x Genomics + datasets as \link[DelayedArray]{DelayedMatrix} objects. + + \item \link[DelayedArray]{DelayedMatrix} objects in the \pkg{DelayedArray} + package. + + \item The \link{ZarrSparseMatrixSeed} helper class. + + \item \link[SparseArray]{SparseArray} objects in the \pkg{SparseArray} + package. + } +} + +\examples{ +library(anndataR) +zarr_zip <- system.file("extdata", + paste0("example_v2.zarr.zip"), + package = "anndataR") +td <- tempdir(check = TRUE) +unzip(zarr_zip, exdir = td) +zarr_store <- file.path(td, paste0("example_v2.zarr")) + +M <- ZarrSparseMatrix(zarr_store, "/obsp/connectivities") +M + +class(M) # ZarrSparseMatrix +is(M, "DelayedMatrix") # TRUE + +seed(M) +class(seed(M)) # CSC_ZarrSparseMatrixSeed + +dim(M) +path(M) +is_sparse(M) # TRUE + +## Use coercion to load the full dataset into memory: +as.matrix(M) # as ordinary array (usually not recommended) +as(M, "dgCMatrix") # as dgCMatrix +as(M, "SparseArray") # as SparseArray object (most efficient) +SparseArray(M) # equivalent to 'as(M, "SparseArray")' +} +\keyword{classes} +\keyword{methods} diff --git a/man/ZarrSparseMatrixSeed-class.Rd b/man/ZarrSparseMatrixSeed-class.Rd new file mode 100644 index 0000000..8387311 --- /dev/null +++ b/man/ZarrSparseMatrixSeed-class.Rd @@ -0,0 +1,223 @@ +\name{ZarrSparseMatrixSeed-class} +\docType{class} + +\alias{class:ZarrSparseMatrixSeed} +\alias{ZarrSparseMatrixSeed-class} +\alias{ZarrSparseMatrixSeed} + +\alias{class:CSC_ZarrSparseMatrixSeed} +\alias{CSC_ZarrSparseMatrixSeed-class} +\alias{CSC_ZarrSparseMatrixSeed} + +\alias{class:CSR_ZarrSparseMatrixSeed} +\alias{CSR_ZarrSparseMatrixSeed-class} +\alias{CSR_ZarrSparseMatrixSeed} + +\alias{t.CSC_ZarrSparseMatrixSeed} +\alias{t,CSC_ZarrSparseMatrixSeed-method} +\alias{t.CSR_ZarrSparseMatrixSeed} +\alias{t,CSR_ZarrSparseMatrixSeed-method} + +\alias{path,ZarrSparseMatrixSeed-method} +\alias{path<-,ZarrSparseMatrixSeed-method} +\alias{dim,ZarrSparseMatrixSeed-method} +\alias{dimnames,ZarrSparseMatrixSeed-method} +\alias{chunkdim,CSC_ZarrSparseMatrixSeed-method} +\alias{chunkdim,CSR_ZarrSparseMatrixSeed-method} +\alias{is_sparse,ZarrSparseMatrixSeed-method} +\alias{nzcount,ZarrSparseMatrixSeed-method} + +\alias{extract_sparse_array,CSC_ZarrSparseMatrixSeed-method} +\alias{extract_sparse_array,CSR_ZarrSparseMatrixSeed-method} +\alias{extract_array,ZarrSparseMatrixSeed-method} + +\alias{show,ZarrSparseMatrixSeed-method} + +% OLD STUFF +\alias{extractNonzeroDataByCol} +\alias{extractNonzeroDataByCol,CSC_ZarrSparseMatrixSeed-method} +\alias{extractNonzeroDataByRow} +\alias{extractNonzeroDataByRow,CSR_ZarrSparseMatrixSeed-method} + +\title{ZarrSparseMatrixSeed objects} + +\description{ + ZarrSparseMatrixSeed is a low-level helper class for representing a + pointer to a sparse matrix stored in an HDF5 file and compressed + using the CSC or CSR layout. + + It is a virtual class with two concrete subclasses: CSC_ZarrSparseMatrixSeed + for the \emph{Compressed Sparse Column} layout, and CSR_ZarrSparseMatrixSeed + for the \emph{Compressed Sparse Row} layout. + The former is used by 10x Genomics (e.g. "1.3 Million Brain Cell Dataset"). + \code{anndata-zarr} store can use one or the other layout to store a + sparse matrix. + + Note that an ZarrSparseMatrixSeed derivative is not intended to be used + directly. Most end users will typically create and manipulate a + higher-level \link{ZarrSparseMatrix} object instead. + See \code{?\link{ZarrSparseMatrix}} for more information. +} + +\usage{ +## --- Constructor function --- + +ZarrSparseMatrixSeed(filepath, group, subdata=NULL, + dim=NULL, sparse.layout=NULL) + +## --- Accessors -------------- + +\S4method{path}{ZarrSparseMatrixSeed}(object) + +\S4method{dim}{ZarrSparseMatrixSeed}(x) + +\S4method{dimnames}{ZarrSparseMatrixSeed}(x) + +\S4method{chunkdim}{CSC_ZarrSparseMatrixSeed}(x) +\S4method{chunkdim}{CSR_ZarrSparseMatrixSeed}(x) + +## --- Data extraction -------- + +\S4method{extract_array}{ZarrSparseMatrixSeed}(x, index) + +\S4method{extract_sparse_array}{CSC_ZarrSparseMatrixSeed}(x, index) +\S4method{extract_sparse_array}{CSR_ZarrSparseMatrixSeed}(x, index) + +\S4method{extractNonzeroDataByCol}{CSC_ZarrSparseMatrixSeed}(x, j) +\S4method{extractNonzeroDataByRow}{CSR_ZarrSparseMatrixSeed}(x, i) + +## --- Other methods ---------- + +\S4method{is_sparse}{ZarrSparseMatrixSeed}(x) + +\S4method{nzcount}{ZarrSparseMatrixSeed}(x) +} + +\arguments{ + \item{filepath, group}{ + See \code{?\link{ZarrSparseMatrix}} for a description of these arguments. + } + \item{subdata}{ + Experimental. Don't use! + } + \item{dim, sparse.layout}{ + The \code{ZarrSparseMatrixSeed()} constructor should be able to + automatically detect the dimensions and layout of the sparse matrix + stored in the HDF5 file, so the user shouldn't need to specify these + arguments. + + See Details section below for some rare situations where the user might + need to specify them. + } + \item{object, x}{ + An ZarrSparseMatrixSeed derivative. + } + \item{index}{ + See \code{?\link[S4Arrays]{extract_array}} in the \pkg{S4Arrays} + package. + } + \item{j}{ + An integer vector containing valid column indices. + } + \item{i}{ + An integer vector containing valid row indices. + } +} + +\details{ + *** Layout in R vs physical layout *** + + The implementation of CSC_ZarrSparseMatrixSeed and + CSR_ZarrSparseMatrixSeed objects follows the usual convention of + transposing the matrix stored in the HDF5 file when loading it into R. + This means that a CSC_ZarrSparseMatrixSeed object represents a + sparse matrix stored physically in the CSR layout (Compressed Sparse Row) + at the HDF5 level, and a CSR_ZarrSparseMatrixSeed object represents a + sparse matrix stored physically in the CSC layout (Compressed Sparse Column) + at the HDF5 level. + + *** Automatic detection of the dimensions and layout *** + + The \code{ZarrSparseMatrixSeed()} constructor should be able to + automatically detect the dimensions and layout of the sparse matrix + stored in the HDF5 file. However, in some rare situations, the user + might want to bypass the detection mechanism, or they might be dealing + with a sparse matrix stored in an HDF5 group that doesn't provide this + information (e.g. the group only contains the \code{data}, \code{indices}, + and \code{indptr} components). In which case, they can supply the + \code{dim} and \code{sparse.layout} arguments: + \itemize{ + \item \code{dim} must be an integer vector of length 2. + \item \code{sparse.layout} must be \code{"CSC"} or \code{"CSR"}. + } + Note that both values must describe the dimensions and layout of the + R object that will be returned, that is, \emph{after} transposition + from the physical layout used at the HDF5 level. Also be aware that the + supplied values will take precedence over whatever the HDF5 file says, + which means that bad things will happen if they don't reflect the actual + dimensions and layout of the sparse matrix. Use these arguments only if + you know what you are doing! + + *** ZarrSparseMatrixSeed object vs ZarrSparseMatrix object *** + + Note that ZarrSparseMatrixSeed derivatives support a very limited set + of methods: + \itemize{ + \item \code{path()}: Returns the path to the HDF5 file where the sparse + matrix is located. + \item \code{dim()}, \code{dimnames()}. + \item \code{extract_array()}, \code{is_sparse()}, + \code{extract_sparse_array()}, \code{chunkdim()}: + These generics are defined and documented in other packages e.g. + in \pkg{S4Arrays} for \code{\link[S4Arrays]{extract_array}()} + and \code{\link[S4Arrays]{is_sparse}()}, in \pkg{SparseArray} + for \code{\link[SparseArray]{extract_sparse_array}()}, and in + \pkg{DelayedArray} for \code{\link[DelayedArray]{chunkdim}()}. + \item \code{nzcount()}: Returns the number of nonzero values in the + object. + \item \code{extractNonzeroDataByCol()}: Works on CSC_ZarrSparseMatrixSeed + objects only. Returns a \link[IRanges]{NumericList} or + \link[IRanges]{IntegerList} object \emph{parallel} to \code{j}, + that is, with one list element per column index in \code{j}. + The row indices of the values are not returned. Furthermore, the + values within a given list element can be returned in **any order**. + In particular, do NOT assume that they are ordered by ascending + row index. + \item \code{extractNonzeroDataByRow()}: Works on CSR_ZarrSparseMatrixSeed + objects only. Returns a \link[IRanges]{NumericList} or + \link[IRanges]{IntegerList} object \emph{parallel} to \code{i}, + that is, with one list element per row index in \code{i}. + The column indices of the values are not returned. Furthermore, the + values within a given list element can be returned in **any order**. + In particular, do NOT assume that they are ordered by ascending + column index. + } + In order to have access to the full set of operations that + are available for \link[DelayedArray]{DelayedMatrix} objects, + an ZarrSparseMatrixSeed derivative would first need to be wrapped in + a \link[DelayedArray]{DelayedMatrix} object, typically by calling + the \code{\link[DelayedArray]{DelayedArray}()} constructor on it. +} + +\value{ + \code{ZarrSparseMatrixSeed()} returns an ZarrSparseMatrixSeed derivative + (CSC_ZarrSparseMatrixSeed or CSR_ZarrSparseMatrixSeed object). +} + +\references{ + \url{https://en.wikipedia.org/wiki/Sparse_matrix} for a description + of the CSR/CSC/Yale format (section "Compressed sparse row (CSR, CRS + or Yale format)"). +} + +\seealso{ + \itemize{ + \item \link{ZarrSparseMatrix} objects. + } +} + +\examples{ +showClass("ZarrSparseMatrixSeed") +} +\keyword{classes} +\keyword{methods} From efe2fdf6c32503e431104c17a82f62fc075d2383 Mon Sep 17 00:00:00 2001 From: Artur-man Date: Sun, 3 May 2026 21:33:59 +0200 Subject: [PATCH 7/9] additional documentation and exports --- DESCRIPTION | 4 +++- NAMESPACE | 44 +++++++++++++++++++++++++++++++---- R/ZarrADMatrix-class.R | 1 - R/zarr_utils.R | 16 ++++++------- man/ZarrADMatrix-class.Rd | 10 ++++---- man/ZarrSparseMatrix-class.Rd | 3 --- 6 files changed, 56 insertions(+), 22 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 1197926..85b7ef7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -42,7 +42,9 @@ Collate: ZarrArray-class.R ZarrSparseMatrixSeed-class.R ZarrSparseMatrix-class.R + ZarrADMatrix-class.R + ZarrADMatrixSeed-class.R writeZarrArray-auto-args.R writeZarrArray.R zzz.R -RoxygenNote: 7.3.3 +Config/roxygen2/version: 8.0.0 diff --git a/NAMESPACE b/NAMESPACE index aaa6a95..799af3b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -20,7 +20,10 @@ exportClasses( "ZarrArray", "ZarrMatrix", "ZarrRealizationSink", "ZarrSparseMatrixSeed", "CSC_ZarrSparseMatrixSeed", "CSR_ZarrSparseMatrixSeed", - "ZarrSparseMatrix" + "ZarrSparseMatrix", + "ZarrADMatrixSeed", + "Dense_ZarrADMatrixSeed", "CSC_ZarrADMatrixSeed", "CSR_ZarrADMatrixSeed", + "ZarrADMatrix" ) ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -29,7 +32,19 @@ exportClasses( S3method(t, CSC_ZarrSparseMatrixSeed) S3method(t, CSR_ZarrSparseMatrixSeed) +S3method(t, CSC_ZarrADMatrixSeed) +S3method(t, CSR_ZarrADMatrixSeed) +### We also export them thru the export() directive so that (a) they can be +### called directly, (b) tab-completion on the name of the generic shows them, +### and (c) methods() doesn't asterisk them. + +export( + t.CSC_ZarrSparseMatrixSeed, + t.CSR_ZarrSparseMatrixSeed, + t.CSC_ZarrADMatrixSeed, + t.CSR_ZarrADMatrixSeed +) ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ### Export S4 methods for generics not defined in ZarrArray @@ -77,12 +92,33 @@ export( get_writeZarrArray_chunk_maxlen, set_writeZarrArray_chunk_maxlen, get_writeZarrArray_chunk_shape, set_writeZarrArray_chunk_shape, get_writeZarrArray_auto_chunkdim, + + ## writeZarrArray.R: + ZarrRealizationSink, writeZarrArray, - ## + ## ZarrSparseMatrixSeed-class.R ZarrSparseMatrixSeed, + + ## ZarrSparseMatrix-class.R ZarrSparseMatrix, + + ## ZarrADMatrixSeed-class.R + ZarrADMatrixSeed, + + ## ZarrADMatrix-class.R + ZarrADMatrix +) - ## writeZarrArray.R: - ZarrRealizationSink, writeZarrArray +### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### Export S4 generics defined in HDF5Array, and corresponding methods +### + +export( + ## ZarrSparseMatrixSeed-class.R: + extractNonzeroDataByCol, extractNonzeroDataByRow ) +### Exactly the same list as above. +exportMethods( + extractNonzeroDataByCol, extractNonzeroDataByRow +) diff --git a/R/ZarrADMatrix-class.R b/R/ZarrADMatrix-class.R index 073053d..5556a8b 100644 --- a/R/ZarrADMatrix-class.R +++ b/R/ZarrADMatrix-class.R @@ -3,7 +3,6 @@ ### ------------------------------------------------------------------------- ### - setClass("ZarrADMatrix", contains="DelayedMatrix", representation(seed="ZarrADMatrixSeed") diff --git a/R/zarr_utils.R b/R/zarr_utils.R index 0b2e487..e6ad859 100644 --- a/R/zarr_utils.R +++ b/R/zarr_utils.R @@ -189,14 +189,14 @@ zarrlength <- function(filepath, name) ### ZarrCreateDataset() ### -compute_max_string_size <- function(x) -{ - if (type(x) != "character") - return(NULL) - if (length(x) == 0L) - return(0L) - max(nchar(x, type="bytes", keepNA=FALSE)) -} +# compute_max_string_size <- function(x) +# { +# if (type(x) != "character") +# return(NULL) +# if (length(x) == 0L) +# return(0L) +# max(nchar(x, type="bytes", keepNA=FALSE)) +# } ZarrCreateDataset <- function(filepath, name, diff --git a/man/ZarrADMatrix-class.Rd b/man/ZarrADMatrix-class.Rd index a9a277a..48ce07c 100644 --- a/man/ZarrADMatrix-class.Rd +++ b/man/ZarrADMatrix-class.Rd @@ -64,7 +64,7 @@ ZarrADMatrix(filepath, layer=NULL) \item \link{ZarrArray} objects for representing conventional (a.k.a. dense) Zarr datasets as \link[DelayedArray]{DelayedArray} objects. - \item \link{H5SparseMatrix} objects for representing Zarr sparse matrices + \item \link{ZarrSparseMatrix} objects for representing Zarr sparse matrices as \link[DelayedArray]{DelayedMatrix} objects. \item \link[DelayedArray]{DelayedMatrix} objects in the \pkg{DelayedArray} @@ -72,10 +72,10 @@ ZarrADMatrix(filepath, layer=NULL) \item The \link{ZarrADMatrixSeed} helper class. - \item \code{\link[zellkonverter]{readH5AD}} and - \code{\link[zellkonverter]{writeH5AD}} in - the \pkg{zellkonverter} package for - importing/exporting an \code{h5ad} file as/from a + \item \code{\link[anndataR]{read_zarr}} and + \code{\link[anndataR]{write_zarr}} in + the \pkg{anndataR} package for + importing/exporting an \code{anndata-zarr} store as/from a \link[SingleCellExperiment]{SingleCellExperiment} object. \item \link[SparseArray]{SparseArray} objects in the \pkg{SparseArray} diff --git a/man/ZarrSparseMatrix-class.Rd b/man/ZarrSparseMatrix-class.Rd index 106ed29..081586e 100644 --- a/man/ZarrSparseMatrix-class.Rd +++ b/man/ZarrSparseMatrix-class.Rd @@ -50,9 +50,6 @@ ZarrSparseMatrix(filepath, group) matrices (or matrices in the \code{/layers} group) as \link[DelayedArray]{DelayedMatrix} objects. - \item \link{TENxMatrix} objects for representing 10x Genomics - datasets as \link[DelayedArray]{DelayedMatrix} objects. - \item \link[DelayedArray]{DelayedMatrix} objects in the \pkg{DelayedArray} package. From 7e7fadb874a6101ef16c2d5ccbf2cbef1f98dc4d Mon Sep 17 00:00:00 2001 From: Artur-man Date: Sun, 3 May 2026 21:45:12 +0200 Subject: [PATCH 8/9] add additional tests, correct dimnames calls --- R/ZarrADMatrixSeed-class.R | 2 +- tests/testthat/test-ZarrADMatrix-class.R | 30 ++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) create mode 100644 tests/testthat/test-ZarrADMatrix-class.R diff --git a/R/ZarrADMatrixSeed-class.R b/R/ZarrADMatrixSeed-class.R index 4dc56e3..229ec5a 100644 --- a/R/ZarrADMatrixSeed-class.R +++ b/R/ZarrADMatrixSeed-class.R @@ -70,7 +70,7 @@ setMethod("t", "CSR_ZarrADMatrixSeed", t.CSR_ZarrADMatrixSeed) if (!isTRUE(ok)) return(NULL) ROWNAMES_DATASET <- paste0(name, "/_index") - ok <- try(zarrhisdataset(filepath, ROWNAMES_DATASET), silent=TRUE) + ok <- try(zarrisdataset(filepath, ROWNAMES_DATASET), silent=TRUE) if (!isTRUE(ok)) return(NULL) zarr_mread(filepath, ROWNAMES_DATASET, as.vector=TRUE) diff --git a/tests/testthat/test-ZarrADMatrix-class.R b/tests/testthat/test-ZarrADMatrix-class.R new file mode 100644 index 0000000..79c78f5 --- /dev/null +++ b/tests/testthat/test-ZarrADMatrix-class.R @@ -0,0 +1,30 @@ +library(Rarr) +library(ZarrArray) +skip_if_not_installed("anndataR") + +# test on both v2 and v3 +for(v in c("v2", "v3")){ + + # unpack zarr + zarr_dir <- system.file("extdata", + paste0("example_", v, ".zarr.zip"), + package = "anndataR") + td <- tempdir(check = TRUE) + unzip(zarr_dir, exdir = td) + zarr_path <- file.path(td, paste0("example_", v, ".zarr")) + + test_that("read sparse", { + + # read sparse matrix + A <- ZarrADMatrix(zarr_path) + expect_true(is(A, "ZarrADMatrix")) + expect_true(is(A, "DelayedArray")) + expect_true(is(seed(A), "ZarrADMatrixSeed")) + expect_identical(tools::file_path_as_absolute(path(A)), zarr_path) + expect_identical(dim(A), c(100L, 50L)) + expect_identical(type(A), "double") + expect_identical(chunkdim(A), c(100L, 1L)) + + expect_equal(1,1) + }) +} \ No newline at end of file From 88bd164c8f8d254fc2b32ecffb635f29ab502d8e Mon Sep 17 00:00:00 2001 From: Artur-man Date: Mon, 4 May 2026 09:26:07 +0200 Subject: [PATCH 9/9] remove as.sparse and as.vector from zarr_mread --- R/ZarrADMatrixSeed-class.R | 2 +- R/ZarrSparseMatrixSeed-class.R | 6 +++--- R/zarr_mread.R | 25 ++++++++----------------- 3 files changed, 12 insertions(+), 21 deletions(-) diff --git a/R/ZarrADMatrixSeed-class.R b/R/ZarrADMatrixSeed-class.R index 229ec5a..c623a3e 100644 --- a/R/ZarrADMatrixSeed-class.R +++ b/R/ZarrADMatrixSeed-class.R @@ -73,7 +73,7 @@ setMethod("t", "CSR_ZarrADMatrixSeed", t.CSR_ZarrADMatrixSeed) ok <- try(zarrisdataset(filepath, ROWNAMES_DATASET), silent=TRUE) if (!isTRUE(ok)) return(NULL) - zarr_mread(filepath, ROWNAMES_DATASET, as.vector=TRUE) + zarr_mread(filepath, ROWNAMES_DATASET) } ### Must return a list of length 2. diff --git a/R/ZarrSparseMatrixSeed-class.R b/R/ZarrSparseMatrixSeed-class.R index de73e06..a5c9bba 100644 --- a/R/ZarrSparseMatrixSeed-class.R +++ b/R/ZarrSparseMatrixSeed-class.R @@ -147,15 +147,15 @@ setMethod("nzcount", "ZarrSparseMatrixSeed", ### All the zarrsparse components are monodimensional. read_zarrsparse_component <- function(filepath, group, name, - start=NULL, count=NULL, as.integer=FALSE) + start=NULL, count=NULL, + as.integer=FALSE) { name <- paste0(group, "/", name) if (!is.null(start)) start <- list(start) if (!is.null(count)) count <- list(count) - zarr_mread(filepath, name, starts=start, counts=count, - as.vector=TRUE, as.integer=as.integer) + zarr_mread(filepath, name, starts=start, counts=count, as.integer=as.integer) } ### Returns a numeric vector (integer or double). diff --git a/R/zarr_mread.R b/R/zarr_mread.R index 041aea1..9f73ce4 100644 --- a/R/zarr_mread.R +++ b/R/zarr_mread.R @@ -8,16 +8,13 @@ ### strictly ascending along each dimension. ### By default the user-supplied selection is checked and reduced (if it ### can be). -### Set 'noreduce' to TRUE to skip the reduction step. ### Set 'as.integer' to TRUE to force returning the result as an integer array. -zarr_mread <- function(filepath, name, starts=NULL, counts=NULL, noreduce=FALSE, - as.vector=NA, as.integer=FALSE, as.sparse=FALSE) +zarr_mread <- function(filepath, name, starts=NULL, counts=NULL, + as.integer=FALSE) { # check name # name <- normarg_zarr_name(name) - if (!isTRUEorFALSE(as.sparse)) - stop(wmsg("'as.sparse' must be TRUE or FALSE")) if (is.null(starts)) { if (!is.null(counts)) stop(wmsg("'counts' must be NULL when 'starts' is NULL")) @@ -42,9 +39,9 @@ zarr_mread <- function(filepath, name, starts=NULL, counts=NULL, noreduce=FALSE, logical(1)) order_starts <- !all(ok) if (order_starts) { - if (length(ok) != 1L && isTRUE(as.vector)) - stop(wmsg("when using 'as.vector=TRUE' on a ", - "multidimensional dataset, list elements ", + # if (length(ok) != 1L && isTRUE(as.vector)) + if (length(ok) != 1L) + stop(wmsg("when using multidimensional dataset, list elements ", "in 'starts' must be strictly sorted")) starts <- lapply(seq_along(starts0), function(i) { @@ -53,10 +50,6 @@ zarr_mread <- function(filepath, name, starts=NULL, counts=NULL, noreduce=FALSE, return(start0) start0 <- sort(start0) start <- unique(start0) - if (as.sparse && length(start) != length(start0)) - stop(wmsg("when using 'as.sparse=TRUE', list ", - "elements in 'starts' are not allowed ", - "to contain duplicates")) start }) } else { @@ -85,10 +78,10 @@ zarr_mread <- function(filepath, name, starts=NULL, counts=NULL, noreduce=FALSE, ) }, starts, counts, SIMPLIFY = FALSE) } + if(as.integer) + index <- lapply(index, as.integer) ans <- read_zarr_array(file.path(filepath, name), index = index) - if (as.sparse) - ans <- COO_SparseArray(ans[[1L]], ans[[2L]], ans[[3L]], check=FALSE) if (is.null(starts) || !order_starts) return(ans) index <- lapply(seq_along(starts0), @@ -97,9 +90,7 @@ zarr_mread <- function(filepath, name, starts=NULL, counts=NULL, noreduce=FALSE, return(NULL) match(starts0[[i]], starts[[i]]) }) - if (as.sparse) { - extract_sparse_array(ans, index) - } else if (is.array(ans)) { + if (is.array(ans)) { extract_array(ans, index) } else if (length(index) == 1L) { ans[index[[1L]]]