Skip to content

Commit d1eff50

Browse files
committed
Merge branch 'dev'
2 parents b82337b + 9382ce8 commit d1eff50

53 files changed

Lines changed: 57402 additions & 299 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/R-CMD-check.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,12 @@ on:
55
branches:
66
- main
77
- master
8+
- dev
89
pull_request:
910
branches:
1011
- main
1112
- master
13+
- dev
1214

1315
name: R-CMD-check
1416

CRAN-RELEASE

Lines changed: 0 additions & 2 deletions
This file was deleted.

DESCRIPTION

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
Package: RcppCWB
22
Type: Package
33
Title: 'Rcpp' Bindings for the 'Corpus Workbench' ('CWB')
4-
Version: 0.3.2
5-
Date: 2021-02-03
4+
Version: 0.4.0
5+
Date: 2021-06-25
66
Author: Andreas Blaette [aut, cre],
77
Bernard Desgraupes [aut],
88
Sylvain Loiseau [aut],
@@ -52,7 +52,8 @@ LinkingTo: Rcpp
5252
Biarch: true
5353
URL: https://github.com/PolMine/RcppCWB
5454
BugReports: https://github.com/PolMine/RcppCWB/issues
55-
RoxygenNote: 6.1.1
55+
RoxygenNote: 7.1.1
56+
Roxygen: list(markdown = TRUE)
5657
Collate:
5758
'RcppCWB_package.R'
5859
'cl.R'
@@ -66,3 +67,4 @@ Collate:
6667
'region_matrix.R'
6768
'misc.R'
6869
'zzz.R'
70+
'xml.R'

NAMESPACE

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ export(check_s_attribute)
1212
export(check_strucs)
1313
export(cl_charset_name)
1414
export(cl_delete_corpus)
15+
export(cl_struc_values)
16+
export(corpus_data_dir)
1517
export(cqp_dump_subcorpus)
1618
export(cqp_get_registry)
1719
export(cqp_initialize)
@@ -22,6 +24,7 @@ export(cqp_query)
2224
export(cqp_reset_registry)
2325
export(cqp_subcorpus_size)
2426
export(cwb_compress_rdx)
27+
export(cwb_encode)
2528
export(cwb_huffcode)
2629
export(cwb_makeall)
2730
export(get_cbow_matrix)
@@ -31,6 +34,8 @@ export(get_region_matrix)
3134
export(get_tmp_registry)
3235
export(region_matrix_to_count_matrix)
3336
export(region_matrix_to_ids)
37+
export(s_attr_is_descendent)
38+
export(s_attr_is_sibling)
3439
export(s_attribute_decode)
3540
export(use_tmp_registry)
3641
exportPattern("^[[:alpha:]]+")

NEWS.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,25 @@
1+
# RcppCWB 0.4.0
2+
3+
## New Features
4+
5+
* Encode XML (vrt file format) with new function `cwb_encode()` that exposes functionality of cwb-encode CWB utility.
6+
* Functions `cl_cpos2lbound()` and `cl_cpos2rbound()` will now accept an integer vector with length > 1 as argument `cpos` and return a vector with the same length. Useful to speed up iterated queries for left and right boundaries of regions (#19).
7+
* A new function `cl_struc_values()` exposes the corresponding C function of the Corpus Library (CL). The previous implicit assumption that all structural attributes have values can thus be tested. Intended to work with annotations of sentences and paragraphs, i.e. common structural attributes that do usually not have values.
8+
* A new function `corpus_data_dir()` will derive the data directory from the internal C representation of a corpus.
9+
* New function `s_attr_regions()` will derive regions defined by a structural attribute from the *.rng file. Fastest option for large corpora.
10+
* New functions `s_attr_is_sibling()` and `s_attr_is_descendent()` test the sibling/descendent relationship of structural attributes.
11+
12+
13+
## Minor Improvements
14+
15+
* Function `check_corpus()` now includes checks whether the registry provided (argument `registry`) is identical with the registry defined internally by CQP. The registry is reset if directories are not identical.
16+
* Minor adjustments of configure script for aarch64, adding -fPIC to CFLAGS so that this flag will be used when Linux default configuration is used as fallback.
17+
* The implementation of the `s_attribute_decode()` method was incomplete for method "Rcpp". This alternative to the "pure R" approach is now implemented (#2).
18+
* The unused file 'setpaths.R' has been removed from the tools directory (#10).
19+
* The argument `method` previously setting "wininet" in ./tools/winlibs.R is omitted to avoid the warning "the 'wininet' method is deprecated for http:// and https:// URLs" on Windows.
20+
* The configure script will print the libdirs derived using pcre-config and link against libintl on macOS by default.
21+
22+
123
# RcppCWB 0.3.2
224

325
* If RcppCWB is compiled on macOS, the package configure script checks the architecture of the machine and ensures that (if glib-2.0 is not yet present) a version of glib-2.0 compiled for Apple Silicon/the M1 chip is loaded in case an amd64 architecture is detected.

R/RcppExports.R

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,14 @@
109109
.Call(`_RcppCWB_check_corpus`, corpus)
110110
}
111111

112+
.cl_struc_values <- function(corpus, s_attribute, registry) {
113+
.Call(`_RcppCWB__cl_struc_values`, corpus, s_attribute, registry)
114+
}
115+
116+
.corpus_data_dir <- function(corpus, registry) {
117+
.Call(`_RcppCWB__corpus_data_dir`, corpus, registry)
118+
}
119+
112120
.decode_s_attribute <- function(corpus, s_attribute, registry) {
113121
.Call(`_RcppCWB_decode_s_attribute`, corpus, s_attribute, registry)
114122
}
@@ -149,3 +157,7 @@
149157
.Call(`_RcppCWB_cwb_compress_rdx`, x, registry_dir, p_attribute)
150158
}
151159

160+
.cwb_encode <- function(regfile, data_dir, vrt_dir, p_attributes, s_attributes_anno, s_attributes_noanno) {
161+
.Call(`_RcppCWB_cwb_encode`, regfile, data_dir, vrt_dir, p_attributes, s_attributes_anno, s_attributes_noanno)
162+
}
163+

R/checks.R

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,14 +31,26 @@ check_registry <- function(registry){
3131
#' @rdname checks
3232
#' @export check_corpus
3333
check_corpus <- function(corpus, registry){
34-
if (length(corpus) != 1)
34+
35+
if (length(corpus) != 1L)
3536
stop("corpus needs to be a vector of length 1")
37+
3638
if (!is.character(corpus))
3739
stop("corpus needs to be a character vector")
38-
if (!cqp_is_initialized()) cqp_initialize()
39-
if (.check_corpus(toupper(corpus)) == 0)
40-
# if (!tolower(corpus) %in% list.files(registry))
40+
41+
registry <- normalizePath(path.expand(registry))
42+
if (isFALSE(dir.exists(registry)))
43+
stop(sprintf("Registry directory '%s' does not exist.", registry))
44+
45+
if (isFALSE(cqp_is_initialized())) cqp_initialize(registry = registry)
46+
if (cqp_get_registry() != registry){
47+
warning(sprintf("Resetting registry directory from '%s' to '%s'", cqp_get_registry(), registry))
48+
cqp_reset_registry(registry = registry)
49+
}
50+
51+
if (.check_corpus(toupper(corpus)) == 0L)
4152
stop(sprintf("corpus %s is not available (check whether there is a typo)", sQuote(corpus)))
53+
4254
return( TRUE )
4355
}
4456

R/cl.R

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ cl_lexicon_size <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_R
5959
#'
6060
#' @param corpus name of a CWB corpus (upper case)
6161
#' @param s_attribute name of structural attribute (character vector)
62-
#' @param cpos corpus positions (integer vector)
62+
#' @param cpos An \code{integer} vector with corpus positions.
6363
#' @param struc a struc identifying a region
6464
#' @param registry path to the registry directory, defaults to the value of the
6565
#' environment variable CORPUS_REGISTRY
@@ -303,3 +303,40 @@ cl_charset_name <- function(corpus, registry = Sys.getenv("CORPUS_REGISTRY")){
303303
.cl_charset_name(corpus = corpus, registry = registry)
304304
}
305305

306+
#' Check whether structural attribute has values
307+
#'
308+
#' Structural attributes do not necessarily have values, structural attributes
309+
#' (such as annotations of sentences or paragraphs) may just define regions of
310+
#' corpus positions. Use this function to test whether an attribute has values.
311+
#'
312+
#' @param corpus Corpus ID, a length-one `character` vector.
313+
#' @param s_attribute Structural attribute to check, a length-one `character` vector.
314+
#' @param registry The registry directory of the corpus.
315+
#' @return `TRUE` if the attribute has values and `FALSE` if not. `NA` if the structural
316+
#' attribute is not available.
317+
#' @export cl_struc_values
318+
#' @examples
319+
#' cl_struc_values("REUTERS", "places") # TRUE - attribute has values
320+
#' cl_struc_values("REUTERS", "date") # NA - attribute does not exist
321+
cl_struc_values <- function(corpus, s_attribute, registry = Sys.getenv("CORPUS_REGISTRY")){
322+
check_corpus(corpus = corpus, registry = registry)
323+
registry <- normalizePath(path.expand(registry))
324+
i <- .cl_struc_values(corpus = corpus, s_attribute = s_attribute, registry = registry)
325+
if (i == 1L) TRUE else if (i == 0L) FALSE else if (i < 0L) as.integer(NA)
326+
}
327+
328+
#' Get data directory of a corpus
329+
#'
330+
#' Extract the data directory from the intenal C representation of the content
331+
#' of the registry file for a corpus.
332+
#' @param corpus A length-one `character` vector with the corpus ID.
333+
#' @param registry A length-one `character` vector with the registry directory.
334+
#' @return A length-one `character` vector stating the data directory.
335+
#' @export corpus_data_dir
336+
#' @examples
337+
#' corpus_data_dir("REUTERS")
338+
corpus_data_dir <- function(corpus, registry = Sys.getenv("CORPUS_REGISTRY")){
339+
check_corpus(corpus = corpus, registry = registry)
340+
registry <- normalizePath(path.expand(registry))
341+
.corpus_data_dir(corpus = corpus, registry = registry)
342+
}

R/cwb.R

Lines changed: 66 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
#' wrappers will always perform a specific indexing/compression step on one
66
#' positional attribute, and produce all components.
77
#'
8-
#' @rdname cwb_utils
98
#' @param corpus name of a CWB corpus (upper case)
109
#' @param p_attribute name p-attribute
1110
#' @param registry path to the registry directory, defaults to the value of the
@@ -79,3 +78,69 @@ cwb_huffcode <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGI
7978
cwb_compress_rdx <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGISTRY")){
8079
.cwb_compress_rdx(x = corpus, p_attribute = p_attribute, registry_dir = registry)
8180
}
81+
82+
#' @param p_attributes Positional attributes (p-attributes) to be declared.
83+
#' @param data_dir The data directory where `cwb_encode` will put the binary
84+
#' files of the indexed corpus.
85+
#' @param vrt_dir Directory with input corpus files (verticalised format / file
86+
#' ending *.vrt).
87+
#' @param s_attributes A `list` of named `character` vectors to declare
88+
#' structural attributes that shall be encoded. The names of the list are the
89+
#' XML elements present in the corpus. Character vectors making up the list
90+
#' declare the attributes that include the metadata of regions. To declare a
91+
#' structural attribute without annotations, provide a zero-length character
92+
#' vector using `character()` - see examples.
93+
#' @rdname cwb_utils
94+
#' @export cwb_encode
95+
#' @examples
96+
#' \dontrun{
97+
#' data_dir <- file.path(tempdir(), "tmp_data_dir")
98+
#' dir.create(data_dir)
99+
#'
100+
#' cwb_encode(
101+
#' corpus = "BTMIN",
102+
#' registry = Sys.getenv("CORPUS_REGISTRY"),
103+
#' vrt_dir = system.file(package = "RcppCWB", "extdata", "vrt"),
104+
#' data_dir = data_dir,
105+
#' p_attributes = c("word", "pos", "lemma"),
106+
#' s_attributes = list(
107+
#' plenary_protocol = c(
108+
#' "lp", "protocol_no", "date", "year", "birthday", "version",
109+
#' "url", "filetype"
110+
#' ),
111+
#' speaker = c(
112+
#' "id", "type", "lp", "protocol_no", "date", "year", "ai_no", "ai_id",
113+
#' "ai_type", "who", "name", "parliamentary_group", "party", "role"
114+
#' ),
115+
#' p = character()
116+
#' )
117+
#' )
118+
#'
119+
#' unlink(data_dir)
120+
#' unlink(file.path(Sys.getenv("CORPUS_REGISTRY"), "BTMIN"))
121+
#' }
122+
cwb_encode <- function(corpus, registry = Sys.getenv("CORPUS_REGISTRY"), data_dir, vrt_dir, p_attributes = c("word", "pos", "lemma"), s_attributes){
123+
124+
s_attributes_noanno <- unlist(lapply(
125+
names(s_attributes),
126+
function(s_attr) if (length(s_attributes[[s_attr]]) == 0L) s_attr else character()
127+
))
128+
129+
for (s_attr in s_attributes_noanno) s_attributes[[s_attr]] <- NULL
130+
131+
s_attributes_anno <- unname(
132+
sapply(
133+
names(s_attributes),
134+
function(s_attr) paste(s_attr, ":", 0L, "+", paste(s_attributes[[s_attr]], collapse = "+"), sep = "")
135+
)
136+
)
137+
138+
.cwb_encode(
139+
regfile = file.path(registry, tolower(corpus)),
140+
data_dir = data_dir,
141+
vrt_dir = vrt_dir,
142+
p_attributes = p_attributes,
143+
s_attributes_anno = s_attributes_anno,
144+
s_attributes_noanno = s_attributes_noanno
145+
)
146+
}

R/decode.R

Lines changed: 72 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,20 @@
77
#' the directory specified by \code{data_dir}. An implementation using Rcpp will use the
88
#' registry file for \code{corpus} to find the data directory.
99
#'
10-
#' @param corpus a CWB corpus
11-
#' @param s_attribute a structural attribute
12-
#' @param data_dir data directory where binary files for corpus are stored
13-
#' @param encoding encoding of the values ("latin-1" or "utf-8")
14-
#' @param registry registry directory
15-
#' @param method character vector, whether to use "R" or "Rcpp" implementation
16-
#' @return A \code{data.frame} with three columns. Column \code{cpos_left} are the start
17-
#' corpus positions of a structural annotation, \code{cpos_right} the end corpus positions.
18-
#' Column \code{value} is the value of the annotation.
10+
#' @param corpus A CWB corpus (ID in upper case).
11+
#' @param s_attribute A structural attribute (length 1 `character` vector).
12+
#' @param data_dir The data directory where the binary files of the corpus are
13+
#' stored.
14+
#' @param encoding Encoding of the values ("latin-1" or "utf-8")
15+
#' @param registry The CWB registry directory.
16+
#' @param method A length-one `character` vector, whether to use "R" or "Rcpp"
17+
#' implementation for decoding structural attribute.
18+
#' @return A \code{data.frame} with three columns. Column \code{cpos_left} are
19+
#' the start corpus positions of a structural annotation, \code{cpos_right}
20+
#' the end corpus positions. Column \code{value} is the value of the
21+
#' annotation.
1922
#' @export s_attribute_decode
2023
#' @rdname s_attribute_decode
21-
#' @return a character vector
2224
#' @examples
2325
#' registry <- if (!check_pkg_registry_files()) use_tmp_registry() else get_pkg_registry()
2426
#' Sys.setenv(CORPUS_REGISTRY = registry)
@@ -27,8 +29,21 @@
2729
#' b <- s_attribute_decode(
2830
#' data_dir = system.file(package = "RcppCWB", "extdata", "cwb", "indexed_corpora", "reuters"),
2931
#' s_attribute = "places", method = "R"
30-
#' )
32+
#' )
33+
#'
34+
#' # Using Rcpp wrappers for CWB C code
35+
#' b <- s_attribute_decode(
36+
#' corpus = "REUTERS",
37+
#' data_dir = system.file(package = "RcppCWB", "extdata", "cwb", "indexed_corpora", "reuters"),
38+
#' s_attribute = "places",
39+
#' method = "Rcpp"
40+
#' )
3141
s_attribute_decode <- function(corpus, data_dir, s_attribute, encoding = NULL, registry = Sys.getenv("CORPUS_REGISTRY"), method = c("R", "Rcpp")){
42+
43+
if (!is.character(method)) stop("Argument 'method' needs to be a character vector.")
44+
if (length(method) != 1L) stop("Argument 'method' needs to be a length 1 vector.")
45+
if (!method %in% c("Rcpp", "R")) stop("Argument 'method' needs to be either 'R' or 'Rcpp'.")
46+
3247
if (method == "R"){
3348

3449
if (missing(data_dir)) stop("data_dir needs to be specified to use R method")
@@ -64,19 +79,58 @@ s_attribute_decode <- function(corpus, data_dir, s_attribute, encoding = NULL, r
6479
check_registry(registry = registry)
6580
check_corpus(corpus = corpus, registry = registry)
6681
check_s_attribute(corpus = corpus, registry = registry, s_attribute = s_attribute)
82+
6783
values <- .decode_s_attribute(corpus = corpus, s_attribute = s_attribute, registry = registry)
68-
warning("region matrix can not yet be generated with Rcpp method")
69-
region_matrix <- NULL
84+
85+
s_attr_size <- cl_attribute_size(
86+
corpus = corpus,
87+
attribute = s_attribute,
88+
attribute_type = "s",
89+
registry = registry
90+
)
7091

92+
region_matrix <- get_region_matrix(
93+
corpus = corpus,
94+
s_attribute = s_attribute,
95+
strucs = 0L:(s_attr_size - 1L),
96+
registry = registry
97+
)
98+
7199
df <- data.frame(
72-
cpos_left = NA,
73-
cpos_right = NA,
100+
cpos_left = region_matrix[,1],
101+
cpos_right = region_matrix[,2],
74102
value = values,
75103
stringsAsFactors = FALSE
76104
)
77-
78-
} else {
79-
stop("method needs to be either 'R' or 'Rcpp'")
80105
}
106+
81107
df
82108
}
109+
110+
111+
#' Get regions defined by a structural attribute
112+
#'
113+
#' Get all regions defined by a structural attribute. Unlike
114+
#' `get_region_matrix()` that returns a region matrix for a defined subset of
115+
#' strucs, all regions are returned. As it is the fastest option, the function
116+
#' reads the binary *.rng file for the structural attribute directly. The corpus
117+
#' library (CL) is not used in this case.
118+
#'
119+
#' @param corpus A length-one `character` vector with a corpus ID.
120+
#' @param s_attr A length-one `character` vector stating a structural attribute.
121+
#' @param registry A length-one `character` vector stating the registry
122+
#' directory (defaults to CORPUS_REGISTRY environment variable).
123+
#' @param data_dir The data directory of the corpus.
124+
#' @return A two-colum `matrix` with the regions defined by the structural
125+
#' attribute: Column 1 defines left corpus positions and column 2 right corpus
126+
#' positions of regions.
127+
#' @examples
128+
#' s_attr_regions("REUTERS", s_attr = "id")
129+
s_attr_regions <- function(corpus, s_attr, registry = Sys.getenv("CORPUS_REGISTRY"), data_dir = corpus_data_dir(corpus = corpus, registry = registry)){
130+
rng_file <- file.path(data_dir, paste(s_attr, "rng", sep = "."))
131+
rng_file_size <- file.info(rng_file)[["size"]]
132+
con <- file(rng_file, open = "rb")
133+
rng <- readBin(con, what = integer(), size = 4L, n = rng_file_size / 4L, endian = "big")
134+
close(con)
135+
matrix(rng, ncol = 2L, byrow = TRUE)
136+
}

0 commit comments

Comments
 (0)