diff --git a/DESCRIPTION b/DESCRIPTION index 168d7ba6..ded9fa55 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -63,5 +63,5 @@ Imports: Rcpp, purrr, stringr, lavaan, rlang, MplusAutomation, nlme, dplyr, Depends: R (>= 4.1.0) URL: https://modsem.org -Suggests: knitr, rmarkdown, ggpubr, RColorBrewer +Suggests: knitr, rmarkdown, ggpubr, RColorBrewer, rstan VignetteBuilder: knitr diff --git a/NAMESPACE b/NAMESPACE index 9b1e3758..b87ad0c2 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -18,6 +18,7 @@ S3method(centered_estimates,modsem_da) S3method(coef,modsem_da) S3method(coef,modsem_mplus) S3method(coef,modsem_pi) +S3method(coef,modsem_stan) S3method(coefficients,modsem_da) S3method(coefficients,modsem_pi) S3method(compare_fit,modsem_da) @@ -59,6 +60,7 @@ S3method(parameter_estimates,lavaan) S3method(parameter_estimates,modsem_da) S3method(parameter_estimates,modsem_mplus) S3method(parameter_estimates,modsem_pi) +S3method(parameter_estimates,modsem_stan) S3method(print,ModsemMatrix) S3method(print,ModsemParTable) S3method(print,ModsemRelcorr) @@ -68,6 +70,7 @@ S3method(print,modsem_da) S3method(print,modsem_mplus) S3method(print,modsem_partable_summary) S3method(print,modsem_pi) +S3method(print,modsem_stan) S3method(print,simple_slopes) S3method(print,summary_da) S3method(print,summary_modsem_pi) @@ -77,19 +80,23 @@ S3method(standardized_estimates,lavaan) S3method(standardized_estimates,modsem_da) S3method(standardized_estimates,modsem_mplus) S3method(standardized_estimates,modsem_pi) +S3method(standardized_estimates,modsem_stan) S3method(summary,modsem_da) S3method(summary,modsem_mplus) S3method(summary,modsem_pi) +S3method(summary,modsem_stan) S3method(var_interactions,data.frame) S3method(var_interactions,modsem_da) S3method(var_interactions,modsem_mplus) S3method(vcov,modsem_da) S3method(vcov,modsem_mplus) S3method(vcov,modsem_pi) +S3method(vcov,modsem_stan) export(bootstrap_modsem) export(centered_estimates) export(colorize_output) export(compare_fit) +export(compile_stan_model) export(default_settings_da) export(default_settings_pi) export(estimate_h0) @@ -107,6 +114,7 @@ export(modsem_mplus) export(modsem_nobs) export(modsem_pi) export(modsem_predict) +export(modsem_stan) export(modsem_vcov) export(modsemify) export(parameter_estimates) diff --git a/R/construct_matrices_da.R b/R/construct_matrices_da.R index e08a7d9a..5e064e13 100644 --- a/R/construct_matrices_da.R +++ b/R/construct_matrices_da.R @@ -198,7 +198,6 @@ constructGamma <- function(DVs, IVs, parTable, auto.fix.first = TRUE) { exprsGamma <- rbind(exprsGamma1, exprsGamma3) - numDVs <- length(DVs) numIVs <- length(IVs) gamma <- matrix(0, nrow = numDVs, ncol = numIVs, dimnames = list(DVs, IVs)) diff --git a/R/generics_modsem_da.R b/R/generics_modsem_da.R index 09357376..98e27775 100644 --- a/R/generics_modsem_da.R +++ b/R/generics_modsem_da.R @@ -854,7 +854,8 @@ centered_estimates.modsem_da <- function(object, -#' @describeIn modsem_predict +#' @describeIn modsem_predict Predict From \code{modsem} Models +#' #' Computes (optionally standardised) factor scores via the #' regression method using the baseline model unless \code{H0 = FALSE}. #' diff --git a/R/model_stan.R b/R/model_stan.R new file mode 100644 index 00000000..9809d578 --- /dev/null +++ b/R/model_stan.R @@ -0,0 +1,1615 @@ +STAN_LAVAAN_MODELS <- rlang::env( + syntaxes = NULL, + compiled = NULL +) + + +STAN_OPERATOR_LABELS <- c( + "__INTERCEPT" = "~1", + "__REGRESSION__" = "~", + "__COVARIANCE__" = "~~", + # "__STDDEV__" = "~~~", + "__MEASUREMENT__" = "=~", + "__XWITH__" = ":" +) + + +STAN_SYNTAX_BLOCKS <- " +functions { +%s + +} + +data { +%s + +} + +parameters { +%s + +} + +transformed parameters { +%s + +} + +model { +%s + +} + +generated quantities { +%s + +} +" + + +#' Compile \code{STAN} model based on a \code{lavaan} model +#' +#' @param model.syntax \code{lavaan} syntax. +#' @param compile Should compilation be performed? If \code{FALSE} only the \code{STAN} +#' is generated, and not compiled. +#' @param force Should compilation of previously compiled models be forced? +#' @param ordered Ordered (i.e., ordinal) variables. +#' @param ordered.link Link function to be used for ordered variables. +#' @param parameterization What parameterization to use? Default is \code{"centered"}. +#' @export +compile_stan_model <- function(model.syntax, compile = TRUE, force = FALSE, + ordered = NULL, + ordered.link = c("logit", "probit"), + parameterization = c("centered", "non-centered")) { + promptInstallRstan() + + if (!requireNamespace("rstan", quietly = TRUE)) + stop2("The 'rstan' package is required to use the `modsem_stan()` function!") + + ordered.link <- tolower(ordered.link) + ordered.link <- match.arg(ordered.link) + + parameterization <- tolower(parameterization) + parameterization <- match.arg(parameterization) + + if (is.null(ordered)) + ordered <- character(0) + + if (length(ordered)) { + model.syntax <- paste( + model.syntax, + sprintf("#ORDERED %s", paste0(ordered, collapse = " ")), + sep = "\n" + ) + } + + if (ordered.link == "probit") { + resVarFormulaInd <- "1" + resSDFormulaInd <- "1" + orderedLinkFun <- "ordered_probit" + } else if (ordered.link == "logit") { + resVarFormulaInd <- "(pi()^2)/3" + resSDFormulaInd <- sprintf("sqrt(%s)", resVarFormulaInd) + orderedLinkFun <- "ordered_logistic" + } + + parTable <- modsemify(model.syntax) + parTable <- addResidualCovariancesParTable(parTable) + + # endogenous variables (etas)model + etas <- getSortedEtas(parTable, isLV = TRUE, checkAny = TRUE) + etas <- etas[length(etas):1] # reverse + numEtas <- length(etas) + + indsEtas <- getIndsLVs(parTable, etas) + numIndsEtas <- vapply(indsEtas, FUN.VALUE = vector("integer", 1L), + FUN = length) + allIndsEtas <- unique(unlist(indsEtas)) + numAllIndsEtas <- length(allIndsEtas) + + # exogenouts variables (xis) and interaction terms + intTerms <- unique(parTable[grepl(":", parTable$rhs), "rhs"]) + numInts <- length(intTerms) + varsInts <- stringr::str_split(intTerms, pattern = ":") + varsInts <- stats::setNames(varsInts, nm = intTerms) + allVarsInInts <- unique(unlist(varsInts)) + + xis <- getXis(parTable, checkAny = TRUE) + numXis <- length(xis) + + indsXis <- getIndsLVs(parTable, xis) + numIndsXis <- vapply(indsXis, FUN.VALUE = vector("integer", 1L), + FUN = length) + allIndsXis <- unique(unlist(indsXis)) + numAllIndsXis <- length(allIndsXis) + + lVs <- c(xis, etas) + numLVs <- length(lVs) + indsLVs <- getIndsLVs(parTable, lVs) + allInds <- c(allIndsXis, allIndsEtas) + numAllInds <- length(allInds) + + # NEW: normalize and precompute ordinal sets + if (is.null(ordered)) ordered <- character(0) + ordSet <- unique(ordered) + is_allOrdinal_lv <- vapply(lVs, function(lv) { + inds <- indsLVs[[lv]] + length(inds) > 0 && all(inds %in% ordSet) + }, logical(1)) + + collapse <- function(..., sep = "\n") { + args <- list(...) + do.call( + "paste", + args = c(lapply(args, FUN = \(arg) paste0(arg, collapse = sep)), + list(sep = sep)) + ) + } + + FUNCTIONS <- " + // empirical covariance helper (for generated quantities) + real cov_vector(vector a, vector b) { + int N = num_elements(a); + + // Center the vectors + vector[N] a_centered = a - mean(a); + vector[N] b_centered = b - mean(b); + + // dot_product is faster than manual loop and clearer than dot_self + return dot_product(a_centered, b_centered) / (N - 1); + } + " + + getMod <- \(lhs, op, rhs) getModParTable(lhs, op, rhs, parTable = parTable) + + DATA <- "int N; // sample size" + PARAMETERS <- NULL + TRANSFORMED_PARAMETERS <- NULL + MODEL <- NULL + GENERATED_QUANTITIES <- NULL + EXCLUDE.PARS <- NULL + + pair_key <- function(a, b) { + vals <- sort(c(a, b)) + paste0(vals, collapse = "___") + } + + normalizeCovPairs <- function(df) { + if (!NROW(df)) return(df) + df$lhs <- pmin(df$lhs, df$rhs) + df$rhs <- pmax(df$lhs, df$rhs) + df <- df[!duplicated(df[, c("lhs", "rhs")]), , drop = FALSE] + df + } + + buildComponents <- function(nodes, edges, order_ref) { + if (!length(nodes)) { + map <- setNames(rep(0L, length(order_ref)), order_ref) + return(list(groups = list(), map = map)) + } + + nodes <- unique(nodes) + if (!missing(order_ref) && length(order_ref)) { + nodes <- nodes[order(match(nodes, order_ref))] + } else { + order_ref <- nodes + } + + adj <- stats::setNames(vector("list", length(nodes)), nodes) + + if (NROW(edges)) { + for (i in seq_len(NROW(edges))) { + v1 <- edges$lhs[[i]] + v2 <- edges$rhs[[i]] + + adj[[v1]] <- unique(c(adj[[v1]], v2)) + adj[[v2]] <- unique(c(adj[[v2]], v1)) + } + } + + visited <- stats::setNames(rep(FALSE, length(nodes)), nodes) + groups <- list() + + for (node in nodes) { + if (visited[[node]]) next + + stack <- node + visited[[node]] <- TRUE + comp <- character() + + while (length(stack)) { + current <- stack[[length(stack)]] + stack <- stack[-length(stack)] + comp <- c(comp, current) + + neighbors <- adj[[current]] + for (nbr in neighbors) { + if (!visited[[nbr]]) { + visited[[nbr]] <- TRUE + stack <- c(stack, nbr) + } + } + } + + comp <- comp[order(match(comp, order_ref))] + groups[[length(groups) + 1L]] <- comp + } + + groups <- Filter(function(g) length(g) > 1L, groups) + + map <- setNames(rep(0L, length(order_ref)), order_ref) + if (length(groups)) { + for (i in seq_along(groups)) { + map[groups[[i]]] <- i + } + } + + list(groups = groups, map = map) + } + + # Residual covariance metadata: continuous indicators + contIndicatorsAll <- setdiff(allInds, ordSet) + + contCovPairs <- parTable[ + parTable$op == "~~" & + parTable$lhs %in% contIndicatorsAll & + parTable$rhs %in% contIndicatorsAll & + parTable$lhs != parTable$rhs, + c("lhs", "rhs", "mod"), + drop = FALSE + ] + + if (NROW(contCovPairs)) { + contCovPairs$mod <- as.character(contCovPairs$mod) + contCovPairs$mod[contCovPairs$mod == ""] <- NA_character_ + contCovPairs <- normalizeCovPairs(contCovPairs) + } + + contCovVars <- if (NROW(contCovPairs)) unique(c(contCovPairs$lhs, contCovPairs$rhs)) else character(0) + contCorrelatedVars <- contCovVars[order(match(contCovVars, contIndicatorsAll))] + + contComponent <- buildComponents( + nodes = contCorrelatedVars, + edges = contCovPairs, + order_ref = contIndicatorsAll + ) + + contGroupMap <- contComponent$map + contResidGroups <- contComponent$groups + + contResidIndex <- setNames(seq_along(contCorrelatedVars), contCorrelatedVars) + + contPairLookup <- if (NROW(contCovPairs)) { + keys <- mapply(pair_key, contCovPairs$lhs, contCovPairs$rhs, SIMPLIFY = TRUE, USE.NAMES = FALSE) + stats::setNames(contCovPairs$mod, keys) + } else { + setNames(character(0), character(0)) + } + + parTable <- addResidualCovariancesParTable(parTable) + continuous_meta <- list() + + # Residual covariance metadata: latent etas + etaCovPairs <- parTable[ + parTable$op == "~~" & + parTable$lhs %in% etas & + parTable$rhs %in% etas & + parTable$lhs != parTable$rhs, + c("lhs", "rhs", "mod"), + drop = FALSE + ] + + if (NROW(etaCovPairs)) { + etaCovPairs$mod <- as.character(etaCovPairs$mod) + etaCovPairs$mod[etaCovPairs$mod == ""] <- NA_character_ + etaCovPairs <- normalizeCovPairs(etaCovPairs) + } + + etaCovVars <- if (NROW(etaCovPairs)) unique(c(etaCovPairs$lhs, etaCovPairs$rhs)) else character(0) + etaCorrelatedVars <- etaCovVars[order(match(etaCovVars, etas))] + + etaComponent <- buildComponents( + nodes = etaCorrelatedVars, + edges = etaCovPairs, + order_ref = etas + ) + + etaGroupMap <- etaComponent$map + etaResidGroups <- etaComponent$groups + + etaPairLookup <- if (NROW(etaCovPairs)) { + keys <- mapply(pair_key, etaCovPairs$lhs, etaCovPairs$rhs, SIMPLIFY = TRUE, USE.NAMES = FALSE) + stats::setNames(etaCovPairs$mod, keys) + } else { + setNames(character(0), character(0)) + } + + eta_meta <- list() + + STAN_INDS_LV <- function(lV) { + inds <- indsLVs[[lV]] + k <- length(inds) + + parLines <- character() + modelLines <- character() + quantLines <- character() + dataInds <- sprintf("matrix[N, %d] INDICATORS_%s;", length(inds), lV) + dataLines <- dataInds + tparLines <- character() + + # Intercepts only for continuous indicators + contInds <- inds[!inds %in% ordSet] + ordInds <- inds[inds %in% ordSet] + + fixContVar <- length(contInds) + length(ordInds) <= 1L + + for (i in seq_along(contInds)) { + ind <- contInds[[i]] + parInterceptInd <- sprintf("real %s__INTERCEPT;", ind) + parLambdaInd <- sprintf("real %s__MEASUREMENT__%s;", ind, ind) + + modTauInd <- getMod(lhs = ind, op = "~1", rhs = "") + modLambdaInd <- getMod(lhs = lV, op = "=~", rhs = ind) + modVarInd <- getMod(lhs = ind, op = "~~", rhs = ind) + + fixTauInd <- !is.na(modTauInd) + fixLambdaInd <- !is.na(modLambdaInd) || i == 1L + fixContVarInd <- !is.na(modVarInd) || fixContVar + + if (fixTauInd) { + parTauInd <- NULL + tparTauInd <- sprintf("real %s__INTERCEPT = %s;", ind, modTauInd) + } else { + parTauInd <- sprintf("real %s__INTERCEPT;", ind) + tparTauInd <- NULL + } + + if (fixLambdaInd) { + mod <- if (!is.na(modLambdaInd)) modLambdaInd else "1" + parLambdaInd <- NULL + tparLambdaInd <- sprintf("real %s__MEASUREMENT__%s = %s;", lV, ind, mod) + } else { + parLambdaInd <- sprintf("real %s__MEASUREMENT__%s;", lV, ind) + tparLambdaInd <- NULL + } + + if (fixContVarInd) { + mod <- if (!is.na(modVarInd)) modVarInd else "0" + parResSDInd <- NULL + tparResSDInd <- sprintf("real %s__STDDEV__%s = sqrt(%s);", ind, ind, mod) + } else { + parResSDInd <- sprintf("real %s__STDDEV__%s;", ind, ind) + tparResSDInd <- NULL + } + + tparResVarInd <- sprintf( + "real %s__COVARIANCE__%s = %s__STDDEV__%s^2;", + ind, ind, ind, ind + ) + + groupId <- 0L + if (length(contGroupMap)) { + groupId <- contGroupMap[[ind]] + if (is.na(groupId)) groupId <- 0L + } + + sigmaName <- sprintf("%s__STDDEV__%s", ind, ind) + interceptName <- sprintf("%s__INTERCEPT", ind) + loadingName <- sprintf("%s__MEASUREMENT__%s", lV, ind) + dataIdx <- if (ind %in% names(contResidIndex)) contResidIndex[[ind]] else NA_integer_ + + continuous_meta[[ind]] <<- list( + intercept = interceptName, + loading = loadingName, + latent = lV, + sigma = sigmaName, + data_index = dataIdx, + group = groupId + ) + + if (groupId <= 0L) { + modMeasrInd <- sprintf( + "INDICATORS_%s[,%d] ~ normal(%s__INTERCEPT + %s__MEASUREMENT__%s * %s, %s__STDDEV__%s);", + lV, i, ind, lV, ind, lV, ind, ind + ) + } else { + modMeasrInd <- NULL + } + + parLines <- c( + parLines, + parTauInd, + parLambdaInd, + parResSDInd + ) + + tparLines <- c( + tparLines, + tparTauInd, + tparLambdaInd, + tparResSDInd, + tparResVarInd + ) + + modelLines <- c( + modelLines, + modMeasrInd + ) + } + + for (j in seq_along(ordInds)) { + ind <- ordInds[[j]] + + if (TRUE) parLambdaInd <- sprintf("real %s__UMEASUREMENT__%s;", lV, ind) + else parLambdaInd <- sprintf("real %s__UMEASUREMENT__%s;", lV, ind) + + dataK_Ind <- sprintf("int K_%s;", ind) + dataInd <- sprintf("int ORD_INDICATOR_%s[N];", ind, ind) + parCutInd <- sprintf("ordered[K_%s - 1] %s__UCUTPOINTS;", ind, ind) + responseInd <- sprintf("vector[N] LV_RESPONSE_%s_ = %s__UMEASUREMENT__%s * %s;", ind, lV, ind, lV) + likelihoodInd <- sprintf("ORD_INDICATOR_%s ~ %s(LV_RESPONSE_%s_, %s__UCUTPOINTS);", + ind, orderedLinkFun, ind, ind) + modMeasrInd <- sprintf("{\n %s\n %s\n}", responseInd, likelihoodInd) + + quantTotalVarInd <- sprintf( + "real TOTAL_VAR_%s_ = (%s__UMEASUREMENT__%s ^ 2) * cov_vector(%s, %s) + %s;", + ind, lV, ind, lV, lV, resVarFormulaInd + ) + + quantStdCutPoints <- sprintf( + "ordered[K_%s - 1] %s__CUTPOINTS = %s__UCUTPOINTS / sqrt(TOTAL_VAR_%s_);", + ind, ind, ind, ind + ) + + quantLambdaInd <- sprintf( + "real %s__MEASUREMENT__%s = %s__UMEASUREMENT__%s / sqrt(TOTAL_VAR_%s_);", + lV, ind, lV, ind, ind + ) + + quantResVarInd <- sprintf( + "real %s__COVARIANCE__%s = %s / TOTAL_VAR_%s_;", + ind, ind, resVarFormulaInd, ind + ) + + dataLines <- c(dataLines, dataK_Ind, dataInd) + parLines <- c(parLines, parLambdaInd, parCutInd) + quantLines <- c(quantLines, quantTotalVarInd, quantStdCutPoints, + quantResVarInd, quantLambdaInd) + modelLines <- c(modelLines, modMeasrInd) + } + + list( + parameters = collapse(parLines), + model = collapse(modelLines), + data = collapse(dataLines), + transformed_parameters = collapse(tparLines), + generated_quantities = collapse(quantLines) + ) + } + + # Centered parameterization + STAN_PAR_XIS_C <- function(xis) { + k <- length(xis) + + # Identify which xi's are all-ordinal + fix_idx <- which(vapply(xis, function(lv) is_allOrdinal_lv[lv], logical(1))) + free_idx <- setdiff(seq_len(k), fix_idx) + + parLOmega <- sprintf("cholesky_factor_corr[%d] L_Omega;", k) + + if (length(free_idx) > 0L) { + parSqrtDPhi <- sprintf("vector[%d] sqrtD_Phi_free;", length(free_idx)) + } else { + parSqrtDPhi <- NULL + } + + parXiMat <- sprintf("matrix[N, %d] XI_Matrix;", k) + + # transformed parameters: rebuild sqrtD_Phi with fixed 1's at fix_idx + tparBuildSqrt <- c( + sprintf("vector[%d] sqrtD_Phi;", k), + " {", + if (length(free_idx) > 0L) " int c = 1;" else NULL, + if (length(seq_len(k)) > 0L) { + paste0( + vapply(seq_len(k), function(i) { + if (i %in% fix_idx) { + sprintf(" sqrtD_Phi[%d] = 1;", i) + } else { + sprintf(" sqrtD_Phi[%d] = sqrtD_Phi_free[c]; c += 1;", i) + } + }, character(1L)), + collapse = "\n" + ) + } else NULL, + " }" + ) + + tparLSigma <- sprintf("matrix[%d, %d] L_Sigma = diag_pre_multiply(sqrtD_Phi, L_Omega);", k, k) + tparMuXi <- sprintf("row_vector[%d] MU_XI = rep_row_vector(0, %d);", k, k) + tparXiArr <- sprintf("array[N] vector[%d] XI_Array;", k) + tparXiArrFill <- "for (i in 1:N) {XI_Array[i] = (XI_Matrix[i, ])';}" + + xiVectors <- NULL + for (i in seq_along(xis)) { + name <- xis[[i]] + xiVector <- sprintf("vector[N] %s = col(XI_Matrix, %d);", name, i) + xiVectors <- c(xiVectors, xiVector) + } + tparXiVectors <- collapse(xiVectors) + + modLOmega <- "L_Omega ~ lkj_corr_cholesky(1);" + modXiArr <- "XI_Array ~ multi_normal_cholesky(MU_XI, L_Sigma);" + + parameters <- collapse(c(parLOmega, parSqrtDPhi, parXiMat)) + tparameters <- collapse(c(tparBuildSqrt, tparLSigma, tparXiVectors, tparMuXi, tparXiArr, tparXiArrFill)) + model <- collapse(c(modLOmega, modXiArr)) + + EXCLUDE.PARS <<- c(EXCLUDE.PARS, "XI_Array", "XI_Matrix", xis) + + list(parameters = parameters, transformed_parameters = tparameters, model = model) + } + + # Non-centered parameterization + STAN_PAR_XIS_NC <- function(xis) { + k <- length(xis) + + # Identify which xi's are all-ordinal (variance fixed to 1) + fix_idx <- which(vapply(xis, function(lv) is_allOrdinal_lv[lv], logical(1))) + free_idx <- setdiff(seq_len(k), fix_idx) + + # parameters + parLOmega <- sprintf("cholesky_factor_corr[%d] L_Omega;", k) + + if (length(free_idx) > 0L) { + parSqrtDPhi <- sprintf("vector[%d] sqrtD_Phi_free;", length(free_idx)) + } else { + parSqrtDPhi <- NULL + } + + # Non-centered: matrix of standard normals (N x k) + parZ <- sprintf("matrix[N, %d] z_XI;", k) + + # transformed parameters + # Rebuild sqrtD_Phi with fixed 1's at fix_idx + tparBuildSqrt <- c( + sprintf("vector[%d] sqrtD_Phi;", k), + "{", + if (length(free_idx) > 0L) " int c = 1;" else NULL, + if (k > 0L) { + paste0( + vapply(seq_len(k), function(i) { + if (i %in% fix_idx) { + sprintf(" sqrtD_Phi[%d] = 1;", i) + } else { + sprintf(" sqrtD_Phi[%d] = sqrtD_Phi_free[c]; c += 1;", i) + } + }, character(1L)), + collapse = "\n" + ) + } else NULL, + "}" + ) + + tparLSigma <- sprintf("matrix[%d, %d] L_Sigma = diag_pre_multiply(sqrtD_Phi, L_Omega);", k, k) + + # Row-vector mean (0 by default). Change if you want non-zero LV means. + tparMuXi <- sprintf("row_vector[%d] MU_XI = rep_row_vector(0, %d);", k, k) + + # Build XI_Matrix in one vectorised statement: + # each row: MU_XI + z_i * L_Sigma' + tparXiMatDecl <- sprintf("matrix[N, %d] XI_Matrix;", k) + tparXiMatFill <- "XI_Matrix = rep_matrix(MU_XI, N) + z_XI * L_Sigma';" + + # Also expose named column vectors (X, Z, ...) for downstream code + xiVectors <- NULL + for (i in seq_along(xis)) { + xiVectors <- c(xiVectors, sprintf("vector[N] %s = col(XI_Matrix, %d);", xis[[i]], i)) + } + tparXiVectors <- collapse(xiVectors) + + # model + modLOmega <- "L_Omega ~ lkj_corr_cholesky(1);" + # Non-centered prior: standard normals (vectorised via to_vector) + modZ <- "to_vector(z_XI) ~ std_normal();" + # (Optionally add a weak prior for sqrtD_Phi_free if desired.) + + parameters <- collapse(c(parLOmega, parSqrtDPhi, parZ)) + transformed <- collapse(c( + tparBuildSqrt, tparLSigma, tparMuXi, + tparXiMatDecl, tparXiMatFill, + tparXiVectors + )) + model <- collapse(c(modLOmega, modZ)) + + # Hide internals if you use an exclusion mechanism + EXCLUDE.PARS <<- c(EXCLUDE.PARS, "z_XI", "XI_Matrix", xis) + + list( + parameters = parameters, + transformed_parameters = transformed, + model = model + ) + } + + if (parameterization == "centered") STAN_PAR_XIS <- STAN_PAR_XIS_C + else if (parameterization == "non-centered") STAN_PAR_XIS <- STAN_PAR_XIS_NC + else stop2("Unrecognized value for `parameterization`: ", parameterization) + + STAN_PAR_ETA <- function(eta) { + indeps <- unique(parTable[parTable$lhs == eta & parTable$op == "~", "rhs"]) + indeps <- stringr::str_replace_all(indeps, ":", "__XWITH__") + + labBeta <- sprintf("%s__REGRESSION__%s", eta, indeps) + allOrd <- is_allOrdinal_lv[eta] + + parBeta <- if (length(labBeta)) sprintf("real %s;", labBeta) else NULL + parValues <- sprintf("vector[N] %s;", eta) + + tparLines <- character() + + meanExpr <- if (length(indeps)) collapse(sprintf("%s * %s", labBeta, indeps), sep = " + ") else "0" + meanExprVec <- if (meanExpr == "0") "rep_vector(0, N)" else meanExpr + meanName <- sprintf("%s__MEAN", eta) + tparLines <- c(tparLines, sprintf("vector[N] %s = %s;", meanName, meanExprVec)) + + groupId <- if (length(etaGroupMap)) etaGroupMap[[eta]] else 0L + if (is.na(groupId)) groupId <- 0L + + if (allOrd) { + parSD <- NULL + sigmaExpr <- "1" + tparLines <- c(tparLines, sprintf("real %s__COVARIANCE__%s = 1;", eta, eta)) + } else { + labSD <- sprintf("%s__STDDEV__%s", eta, eta) + parSD <- sprintf("real %s;", labSD) + sigmaExpr <- labSD + tparLines <- c(tparLines, sprintf("real %s__COVARIANCE__%s = %s^2;", eta, eta, labSD)) + } + + if (groupId > 0L) { + modEta <- NULL + } else { + modEta <- sprintf("%s ~ normal(%s, %s);", eta, meanName, sigmaExpr) + } + + eta_meta[[eta]] <<- list( + mean = meanName, + sigma = sigmaExpr, + group = groupId + ) + + parameters <- collapse(c(parValues, parSD, parBeta)) + EXCLUDE.PARS <<- c(EXCLUDE.PARS, eta, meanName) + + list( + parameters = parameters, + model = modEta, + transformed_parameters = collapse(tparLines) + ) + } + + STAN_CORRELATED_CONTINUOUS <- function() { + if (!length(contResidGroups)) return(list()) + + parLines <- character() + tparLines <- character() + modelLines <- character() + + if (length(names(contPairLookup))) { + for (key in names(contPairLookup)) { + vars <- strsplit(key, "___", fixed = TRUE)[[1L]] + v1 <- vars[[1L]] + v2 <- vars[[2L]] + covName <- sprintf("%s__COVARIANCE__%s", v1, v2) + modVal <- contPairLookup[[key]] + + if (is.na(modVal)) { + parLines <- c(parLines, sprintf("real %s;", covName)) + tparLines <- c( + tparLines, + sprintf("real %s__COVARIANCE__%s = %s;", v2, v1, covName) + ) + } else { + tparLines <- c( + tparLines, + sprintf("real %s__COVARIANCE__%s = %s;", v1, v2, modVal), + sprintf("real %s__COVARIANCE__%s = %s;", v2, v1, modVal) + ) + } + } + } + + for (g in seq_along(contResidGroups)) { + vars <- contResidGroups[[g]] + size <- length(vars) + thetaName <- sprintf("Theta_cont_%d", g) + lName <- sprintf("L_Theta_cont_%d", g) + + tparLines <- c( + tparLines, + sprintf("matrix[%d, %d] %s;", size, size, thetaName), + sprintf("cholesky_factor_cov[%d] %s;", size, lName), + sprintf("%s = rep_matrix(0, %d, %d);", thetaName, size, size) + ) + + for (i in seq_along(vars)) { + var <- vars[[i]] + sigmaExpr <- continuous_meta[[var]]$sigma + tparLines <- c( + tparLines, + sprintf("%s[%d, %d] = square(%s);", thetaName, i, i, sigmaExpr) + ) + } + + if (size >= 2L) { + for (i in seq_along(vars)) { + for (j in seq_len(i - 1L)) { + vi <- vars[[i]] + vj <- vars[[j]] + key <- pair_key(vi, vj) + if (key %in% names(contPairLookup)) { + covExpr <- sprintf("%s__COVARIANCE__%s", vi, vj) + } else { + covExpr <- "0" + } + tparLines <- c( + tparLines, + sprintf("%s[%d, %d] = %s;", thetaName, i, j, covExpr), + sprintf("%s[%d, %d] = %s;", thetaName, j, i, covExpr) + ) + } + } + } + + tparLines <- c( + tparLines, + sprintf("%s = cholesky_decompose(%s);", lName, thetaName) + ) + + EXCLUDE.PARS <<- c(EXCLUDE.PARS, thetaName, lName) + + loopLines <- c( + sprintf("for (n in 1:N) {"), + sprintf(" vector[%d] resid;", size) + ) + + for (i in seq_along(vars)) { + var <- vars[[i]] + meta <- continuous_meta[[var]] + dataIdx <- meta$data_index + if (is.na(dataIdx)) { + stop2("Missing data index for correlated indicator '", var, "'.") + } + expr <- sprintf("%s + %s * %s[n]", + meta$intercept, + meta$loading, + meta$latent) + loopLines <- c( + loopLines, + sprintf(" resid[%d] = CONTINUOUS_INDICATORS[n, %d] - (%s);", + i, dataIdx, expr) + ) + } + + loopLines <- c( + loopLines, + sprintf(" target += multi_normal_cholesky_lpdf(resid | rep_vector(0, %d), %s);", size, lName), + "}" + ) + + modelLines <- c(modelLines, paste(loopLines, collapse = "\n")) + } + + dataLines <- sprintf("matrix[N, %d] CONTINUOUS_INDICATORS;", length(contResidIndex)) + + list( + data = dataLines, + parameters = collapse(parLines), + transformed_parameters = collapse(tparLines), + model = collapse(modelLines) + ) + } + + STAN_CORRELATED_ETAS <- function() { + if (!length(etaResidGroups)) return(list()) + + parLines <- character() + tparLines <- character() + modelLines <- character() + + if (length(names(etaPairLookup))) { + for (key in names(etaPairLookup)) { + vars <- strsplit(key, "___", fixed = TRUE)[[1L]] + v1 <- vars[[1L]] + v2 <- vars[[2L]] + covName <- sprintf("%s__COVARIANCE__%s", v1, v2) + modVal <- etaPairLookup[[key]] + + if (is.na(modVal)) { + parLines <- c(parLines, sprintf("real %s;", covName)) + tparLines <- c( + tparLines, + sprintf("real %s__COVARIANCE__%s = %s;", v2, v1, covName) + ) + } else { + tparLines <- c( + tparLines, + sprintf("real %s__COVARIANCE__%s = %s;", v1, v2, modVal), + sprintf("real %s__COVARIANCE__%s = %s;", v2, v1, modVal) + ) + } + } + } + + for (g in seq_along(etaResidGroups)) { + vars <- etaResidGroups[[g]] + size <- length(vars) + psiName <- sprintf("Psi_eta_%d", g) + lName <- sprintf("L_Psi_eta_%d", g) + + tparLines <- c( + tparLines, + sprintf("matrix[%d, %d] %s;", size, size, psiName), + sprintf("cholesky_factor_cov[%d] %s;", size, lName), + sprintf("%s = rep_matrix(0, %d, %d);", psiName, size, size) + ) + + for (i in seq_along(vars)) { + var <- vars[[i]] + sigmaExpr <- eta_meta[[var]]$sigma + tparLines <- c( + tparLines, + sprintf("%s[%d, %d] = square(%s);", psiName, i, i, sigmaExpr) + ) + } + + if (size >= 2L) { + for (i in seq_along(vars)) { + for (j in seq_len(i - 1L)) { + vi <- vars[[i]] + vj <- vars[[j]] + key <- pair_key(vi, vj) + if (key %in% names(etaPairLookup)) { + covExpr <- sprintf("%s__COVARIANCE__%s", vi, vj) + } else { + covExpr <- "0" + } + tparLines <- c( + tparLines, + sprintf("%s[%d, %d] = %s;", psiName, i, j, covExpr), + sprintf("%s[%d, %d] = %s;", psiName, j, i, covExpr) + ) + } + } + } + + tparLines <- c( + tparLines, + sprintf("%s = cholesky_decompose(%s);", lName, psiName) + ) + + EXCLUDE.PARS <<- c(EXCLUDE.PARS, psiName, lName) + + loopLines <- c( + sprintf("for (n in 1:N) {"), + sprintf(" vector[%d] resid;", size) + ) + + for (i in seq_along(vars)) { + var <- vars[[i]] + meta <- eta_meta[[var]] + loopLines <- c( + loopLines, + sprintf(" resid[%d] = %s[n] - %s[n];", i, var, meta$mean) + ) + } + + loopLines <- c( + loopLines, + sprintf(" target += multi_normal_cholesky_lpdf(resid | rep_vector(0, %d), %s);", size, lName), + "}" + ) + + modelLines <- c(modelLines, paste(loopLines, collapse = "\n")) + } + + list( + parameters = collapse(parLines), + transformed_parameters = collapse(tparLines), + model = collapse(modelLines) + ) + } + + STAN_COMPUTED_PRODUCTS <- function(intTerms) { + transformed_parameters <- NULL + + for (intTerm in intTerms) { + elems <- stringr::str_split(intTerm, pattern = ":")[[1L]] + prodEq <- collapse(elems, sep = " .* ") + nameIntTerm <- collapse(elems, sep = "__XWITH__") + + transformed_parameters <- c( + transformed_parameters, + sprintf("vector[N] %s = %s;", nameIntTerm, prodEq) + ) + + EXCLUDE.PARS <<- c(EXCLUDE.PARS, nameIntTerm) + } + + list(transformed_parameters = collapse(transformed_parameters)) + } + + + STAN_COMPUTED_COVARIANCES <- function(vars) { + vars <- stringr::str_replace_all(vars, pattern = ":", replacement = "__XWITH__") + combos <- getUniqueCombos(vars) + generated_quantities <- NULL + + for (i in seq_len(NROW(combos))) { + lhs <- combos[[1L]][[i]] + rhs <- combos[[2L]][[i]] + + generated_quantities <- c( + generated_quantities, + sprintf("real %s__COVARIANCE__%s = cov_vector(%s, %s);", + lhs, rhs, lhs, rhs) + ) + } + + list(generated_quantities = collapse(generated_quantities)) + } + + + STAN_COMPUTED_VARIANCES <- function(vars) { + vars <- stringr::str_replace_all(vars, pattern = ":", replacement = "__XWITH__") + generated_quantities <- NULL + + for (var in vars) { + generated_quantities <- c( + generated_quantities, + sprintf("real %s__COVARIANCE__%s = cov_vector(%s, %s);", + var, var, var, var) + ) + } + + list(generated_quantities = collapse(generated_quantities)) + } + + + add2block <- function(FUN, ...) { + blocks <- FUN(...) + + for (name in names(blocks)) { + block <- blocks[[name]] + switch(name, + functions = { + FUNCTIONS <<- collapse(FUNCTIONS, block) + }, + + data = { + DATA <<- collapse(DATA, block) + }, + + parameters = { + PARAMETERS <<- collapse(PARAMETERS, block) + }, + + transformed_parameters = { + TRANSFORMED_PARAMETERS <<- collapse(TRANSFORMED_PARAMETERS, block) + }, + + model = { + MODEL <<- collapse(MODEL, block) + }, + + generated_quantities = { + GENERATED_QUANTITIES <<- collapse(GENERATED_QUANTITIES, block) + } + ) + } + } + + for (lV in lVs) add2block(STAN_INDS_LV, lV = lV) + add2block(STAN_PAR_XIS, xis = xis) + add2block(STAN_COMPUTED_PRODUCTS, intTerms = intTerms) + for (eta in etas) add2block(STAN_PAR_ETA, eta = eta) + + add2block(STAN_CORRELATED_CONTINUOUS) + add2block(STAN_CORRELATED_ETAS) + + add2block(STAN_COMPUTED_COVARIANCES, vars = c(xis, intTerms)) + add2block(STAN_COMPUTED_VARIANCES, vars = c(xis, intTerms)) + + stanModelSyntax <- sprintf(STAN_SYNTAX_BLOCKS, + FUNCTIONS, DATA, PARAMETERS, + TRANSFORMED_PARAMETERS, + MODEL, GENERATED_QUANTITIES) + + SYNTAXES <- STAN_LAVAAN_MODELS$syntaxes + COMPILED <- STAN_LAVAAN_MODELS$compiled + match <- STAN_LAVAAN_MODELS$syntaxes == model.syntax + + if (compile && any(match) && !force) { + message("Reusing compiled Stan model...") + stanModel <- last(COMPILED[match]) # if a duplicate somehow appears, pick last/newest match + + } else if (compile) { + message("Compiling Stan model...") + + stanModel <- rstan::stan_model(model_code = stanModelSyntax) + + SYNTAXES <- c(SYNTAXES, model.syntax) + COMPILED <- c(COMPILED, stanModel) + + STAN_LAVAAN_MODELS$syntaxes <- SYNTAXES + STAN_LAVAAN_MODELS$compiled <- COMPILED + + } else stanModel <- NULL + + + list(syntax = stanModelSyntax, + stan_model = stanModel, + info = list(lVs = lVs, + xis = xis, + etas = etas, + indsLVs = indsLVs, + allIndsXis = allIndsXis, + allIndsEtas = allIndsEtas, + cont_resid_vars = contCorrelatedVars, + cont_resid_groups = contResidGroups, + eta_resid_groups = etaResidGroups, + exclude.pars = unique(EXCLUDE.PARS), + parTable = parTable)) +} + + +getStanData <- function(compiled_model, data, missing = "listwise", ordered = NULL) { + if (is.null(ordered)) ordered <- character(0) + + lVs <- compiled_model$info$lVs + indsLVs <- compiled_model$info$indsLVs + allIndsXis <- compiled_model$info$allIndsXis + allIndsEtas <- compiled_model$info$allIndsEtas + contResidVars <- compiled_model$info$cont_resid_vars + if (is.null(contResidVars)) contResidVars <- character(0) + + # 1) Pre-coerce requested ordinal columns in the raw data + # (safe even if columns are already numeric; ensures stable ordering) + for (col in ordered) { + if (!col %in% names(data)) { + stop("`ordered` indicator '", col, "' not found in data.") + } + data[[col]] <- as.integer(as.ordered(data[[col]])) + } + + # 2) Run your existing missing-data preparation (listwise or otherwise) + INDICATORS <- prepDataModsemDA(data, allIndsXis, allIndsEtas, + missing = missing)$data.full + + stan_data <- list(N = nrow(INDICATORS)) + + # 3) Emit the latent-specific indicator matrices (unchanged) + for (lV in lVs) { + name <- sprintf("INDICATORS_%s", lV) + inds <- indsLVs[[lV]] + if (!all(inds %in% colnames(INDICATORS))) { + missing_cols <- paste(setdiff(inds, colnames(INDICATORS)), collapse = ", ") + stop("Indicators missing from prepared data for latent '", lV, "': ", missing_cols) + } + stan_data[[name]] <- as.matrix(INDICATORS[, inds, drop = FALSE]) + } + + # 4) For each ordered indicator, add integer vector 1..K and K + remap2consecutive <- function(x) { + # x should be atomic, no NAs expected after listwise handling + u <- sort(unique(x)) + # Create a 1..K mapping even if labels were not consecutive (e.g., 0/2/5) + map <- stats::setNames(seq_along(u), as.character(u)) + as.integer(unname(map[as.character(x)])) + } + + for (ind in ordered) { + if (!ind %in% colnames(INDICATORS)) { + stop("`ordered` indicator '", ind, "' not found after preprocessing.") + } + x_raw <- INDICATORS[, ind] + # Ensure integer 1..K coding regardless of original labels + x_int <- remap2consecutive(x_raw) + K <- as.integer(max(x_int)) + + stan_data[[sprintf("ORD_INDICATOR_%s", ind)]] <- x_int + stan_data[[sprintf("K_%s", ind)]] <- K + } + + if (length(contResidVars)) { + stan_data[["CONTINUOUS_INDICATORS"]] <- as.matrix(INDICATORS[, contResidVars, drop = FALSE]) + } + + stan_data +} + + +# General Stan Model for estimating SEMs with interaction terms +STAN_MODEL_GENERAL <- " +// sem_latent_interaction.stan (updated) +// SEM with three latent factors (X, Z, Y) and latent interaction X*Z -> Y +// Identification: first loading fixed to 1; *latent means constrained to 0*. +// All nine indicator intercepts tau are now free. + +functions { + vector getIthProduct(int i, int N_LVS, int N, matrix PRODUCTS, matrix ETA) { + + vector[N] product = rep_vector(0, N); + + int firstFound = 0; + for (j in 1:N_LVS) { + + if (PRODUCTS[i, j]) { + + if (!firstFound) { + product = ETA[, j]; + firstFound = 1; + + } else { + product = product .* ETA[, j]; + } + } + } + + return product; + } +} + + +data { + int N; // Sample size + int K; // Number of indicators + int N_XIS; // Number of xis + int N_ETAS; // Number of etas + int N_LVS; // Number of lVs + int N_INT; // Number of interaction terms + + matrix[K, N_LVS] LAMBDA; // Structure of measurement model + matrix[N_ETAS, N_LVS] GAMMA; // Structure of structural model + matrix[N_ETAS, N_INT] OMEGA; // Structure of interaction-coefficients + matrix[N_INT, N_LVS] PRODUCTS; // Structure of interaction terms + + matrix[N_LVS, N_LVS] PSI; // Structure of latent covariances + matrix[K, K] THETA; // Structure of indicator covariances + vector[K] TAU; // Structure of indicator intercepts + vector[N_LVS] ALPHA; // Structure of LV intercepts + + int N_FREE_LAMBDA; // sum(abs(LAMBDA)) + int N_FREE_GAMMA; // sum(abs(GAMMA)) + int N_FREE_OMEGA; + + int N_FREE_DIAG_THETA; // sum(abs(diag(THETA))) + int N_FREE_LOWER_THETA; // sum(abs(THETA[is.lower(THETA)])) + + int N_FREE_DIAG_PSI; // sum(abs(diag(PSI))) + int N_FREE_LOWER_PSI; // sum(abs(GAMMA[is.lower(GAMMA)])) + + int N_FREE_TAU; // sum(abs(TAU)) + int N_FREE_ALPHA; // sum(abs(ALPHA)) + + // Observed Data + matrix[N, K] Y; +} + + +parameters { + // Measurement model + vector[N_FREE_LAMBDA] lambda; + vector[N_FREE_TAU] tau; + vector[N_FREE_DIAG_THETA] theta_d; + vector[N_FREE_LOWER_THETA] theta_l; + + // Structural model + vector[N_FREE_GAMMA] gamma; + vector[N_FREE_ALPHA] alpha; + vector[N_FREE_OMEGA] omega; + vector[N_FREE_DIAG_PSI] psi_d; + vector[N_FREE_LOWER_PSI] psi_l; + + // LVs + matrix[N, N_LVS] XI; + + // Indicator disturbances + // matrix[N, K] EPSILON; +} + + +transformed parameters { + // Declare matrices + matrix[K, N_LVS] Lambda; // Structure of measurement model + matrix[N_ETAS, N_LVS] Gamma; // Structure of structural model + matrix[N_ETAS, N_INT] Omega; + + matrix[N_LVS, N_LVS] Psi; // Structure of latent covariances + matrix[K, K] Theta; // Structure of indicator covariances + vector[K] Tau; // Structure of indicator intercepts + vector[N_LVS] Alpha; // Structure of LV intercepts + + // Fill Matrices + Lambda = rep_matrix(0, K, N_LVS); + Gamma = rep_matrix(0, N_ETAS, N_LVS); + Omega = rep_matrix(0, N_ETAS, N_INT); + Theta = rep_matrix(0, K, K); + Psi = rep_matrix(0, N_LVS, N_LVS); + Tau = rep_vector(0, K); + Alpha = rep_vector(0, N_LVS); + + { + // Fill Lambda + int k = 1; + for (j in 1:N_LVS) { + real filledFirst = 0; + + for (i in 1:K) { + real fill = LAMBDA[i, j]; + + if (fill && !filledFirst) { + Lambda[i, j] = 1; + filledFirst = 1; + + } else if (fill) { + Lambda[i, j] = lambda[k]; + k = k + 1; + } + } + } + // Fill Gamma + k = 1; + for (i in 1:N_ETAS) { + for (j in 1:N_LVS) { + real fill = GAMMA[i, j]; + + if (fill) { + Gamma[i, j] = gamma[k]; + k = k + 1; + } + } + } + + // Fill OMEGA + k = 1; + for (i in 1:N_ETAS) { + for (j in 1:N_INT) { + real fill = OMEGA[i, j]; + + if (fill) { + Omega[i, j] = omega[k]; + k = k + 1; + } + } + } + + // Fill Diagonal Theta + k = 1; + for (i in 1:K) { + real fill = THETA[i, i]; + + if (fill) { + Theta[i, i] = theta_d[k]; + k = k + 1; + } + } + + // Fill Off-Diagonal Theta + k = 1; + for (i in 1:K) { + + for (j in 1:(i-1)) { + real fill = THETA[i, j]; + + if (fill) { + Theta[i, j] = theta_l[k]; + Theta[j, i] = theta_l[k]; + k = k + 1; + } + } + } + + // Fill Diagonal Psi + k = 1; + for (i in 1:N_LVS) { + real fill = PSI[i, i]; + + if (fill) { + Psi[i, i] = psi_d[k]; + k = k + 1; + } + } + + // Fill Off-Diagonal Psi + k = 1; + for (i in 1:N_LVS) { + + for (j in 1:(i-1)) { + real fill = PSI[i, j]; + + if (fill) { + Psi[i, j] = psi_l[k]; + Psi[j, i] = psi_l[k]; + k = k + 1; + } + } + } + + // Fill Alpha + k = 1; + for (i in 1:N_LVS) { + real fill = ALPHA[i]; + if (fill) { + Alpha[i] = alpha[k]; + k = k + 1; + } + } + + // Fill Tau + k = 1; + for (i in 1:K) { + real fill = TAU[i]; + + if (fill) { + Tau[i] = tau[k]; + k = k + 1; + } + } + } + + // Calculate LV values + matrix[N, N_LVS] ETA; + + for (i in 1:N_LVS) { + ETA[, i] = XI[, i]; // For xis the actual values are filled + // For etas we start with the disturbances + } + + for (i in 1:N_ETAS) { + int idx = N_XIS + i; + + for (j in 1:N_LVS) { + if (GAMMA[i, j]) { + ETA[, idx] = ETA[, idx] + Gamma[i, j] * ETA[, j]; + } + } + + for (j in 1:N_INT) { + if (OMEGA[i, j]) { + ETA[, idx] = + ETA[, idx] + Omega[i, j] * getIthProduct(j, N_LVS, N, PRODUCTS, ETA); + } + } + } + + // Calculate Expected Indicator values + matrix[N, K] X = ETA * Lambda'; + for (i in 1:K) { + X[, i] = Tau[i] + X[, i]; + } + + // DEBUGGING + // print(\"Omega\"); + // print(Omega); + // print(\"Gamma\"); + // print(Gamma); + // print(\"Lambda\"); + // print(Lambda); + // print(\"PRODUCTS\"); + // print(PRODUCTS); + // print(\"Theta\"); + // print(Theta); + // print(\"Psi\"); + // print(Psi); + + // print(\"head(ETA)\"); + // print(ETA[1:5, ]); + // + // print(\"head(XI)\"); + // print(XI[1:5, ]); + // + // print(\"head(X)\"); + // print(X[1:5, ]); + + // print(\"head(XZ)\"); + // print(getIthProduct(1, N_LVS, N, PRODUCTS, ETA)[1:5]); + + array[N] vector[N_LVS] marginalXI; + row_vector[N_LVS] marginalMeanXI = rep_row_vector(0, N_LVS); + + for (n in 1:N) { + marginalXI[n] = (XI[n, ])' - Alpha; + } + + array[N] vector[K] marginalY; + row_vector[K] marginalMeanY = rep_row_vector(0, K); + + for (n in 1:N) { + marginalY[n] = (Y[n, ] - X[n, ])'; + } +} + +model { + // Priors + // No priors (yet) + + marginalXI ~ multi_normal(marginalMeanXI, Psi); + marginalY ~ multi_normal(marginalMeanY, Theta); +} + +" + + +# Functions +specifyModelSTAN <- function(syntax = NULL, + data = NULL, + auto.fix.first = TRUE, + auto.fix.single = TRUE, + orthogonal.y = FALSE, + orthogonal.x = FALSE, + mean.observed = TRUE) { + if (!is.null(syntax)) parTable <- modsemify(syntax) + stopif(is.null(parTable), "No parTable found") + + # endogenous variables (etas)model + etas <- getSortedEtas(parTable, isLV = TRUE, checkAny = TRUE) + etas <- etas[length(etas):1] # reverse + numEtas <- length(etas) + + indsEtas <- getIndsLVs(parTable, etas) + numIndsEtas <- vapply(indsEtas, FUN.VALUE = vector("integer", 1L), + FUN = length) + allIndsEtas <- unique(unlist(indsEtas)) + numAllIndsEtas <- length(allIndsEtas) + + # exogenouts variables (xis) and interaction terms + intTerms <- unique(parTable[grepl(":", parTable$rhs), "rhs"]) + numInts <- length(intTerms) + varsInts <- stringr::str_split(intTerms, pattern = ":") + varsInts <- stats::setNames(varsInts, nm = intTerms) + allVarsInInts <- unique(unlist(varsInts)) + + xis <- getXis(parTable, checkAny = TRUE) + numXis <- length(xis) + + indsXis <- getIndsLVs(parTable, xis) + numIndsXis <- vapply(indsXis, FUN.VALUE = vector("integer", 1L), + FUN = length) + allIndsXis <- unique(unlist(indsXis)) + numAllIndsXis <- length(allIndsXis) + + lVs <- c(xis, etas) + numLVs <- length(lVs) + indsLVs <- getIndsLVs(parTable, lVs) + allInds <- c(allIndsXis, allIndsEtas) + numAllInds <- length(allInds) + + # clean data + data <- prepDataModsemDA(data, allIndsXis, allIndsEtas)$data.full + + # measurement model x + listLambda <- constructLambda(lVs, indsLVs, parTable = parTable, + auto.fix.first = auto.fix.first) + LAMBDA <- listLambda$numeric + LAMBDA[is.na(LAMBDA)] <- 1 + + listTau <- constructTau(lVs, indsLVs, parTable = parTable) + TAU <- listTau$numeric + TAU[is.na(TAU)] <- 1 + + listAlpha <- constructAlpha(lVs, parTable = parTable, + mean.observed = mean.observed) + ALPHA <- listAlpha$numeric + ALPHA[is.na(ALPHA)] <- 1 + + listTheta <- constructTheta(lVs, indsLVs, parTable = parTable, + auto.fix.single = auto.fix.single) + THETA <- listTheta$numeric + THETA[is.na(THETA)] <- 1 + + listGamma <- constructGamma(etas, lVs, parTable = parTable) + GAMMA <- listGamma$numeric + GAMMA[is.na(GAMMA)] <- 1 + + listOmega <- constructGamma(etas, intTerms, parTable = parTable) + OMEGA <- listOmega$numeric + OMEGA[is.na(OMEGA)] <- 1 + + listPsi <- constructPsi(etas, parTable = parTable, orthogonal.y = orthogonal.y) + psi <- listPsi$numeric + + listPhi <- constructPhi(xis, method = "qml", cov.syntax = NULL, + parTable = parTable, orthogonal.x = orthogonal.x) + phi <- listPhi$numeric + + PSI <- diagPartitionedMat(phi, psi) + PSI[is.na(PSI)] <- 1 + + PRODUCTS <- matrix(0, nrow = numInts, ncol = numLVs, dimnames = list(intTerms, lVs)) + for (intTerm in intTerms) { + elems <- varsInts[[intTerm]] + + PRODUCTS[intTerm, elems] <- 1 + } + + lowerTri <- \(X) X[lower.tri(X, diag = FALSE)] + + list(N = nrow(data), + K = numAllInds, + N_XIS = numXis, + N_ETAS = numEtas, + N_LVS = numLVs, + N_INT = numInts, + + LAMBDA = LAMBDA, + GAMMA = GAMMA, + OMEGA = OMEGA, + PRODUCTS = PRODUCTS, + + PSI = PSI, + THETA = THETA, + + ALPHA = as.vector(ALPHA), + TAU = as.vector(TAU), + + N_FREE_LAMBDA = sum(is.na(listLambda$numeric)), + N_FREE_GAMMA = sum(GAMMA), + N_FREE_OMEGA = sum(OMEGA), + + N_FREE_DIAG_THETA = sum(diag(THETA)), + N_FREE_LOWER_THETA = sum(lowerTri(THETA)), + + N_FREE_DIAG_PSI = sum(diag(PSI)), + N_FREE_LOWER_PSI = sum(lowerTri(PSI)), + + N_FREE_ALPHA = sum(ALPHA), + N_FREE_TAU = sum(TAU), + + Y = as.matrix(data) + ) +} + + +getModParTable <- function(lhs, op, rhs, parTable, .default = NA) { + parTable[parTable$mod == "", "mod"] <- .default + + if (op == "~~") { + out <- parTable[((parTable$lhs == lhs & parTable$rhs == rhs) | + (parTable$lhs == rhs & parTable$rhs == lhs)) & + parTable$op == op, "mod"] + } else { + out <- parTable[parTable$lhs == lhs & parTable$rhs == rhs & + parTable$op == op, "mod"] + } + + if (!length(out)) .default else out +} + + +## Example +# model.syntax <- ' +# X =~ x1 + x2 + x3 +# Z =~ z1 + z2 + z3 +# Y =~ y1 + y2 + y3 +# +# Y ~ X + Z + X:Z +# ' +# stan_data <- specifyModelSTAN(model.syntax, data = oneInt) +# stan_model <- stan_model(model_code = STAN_MODEL_GENERAL) +# +# fit <- sampling( +# object = stan_model, +# data = stan_data, +# chains = 2, +# iter = 2000, +# warmup = 1000 +# ) +# +# summary(fit, c("gamma")) +# summary(fit, c("omega")) +# summary(fit, c("Psi")) +# stan_rhat(fit) +# stan_trace(fit, "omega") +# stan_trace(fit, "gamma") diff --git a/R/modsem_stan.R b/R/modsem_stan.R new file mode 100644 index 00000000..81b7cac3 --- /dev/null +++ b/R/modsem_stan.R @@ -0,0 +1,400 @@ +#' Interaction between latent variables using Markow-Chain Monte-Carlo sampling via \code{STAN} +#' +#' @param model.syntax \code{lavaan} syntax. Overwritten by \code{compiled_model}. +#' +#' @param data A dataframe with observed variables used in the model. +#' +#' @param compiled_model Compiled model from \code{\link{compile_stan_model}}. Saves time if the +#' same \code{model.syntax} has to be reused multiple times. +#' +#' @param chains A positive integer specifying the number of Markov chains. +#' The default is 4. +#' +#' @param iter A positive integer specifying the number of iterations for +#' each chain (including warmup). The default is 2000. +#' +#' @param warmup A positive integer specifying the number of warmup (aka +#' burnin) iterations per chain. If step-size adaptation is on +#' (which it is by default), this also controls the number of +#' iterations for which adaptation is run (and hence these +#' warmup samples should not be used for inference). The number +#' of warmup iterations should be smaller than \code{iter} and the +#' default is \code{iter/2}. +#' +#' @param ordered Ordered (i.e., ordinal) variables. +#' +#' @param ordered.link Link function to be used for ordered variables. +#' +#' @param parameterization What parameterization to use? Default is \code{"centered"}. +#' +#' @param rcs Should latent variable indicators be replaced with reliability-corrected +#' single item indicators instead? See \code{\link{relcorr_single_item}}. +#' +#' @param rcs.choose Which latent variables should get their indicators replaced with +#' reliability-corrected single items? It is passed to \code{\link{relcorr_single_item}} +#' as the \code{choose} argument. +#' +#' @param rcs.scale.corrected Should reliability-corrected items be scale-corrected? If \code{TRUE} +#' reliability-corrected single items are corrected for differences in factor loadings between +#' the items. Default is \code{TRUE}. +#' +#' @param ... Arguments passed to \code{stan::sampling}. +#' +#' @export +modsem_stan <- function(model.syntax = NULL, + data = NULL, + compiled_model = NULL, + chains = 2, + iter = 2000, + warmup = iter / 2, + ordered = NULL, + ordered.link = "logit", + parameterization = "centered", + rcs = FALSE, + rcs.choose = NULL, + rcs.scale.corrected = TRUE, + ...) { + promptInstallRstan() + + if (!requireNamespace("rstan", quietly = TRUE)) + stop2("The 'rstan' package is required to use the `modsem_stan()` function!") + + if (rcs) { # use reliability-correct single items? + corrected <- relcorr_single_item( + syntax = model.syntax, + data = data, + choose = rcs.choose, + scale.corrected = rcs.scale.corrected, + warn.lav = FALSE + ) + + model.syntax <- corrected$syntax + data <- corrected$data + } + + stopif(is.null(model.syntax) && rcs, + "`model.syntax` argument is needed when `rcs=TRUE`!") + + if (is.null(compiled_model) || rcs) { + stopif(is.null(model.syntax), + "One of `model.syntax` or `compiled_model` has to be provided!") + # pass ordered through so codegen knows which indicators are ordinal + compiled_model <- compile_stan_model(model.syntax, + ordered = ordered, + ordered.link = ordered.link, + parameterization = parameterization) + + } else { + # normalize ordered for downstream data + # prep even when compiled_model is provided + if (is.null(ordered)) ordered <- character(0) + } + + lVs <- compiled_model$info$lVs + indsLVs <- compiled_model$info$indsLVs + inds <- unique(unlist(indsLVs)) + etas <- compiled_model$info$etas + deps <- c(inds, etas) + + # IMPORTANT: pass ordered to the data builder so it supplies INDICATORS_ and K_ + stan_data <- getStanData(compiled_model = compiled_model, + data = data, + ordered = ordered) + + message("Sampling Stan model...") + fit <- rstan::sampling(object = compiled_model$stan_model, + data = stan_data, + chains = chains, + iter = iter, + warmup = warmup, + pars = compiled_model$info$exclude.pars, + include = FALSE, + ...) + + diagnostics <- rstan::summary(fit)$summary + + samples <- as.matrix(fit) + # DO NOT drop indexed params globally; we need cutpoints like x2__CUTPOINTS[1]. + # Vectors of length N (e.g., latent time-series) are already excluded via `exclude.pars`. + + # Map Stan parameter labels -> lavaan-like labels + cleanPars <- \(pars) stringr::str_replace_all(pars, STAN_OPERATOR_LABELS) + + # We will build lhs/op/rhs in two passes: + # (A) cutpoints: __CUTPOINTS[i] -> lhs=, op="|", rhs=paste0("t", i) + # (B) everything else: use existing operator mapping + + par_names_raw <- colnames(samples) + + is_cut <- grepl("__CUTPOINTS\\[[0-9]+\\]$", par_names_raw) + cutRaw <- par_names_raw[is_cut] + + # Extract lhs (indicator) and threshold index + cutLhs <- sub("__CUTPOINTS\\[[0-9]+\\]$", "", cutRaw) + cut_k <- as.integer(sub("^.*__CUTPOINTS\\[([0-9]+)\\]$", "\\1", cutRaw)) + cutRhs <- if (length(cut_k)) paste0("t", cut_k) else NULL + cutOp <- rep("|", length(cutRaw)) + + # Non-cutpoint parameters + noncutRaw <- par_names_raw[!is_cut] + noncutClean <- cleanPars(noncutRaw) + + OP <- "~~~|~~|=~|~1|~" # lavaan operators we already produce + noncutOp <- stringr::str_extract(noncutClean, pattern = OP) + noncutOp[is.na(noncutOp)] <- ":=" + + lr <- stringr::str_split_fixed(noncutClean, pattern = OP, n = 2) + noncutLhs <- lr[, 1] + noncutRhs <- lr[, 2] + + # Square residual SDs (variances) for deps (as before) + # isSD <- noncutLhs == noncutRhs & noncutOp == "~~~" + # if (any(isSD)) samples[, match(noncutRaw[isSD], par_names_raw)] <- samples[, match(noncutRaw[isSD], par_names_raw)]^2 + + # Combine back the label pieces for all params (cutpoints first or interleaved – order does not matter) + allLhs <- c(cutLhs, noncutLhs) + allOp <- c(cutOp, noncutOp) + allRhs <- c(cutRhs, noncutRhs) + allOp[allOp == "~~~"] <- "~~" + + # Reorder samples consistently with the combined vectors + samples <- samples[, c(cutRaw, noncutRaw), drop = FALSE] + + # Remove := + keep <- allOp != ":=" + lhs <- allLhs[keep] + op <- allOp[keep] + rhs <- allRhs[keep] + + samples <- samples[, keep, drop = FALSE] + namesSamplesRaw <- colnames(samples) + diagnostics <- diagnostics[namesSamplesRaw, , drop = FALSE] + + # Detect duplicated residual covariance columns (order-insensitive) + sym_mask <- op == "~~" & lhs != rhs + dedup_key <- paste( + op, + ifelse(sym_mask, pmin(lhs, rhs), lhs), + ifelse(sym_mask, pmax(lhs, rhs), rhs), + sep = "::" + ) + + dup_idx <- duplicated(dedup_key) + if (any(dup_idx)) { + samples <- samples[, !dup_idx, drop = FALSE] + lhs <- lhs[!dup_idx] + op <- op[!dup_idx] + rhs <- rhs[!dup_idx] + diagnostics <- diagnostics[!dup_idx, , drop = FALSE] + dedup_key <- dedup_key[!dup_idx] + } + + # Align residual covariance orientation with the compiled parTable, if possible + sym_mask <- op == "~~" & lhs != rhs + if (any(sym_mask)) { + par_input <- compiled_model$info$parTable + input_sym <- par_input$op == "~~" & par_input$lhs != par_input$rhs + if (any(input_sym)) { + input_key <- paste( + par_input$op[input_sym], + pmin(par_input$lhs[input_sym], par_input$rhs[input_sym]), + pmax(par_input$lhs[input_sym], par_input$rhs[input_sym]), + sep = "::" + ) + + current_key <- dedup_key[sym_mask] + match_idx <- match(current_key, input_key) + has_match <- !is.na(match_idx) + + if (any(has_match)) { + idx <- which(sym_mask)[has_match] + lhs[idx] <- par_input$lhs[input_sym][match_idx[has_match]] + rhs[idx] <- par_input$rhs[input_sym][match_idx[has_match]] + } + + if (any(!has_match)) { + idx <- which(sym_mask)[!has_match] + lhs_new <- pmin(lhs[idx], rhs[idx]) + rhs_new <- pmax(lhs[idx], rhs[idx]) + lhs[idx] <- lhs_new + rhs[idx] <- rhs_new + } + } else { + idx <- which(sym_mask) + lhs_new <- pmin(lhs[idx], rhs[idx]) + rhs_new <- pmax(lhs[idx], rhs[idx]) + lhs[idx] <- lhs_new + rhs[idx] <- rhs_new + } + } + + colnames(samples) <- paste0(lhs, op, rhs) + rownames(diagnostics) <- colnames(samples) + + # Summaries + coefs <- apply(samples, MARGIN = 2, FUN = mean) + vcov <- stats::cov(samples) + rhat <- diagnostics[, "Rhat"] + neff <- diagnostics[, "n_eff"] + + # Build parTable (lavaan-like), including thresholds + pars_clean_for_table <- cleanPars(colnames(samples)) # human-friendly labels where relevant + + se <- sqrt(diag(vcov)) + ci.lower <- apply(samples, MARGIN = 2, FUN = quantile, probs = 0.05) + ci.upper <- apply(samples, MARGIN = 2, FUN = quantile, probs = 0.95) + # non-symmetric P-value (Mplus note: https://www.statmodel.com/download/FAQ-Bootstrap%20-%20Pvalue.pdf) + B <- NROW(samples) + M <- apply(samples, MARGIN = 2, FUN = \(x) sum(x < 0)) + p.value <- 2 * apply(cbind(M/B, 1 - M/B), MARGIN = 1, FUN = min) # Two-sided (non-symmetric) + + # handle NaNs and zero SEs + se.zero <- se <= .Machine$double.eps + se [se.zero] <- NA + rhat[se.zero | is.nan(rhat)] <- NA + neff[se.zero | is.nan(neff)] <- NA + + parTable <- data.frame( + lhs = lhs, op = op, rhs = rhs, + est = coefs, std.error = se, + z.value = coefs / se, + p.value = p.value, + ci.lower = ci.lower, + ci.upper = ci.upper, + R.hat = rhat, n.eff = neff, + row.names = NULL + ) + + parTable <- tryCatch(sortParTableStan(parTable, compiled_model$info$parTable), + error = \(e) parTable) + parTable <- modsemParTable(parTable) + + out <- list( + fit = fit, + parTable = parTable, + coefs = coefs, + vcov = vcov, + samples = samples + ) + + class(out) <- "modsem_stan" + out +} + + +#' @export +summary.modsem_stan <- function(object, standardized = FALSE, ...) { + parTable <- parameter_estimates(object) + + if (standardized) { + parTable <- addTransformedEstimatesPT( + parTable = parTable, + values.from = "est", + values.to = "Std.all", + merge.by = c("lhs", "op", "rhs"), + FUN = standardized_estimates, + object = object, + ... + ) + } + + parTable$n.eff <- as.character(round(parTable$n.eff)) # print as integer, not float + summarize_partable(parTable) +} + + +#' @export +print.modsem_stan <- function(x, ...) { + print(x$parTable) +} + + +#' @export +parameter_estimates.modsem_stan <- function(object, ...) { + object$parTable +} + + +#' @export +standardized_estimates.modsem_stan <- function(object, + monte.carlo = FALSE, + mc.reps = 10000, + tolerance.zero = 1e-10, + ...) { + stdSolution <- standardizedSolutionCOEFS( + object, + monte.carlo = monte.carlo, + mc.reps = mc.reps, + tolerance.zero = tolerance.zero, + ... + ) + + stdSolution$parTable +} + + +#' @export +#' @importFrom stats vcov +vcov.modsem_stan <- function(object, ...) { + object$vcov +} + + +#' @export +#' @importFrom stats coef +coef.modsem_stan <- function(object, ...) { + object$coefs +} + + +sortParTableStan <- function(parTable, parTable.input) { + etas <- getSortedEtas(parTable.input, isLV = TRUE) + xis <- getXis(parTable.input, checkAny = FALSE, etas = etas, isLV = TRUE) + + indsXis <- unlist(getIndsLVs(parTable.input, lVs = xis)) + indsEtas <- unlist(getIndsLVs(parTable.input, lVs = etas)) + + opOrder <- c("=~", "~", "~1", "~~", "|", ":=") + varOrder <- unique(c(indsXis, indsEtas, xis, etas)) + + getScore <- function(x, order.by) { + order.by <- unique(c(order.by, x)) # ensure that all of x is in order.by + mapping <- stats::setNames(seq_along(order.by), nm = order.by) + score <- mapping[x] + + if (length(score) != length(x)) { + warning2("Sorting of parameter estimates failed!\n", + immediate. = FALSE) + + return(seq_along(x)) + } + + score + } + + scoreLhs <- getScore(x = parTable$lhs, order.by = varOrder) + scoreOp <- getScore(x = parTable$op, order.by = opOrder) + scoreRhs <- getScore(x = parTable$rhs, order.by = varOrder) + + out <- parTable[order(scoreOp, scoreLhs, scoreRhs), , drop = FALSE] + rownames(out) <- NULL + + out +} + + +promptInstallRstan <- function() { + if (!interactive() || requireNamespace("rstan", quietly = TRUE)) + return(NULL) + + printf("The `rstan` package is needed to use `modsem_stan()`\n") + printf("Do you want to install it? (y/n)\n") + + choice <- readLines(n = 1L) + + if (!nchar(choice)) + return(invisible(NULL)) + + if (tolower(substr(choice, 1L, 1L)) == "y") + install.packages("rstan") +} diff --git a/R/standardized_solution.R b/R/standardized_solution.R index cd2d4780..6c09c805 100644 --- a/R/standardized_solution.R +++ b/R/standardized_solution.R @@ -7,9 +7,10 @@ transformedSolutionCOEFS <- function(object, center = TRUE, standardize = TRUE, ...) { - stopif(!inherits(object, c("modsem_da", "modsem_pi", "lavaan", "modsem_mplus")), - "The model must be of class `modsem_da`, `modsem_mplus`, `modsem_pi` or `lavaan`!") + stopif(!inherits(object, c("modsem_da", "modsem_pi", "lavaan", "modsem_mplus", "modsem_stan")), + "The model must be of class `modsem_da`, `modsem_mplus`, `modsem_pi`, `modsem_stan` or `lavaan`!") + isStan <- inherits(object, "modsem_stan") isLav <- inherits(object, "lavaan") isDA <- inherits(object, "modsem_da") isMplus <- inherits(object, "modsem_mplus") @@ -34,14 +35,16 @@ transformedSolutionCOEFS <- function(object, if (!NROW(parTable)) return(NULL) - if (isDA || isMplus) { + if (!"label" %in% names(parTable)) + parTable$label <- "" + + if (isDA || isMplus || isStan) { cols.keep <- c("lhs", "op", "rhs", "label", "est", "std.error") if ("group" %in% names(parTable)) cols.keep <- c(cols.keep, "group") cols.keep <- intersect(cols.keep, names(parTable)) parTable <- parTable[cols.keep] } else { # modsem_pi or lavaan - if (!"label" %in% names(parTable)) parTable$label <- "" if (!"se" %in% names(parTable)) parTable$se <- NA cols.keep <- c("lhs", "op", "rhs", "label", "est", "se") diff --git a/R/summarize_partable.R b/R/summarize_partable.R index b01ed923..21f06b04 100644 --- a/R/summarize_partable.R +++ b/R/summarize_partable.R @@ -59,6 +59,11 @@ summarize_partable <- function(parTable, z = "z.value" ) + standard.cols <- c("lhs", "op", "rhs", "label", "est", + "std.error", "p.value", "z.value", + "ci.lower", "ci.upper") + extra.cols <- setdiff(colnames(parTable), standard.cols) + width.out <- getWidthPrintedParTable( parTable = parTable, scientific = scientific, @@ -68,7 +73,8 @@ summarize_partable <- function(parTable, regressions = regressions, covariances = covariances, intercepts = intercepts, - variances = variances + variances = variances, + extra.cols = NULL ) info.names <- c("Number of model parameters", @@ -91,7 +97,8 @@ summarize_partable <- function(parTable, variances = variances, width.out = width.out, info.names = info.names, - info.values = info.values + info.values = info.values, + extra.cols = extra.cols ) class(out) <- c("list", "modsem_partable_summary") @@ -122,6 +129,7 @@ print.modsem_partable_summary <- function(x, ...) { regressions = x$regressions, covariances = x$covariances, intercepts = x$intercepts, - variances = x$variances + variances = x$variances, + extra.cols = x$extra.cols ) } diff --git a/R/utils_da.R b/R/utils_da.R index ff4e7802..307ccb7c 100644 --- a/R/utils_da.R +++ b/R/utils_da.R @@ -762,6 +762,41 @@ isPureEta <- function(eta, parTable) { } +addResidualCovariancesParTable <- function(parTable) { + etas <- getEtas(parTable, checkAny = FALSE) + + if (!length(etas)) + return(parTable) + + pureEtas <- etas[isPureEta(etas, parTable = parTable)] + k <- length(pureEtas) + + if (k <= 1L) + return(parTable) + + for (j in seq_len(k)) { # iterate in reverse order + eta.j <- pureEtas[[j]] + + for (i in seq_len(j - 1L)) { + eta.i <- pureEtas[[i]] + + exists <- ( + (parTable$lhs == eta.i & parTable$op == "~~" & parTable$rhs == eta.j) | + (parTable$lhs == eta.j & parTable$op == "~~" & parTable$rhs == eta.i) + ) + + if (any(exists)) next + + parTable <- rbind(parTable, data.frame( + lhs = eta.i, op = "~~", rhs = eta.j, mod = "" + )) + } + } + + parTable +} + + getCoefMatricesDA <- function(parTable, xis = NULL, etas = NULL, diff --git a/man/compile_stan_model.Rd b/man/compile_stan_model.Rd new file mode 100644 index 00000000..129f91c9 --- /dev/null +++ b/man/compile_stan_model.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/model_stan.R +\name{compile_stan_model} +\alias{compile_stan_model} +\title{Compile \code{STAN} model based on a \code{lavaan} model} +\usage{ +compile_stan_model( + model.syntax, + compile = TRUE, + force = FALSE, + ordered = NULL, + ordered.link = c("logit", "probit"), + parameterization = c("centered", "non-centered") +) +} +\arguments{ +\item{model.syntax}{\code{lavaan} syntax.} + +\item{compile}{Should compilation be performed? If \code{FALSE} only the \code{STAN} +is generated, and not compiled.} + +\item{force}{Should compilation of previously compiled models be forced?} + +\item{ordered}{Ordered (i.e., ordinal) variables.} + +\item{ordered.link}{Link function to be used for ordered variables.} + +\item{parameterization}{What parameterization to use? Default is \code{"centered"}.} +} +\description{ +Compile \code{STAN} model based on a \code{lavaan} model +} diff --git a/man/modsem_predict.Rd b/man/modsem_predict.Rd index 637ae2df..ceb526c2 100644 --- a/man/modsem_predict.Rd +++ b/man/modsem_predict.Rd @@ -49,8 +49,10 @@ values or factor scores from \code{\link{modsem}} models. } \section{Methods (by class)}{ \itemize{ -\item \code{modsem_predict(modsem_da)}: Computes (optionally standardised) factor scores via the -regression method using the baseline model unless \code{H0 = FALSE}. +\item \code{modsem_predict(modsem_da)}: Predict From \code{modsem} Models + +Computes (optionally standardised) factor scores via the + regression method using the baseline model unless \code{H0 = FALSE}. \item \code{modsem_predict(modsem_pi)}: Wrapper for \code{lavaan::predict} diff --git a/man/modsem_stan.Rd b/man/modsem_stan.Rd new file mode 100644 index 00000000..0276c82e --- /dev/null +++ b/man/modsem_stan.Rd @@ -0,0 +1,66 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/modsem_stan.R +\name{modsem_stan} +\alias{modsem_stan} +\title{Interaction between latent variables using Markow-Chain Monte-Carlo sampling via \code{STAN}} +\usage{ +modsem_stan( + model.syntax = NULL, + data = NULL, + compiled_model = NULL, + chains = 2, + iter = 2000, + warmup = iter/2, + ordered = NULL, + ordered.link = "logit", + parameterization = "centered", + rcs = FALSE, + rcs.choose = NULL, + rcs.scale.corrected = TRUE, + ... +) +} +\arguments{ +\item{model.syntax}{\code{lavaan} syntax. Overwritten by \code{compiled_model}.} + +\item{data}{A dataframe with observed variables used in the model.} + +\item{compiled_model}{Compiled model from \code{\link{compile_stan_model}}. Saves time if the +same \code{model.syntax} has to be reused multiple times.} + +\item{chains}{A positive integer specifying the number of Markov chains. +The default is 4.} + +\item{iter}{A positive integer specifying the number of iterations for +each chain (including warmup). The default is 2000.} + +\item{warmup}{A positive integer specifying the number of warmup (aka +burnin) iterations per chain. If step-size adaptation is on +(which it is by default), this also controls the number of +iterations for which adaptation is run (and hence these +warmup samples should not be used for inference). The number +of warmup iterations should be smaller than \code{iter} and the +default is \code{iter/2}.} + +\item{ordered}{Ordered (i.e., ordinal) variables.} + +\item{ordered.link}{Link function to be used for ordered variables.} + +\item{parameterization}{What parameterization to use? Default is \code{"centered"}.} + +\item{rcs}{Should latent variable indicators be replaced with reliability-corrected +single item indicators instead? See \code{\link{relcorr_single_item}}.} + +\item{rcs.choose}{Which latent variables should get their indicators replaced with +reliability-corrected single items? It is passed to \code{\link{relcorr_single_item}} +as the \code{choose} argument.} + +\item{rcs.scale.corrected}{Should reliability-corrected items be scale-corrected? If \code{TRUE} +reliability-corrected single items are corrected for differences in factor loadings between +the items. Default is \code{TRUE}.} + +\item{...}{Arguments passed to \code{stan::sampling}.} +} +\description{ +Interaction between latent variables using Markow-Chain Monte-Carlo sampling via \code{STAN} +} diff --git a/tests/testthat/test_modsem_stan.R b/tests/testthat/test_modsem_stan.R new file mode 100644 index 00000000..dd54eab1 --- /dev/null +++ b/tests/testthat/test_modsem_stan.R @@ -0,0 +1,296 @@ +devtools::load_all() + + +TEST_STAN <- FALSE + +if (TEST_STAN) { + library(mvtnorm) + library(rstan) + + rstan_options(auto_write = TRUE, threads_per_chain = 4) # cache compiled models + options(mc.cores = parallel::detectCores()) + + # Here we estimate the models using STAN. + # We're essentially specifying the models as + # Bayesian models with flat priors for the model + # parameters, mimicking ML-estimation + + # ------------------------------------------------------------------------------ + # Two-way Interaction + # ------------------------------------------------------------------------------ + + m1 <- ' + X =~ x1 + x2 + x3 + Z =~ z1 + z2 + z3 + Y =~ y1 + y2 + y3 + + Y ~ X + Z + X:Z + ' + + # First we compile the STAN model, this can be slow + # and is therefore done once, such that the compiled + # STAN code can be reused for the same model syntax later + compiled_model.2way <- compile_stan_model(m1) + compiled_model.2way <- compile_stan_model(m1) + cat(compiled_model.2way$syntax) + + # Fit the model based on the compiled STAN code + fit.2way <- modsem_stan(compiled_model = compiled_model.2way, + data = oneInt, iter = 2000, chains = 2) + # We can get a summary + summary(fit.2way) + + # We can also get the standardized (and unstandardized) + # estimates + standardized_estimates(fit.2way) + parameter_estimates(fit.2way) + + + # ------------------------------------------------------------------------------ + # Three-way Interaction + # ------------------------------------------------------------------------------ + + # Simulate data + + set.seed(29723234) + n <- 1400 + Sigma <- matrix(c( + 1.2, 0.7, 0.8, + 0.7, 1.8, 0.6, + 0.8, 0.6, 1.4 + ), nrow = 3) + + XI <- rmvnorm(n, sigma = Sigma) + + X <- XI[, 1] + Z <- XI[, 2] + W <- XI[, 3] + + Y <- 1.2 * X + 0.4 * Z + 0.7 * W + + 0.2 * W * Z + + 0.7 * W * X + + 1.2 * X * Z + + 2.2 * X * Z * W + rnorm(n, sd = sqrt(2)) + + createInd <- \(x, lambda, tau, epsilon = 0.2) { + tau + lambda * x + rnorm(n, sd = sqrt(epsilon)) + } + + x1 <- createInd(X, 1.0, 1.2) + x2 <- createInd(X, 0.8, 0.8) + x3 <- createInd(X, 0.9, 1.0) + + z1 <- createInd(Z, 1.0, 1.2) + z2 <- createInd(Z, 0.8, 0.8) + z3 <- createInd(Z, 0.9, 1.0) + + w1 <- createInd(W, 1.0, 1.2) + w2 <- createInd(W, 0.8, 0.8) + w3 <- createInd(W, 0.9, 1.0) + + y1 <- createInd(Y, 1.0, 1.2) + y2 <- createInd(Y, 0.8, 0.8) + y3 <- createInd(Y, 0.9, 1.0) + + data.3way <- data.frame(x1, x2, x3, + z1, z2, z3, + w1, w2, w3, + y1, y2, y3) + m.3way <- ' + X =~ x1 + x2 + x3 + Z =~ z1 + z2 + z3 + W =~ w1 + w2 + w3 + Y =~ y1 + y2 + y3 + + Y ~ X + Z + W + X:Z + X:W + Z:W + X:Z:W + # True values are + # Y ~ 1.2 * X + + # 0.4 * Z + + # 0.7 * W + + # 1.2 * X:Z + + # 0.7 * X:W + + # 0.2 * Z:W + + # 2.2 * X:Z:W + + ' + + # First we compile the STAN model, this can be slow + # and is therefore done once, such that the compiled + # STAN code can be reused for the same model syntax later + compiled_model.3way <- compile_stan_model(m.3way) + + # Fit the model based on the compiled STAN code + fit.3way <- modsem_stan(compiled_model = compiled_model.3way, + data = data.3way, + chains = 2, + iter = 10000) # More iterations should yield more stable estimates + #> Regressions: + #> Estimate Std.Error z.value P(>|z|) + #> Y ~ + #> X 1.040 0.159 6.535 0.000 + #> Z 0.423 0.128 3.296 0.001 + #> W 0.696 0.126 5.524 0.000 + #> X:Z 1.385 0.173 8.023 0.000 + #> X:W 0.733 0.175 4.193 0.000 + #> Z:W 0.190 0.178 1.070 0.284 + #> X:Z:W 2.227 0.107 20.731 0.000 + + summary(fit.3way) + standardized_estimates(fit.3way) + parameter_estimates(fit.3way) + + fit.3way.rcs <- modsem_stan(model.syntax = m.3way, + data = data.3way, + rcs = TRUE, + chains = 2, + iter = 10000) + #> Regressions: (n = 1400) + #> Estimate Std.Error z.value P(>|z|) R.hat n.eff + #> Y ~ + #> X 1.269 0.094 13.448 0.000 1.015 342 + #> Z 0.432 0.070 6.167 0.000 1.013 351 + #> W 0.749 0.070 10.757 0.000 1.007 753 + #> X:Z 1.300 0.096 13.509 0.000 1.008 536 + #> X:W 0.753 0.105 7.198 0.000 1.011 511 + #> Z:W 0.265 0.091 2.896 0.004 1.013 319 + #> X:Z:W 2.229 0.054 40.929 0.000 1.000 7409 + + + # ------------------------------------------------------------------------------ + # ORDERED INDICATORS + # ------------------------------------------------------------------------------ + + m1 <- ' + # Outer Model + X =~ x1 + x2 + x3 + Z =~ z1 + z2 + z3 + Y =~ y1 + y2 + y3 + + # Inner Model + Y ~ X + Z + X:Z + ' + + + rthreshold <- \(k, offset = runif(1, min = -1, max = 1), sigma = 0.35) { + t <- seq_len(k) - mean(seq_len(k)) + offset + t <- t + runif(k, min = -sigma, max = sigma) + c(-Inf, t, Inf) + } + + + cut_data <- function(data, k = 5, choose = NULL) { + if (is.null(choose)) + choose <- colnames(data) + + standardize <- \(x) (x - mean(x)) / sd(x) + + thresholds <- list() + for (var in choose) { + x <- standardize(data[[var]]) + t <- rthreshold(k) + y <- cut(x, breaks = t, ordered_result = TRUE) + + min.x <- min(x) + max.x <- max(x) + + data[[var]] <- y + thresholds[[var]] <- t[t >= min.x & t <= max.x] + } + + list(data = data, thresholds = thresholds) + } + + + choose <- colnames(oneInt) + set.seed(2837290) + CUTS <- cut_data(oneInt, choose = choose) + oneInt2 <- CUTS$data + compiled <- compile_stan_model(m1, ordered = choose) + stan <- modsem_stan(compiled_model = compiled, data = oneInt2, + ordered = choose, iter = 10000) + thresholds <- CUTS$thresholds + + thresholds.table <- NULL + parTable <- parameter_estimates(stan) + for (col in choose) { + tau.true <- thresholds[[col]] + tau.true <- tau.true[is.finite(tau.true)] + mask <- parTable$lhs == col & parTable$op == "|" + tau.est <- parTable[mask, "est"] + tau.lower <- parTable[mask, "ci.lower"] + tau.upper <- parTable[mask, "ci.upper"] + pars <- paste0(col, "|t", seq_along(tau.true)) + + rows <- data.frame(parameter = pars, true = tau.true, + est = tau.est, diff = tau.true - tau.est, + ci.lower = tau.lower, ci.upper = tau.upper, + ok = tau.true >= tau.lower & tau.true <= tau.upper) + thresholds.table <- rbind(thresholds.table, rows) + } + + print(modsemParTable(thresholds.table)) + testthat::expect_true(sum(thresholds.table$ok) / NROW(thresholds.table) >= 0.95) # 95% confidence + + # ---------------------------------------------------------------------------- + # Reliability corrected single items + # ---------------------------------------------------------------------------- + + m1 <- ' + X =~ x1 + x2 + x3 + Z =~ z1 + z2 + z3 + Y =~ y1 + y2 + y3 + + Y ~ X + Z + X:Z + ' + + # First we compile the STAN model, this can be slow + # and is therefore done once, such that the compiled + # STAN code can be reused for the same model syntax later + # Fit the model based on the compiled STAN code + fit.2way <- modsem_stan(model.syntax = m1, rcs = TRUE, + data = oneInt, iter = 2000, chains = 2) + # We can get a summary + summary(fit.2way) + + set.seed(82423) + n <- 500 + X <- rnorm(n, mean = 0, sd = sqrt(1.2)) + Z <- rnorm(n, mean = 0, sd = sqrt(1.4)) + + Psi <- matrix(c(0.6, 0.2, 0.2, 0.4), nrow = 2, ncol = 2) + Zeta <- mvtnorm::rmvnorm(n, mean = c(0, 0), sigma = Psi) + + Theta <- matrix(c(0.2, 0.15, 0.15, 0.2), nrow = 2) + Epsilon <- mvtnorm::rmvnorm(n, mean = c(0, 0), sigma = Theta) + + Y1 <- 0.7 * X + 0.3 * Z + Zeta[, 1L] + Y2 <- 0.3 * X + 0.7 * Z + Zeta[, 2L] + + x1 <- X + 1.2 + rnorm(n, mean = 0, sd = sqrt(0.2)) + x2 <- 0.8 * X + 0.9 + rnorm(n, mean = 0, sd = sqrt(0.2)) + + z1 <- Z + 1.2 + rnorm(n, mean = 0, sd = sqrt(0.2)) + z2 <- 0.8 * Z + 0.9 + rnorm(n, mean = 0, sd = sqrt(0.2)) + + y1 <- Y1 + 1.2 + rnorm(n, mean = 0, sd = sqrt(0.2)) + y2 <- 0.8 * Y1 + 0.9 + Epsilon[, 1L] + + y3 <- Y2 + 1.2 + rnorm(n, mean = 0, sd = sqrt(0.2)) + y4 <- 0.8 * Y2 + 0.9 + Epsilon[, 2L] + + data <- data.frame(x1, x2, z1, z2, y1, y2, y3, y4) + + mod <- ' + X =~ x1 + x2 + Z =~ z1 + z2 + Y1 =~ y1 + y2 + Y2 =~ y3 + y4 + + Y1 ~ X + Z + Y2 ~ X + Z + + y2 ~~ y4 + ' + + fit.lms <- modsem(mod, data, method = "lms") + fit.stan <- modsem_stan(mod, data, iter = 10000) +}