# R/bruceR_multivariate.R In psychbruce/bruceR: BRoadly Useful Collections and Extensions of R functions

#### Documented in AlphaCFACONSECCOUNTEFAMEANMODERECODERESCALESTDSUM

#### Variable Computing ####

#' Recode a variable
#'
#' @param var Variable (numeric vector, character vector, or factor).
#' @param recodes Character string: e.g., \code{"lo:1=0; c(2,3)=1; 4=2; 5:hi=3; else=999"}.
#' @examples
#' d=data.table(var=c(NA, 0, 1, 2, 3, 4, 5, 6))
#' d
#'
#' d[,":="(var.recoded=RECODE(var, "lo:1=0; c(2,3)=1; 4=2; 5:hi=3; else=999"))]
#' d
#' @export
RECODE=function(var, recodes) {
car::recode(var, recodes)
}

#' Rescale likert scales (e.g., from 5-point to 7-point)
#' @param var Variable (numeric vector).
#' @param from Numeric vector, the range of old scale (e.g., \code{1:5}).
#' If not set, it will compute the range of \code{var}.
#' @param to Numeric vector, the range of new scale (e.g., \code{1:7}).
#' @examples
#' d=data.table(var=rep(1:5, 2))
#' d[,":="(var1=RESCALE(var, to=1:7),
#'         var2=RESCALE(var, from=1:5, to=1:7))]
#' d  # var1 is equal to var2
#' @export
RESCALE=function(var, from=range(var), to) {
(var-median(from)) / (max(from)-median(from)) * (max(to)-median(to)) + median(to)
}

#' A tool box for multivariate computing
#'
#' @description
#' Easily compute the sum, mean, or other indexes of a scale.
#' Reverse scoring can also be easily implemented, without generating extra variables
#' (\code{\link{Alpha}} uses a similar method to deal with reverse scoring)!
#'
#' Three ways to specify the variable list:
#' \enumerate{
#'   \item \code{\strong{var + items}}: use the common and unique parts of variable names.
#'   \item \code{\strong{vars}}: manually define the variable list.
#'   \item \code{\strong{varrange}}: use the start and stop positions.
#' }
#' @param data \code{data.frame} or \code{data.table}.
#' @param var \strong{[option 1]} Common part across multiple variables (e.g., \code{"RSES", "SWLS"}).
#' @param items \strong{[option 1]} Unique part across multiple variables (e.g., \code{1:10}).
#' @param vars \strong{[option 2]} Character vector specifying the variable list (e.g., \code{c("x1", "x2", "x3")}).
#' @param varrange \strong{[option 3]} Character with \code{":"} specifying the start and stop positions of variables (e.g., \code{"A1:E5"}).
#' @param value [only for \code{COUNT}] The value to be counted.
#' @param rev [optional] Reverse-scoring variables. It can be
#' 1) a numeric vector specifying the positions of reverse-scoring variables (not recommended) or
#' 2) a character vector directly specifying the variable list (recommended).
#' @param likert [optional] Range of likert scale (e.g., \code{1:5}, \code{c(1,5)}).
#' If not provided, it will be automatically estimated from the given data (BUT you should use this carefully).
#' @param na.rm Ignore missing values. Default is \code{TRUE}.
#' @param values [only for \code{CONSEC}] Values to be counted as consecutive identical values. Default is all numbers (\code{0:9}).
#' @examples
#' d=data.table(x1=1:5,
#'              x4=c(2,2,5,4,5),
#'              x3=c(3,2,NA,NA,5),
#'              x2=c(4,4,NA,2,5),
#'              x5=c(5,4,1,4,5))
#' d
#' ## I deliberately set this order to show you
#' ## the difference between "vars" and "varrange".
#'
#' d[,":="(na=COUNT(d, "x", 1:5, value=NA),
#'         n.2=COUNT(d, "x", 1:5, value=2),
#'         sum=SUM(d, "x", 1:5),
#'         m1=MEAN(d, "x", 1:5),
#'         m2=MEAN(d, vars=c("x1", "x4")),
#'         m3=MEAN(d, varrange="x1:x2", rev="x2", likert=1:5),
#'         cons1=CONSEC(d, "x", 1:5),
#'         cons2=CONSEC(d, varrange="x1:x5")
#'         )]
#' d
#' ## It has already changed.
#'
#' ## NOTE: ":=" is indeed a special function in the 'data.table' package.
#' ## See a similar function "mutate()" in the 'dplyr' package: ?dplyr::mutate
#' ## For data.table, you need NOT to re-assign the tranformed data object,
#' ## because it can automatically update the variables in situ!
#' @name %%COMPUTE%%
#' @aliases COUNT SUM MEAN STD CONSEC
NULL

convert2vars=function(data,
var=NULL, items=NULL,
vars=NULL,
varrange=NULL,
rev=NULL) {
if(!is.null(varrange)) {
dn=names(data)
varrange=strsplit(varrange, ":")[[1]]
vars=dn[which(dn==varrange[1]):which(dn==varrange[2])]
}
if(is.null(vars)) vars=paste0(var, items)
if(is.numeric(rev)) rev=paste0(var, rev)  # bug fixed on 2019-09-28
if(is.character(rev)) rev=which(vars %in% rev)
vars.raw=vars
vars=paste(deparse(substitute(data)), vars, sep="$") return(list(vars.raw=vars.raw, vars=vars, rev=rev)) } #' @describeIn grapes-grapes-COMPUTE-grapes-grapes \strong{Count} a certain value across multiple variables #' @export COUNT=function(data, var=NULL, items=NULL, vars=NULL, varrange=NULL, value=NA) { Count=function(...) sum(c(...), na.rm=TRUE) v.r=convert2vars(data, var, items, vars, varrange) vars=v.r$vars
if(is.na(value))
varlist=paste0("is.na(", vars, ")")
else
varlist=paste0(vars, "==", value)
eval(parse(text=paste0("mapply(Count, ", paste(varlist, collapse=", "), ")")))
}

#' @describeIn grapes-grapes-COMPUTE-grapes-grapes Compute the \strong{mode} across multiple variables
#' @export
MODE=function(data, var=NULL, items=NULL, vars=NULL, varrange=NULL) {
getmode=function(v) {
uniqv=unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
Mode=function(...) getmode(c(...))
varlist=convert2vars(data, var, items, vars, varrange)$vars eval(parse(text=paste0("mapply(Mode, ", paste(varlist, collapse=", "), ")"))) } #' @describeIn grapes-grapes-COMPUTE-grapes-grapes Compute the \strong{sum} across multiple variables #' @export SUM=function(data, var=NULL, items=NULL, vars=NULL, varrange=NULL, rev=NULL, likert=NULL, na.rm=TRUE) { Sum=function(...) sum(..., na.rm=na.rm) v.r=convert2vars(data, var, items, vars, varrange, rev) vars=v.r$vars
rev=v.r$rev if(!is.null(rev) & is.null(likert)) { ranges=apply(as.data.frame(data)[,v.r$vars.raw], 2, range)
likert=c(min(ranges[1,], na.rm=TRUE), max(ranges[2,], na.rm=TRUE))
warning("The range of likert scale was automatically estimated from the given data. If you are not sure about this, please specify the 'likert' parameter. See ?SUM")
}
pre=rep("", length(vars))
pre[rev]=ifelse(is.null(likert), "", paste0(sum(range(likert)), "-"))
varlist=paste0(pre, vars)
eval(parse(text=paste0("mapply(Sum, ", paste(varlist, collapse=", "), ")")))
}

#' @describeIn grapes-grapes-COMPUTE-grapes-grapes Compute the \strong{mean} across multiple variables
#' @export
MEAN=function(data, var=NULL, items=NULL, vars=NULL, varrange=NULL,
rev=NULL, likert=NULL,
na.rm=TRUE) {
Mean=function(...) mean(c(...), na.rm=na.rm)
v.r=convert2vars(data, var, items, vars, varrange, rev)
vars=v.r$vars rev=v.r$rev
if(!is.null(rev) & is.null(likert)) {
ranges=apply(as.data.frame(data)[,v.r$vars.raw], 2, range) likert=c(min(ranges[1,], na.rm=TRUE), max(ranges[2,], na.rm=TRUE)) warning("The range of likert scale was automatically estimated from the given data. If you are not sure about this, please specify the 'likert' parameter. See ?MEAN") } pre=rep("", length(vars)) pre[rev]=ifelse(is.null(likert), "", paste0(sum(range(likert)), "-")) varlist=paste0(pre, vars) eval(parse(text=paste0("mapply(Mean, ", paste(varlist, collapse=", "), ")"))) } #' @describeIn grapes-grapes-COMPUTE-grapes-grapes Compute the \strong{standard deviation} across multiple variables #' @export STD=function(data, var=NULL, items=NULL, vars=NULL, varrange=NULL, rev=NULL, likert=NULL, na.rm=TRUE) { Std=function(...) sd(c(...), na.rm=na.rm) v.r=convert2vars(data, var, items, vars, varrange, rev) vars=v.r$vars
rev=v.r$rev if(!is.null(rev) & is.null(likert)) { ranges=apply(as.data.frame(data)[,v.r$vars.raw], 2, range)
likert=c(min(ranges[1,], na.rm=TRUE), max(ranges[2,], na.rm=TRUE))
warning("The range of likert scale was automatically estimated from the given data. If you are not sure about this, please specify the 'likert' parameter. See ?STD")
}
pre=rep("", length(vars))
pre[rev]=ifelse(is.null(likert), "", paste0(sum(range(likert)), "-"))
varlist=paste0(pre, vars)
eval(parse(text=paste0("mapply(Std, ", paste(varlist, collapse=", "), ")")))
}

#' @describeIn grapes-grapes-COMPUTE-grapes-grapes Compute the \strong{consecutive identical digits} across multiple variables (especially useful in detecting careless responding)
#' @import stringr
#' @export
CONSEC=function(data, var=NULL, items=NULL,
vars=NULL,
varrange=NULL,
values=0:9) {
Conseq=function(string, number=values) {
# Consecutive Identical Digits
pattern=paste(paste0(number, "{2,}"), collapse="|")
ifelse(grepl(pattern, string), max(nchar(str_extract_all(string=string, pattern=pattern, simplify=TRUE))), 0)
}
v.r=convert2vars(data, var, items, vars, varrange)
vars=v.r$vars varlist=vars eval(parse(text=paste0("mapply(Conseq, paste0(", paste(varlist, collapse=", "), "))"))) } #### Reliability, EFA, and CFA #### #' Reliability analysis (Cronbach's \eqn{\alpha} and corrected item-total correlation) ## @import jmv #' @inheritParams %%COMPUTE%% #' @examples #' Alpha(bfi, "E", 1:5) # "E1" & "E2" should be reverse scored; see ?bfi #' Alpha(bfi, "E", 1:5, rev=1:2) # right #' @export Alpha=function(data, var, items, vars=NULL, rev=NULL) { if(is.null(vars)) vars=paste0(var, items) if(!is.null(rev)) rev=paste0(var, rev) jmv::reliability(data, vars=eval(vars), revItems=eval(rev), meanScale=TRUE, sdScale=TRUE, itemRestCor=TRUE, alphaItems=TRUE) } #' Exploratory factor analysis (EFA) #' #' Based on \code{jmv::\link[jmv]{efa}}. ## @import jmv #' @inheritParams %%COMPUTE%% #' @param vartext Character string specifying the model (e.g., \code{"X[1:5] + Y[c(1,3)] + Z"}). #' @param method \code{"eigen"} (default), \code{"parallel"}, or \code{"fixed"}, the way to determine the number of factors. #' @param extraction \code{"pa"} (default), \code{"ml"}, or \code{"minres"}, #' using "prinicipal axis", "maximum likelihood", or "minimum residual" as the factor extraction method, respectively. #' @param rotation \code{"varimax"} (default), \code{"oblimin"}, or \code{"none"}, the rotation method. #' @param nFactors An integer (default is 1) fixing the number of factors. #' #' Only relevant when \code{method="fixed"}. #' @param hideLoadings A number (0~1, default is 0.3) for hiding factor loadings below this value. #' @note It does not have the extraction method "Principal Components". You may still use SPSS. #' @seealso #' \code{jmv::\link[jmv]{efa}} #' @examples #' EFA(bfi, "E[1:5] + A[1:5] + C[1:5] + N[1:5] + O[1:5]", method="fixed", nFactors=5) #' @export EFA=function(data, vartext, method="eigen", extraction="pa", rotation="varimax", nFactors=1, hideLoadings=0.3) { jmv::efa(data, vars=eval(expand_vars(vartext)), nFactorMethod=method, # "eigen", "parallel", "fixed" extraction=extraction, # "pa", "ml", "minres" rotation=rotation, # "none", "varimax", "quartimax", "promax", "oblimin", "simplimax" minEigen=1, nFactors=nFactors, hideLoadings=hideLoadings, sortLoadings=TRUE, screePlot=TRUE, eigen=TRUE, factorCor=TRUE, factorSummary=TRUE, modelFit=TRUE, kmo=TRUE, bartlett=TRUE) } ## Expand multiple variables with complex string formats ## Input: "X[1:5] + Y[c(1,3)] + Z" ## Output: expand_vars=function(vartext) { vartexts=gsub(" ", "", strsplit(vartext, "\\+")[[1]]) vars=c() for(vartext.i in vartexts) { if(grepl("\$|\$", vartext.i)==TRUE) { vars.i=eval(parse(text=paste0("paste0('", gsub("\\]", ")", gsub("\\[", "',", vartext.i))))) } else { vars.i=vartext.i } vars=c(vars, vars.i) } return(vars) } ## CFA model formula transformation modelCFA.trans=function(style=c("jmv", "lavaan"), model, highorder="") { # model: free style input model=gsub("^\\s+|\\s+$", "", model)
model=strsplit(gsub(" ", "", strsplit(model, "(;|\\n)+")[[1]]), "=~")
# jmv style
model.jmv=list()
for(i in 1:length(model)) {
var=model[[i]][[2]]
vars=expand_vars(var)
model.jmv[[i]]=list(label=model[[i]][[1]],
vars=vars)
}
# lavaan style
model.lav=c()
for(i in 1:length(model.jmv)) {
model.i=paste(model.jmv[[i]]$label, paste(model.jmv[[i]]$vars, collapse=" + "), sep=" =~ ")
model.lav=c(model.lav, model.i)
}
model.lav=paste(model.lav, collapse="\n")
# high-order CFA (only for lavaan)
factors=sapply(model.jmv, function(x) x$label) if(highorder!="") model.lav=paste(model.lav, paste(highorder, "=~", paste(factors, collapse=" + ")), paste(highorder, "~~", highorder), sep="\n") # output if(style=="jmv") return(model.jmv) if(style=="lavaan") return(model.lav) } #' Confirmatory factor analysis (CFA) #' #' Based on \code{jmv::\link[jmv]{cfa}} and \code{lavaan::\link[lavaan]{cfa}}. ## @import jmv #' @import lavaan #' @import semPlot #' @inheritParams %%COMPUTE%% #' @param model Model formula. See examples. #' @param highorder High-order factor. Default is \code{""}. #' @param orthogonal Default is \code{FALSE}. If \code{TRUE}, all covariances among latent variables are set to zero, and only "lavaan" style will be output. #' @param missing Default is \code{"listwise"}. Alternative is \code{"fiml"} (using "Full Information Maximum Likelihood" method to estimate the model). #' @param style \code{"jmv"}, \code{"lavaan"}, or both (default). #' #' If the model has high-order factors, only "lavaan" style will be output. #' @param CI \code{TRUE} or \code{FALSE} (default), provide confidence intervals for the model estimates. #' @param MI \code{TRUE} or \code{FALSE} (default), provide modification indices for the parameters not included in the model. #' @param plot \code{TRUE} (default) or \code{FALSE}, provide a path diagram of the model. #' @seealso #' \code{jmv::\link[jmv]{cfa}}, \code{lavaan::\link[lavaan]{cfa}} #' @examples #' data.cfa=lavaan::HolzingerSwineford1939 #' CFA(data.cfa, "Visual =~ x[1:3]; Textual =~ x[c(4,5,6)]; Speed =~ x7 + x8 + x9") #' CFA(data.cfa, model=" #' Visual =~ x[1:3] #' Textual =~ x[c(4,5,6)] #' Speed =~ x7 + x8 + x9 #' ", highorder="Ability") #' #' data.bfi=psych::bfi #' data.bfi=data.bfi[complete.cases(data.bfi),] #' CFA(data.bfi, "E =~ E[1:5]; A =~ A[1:5]; C =~ C[1:5]; N =~ N[1:5]; O =~ O[1:5]") #' @export CFA=function(data, model="A =~ a[1:5]; B =~ b[c(1,3,5)]; C =~ c1 + c2 + c3", highorder="", orthogonal=FALSE, missing="listwise", style=c("jmv", "lavaan"), CI=FALSE, MI=FALSE, plot=FALSE) { model.jmv=modelCFA.trans("jmv", model) model.lav=modelCFA.trans("lavaan", model, highorder) if(orthogonal==TRUE | highorder!="") style="lavaan" if(plot==TRUE & "lavaan" %notin% style) style=append(style, "lavaan") Print("#### Latent variable definitions ####") cat(model.lav, "\n\n") results=list() # jmv style if("jmv" %in% style) { fit.jmv=jmv::cfa(data=data, factors=model.jmv, resCov=NULL, constrain="facVar", # or "facInd" # 'facVar' fixes the factor variances to 1 # 'facInd' fixes each factor to the scale of its first indicator ci=CI, mi=MI, # modification indices stdEst=TRUE, resCovEst=TRUE, # pathDiagram=plot, fitMeasures=c("cfi", "tli", "rmsea", "srmr", "aic", "bic"), miss=missing) # fiml (default), listwise # cat(r$modelSyntax)
cat("#### jamovi style output ####\n")
print(fit.jmv)
results=c(results, fit.jmv=fit.jmv)
}

# lavaan style
if("lavaan" %in% style) {
fit.lav=lavaan::cfa(data=data, model=model.lav,
std.lv=TRUE,
# TRUE: fixing the factor residual variances to 1
# FALSE: fixing the factor loading of the first indicator to 1
orthogonal=orthogonal,
missing=missing) # fiml, listwise (default)
cat("#### lavaan style output ####\n\n")
summary(fit.lav, fit.measures=TRUE, standard=TRUE)
if(MI) print(modificationIndices(fit.lav))