inst/developer/function_ideas/umx_build_umxACEcov_fixed.R

#
#   Copyright 2007-2022 Timothy C. Bates
#
#   Licensed under the Apache License, Version 2.0 (the "License");
#   you may not use this file except in compliance with the License.
#   You may obtain a copy of the License at
# 
#        https://www.apache.org/licenses/LICENSE-2.0
# 
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.

#' Run a Cholesky with covariates  ("fixed" / definition variables in the means style)
#'
#' Often, it is appropriate to include covariates in models.
#' A simple method is to regress covariates from the data using [lm()].
#' This is a 'fixed' effects approach.
#' [umx_residualize()] makes this easier, even on twin data, and with complex regression formulas.
#' 
#' While these estimates are unbiased, modeling this regression in the means element of the twin model
#' allows correct tests for significance. Also, if DVs are not continuous, the lm-based approach
#' cannot be used.
#' 
#' For this reason, we have implemented umxACE_cov_fixed, which allows including covariates as definition variables.
#' The following figure shows how the ACE model with fixed covariates appears as a path diagram:
#'
#' \if{html}{\figure{ACEcovOnMeans.png}{options: width="50\%" alt="Figure: ACEcovOnMeans.png"}}
#' \if{latex}{\figure{ACEcovOnMeans.pdf}{options: width=7cm}}
#' 
#' @details
#' On the plus side, there is no distributional assumption for this method. A downside of this approach is that all 
#' covariates must be non-NA, thus dropping any rows where one or more covariates are missing.
#' This is wasteful of data, but often cannot be avoided (though see note below).
#' 
#' \emph{note}: An alternative is the [umxACEcov()] 'random' option. This model adds covariates to
#' the expected covariance matrix, thus allowing all data to be preserved.
#' The (BIG) downside is that this method has a strong assumption of multivariate normality.
#' Covariates like age, which are perfectly correlated in twins cannot be used.
#' Covariates like sex, which are ordinal, violate the normality assumption.
#' 
#' @param name The name of the model (defaults to"ACEcov").
#' @param selDVs The variables to include from the data (do not include sep).
#' @param selCovs The covariates to include from the data (do not include sep).
#' @param dzData The DZ dataframe.
#' @param mzData The MZ dataframe.
#' @param sep Separator text between basename for twin variable names. Often "_T".
#' Used to expand selDVs into full column names, i.e., "dep" --> c("dep_T1", "dep_T2").
#' @param dzAr The DZ genetic correlation (defaults to .5, vary to examine assortative mating).
#' @param dzCr The DZ "C" correlation (defaults to 1: set to .25 to make an ADE model).
#' @param addStd Whether to add the algebras to compute a std model (defaults to TRUE).
#' @param addCI Whether to add intervals to compute CIs (defaults to TRUE).
#' @param boundDiag = Whether to bound the diagonal of the a, c, and e matrices.
#' @param equateMeans Whether to equate the means across twins (defaults to TRUE).
#' @param bVector Whether to compute row-wise likelihoods (defaults to FALSE).
#' @param weightVar (optional) Variable containing the weights to apply to data.
#' @param autoRun Whether to run the model (default), or just to create it and return without running.
#' @param tryHard Default ('no') uses normal mxRun. "yes" uses mxTryHard. Other options: "ordinal", "search"
#' @param optimizer (optionally) set the optimizer. Default (NULL) does nothing.
#' @return - [mxModel()] of subclass mxModel.ACEcov
#' @seealso umx_residualize umxACE
#' @family Twin Modeling Functions
#' @export
#' @md
#' @examples
#' \dontrun{
#' require(umx)
#' data(twinData) # ?twinData from Australian twins.
#' # Pick the variables
#' selDVs  = "ht"
#' selCovs = "age"
#' mzData <- twinData[twinData$zygosity %in% "MZFF", ]
#' dzData <- twinData[twinData$zygosity %in% "DZFF", ]
#' m1 = umxACE_cov_fixed(selDVs = selDVs, selCovs = selCovs, sep = "",
#' 	     dzData = dzData, mzData = mzData)
#' m2 = umxACE(selDVs = selDVs, sep = "", dzData = dzData, mzData = mzData)
#' # =======================
#' # = lm-based equivalent =
#' # =======================
#' df_res = umx_residualize(ht ~ age, suffixes = c("1", "2"), data = twinData)
#' mzData = df_res[df_res$zygosity %in% "MZFF", ]
#' dzData = df_res[df_res$zygosity %in% "DZFF", ]
#' m3 = umxACE("lm_based", selDVs = selDVs, sep = "", dzData = dzData, mzData = mzData)
#' # ===============================
#' # = Example with two covariates =
#' # ===============================
#' selDVs  = "wt"
#' selCovs = c("age", "cohort")
#' twinData$cohort1 = twinData$cohort2 = as.numeric(as.factor(twinData$cohort))
#' mzData <- twinData[twinData$zygosity %in% "MZFF", ]
#' dzData <- twinData[twinData$zygosity %in% "DZFF", ]
#' m1 = umxACE_cov_fixed(selDVs = selDVs, selCovs = selCovs, sep = "",
#' 	     dzData = dzData, mzData = mzData)
#' m1 = umxACE(selDVs = selDVs, sep = "", dzData = dzData, mzData = mzData)
#' }
umxACE_cov_fixed <- function(name = "ACEcov", selDVs, selCovs = NULL, dzData, mzData, sep = NULL, dzAr = .5, dzCr = 1, addStd = TRUE, addCI = TRUE, boundDiag = 0, weightVar = NULL, equateMeans = TRUE, bVector = FALSE, optimizer = NULL, autoRun = getOption("umx_auto_run"), tryHard = c("no", "yes", "ordinal", "search")) {
		tryHard = match.arg(tryHard)	
		nSib = 2 # number of siblings in a twin pair
		if(dzCr == .25 && name == "ACEcov"){ name = "ADEcov"}
		xmu_twin_check(selDVs= c(selDVs, selCovs), dzData = dzData, mzData = mzData, optimizer = optimizer, sep = sep, enforceSep=TRUE)

		if(is.null(selCovs)){
			stop("You need to give me some covariates (if there are none, just use umxACE)")
		}
		
		baseDV_names  = selDVs
		baseCov_names = selCovs
		selDVs  = umx_paste_names(selDVs , sep = sep, suffixes = 1:nSib)
		selCovs = umx_paste_names(selCovs, sep = sep, suffixes = 1:nSib)
		nCov = length(baseCov_names)
		nDVs = length(baseDV_names); # number of dependent variables ** per INDIVIDUAL ( so times-2 for a family)**
		used = c(selDVs, selCovs)
		if(!is.null(weightVar)){
			used = c(used, weightVar)
		}
		# Drop unused columns
		mzData = mzData[, used]
		dzData = dzData[, used]
		
		# Drop rows with missing covariates
		OK = complete.cases(mzData[, selCovs])
		if(sum(!OK)>0){
			message(sum(!OK), " cases in mzData do not have complete covariates, and are being dropped from the model.")
			mzData = mzData[OK,]
		}

		OK = complete.cases(dzData[, selCovs])
		if(sum(!OK)>0){
			message(sum(!OK), " cases in dzData do not have complete covariates, and are being dropped from the model.")
			dzData = dzData[OK,]
		}

		# Compute numbers of ordinal and binary variables
		isFactor = umx_is_ordered(mzData[, selDVs])                      # T/F list of factor columns
		isOrd    = umx_is_ordered(mzData[, selDVs], ordinal.only = TRUE) # T/F list of ordinal (excluding binary)
		isBin    = umx_is_ordered(mzData[, selDVs], binary.only  = TRUE) # T/F list of binary columns
		nFactors = sum(isFactor)
		nOrdVars = sum(isOrd) # total number of ordinal columns
		nBinVars = sum(isBin) # total number of binary columns

		factorVarNames = names(mzData)[isFactor]
		ordVarNames    = names(mzData)[isOrd]
		binVarNames    = names(mzData)[isBin]
		contVarNames   = names(mzData)[!isFactor]

		if(nFactors > 0 & is.null(sep)){
			stop("Please set sep.\n",
			"Why: You have included ordinal or binary variables. I need to know which variables are for twin 1 and which for twin2.\n",
			"The way I do this is enforcing some naming rules. For example, if you have 2 variables:\n",
			" obesity and depression called: 'obesity_T1', 'dep_T1', 'obesity_T2' and 'dep_T2', you should call umxACE with:\n",
			"selDVs = c('obesity','dep'), sep = '_T' \n",
			"sep is just one word, appearing in all variables (e.g. '_T').\n",
			"This is assumed to be followed by '1' '2' etc...")
		}

		if(!is.null(weightVar)){
			# weight variable provided: check it exists in each frame
			if(!umx_check_names(weightVar, data = mzData, die = FALSE) | !umx_check_names(weightVar, data = dzData, die = FALSE)){
				stop("The weight variable must be included in the mzData and dzData",
					 " frames passed into umxACE when \"weightVar\" is specified",
					 "\n mzData contained:", paste(names(mzData), collapse = ", "),
					 "\n and dzData contain:", paste(names(dzData), collapse = ", "),
					 "\nbut I was looking for ", weightVar, " as the moderator."
				)
			}
			mzWeightMatrix = mxMatrix(name = "mzWeightMatrix", type = "Full", nrow = nrow(mzData), ncol = 1, free = FALSE, values = mzData[, weightVar])
			dzWeightMatrix = mxMatrix(name = "dzWeightMatrix", type = "Full", nrow = nrow(dzData), ncol = 1, free = FALSE, values = dzData[, weightVar])
			mzData = mzData[, selDVs]
			dzData = dzData[, selDVs]
			bVector = TRUE
		} else {
			# no weights
		}

		# =====================================
		# = Add means and var matrices to top =
		# =====================================
		# Figure out start values for a, c, and e
		varStarts = umx_var(mzData[, selDVs[1:nDVs], drop = FALSE], format= "diag", ordVar = 1, use = "pairwise.complete.obs")
		
		if(nDVs == 1){
			# sqrt to switch from var to path coefficient scale
			varStarts = sqrt(varStarts)/3
		} else {
			varStarts = t(chol(diag(varStarts/3))) # Divide variance up equally, and set to Cholesky form.
		}
		varStarts = matrix(varStarts, nDVs, nDVs)

		# Mean starts
		obsMZmeans   = umx_means(mzData[, selDVs], ordVar = 0, na.rm = TRUE)
		meanDimNames = list("means", selDVs)		

		# ===============================
		# = Notes: Ordinal requires:    =
		# ===============================
		# 1. Set to mxFactor
		# 2. For Binary vars:
		#   1. Means of binary vars fixedAt 0
		#   2. A + C + E for binary vars is constrained to 1 
		# 3. For Ordinal vars, first 2 thresholds fixed
        #   # TODO
		#   1. WLS as an option.
		# 	2. Simple test if results are similar for an ACE model of 1 variable
		if(nFactors == 0) {			
			# =======================================================
			# = Handle all continuous case                          =
			# =======================================================
			message("All variables continuous")

			# =====================================================================
			# = Create Matrices for Covariates and linear Regression Coefficients =
			# =====================================================================
			# make a def matrix containing covariates
			# http://ibg.colorado.edu/cdrom2016/maes/UnivariateAnalysis/twoa/twoACEma.R
			# http://ibg.colorado.edu/cdrom2016/maes/UnivariateAnalysis/twoa/twoACEja.R
			# http://ibg.colorado.edu/cdrom2016/maes/UnivariateAnalysis/twoa/twoACEca.R

			# copies to debug
			# nSib            = 2
			# baseDV_names    = c("ht", "wt")
			# baseCov_names   = c("age", "sex")
			# baseDV_names    = c("ht")
			# baseCov_names   = c("age")
			# nDVs            = length(baseDV_names)
			# nCov            = length(baseCov_names)
			# selDVs          = umx_paste_names(baseDV_names, "_T")
			# selCovs         = umx_paste_names(baseCov_names, "_T")
			# obsMZmeans      = rep(0, length(selDVs))

			# ================
			# = Bits for top =
			# ================
			# 1. betas is an [nCov, nDVs] matrix
			# TODO: add intercept to incoming cov list
			# TODO support quadratic betas on means
			
			T1Covs = selCovs[1:nCov]
			T2Covs = selCovs[(nCov+1):length(selCovs)]
			betaLabels = paste0("b", rep(1:nCov, each = nDVs), "_", rep(baseDV_names, nCov))
 			betaDims = list(paste0("bcov", 1:nCov), paste0("var", 1:(nDVs/nSib)))
			betas = umxMatrix("betas", "Full", nrow = nCov, ncol = nDVs, free = TRUE, values = .01, labels = betaLabels, dimnames = betaDims)

			# 2. Intercepts is a [1, nDVs*nSib] matrix (not yet equated across twins...)
			Intercepts = umxMatrix("Intercepts", "Full", nrow = 1, ncol = (nDVs * nSib), free = TRUE, values = obsMZmeans, dimnames = list("int", selDVs))

			top = mxModel("top", betas, Intercepts)
			MZ = mxModel("MZ",
				umxMatrix("defCovT1", "Full", nrow = 1, ncol = nCov, free = FALSE, labels = paste0("data.", T1Covs), dimnames = list("defCovT1", T1Covs)),
				umxMatrix("defCovT2", "Full", nrow = 1, ncol = nCov, free = FALSE, labels = paste0("data.", T2Covs), dimnames = list("defCovT2", T2Covs)),
				mxAlgebra(name = "expMean", top.Intercepts + cbind(defCovT1 %*% top.betas, defCovT2 %*% top.betas), dimnames = list(NULL, selDVs)),
				mxExpectationNormal("top.expCovMZ", "expMean"),
				mxFitFunctionML(vector = bVector), mxData(mzData, type = "raw")
			)
			DZ = mxModel("DZ", 
				umxMatrix("defCovT1", "Full", nrow = 1, ncol = nCov, free = FALSE, labels = paste0("data.", T1Covs), dimnames = list("defCovT1", T1Covs)),
				umxMatrix("defCovT2", "Full", nrow = 1, ncol = nCov, free = FALSE, labels = paste0("data.", T2Covs), dimnames = list("defCovT2", T2Covs)),
				mxAlgebra(name = "expMean", top.Intercepts + cbind(defCovT1 %*% top.betas, defCovT2 %*% top.betas), dimnames = list(NULL, selDVs)),
				mxExpectationNormal("top.expCovDZ", "expMean"),
				mxFitFunctionML(vector = bVector), mxData(dzData, type = "raw")
			)
		} else if(sum(isBin) == 0){
			# ==================================================
			# = Handle 1 or more ordinal variables (no binary) =
			# ==================================================
			message("umxACE found ", (nOrdVars/nSib), " pair(s) of ordinal variables:", omxQuotes(ordVarNames), " (No binary)")
			if(length(contVarNames) > 0){
				message(length(contVarNames)/nSib, " pair(s) of continuous variables:", omxQuotes(contVarNames))
			}else{
				# message("No continuous variables found.")
			}
			# Means: all free, start cont at the measured value, ord @0
			meansMatrix = mxMatrix(name = "expMean", "Full" , nrow = 1, ncol = (nDVs * nSib), free = TRUE, values = obsMZmeans, dimnames = meanDimNames)
			# Thresholds
			# for better guessing with low-frequency cells
			allData = rbind(mzData, dzData)
			# threshMat is is a matrix, or a list of 2 matrices and an algebra
			threshMat = umxThresholdMatrix(allData, sep = sep, threshMatName = "threshMat", verbose = FALSE)
			mzExpect  = mxExpectationNormal("top.expCovMZ", "top.expMean", thresholds = "top.threshMat")
			dzExpect  = mxExpectationNormal("top.expCovDZ", "top.expMean", thresholds = "top.threshMat")			
			top = mxModel("top", umxLabel(meansMatrix), threshMat)
			MZ  = mxModel("MZ", mzExpect, mxFitFunctionML(vector = bVector), mxData(mzData, type = "raw") )
			DZ  = mxModel("DZ", dzExpect, mxFitFunctionML(vector = bVector), mxData(dzData, type = "raw") )
		} else if(sum(isBin) > 0){
			# =======================================================
			# = Handle case of at least 1 binary variable           =
			# =======================================================

			message("umxACE found ", sum(isBin)/nSib, " pairs of binary variables:", omxQuotes(binVarNames))
			message("\nI am fixing the latent means and variances of these variables to 0 and 1")
			if(nOrdVars > 0){
				message("There were also ", nOrdVars/nSib, " pairs of ordinal variables:", omxQuotes(ordVarNames))			
			}
			if(length(contVarNames) > 0){
				message("\nand ", length(contVarNames)/nSib, " pairs of continuous variables:", omxQuotes(contVarNames))	
			}else{
				message("No continuous variables")
			}
	
			# ===========================================================================
			# = Means: bin fixed, others free, start cont at the measured value, ord @0 =
			# ===========================================================================
			# Fill with zeros: default for ordinals and binary...
			meansFree = (!isBin) # fix the binary variables at zero
			meansMatrix = mxMatrix(name = "expMean", "Full" , nrow = 1, ncol = nDVs*nSib, free = meansFree, values = obsMZmeans, dimnames = meanDimNames)

			# = Thresholds =
			# For better guessing with low-freq cells
			allData = rbind(mzData, dzData)
			# threshMat may be a three item list of matrices and algebra
			threshMat = umxThresholdMatrix(allData, sep = sep, threshMatName = "threshMat", verbose = TRUE)

			mzExpect  = mxExpectationNormal("top.expCovMZ", "top.expMean", thresholds = "top.threshMat")
			dzExpect  = mxExpectationNormal("top.expCovDZ", "top.expMean", thresholds = "top.threshMat")

			top = mxModel("top", umxLabel(meansMatrix), threshMat)
			MZ  = mxModel("MZ", mzExpect, mxFitFunctionML(vector = bVector), mxData(mzData, type = "raw") )
			DZ  = mxModel("DZ", dzExpect, mxFitFunctionML(vector = bVector), mxData(dzData, type = "raw") )


			# ===================================
			# = Constrain Ordinal variance @1  =
			# ===================================
			# Algebra to pick out the ord vars
			# TODO check this way of using twin 1 to pick where the bin vars are is robust...
			the_bin_cols = which(isBin)[1:nDVs] # columns in which the bin vars appear for twin 1, i.e., c(1,3,5,7)
			binBracketLabels = paste0("Vtot[", the_bin_cols, ",", the_bin_cols, "]")

			top = mxModel(top,
				# Algebra to compute total variances and standard deviations
				mxAlgebra(name = "Vtot", A + C+ E), # Total variance (redundant but is OK)
				mxMatrix(name  = "binLabels"  , "Full", nrow = (nBinVars/nSib), ncol = 1, labels = binBracketLabels),
				mxMatrix(name  = "Unit_nBinx1", "Unit", nrow = (nBinVars/nSib), ncol = 1),
				mxConstraint(name = "constrain_Bin_var_to_1", binLabels == Unit_nBinx1)
			)
		} else {
			stop("You appear to have something other than I expected in terms of binary, ordinal and continuous variable mix")
		}
		# nb: means not yet equated across twins

	# Finish building top
	top = mxModel(top,
		# "top" defines the algebra of the twin model, which MZ and DZ slave off of
		# NB: top already has the means model and thresholds matrix added if necessary  - see above
		# Additive, Common, and Unique environmental paths
		umxMatrix("a", type = "Lower", nrow = nDVs, ncol = nDVs, free = TRUE, values = varStarts, byrow = TRUE),
		umxMatrix("c", type = "Lower", nrow = nDVs, ncol = nDVs, free = TRUE, values = varStarts, byrow = TRUE),
		umxMatrix("e", type = "Lower", nrow = nDVs, ncol = nDVs, free = TRUE, values = varStarts, byrow = TRUE), 
	
		mxMatrix(name = "dzAr", type = "Full", 1, 1, free = FALSE, values = dzAr),
		mxMatrix(name = "dzCr", type = "Full", 1, 1, free = FALSE, values = dzCr),
		# Multiply each path coefficient by its inverse to get variance component
		# Quadratic multiplication to add common_loadings
		mxAlgebra(name = "A", a %*% t(a)), # additive genetic variance
		mxAlgebra(name = "C", c %*% t(c)), # common environmental variance
		mxAlgebra(name = "E", e %*% t(e)), # unique environmental variance
		mxAlgebra(name = "ACE", A+C+E),
		mxAlgebra(name = "AC" , A+C  ),
		mxAlgebra(name = "hAC", (dzAr %x% A) + (dzCr %x% C)),
		mxAlgebra(name = "expCovMZ",
					rbind(cbind(ACE, AC),
					      cbind(AC , ACE)), dimnames = list(selDVs, selDVs)),
		mxAlgebra(name = "expCovDZ",
					rbind(cbind(ACE, hAC),
					      cbind(hAC, ACE)), dimnames = list(selDVs, selDVs))
	)
	# =======================================
	# = 		Assemble models into supermodel =
	# =======================================

	if(!bVector){
		model = mxModel(name, MZ, DZ, top,
			mxFitFunctionMultigroup(c("MZ", "DZ"))
		)
	} else {
		# bVector is TRUE
		# To weight objective functions in OpenMx, you specify a container model that applies the weights
		# m1 is the model with no weights, but with "vector = TRUE" option added to the FIML objective.
		# This option makes FIML return individual likelihoods for each row of the data (rather than a single -2LL value for the model)
		# You then optimize weighted versions of these likelihoods by building additional models containing 
		# weight data and an algebra that multiplies the likelihoods from the first model by the weight vector
		model = mxModel(name, MZ, DZ, top,
			mxModel("MZw", mzWeightMatrix,
				mxAlgebra(-2 * sum(mzWeightMatrix * log(MZ.objective) ), name = "mzWeightedCov"),
				mxFitFunctionAlgebra("mzWeightedCov")
			),
			mxModel("DZw", dzWeightMatrix,
				mxAlgebra(-2 * sum(dzWeightMatrix * log(DZ.objective) ), name = "dzWeightedCov"),
				mxFitFunctionAlgebra("dzWeightedCov")
			),
			mxFitFunctionMultigroup(c("MZw", "DZw"))
		)				
	}
	if(!is.null(boundDiag)){
		if(!is.numeric(boundDiag)){
			stop("boundDiag must be a digit or vector of numbers. You gave me a ", class(boundDiag))
		} else {				
			newLbound = model$top$matrices$a@lbound
			if(length(boundDiag) > 1 ){
				if(length(boundDiag) != length(diag(newLbound)) ){
					stop("Typically boundDiag is 1 digit: if more, must be size of diag(a)")
				}
			}
			diag(newLbound) = boundDiag; 
			model$top$a$lbound = newLbound
			model$top$c$lbound = newLbound
			model$top$e$lbound = newLbound
		}
	}
	if(addStd){
		newTop = mxModel(model$top,
			mxMatrix(name  = "I", "Iden", nDVs, nDVs), # nDVs Identity matrix
			mxAlgebra(name = "Vtot", A + C+ E),        # Total variance
			# TODO test that these are identical in all cases
			# mxAlgebra(vec2diag(1/sqrt(diag2vec(Vtot))), name = "SD"), # Total variance
			mxAlgebra(name = "SD"   , solve(sqrt(I * Vtot))), # Total variance
			mxAlgebra(name = "a_std", SD %*% a), # standardized a
			mxAlgebra(name = "c_std", SD %*% c), # standardized c
			mxAlgebra(name = "e_std", SD %*% e)  # standardized e
		)
		model = mxModel(model, newTop)
		if(addCI){
			model = mxModel(model, mxCI(c('top.a_std', 'top.c_std', 'top.e_std')))
		}
	}
	# Equate means for twin1 and twin 2 by matching labels in the first and second halves of the means labels matrix
	if(equateMeans){
		# in basic ACE models, this acts on expMean.
		# Here, we equate halves of the [1* (nDVs * nSib)] Intercepts matrix
		model = omxSetParameters(model,
		  labels    = paste0("Intercepts_r1c", (nDVs + 1):(nDVs * nSib)), # c("Intercepts14", "Intercepts15", "Intercepts16"),
		  newlabels = paste0("Intercepts_r1c", 1:nDVs)                    # c("Intercepts11", "Intercepts12", "Intercepts13")
		)
	}
	# Trundle through and make sure values with the same label have the same start value... means for instance.
	model = omxAssignFirstParameters(model)
	model = as(model, "MxModelACE") # set class so that S3 plot() dispatches.
	model = xmu_safe_run_summary(model, autoRun = autoRun, tryHard = tryHard)
	return(model)
} # end umxACE
tbates/umx documentation built on Sept. 19, 2024, 1:10 a.m.