make_h2s_matrix = function(formula,data, h2_divisions) {
# get the names of the random effects, make a table with rows as the random effects and columns as the grid
model_setup = make_model_setup(formula,data,list())
RE_setup = model_setup$RE_setup
# add names to RE_setup if needed
n_RE = length(RE_setup)
for(i in 1:n_RE){
if(is.null(names(RE_setup)[i]) || names(RE_setup)[i] == ''){
names(RE_setup)[i] = paste0('RE.',i)
}
}
RE_names = names(RE_setup)
if(length(h2_divisions) < n_RE){
if(length(h2_divisions) != 1) stop('Must provide either 1 h2_divisions parameter, or 1 for each random effect')
h2_divisions = rep(h2_divisions,n_RE)
}
if(is.null(names(h2_divisions))) {
names(h2_divisions) = RE_names
}
h2s_matrix = expand.grid(lapply(RE_names,function(re) seq(0,1,length = h2_divisions[[re]]+1)))
colnames(h2s_matrix) = RE_names
h2s_matrix = t(h2s_matrix[rowSums(h2s_matrix) < 1,,drop=FALSE])
colnames(h2s_matrix) = NULL
return(h2s_matrix)
}
# 1. F model - specify formula (with ID), ID, full data with all individuals, K, var_F prior, F_h2 prior
# 2. Add traits - for each, specify formula (with ID), reduced data with ID, var_Y prior, resid_h2 prior. Could be different trait models for each set of traits
# - check that no Lambda model yet. If so, clear it
# - The IDs here define the missing data map for columns
# - must have same random effects as F model, but fixed model can be different
# 3. Add Lambda model. Easiest would be a list of formulas all with same data, each with a prior, plus Lambda prior function and parameters
# 4. Initialize sampling matrices. May be able to combine across trait sets and F
# 5. Initialize variables
# 6. Define posterior functions
MegaLMM_F = function(formula, data, ID = "ID", relmat = NULL,
K, h2s_matrix,
tot_F_var_prior = list(V = 18/20, nu = 20),
F_h2_prior,
output_folder = 'MegaLMM_model',
X2 = NULL, U2 = NULL, V2 = NULL,
B2_F_prior = list(
sampler = sample_B2_prec_horseshoe,
prop_0 = 0.1
)) {
try(dir.create(output_folder,recursive = T, showWarnings = FALSE),silent=T)
# first check ID
if(!ID %in% colnames(data)) stop(sprintf('ID: "%s% missing from data',ID))
if(!length(data[[ID]]) == length(unique(data[[ID]]))) stop(sprintf('ID: "%s% not unique in data',ID))
model_setup = make_model_setup(formula,data,relmat)
# X2 is the "fixed effects" for F
X2_F = model_setup$lmod$X
b2_F = ncol(X2_F)
n = nrow(data)
# alternatively, (or in addition) these can be specified directly with X2 or U2*V2
if(!is.null(X2)) {
if(nrow(X2) != n) stop("nrow(X2) != nrow(data)")
X2_F = cbind(X2_F,X2)
b2_F = ncol(X2_F)
}
U2_F = NULL
V2_F = NULL
if(!is.null(U2) || !is.null(V2)) {
if(is.null(U2) || is.null(V2)) stop("must specify both U2 and V2 if either is specified")
if(nrow(U2) != n) stop("nrow(U2) != nrow(data)")
if(ncol(U2) != nrow(V2)) stop("sizes of U2 and V2 are not compatible. U2 %*% V2 is calculated")
if(ncol(X2_F)>0) {
# expand V2
V2_F = as.matrix(Matrix::bdiag(diag(1,ncol(X2_F)),V2))
X2_F = cbind(X2_F,U2)
}
}
if((!is.null(X2_F) || !is.null(U2_F)) && is.null(B2_F_prior)) stop("B2_F_prior is needed")
# -------- Random effects ---------- #
# RE_setup is a list like from BGLR that describes the random effects (for both residuals and factors)
RE_setup = model_setup$RE_setup
# add names to RE_setup if needed
n_RE = length(RE_setup)
for(i in 1:n_RE){
if(is.null(names(RE_setup)[i]) || names(RE_setup)[i] == ''){
names(RE_setup)[i] = paste0('RE.',i)
}
}
RE_names = names(RE_setup)
# combine Z matrices
Z = do.call(cbind,lapply(RE_setup,function(x) x$Z))
Z = as(Z,'dgCMatrix')
# find RE indices
RE_lengths = sapply(RE_setup,function(x) ncol(x$Z))
RE_starts = cumsum(c(0,RE_lengths)[1:n_RE])
names(RE_starts) = RE_names
RE_indices = lapply(RE_names,function(re) RE_starts[re] + 1:RE_lengths[re])
names(RE_indices) = RE_names
h2_divisions = run_parameters$h2_divisions
if(length(h2_divisions) < n_RE){
if(length(h2_divisions) != 1) stop('Must provide either 1 h2_divisions parameter, or 1 for each random effect')
h2_divisions = rep(h2_divisions,n_RE)
}
if(is.null(names(h2_divisions))) {
names(h2_divisions) = RE_names
}
h2s_matrix = expand.grid(lapply(RE_names,function(re) seq(0,1,length = h2_divisions[[re]]+1)))
colnames(h2s_matrix) = RE_names
h2s_matrix = t(h2s_matrix[rowSums(h2s_matrix) < 1,,drop=FALSE])
colnames(h2s_matrix) = NULL
# # # -------------------------------------------#
# # # identify groups of traits with same pattern of missingness
# # # ideally, want to be able to restrict the number of sets. Should be possible to merge sets of columngs together.
#
# Missing_data_map = list(list(
# Y_obs = 1:n,
# Y_cols = 1:p
# ))
# Missing_row_data_map = list(list(
# Y_obs = 1:n,
# Y_cols = 1:p
# ))
# # # -------------------------------------------#
# # # Fixed factors
# if(is.null(Lambda_fixed)) {
# fixed_factors = rep(F,run_parameters$K)
# } else{
# Lambda_fixed = as.matrix(Lambda_fixed)
# if(ncol(Lambda_fixed) != p) stop("wrong dimensions of Lambda_fixed")
# if(nrow(Lambda_fixed) >= run_parameters$K) stop("nrow(Lambda_fixed) >= K")
# fixed_factors = rep(F,run_parameters$K)
# fixed_factors[1:nrow(Lambda_fixed)] = T
# }
# Kr = run_parameters$K - sum(fixed_factors)
priors = list()
priors$tot_Y_var = list(V = c(),nu = c())
priors$tot_F_var = tot_F_var_prior
if(length(priors$tot_F_var$V) == 1) {
priors$tot_F_var$V = rep(priors$tot_F_var$V,K)
priors$tot_F_var$nu = rep(priors$tot_F_var$nu,K)
}
priors$h2_priors_resids = matrix(0,nrow = length(F_h2_prior),ncol = 0)
priors$h2_priors_factors = F_h2_prior
# recover()
if(nrow(priors$h2_priors_factors) != n_RE || !all(rownames(F_h2_prior == RE_names))) stop("F_h2_prior not compatible with random effect formula")
trait_sets = list(
list(
setID = 1,
Y = NULL,
rows = 1:nrow(data),
cols = NULL,
colnames = NULL,
X1_cols = NULL,
use_X2 = is.null(X2_F) && is.null(U2_F),
Y_model = NULL
)
)
run_variables = list(
K = K,
t = 0,
n = n,
# r_RE = r_RE,
RE_names = RE_names,
b2 = b2_F,
F_formula = formula,
F_data = data,
ID = ID,
trait_sets = trait_sets
# Kr = Kr
)
data_matrices = list(
X1 = matrix(0,n,0),
X2 = X2_F,
U2 = U2_F,
V2 = V2_F,
Z = Z,
# ZL = ZL,
# ZL_list = list(ZL),
RE_setup = RE_setup,
# RE_L_list = list(RE_L), # List of matrices necessary to back-transform U_F and U_R (RE_L*U_F and RE_L*U_R) to get original random effects
RE_indices = RE_indices,
h2s_matrix = h2s_matrix
)
current_state = list(
Eta = matrix(NA,nrow = nrow(data),ncol = 0,dimnames = list(data[[ID]]))
)
#
# run_parameters$observation_model = observation_model
# run_parameters$observation_model_parameters = observation_model_parameters
# run_parameters$traitnames = traitnames
# ----------------------------- #
# -- create MegaLMM_state object - #
# ----------------------------- #
MegaLMM_state = list(
current_state = current_state,
output_folder = output_folder,
data_matrices = data_matrices,
priors = priors,
run_parameters = run_parameters,
run_variables = run_variables
)
class(MegaLMM_state) = append('MegaLMM_state',class(MegaLMM_state))
MegaLMM_state$Posterior = list(
# posteriorSample_params = posteriorSample_params,
# posteriorMean_params = posteriorMean_params,
# posteriorFunctions = posteriorFunctions,
total_samples = 0,
folder = file.path(output_folder,'Posterior'),
files = c()
)
MegaLMM_state
}
# 2. Add traits - for each, specify formula (with ID), reduced data with ID, var_Y prior, resid_h2 prior. Could be different trait models for each set of traits
# - check that no Lambda model yet. If so, clear it
# - The IDs here define the missing data map for columns
# - must have same random effects as F model, but fixed model can be different
MegaLMM_add_trait_matrix = function(MegaLMM_state,Y,fixed_formula,data,
tot_Y_var_prior = list(V = 0.5, nu = 3),
resid_h2_prior,
use_X2 = FALSE,
B2_R_prior = list(
sampler = sample_B2_prec_horseshoe,
prop_0 = 0.1
),
center = T,scale = T) {
# Set ID
setID = length(MegaLMM_state$run_variables$trait_sets) + 1
# check model
if(length(lme4::findbars(fixed_formula))>0) stop('do not include random effects in fixed_formula. Random effects are inherited from the model for F')
# recover()
# check ID
ID = MegaLMM_state$run_variables$ID
F_data = MegaLMM_state$run_variables$F_data
if(!ID %in% colnames(data)) stop(sprintf('ID: "%s% missing from data',ID))
if(!length(data[[ID]]) == length(unique(data[[ID]]))) stop(sprintf('ID: "%s% not unique in data',ID))
if(any(data[[ID]] %in% F_data[[ID]] == F)) stop("all IDs in data must be present in F_data")
# clear Lambda_prior (if it exists) as this needs to be re-set with new traits
MegaLMM_state$data_matrices$Lambda_X = NULL
MegaLMM_state$priors$Lambda_prior = NULL
# row/column names of Y
if(is.null(rownames(Y))) {
rownames(Y) = data[[ID]]
} else if(!all(rownames(Y) == data[[ID]])) {
stop(sprintf('rownames(Y) must equal data[["%s"]]',ID))
}
if(is.null(colnames(Y))) {
# add column names of the form SetID.1-t
colnames(Y) = sprintf('Set%03d.%02d',setID,1:ncol(Y))
}
# expand Y to full size of F_data
Y_full = matrix(NA,nrow = nrow(F_data),ncol = ncol(Y),dimnames = list(F_data[[ID]],colnames(Y)))
Y_full[rownames(Y),] = Y
# tot_Y_var priors
priors = MegaLMM_state$priors
if(length(tot_Y_var_prior$V) == 1) {
tot_Y_var_prior$V = rep(tot_Y_var_prior$V,ncol(Y))
tot_Y_var_prior$nu = rep(tot_Y_var_prior$nu,ncol(Y))
}
# priors$tot_Y_var$V = c(priors$tot_Y_var$V,tot_Y_var_prior$V)
# priors$tot_Y_var$nu = c(priors$tot_Y_var$nu,tot_Y_var_prior$nu)
# resid_h2_priors
# priors$h2_priors_resids = cbind(resid_h2_prior,matrix(resid_h2_prior,nrow = length(resid_h2_prior),ncol = ncol(Y)))
# recover()
# design matrices
mf = model.frame(fixed_formula,data)
if(any(is.na(mf))) stop('NAs in variables in fixed_formula in data')
X1_Y = model.matrix(fixed_formula,mf)
X1_full = matrix(NA,nrow = nrow(F_data),ncol = ncol(X1_Y),dimnames = list(F_data[[ID]],colnames(X1_Y)))
X1_full[rownames(Y),] = X1_Y
# collect trait info
traitSet_info = list(
setID = setID,
Y = Y,
rows = which(rownames(Y_full) %in% rownames(Y)),
cols = ncol(MegaLMM_state$current_state$Eta) + 1:ncol(Y),
colnames = colnames(Y),
X1_cols = ncol(MegaLMM_state$data_matrices$X1) + 1:ncol(X1_full),
use_X2 = use_X2,
tot_Eta_prec_rate = with(tot_Y_var_prior,V * nu),
tot_Eta_prec_shape = with(tot_Y_var_prior,nu - 1),
h2_priors_resids = matrix(resid_h2_prior,nrow = length(resid_h2_prior),ncol = ncol(Y)),
B2_R_prior = B2_R_prior
)
traitSet_info$sample_Eta = partial_matrix_model_sampler
setup = partial_matrix_model_setup(Y_full,center=center,scale=scale)
Eta = setup$Eta
traitSet_info$sampler_parameters = setup$parameters
# add to MegaLMM_state
MegaLMM_state$run_variables$trait_sets = append(MegaLMM_state$run_variables$trait_sets,list(traitSet_info))
MegaLMM_state$current_state$Eta = cbind(MegaLMM_state$current_state$Eta,Eta)
MegaLMM_state$run_variables$p = ncol(MegaLMM_state$current_state$Eta)
MegaLMM_state$data_matrices$X1 = cbind(MegaLMM_state$data_matrices$X1,X1_full)
# MegaLMM_state$priors = priors
MegaLMM_state
}
MegaLMM_add_Lambda_model = function(MegaLMM_state,design_matrices,
Lambda_prior,
Lambda_beta_prior) {
# recover()
n_matrices = length(design_matrices)
if(n_matrices == 0) {
Lambda_X = matrix(0,MegaLMM_state$run_parameters$K,ncol = 0)
Lambda_X_groups = c()
} else {
if(!all(sapply(design_matrices,function(x) all(rownames(x) == colnames(MegaLMM_state$current_state$Eta))))) {
stop('all design matrices must have rownames matching the column names of MegaLMM_state$current_state$Eta')
}
Lambda_X = do.call(cbind,design_matrices)
Lambda_X_groups = do.call(c,lapply(1:n_matrices,function(i) rep(i,ncol(design_matrices[[i]]))))
}
if(ncol(Lambda_X) == 0 || length(Lambda_beta_prior$V) == 1) {
Lambda_beta_prior$V = rep(Lambda_beta_prior$V,n_matrices)
Lambda_beta_prior$nu = rep(Lambda_beta_prior$nu,n_matrices)
}
Lambda_beta_rate = Lambda_beta_prior$V * Lambda_beta_prior$nu
Lambda_beta_shape = Lambda_beta_prior$nu - 1
MegaLMM_state$priors$Lambda_prior = Lambda_prior
MegaLMM_state$priors$Lambda_prior$Lambda_beta_rate = Lambda_beta_rate
MegaLMM_state$priors$Lambda_prior$Lambda_beta_shape = Lambda_beta_shape
MegaLMM_state$priors$Lambda_prior$Lambda_X_groups = Lambda_X_groups
MegaLMM_state$data_matrices$Lambda_X = Lambda_X
MegaLMM_state
}
MegaLMM_preliminary_calculations = function(MegaLMM_state,ncores = 1,verbose = FALSE ) {
run_parameters = MegaLMM_state$run_parameters
data_matrices = MegaLMM_state$data_matrices
priors = MegaLMM_state$priors
h2s_matrix = MegaLMM_state$data_matrices$h2s_matrix
RE_setup = data_matrices$RE_setup
F_data = MegaLMM_state$run_variables$F_data
ID = MegaLMM_state$run_variables$ID
trait_sets = MegaLMM_state$run_variables$trait_sets
# trait_sets[[1]] is the factors
# rows should be any rows with non-missing data
trait_sets[[1]]$rows = unique(do.call(c,lapply(trait_sets,function(x) x$rows)))
# Missing_row_data_map = MegaLMM_state$run_variables$Missing_row_data_map
n = MegaLMM_state$run_variables$n
p = MegaLMM_state$run_variables$p
n_RE = length(RE_setup)
RE_names = names(RE_setup)
n_matrices = 2*ncol(h2s_matrix)
pb = 0
if(verbose) {
print(sprintf("Pre-calculating random effect inverse matrices for %d groups of traits and %d sets of random effect weights", length(trait_sets), ncol(h2s_matrix)))
pb = txtProgressBar(min=0,max = 2 + length(trait_sets) * n_matrices,style=3)
}
X1 = data_matrices$X1
X2 = data_matrices$X2
U2 = data_matrices$U2
Z = data_matrices$Z
# function to ensure that covariance matrices are sparse and symmetric
fix_K = function(x) forceSymmetric(drop0(x,tol = run_parameters$drop0_tol))
# construct RE_L and ZL for each random effect
for(i in 1:length(RE_setup)){
re_name = names(RE_setup)[i]
RE_setup[[i]] = within(RE_setup[[i]],{
# recover()
if(!'ZL' %in% ls()){
if('K' %in% ls() && !is.null(K)){
id_names = rownames(K)
if(is(K,'Matrix') & isDiagonal(K)) {
L = as(diag(1,nrow(K)),'dgCMatrix')
K_inv = as(diag(1/diag(K)),'dgCMatrix')
} else {
ldl_k = LDLt(K)
large_d = ldl_k$d > run_parameters$K_eigen_tol
r_eff = sum(large_d)
# if need to use reduced rank model, then use D of K in place of K and merge L into Z
# otherwise, use original K, set L = Diagonal(1,r)
if(r_eff < length(ldl_k$d)) {
K = K_inv = as(diag(1,r_eff),'dgCMatrix')
L = t(ldl_k$P) %*% t(sqrt(ldl_k$d[large_d])*t(ldl_k$L[,large_d]))
if(is(L,'dgeMatrix')) L = as.matrix(L)
} else{
L = as(diag(1,nrow(K)),'dgCMatrix')
K_inv = as(with(ldl_k,t(P) %*% crossprod(diag(1/sqrt(d)) %*% solve(L)) %*% P),'dgCMatrix')
}
rm(list=c('ldl_k','large_d','r_eff'))
}
if(is.null(rownames(K))) rownames(K) = 1:nrow(K)
rownames(K_inv) = rownames(K)
} else if ('K_inv' %in% ls() && !is.null(K_inv)){
id_names = rownames(K_inv)
if(is.null(rownames(K_inv))) rownames(K_inv) = 1:nrow(K_inv)
K = solve(K_inv)
rownames(K) = rownames(K_inv)
L = as(diag(1,nrow(K)),'dgCMatrix')
} else{
K = as(diag(1,ncol(Z)),'dgCMatrix')
rownames(K) = colnames(Z)
id_names = rownames(K)
K_inv = K
L = as(diag(1,nrow(K)),'dgCMatrix')
}
# if(is.null(id_names)) id_names = 1:length(id_names)
rownames(L) = paste(id_names,re_name,sep='::')
K = fix_K(K)
ZL = Z %*% L
}
})
}
if(verbose) setTxtProgressBar(pb,getTxtProgressBar(pb)+1)
ZL = do.call(cbind,lapply(RE_setup,function(re) re$ZL))
if(nnzero(ZL)/length(ZL) < .25) {
ZL = as(ZL,'dgCMatrix')
} else{
ZL = as.matrix(ZL)
}
if(length(RE_setup) > 1) {
RE_L = do.call(bdiag,lapply(RE_setup,function(re) re$L))
rownames(RE_L) = do.call(c,lapply(RE_setup,function(re) rownames(re$L)))
} else{
RE_L = RE_setup[[1]]$L
}
if(nnzero(RE_L)/length(RE_L) < 0.25) {
RE_L = as(RE_L,'dgCMatrix')
} else{
RE_L = as.matrix(RE_L)
}
r_RE = sapply(RE_setup,function(re) ncol(re$ZL))
# cholesky decompositions (RtR) of each K_inverse matrix
chol_Ki_mats = lapply(RE_setup,function(re) {
K_inv = re$K_inv
if(is(K_inv,'Matrix')) {
if(isDiagonal(K_inv)) {
sparseMatrix(i=1:nrow(K_inv),j=1:nrow(K_inv),x=sqrt(diag(K_inv)))
} else{
as(chol(forceSymmetric(K_inv)),'dgCMatrix')
}
} else {
as(chol(K_inv),'dgCMatrix')
}
})
if(verbose) setTxtProgressBar(pb,getTxtProgressBar(pb)+1)
Qt_list_rows = matrix(FALSE,nrow = length(F_data[[ID]]),ncol = 0)
Qt_list = list()
QtX1_list = list()
QtX2_R_list = list()
ZL_list = list()
RE_L_list = list()
chol_V_list_list = list()
chol_ZtZ_Kinv_list_list = list()
svd_K1 = NULL
for(set in seq_along(trait_sets)){
if(verbose>1) print(sprintf('Set %d',set))
rows = trait_sets[[set]]$rows
cols = trait_sets[[set]]$cols
X1_cols = trait_sets[[set]]$X1_cols
if(sum(rows) == 0) next
# check if this set matches with any previous set - we can the re-use the matrices
Qt_list_ID = 1
if(set > 1) {
while(Qt_list_ID <= length(Qt_list)) {
if(all(Qt_list_rows[rows,Qt_list_ID]) && all(!Qt_list_rows[-rows,Qt_list_ID])) {
# we found a match
break
}
Qt_list_ID = Qt_list_ID + 1
}
}
trait_sets[[set]]$Qt_list_ID = Qt_list_ID
if(Qt_list_ID <= length(Qt_list)) {
Qt = Qt_list[[Qt_list_ID]]
if(trait_sets[[set]]$use_X2 & (length(QtX2_R_list) < Qt_list_ID || is.null(QtX2_R_list[[Qt_list_ID]]))) {
QtX2_R_set = Qt %**% X2_R[rows,,drop=FALSE]
QtX2_R_list[[Qt_list_ID]] = QtX2_R_set
}
# update progress bar
if(verbose) setTxtProgressBar(pb,getTxtProgressBar(pb)+n_matrices)
next
}
# add new column to Qt_list_rows
Qt_list_rows = cbind(Qt_list_rows,FALSE)
Qt_list_rows[rows,Qt_list_ID] = TRUE
# find Qt = svd(ZLKLtZt)$u
if(ncol(ZL) < nrow(ZL[rows,])*.9) {
# a faster way of taking the SVD of ZLKZLt, particularly if ncol(ZL) < nrow(ZL). Probably no benefit if ncol(K) > nrow(ZL)
if(is.null(svd_K1)){
svd_K1 = svd(RE_setup[[1]]$K)
}
qr_ZU = qr(RE_setup[[1]]$ZL[rows,,drop=FALSE] %*% svd_K1$u)
R_ZU = drop0(qr.R(qr_ZU,complete=F),tol=run_parameters$drop0_tol)
Q_ZU = drop0(qr.Q(qr_ZU,complete=T),tol=run_parameters$drop0_tol)
RKRt = R_ZU %*% diag(svd_K1$d) %*% t(R_ZU)
svd_RKRt = svd(RKRt)
RKRt_U = svd_RKRt$u
if(ncol(Q_ZU) > ncol(RKRt_U)) RKRt_U = bdiag(RKRt_U,diag(1,ncol(Q_ZU)-ncol(RKRt_U)))
Qt = t(Q_ZU %**% RKRt_U)
} else{
ZKZt = with(RE_setup[[1]],ZL[rows,,drop=FALSE] %*% K %*% t(ZL[rows,,drop=FALSE]))
if(is(ZKZt,'Matrix') & isDiagonal(ZKZt)) {
nn = nrow(ZKZt)
result = list(d = diag(ZKZt),u = sparseMatrix(i=1:nn,j=1:nn,x=1))
} else{
result = svd(ZKZt)
}
Qt = t(result$u)
}
Qt = as(drop0(as(Qt,'dgCMatrix'),tol = run_parameters$drop0_tol),'dgCMatrix')
if(nnzero(Qt)/length(Qt) > 0.5) Qt = as.matrix(Qt) # only store as sparse if it is sparse
QtZL_matrices_set = lapply(RE_setup,function(re) Qt %*% re$ZL[rows,,drop=FALSE])
# QtZL_set = do.call(cbind,QtZL_matrices_set[RE_names])
# if(nnzero(QtZL_set)/length(QtZL_set) > 0.5) QtZL_set = as(QtZL_set,'dgCMatrix')
QtX1_set = Qt %**% X1[rows,X1_cols,drop=FALSE]
QtX1_keepColumns = logical(0)
if(ncol(QtX1_set)>0) QtQtX1_set_keepColumns = c(1:ncol(QtX1_set)) %in% caret::findLinearCombos(QtX1_set)$remove == F # assess which columns of X1 are identifiable in this data set
QtX1_set = QtX1_set[,QtX1_keepColumns,drop=FALSE] # drop extra columns
Qt_list[[Qt_list_ID]] = Qt
QtX1_list[[Qt_list_ID]] = list( X1 = QtX1_set,
keepColumns = QtX1_keepColumns
)
ZKZts_set = list()
for(i in 1:n_RE){
ZKZts_set[[i]] = forceSymmetric(drop0(QtZL_matrices_set[[i]] %*% RE_setup[[i]]$K %*% t(QtZL_matrices_set[[i]]),tol = run_parameters$drop0_tol))
ZKZts_set[[i]] = as(as(ZKZts_set[[i]],'CsparseMatrix'),'dgCMatrix')
if(nnzero(ZKZts_set[[i]])/length(ZKZts_set[[i]]) > 0.5) {
ZKZts_set[[i]] = as.matrix(ZKZts_set[[i]])
}
}
chol_V_list_list[[Qt_list_ID]] = make_chol_V_list(ZKZts_set,h2s_matrix,run_parameters$drop0_tol,
verbose,pb,setTxtProgressBar,getTxtProgressBar,ncores)
# convert any to dense if possible
for(i in 1:length(chol_V_list_list[[Qt_list_ID]])){
chol_V_list_list[[Qt_list_ID]][[i]] = drop0(chol_V_list_list[[Qt_list_ID]][[i]],tol = run_parameters$drop0_tol)
if(nnzero(chol_V_list_list[[Qt_list_ID]][[i]])/length(chol_V_list_list[[Qt_list_ID]][[i]]) > 0.25){
chol_V_list_list[[Qt_list_ID]][[i]] = as.matrix(chol_V_list_list[[Qt_list_ID]][[i]])
}
}
if(verbose>1) print(sprintf('Set %d S',set))
ZL_list[[Qt_list_ID]] = ZL
chol_Ki_mats_set = chol_Ki_mats
ZtZ_set = as(forceSymmetric(drop0(crossprod(ZL_list[[Qt_list_ID]][rows,]),tol = run_parameters$drop0_tol)),'dgCMatrix')
if(length(RE_setup) == 1) {
S = simultaneous_diagonalize(ZtZ_set,solve(chol_Ki_mats[[1]]))$S
if(nnzero(S)/length(S) > 0.5) {
S = as.matrix(S) # only store as sparse if it is sparse
} else {
S = as(S,'dgCMatrix')
}
ZL_list[[Qt_list_ID]] = ZL_list[[Qt_list_ID]] %**% S
ZtZ_set = Diagonal(ncol(ZL),colSums(ZL_list[[Qt_list_ID]][rows,]^2))
# ZtZ_set = as(forceSymmetric(drop0(crossprod(ZL_list[[Qt_list_ID]][x,]),tol = run_parameters$drop0_tol)),'dgCMatrix') # Not needed because must be symmetric
RE_L_list[[Qt_list_ID]] = RE_setup[[1]]$L %**% S
chol_Ki_mats_set[[1]] = as(diag(1,nrow(ZtZ_set)),'dgCMatrix')
}
if(verbose>1) print(sprintf('Set %d chol_ZtZ_Kinv_list_list',set))
if(length(RE_setup) == 1 & isDiagonal(ZtZ_set) & all(sapply(chol_Ki_mats_set,isDiagonal))) {
# in the case of 1 random effect with diagonal covariance matrix, we can skip the expensive calculations
chol_ZtZ_Kinv_list_list[[Qt_list_ID]] = sapply(1:length(h2s_matrix),function(i) {
nn = nrow(ZtZ_set)
sparseMatrix(i=1:nn,j=1:nn,x = sqrt(1/(1-h2s_matrix[1,i])*diag(ZtZ_set) + 1/h2s_matrix[1,i]*diag(chol_Ki_mats_set[[1]])^2))
})
if(verbose) setTxtProgressBar(pb,getTxtProgressBar(pb)+length(h2s_matrix))
} else if(set == 1 || sum(priors$h2_priors_resids[-1]) > 0) {
# Note: set >1 only applies to the resids (factors always use set==1).
chol_ZtZ_Kinv_list_list[[Qt_list_ID]] = make_chol_ZtZ_Kinv_list(chol_Ki_mats_set,h2s_matrix,ZtZ_set,
run_parameters$drop0_tol,
verbose,pb,setTxtProgressBar,getTxtProgressBar,ncores)
} else{
# if the prior is concentrated at h2s==0, then we don't need to sample U. All values will be 0.
chol_ZtZ_Kinv_list_list[[Qt_list_ID]] = lapply(seq_along(priors$h2_priors_resids),function(x) matrix(1,0,0))
if(verbose) setTxtProgressBar(pb,getTxtProgressBar(pb)+length(h2s_matrix))
}
}
if(verbose) close(pb)
# Qt matrices for factors are only used with row set 1
rows = trait_sets[[1]]$rows
Qt1_U2_F = Qt_list[[1]] %**% U2[rows,,drop=FALSE]
Qt1_X2_F = Qt_list[[1]] %**% X2[rows,,drop=FALSE]
MegaLMM_state$run_variables = c(MegaLMM_state$run_variables,
list(
Qt_list = Qt_list,
QtX1_list = QtX1_list,
QtX2_R_list = QtX2_R_list,
Qt1_U2_F = Qt1_U2_F,
Qt1_X2_F = Qt1_X2_F,
chol_V_list_list = chol_V_list_list,
chol_ZtZ_Kinv_list_list = chol_ZtZ_Kinv_list_list
))
MegaLMM_state$run_variables$trait_sets = trait_sets
MegaLMM_state$data_matrices$ZL = ZL
MegaLMM_state$data_matrices$RE_setup = RE_setup
MegaLMM_state$data_matrices$ZL_list = ZL_list
MegaLMM_state$data_matrices$RE_L_list = RE_L_list
MegaLMM_state$run_variables$r_RE = r_RE
return(MegaLMM_state)
}
MegaLMM_initialize_variables = function(MegaLMM_state,...){
run_parameters = MegaLMM_state$run_parameters
run_variables = MegaLMM_state$run_variables
data_matrices = MegaLMM_state$data_matrices
priors = MegaLMM_state$priors
trait_sets = run_variables$trait_sets
if(!'Qt_list' %in% names(run_variables)) stop('run MegaLMM_preliminary_calculations() first')
current_state = with(c(run_parameters,run_variables,data_matrices,priors),{
# recover()
# MegaLMM_state$current_state = list()
# MegaLMM_state$current_state$trait_sets_current_state = list()
Eta = MegaLMM_state$current_state$Eta
# Factor discrete variances
# K-matrix of n_RE x K with
F_h2_index = sample(c(1:ncol(h2s_matrix))[h2_priors_factors>0],K,replace=T)
F_h2 = h2s_matrix[,F_h2_index,drop=FALSE]
tot_F_prec = matrix(1,nrow=1,ncol=K)
U_F = matrix(rnorm(sum(r_RE) * K, 0, sqrt(F_h2[1,] / tot_F_prec)),ncol = K, byrow = T)
rownames(U_F) = colnames(ZL)
# Factor fixed effects
B2_F = 0*matrix(rnorm(b2 * K),b2,K)
rownames(B2_F) = colnames(X2)
# recover()
F = X2 %*% B2_F + ZL %*% U_F + matrix(rnorm(n * K, 0, sqrt((1-colSums(F_h2)) / tot_F_prec)),ncol = K, byrow = T)
F = as.matrix(F)
Lambda_mean = matrix(0,K,p)
colnames(Lambda_mean) = colnames(Eta)
trait_sets_current_state = list()
for(set in seq_along(trait_sets)) {
if(length(trait_sets[[set]]$cols) == 0) next
trait_sets_current_state[[set]] = with(trait_sets[[set]],{
# recover()
# trait_set = trait_sets[[set]]
# MegaLMM_state$current_state$trait_sets_current_state[[set]] = list()
# if(length(trait_sets[[set]]$cols) == 0) next
# p = length(trait_set$cols)
# traitnames = trait_set$colnames
p = length(cols)
traitnames = colnames
Lambda_prec = matrix(1,K,p)
Lambda = matrix(rnorm(K*p,0,sqrt(1/Lambda_prec)),nr = K,nc = p)
colnames(Lambda) = traitnames
tot_Eta_prec = matrix(rgamma(p,shape = tot_Eta_prec_shape,rate = tot_Eta_prec_rate),nrow = 1)
colnames(tot_Eta_prec) = traitnames
resid_h2_index = matrix(apply(h2_priors_resids,2,function(x) sample(c(1:ncol(h2s_matrix))[x>0],1)),nrow=1)
resid_h2 = h2s_matrix[,resid_h2_index,drop=FALSE]
U_R = matrix(rnorm(sum(r_RE) * p, 0, sqrt(resid_h2[1,] / tot_Eta_prec)),ncol = p, byrow = T)
colnames(U_R) = traitnames
rownames(U_R) = colnames(ZL)
# Fixed effects
b1 = ncol(X1)
B1 = matrix(rnorm(b1*p), ncol = p)
colnames(B1) = traitnames
b2_R = ifelse(use_X2,b2,0)
B2_R = 0*matrix(rnorm(b2_R*p),b2_R,ncol = p)
colnames(B2_R) = traitnames
rownames(B2_R) = colnames(X2)
XB = X1[,X1_cols,drop=FALSE] %**% B1[X1_cols,,drop=FALSE]
if(use_X2) XB = XB + X2 %*% B2_R
var_Eta = rep(1,p)
Eta_mean = XB + F %**% Lambda + ZL_list[[Qt_list_ID]] %**% U_R
# recover()
list(
cols = cols,
Eta = Eta[,cols],
Eta_mean = Eta_mean,
Lambda = Lambda,
tot_Eta_prec = tot_Eta_prec,
resid_h2_index = resid_h2_index,
resid_h2 = resid_h2,
U_R = U_R,
B1 = B1,
B2_R = B2_R,
XB = XB,
var_Eta = var_Eta
)
})
trait_sets_current_state[[set]] = trait_sets[[set]]$sample_Eta(trait_sets_current_state[[set]],trait_sets[[set]]$sampler_parameters)
# trait_sets_current_state[[set]] = trait_sets[[set]]$B2_R_prior$sampler(list(current_state = trait_sets_current_state[[set]],
# data_matrices = data_matrices,
# = trait_sets[[set]]$B2_R_prior))
}
list(
K = K,
Kr = K,
tot_F_prec = tot_F_prec,
F_h2_index = F_h2_index,
F_h2 = F_h2,
U_F = U_F,
F = F,
B2_F = B2_F,
Lambda_mean = Lambda_mean,
trait_sets_current_state = trait_sets_current_state,
nrun = 0,
total_time = 0
)
})
MegaLMM_state$current_state = assemble_current_state(current_state)
# recover()
# Initialize parameters for Lambda_prior, B_prior, and QTL_prior (may be model-specific)
# MegaLMM_state$current_state = MegaLMM_state$priors$Lambda_prior$sampler(MegaLMM_state)
# save the initial RNG state
MegaLMM_state$current_state$RNG = list(
Random.seed = .Random.seed,
RNGkind = RNGkind()
)
# ----------------------------- #
# --- Initialize MegaLMM_state --- #
# ----------------------------- #
# Posterior = reset_Posterior(MegaLMM_state$Posterior,MegaLMM_state)
# MegaLMM_state$Posterior = Posterior
#
# MegaLMM_state$Posterior$posteriorSample_params = unique(c(MegaLMM_state$Posterior$posteriorSample_params,observation_model_state$posteriorSample_params))
# MegaLMM_state$Posterior$posteriorMean_params = unique(c(MegaLMM_state$Posterior$posteriorMean_params,observation_model_state$posteriorMean_params))
return(MegaLMM_state)
}
assemble_matrix = function(current_state,trait_sets,matrix,by = 'cols') {
if(by=='cols') {
new_matrix = do.call(cbind,lapply(current_state$trait_sets_current_state[trait_sets],function(x) x[[matrix]]))
} else{
new_matrix = do.call(rbind,lapply(current_state$trait_sets_current_state[trait_sets],function(x) x[[matrix]]))
}
new_matrix
}
assemble_current_state = function(current_state,
sets = 1:length(current_state$trait_sets_current_state),
matrices = c('Lambda','B1','B2_R','U_R','tot_Eta_prec','resid_h2_index','resid_h2','XB','Eta_mean','Eta')) {
for(i in seq_along(matrices)) {
current_state[[matrices[i]]] = assemble_matrix(current_state,sets,matrices[i],'cols')
}
current_state
}
partition_data_missing = function(Y,max_NA_groups = ncol(Y), starting_groups = NULL, verbose=FALSE) {
Y_missing = is.na(Y)
Y_missing_mat = as.matrix(Y_missing)
non_missing_rows = unname(which(rowSums(!Y_missing)>0))
# if(is.null(max_NA_groups)) max_NA_groups = MegaLMM_state$run_parameters$max_NA_groups
# if(is.null(verbose)) verbose = MegaLMM_state$run_parameters$verbose
# create a base map
Missing_data_map = list(list(
Y_obs = non_missing_rows,
Y_cols = 1:ncol(Y_missing)
))
# if no groups allowed, just return base map
if(max_NA_groups <= 1) {
return(list(Missing_data_map = Missing_data_map, Missing_data_map_list = NULL,map_results=c()))
}
if(is.infinite(max_NA_groups)) {
Y_col_obs = lapply(1:ncol(Y_missing_mat),function(x) {
obs = which(!Y_missing_mat[,x],useNames=F)
names(obs) = NULL
obs
})
non_missing_rows = unname(which(rowSums(!Y_missing_mat)>0))
unique_Y_col_obs = unique(c(list(non_missing_rows),Y_col_obs))
unique_Y_col_obs_str = lapply(unique_Y_col_obs,paste,collapse='')
Y_col_obs_index = sapply(Y_col_obs,function(x) which(unique_Y_col_obs_str == paste(x,collapse='')))
Missing_data_map = lapply(1:max(unique(Y_col_obs_index)),function(i) {
Y_cols = which(Y_col_obs_index==i)
if(length(Y_cols) == 0) {
return(list(
Y_obs = non_missing_rows,
Y_cols = Y_cols
))
} else{
return(list(
Y_obs = unname(which(rowSums(!Y_missing[,Y_cols,drop=FALSE])>0)), # now find the rows with any non-missing data in this set of columns
Y_cols = Y_cols
))
}
})
return(list(
Missing_data_map = Missing_data_map,
Missing_data_map_list = NULL,
map_results = c()
))
}
# if a starting map is provided, initialize this
if(!is.null(starting_groups)) {
if(length(starting_groups) != ncol(Y_missing)) stop("starting_groups must be vector with length ncol(Y).")
starting_groups = as.numeric(as.factor(starting_groups))
Missing_data_map = lapply(unique(starting_groups),function(i) {
Y_cols = which(starting_groups == i)
Y_obs = unname(which(rowSums(!Y_missing[,Y_cols,drop=FALSE])>0))
list(
Y_obs = Y_obs,
Y_cols = Y_cols
)
})
if(max(sapply(Missing_data_map,function(x) length(x$Y_obs))) < length(non_missing_rows)) {
Missing_data_map = c(list(list(
Y_obs = non_missing_rows,
Y_cols = c()
)), Missing_data_map)
}
}
# otherwise, use kmeans to sequentially split map with the most non-excluded NAs
# save the sequence of maps so the user can select the best
Missing_data_map_list = list(
Missing_data_map
)
iter = 1
map_results = data.frame(
map = iter,
N_groups = length(Missing_data_map),
max_group_size = length(non_missing_rows),
total_kept_NAs = sum(sapply(Missing_data_map,function(x) {
if(length(x$Y_cols) == 0) return(0)
sum(Y_missing_mat[x$Y_obs,x$Y_cols])
}))
)
if(verbose) print(map_results)
while(length(Missing_data_map) < max_NA_groups && iter < 2*max_NA_groups){
iter = iter+1
# count the number of non-excluded NAs in each group
group_NAs = sapply(Missing_data_map,function(x) {
if(length(x$Y_cols)<= 1) return(0) # don't split a group with only one column
sum(Y_missing_mat[x$Y_obs,x$Y_cols])
})
if(max(group_NAs) == 0) break
target = which.max(group_NAs) # group with the most NAs
group = Missing_data_map[[target]]
# split this cluster into two
if(length(group$Y_cols) == 2) {
clusters = list(cluster = c(1,2))
} else{
clusters = with(group,kmeans(t(Y_missing[Y_obs,Y_cols]),centers=2))
}
new_groups = lapply(seq_len(max(clusters$cluster)),function(i) {
Y_cols = group$Y_cols[which(clusters$cluster==i)]
Y_obs = unname(which(rowSums(!Y_missing_mat[,Y_cols,drop=FALSE])>0)) # now find the rows with any non-missing data in this set of columns
# if((length(non_missing_rows)-length(Y_obs))/length(non_missing_rows) <= min_perc_drop) {
# Y_obs = non_missing_rows # if Y_obs is too big, not worth keeping this cluster
# # this might make an infinite loop!
# }
list(
Y_obs = Y_obs,
Y_cols = Y_cols
)
})
new_groups = new_groups[order(sapply(new_groups,function(x) length(x$Y_obs)),decreasing=T)]
if(target == 1) {
if(length(new_groups[[1]]$Y_obs) == length(non_missing_rows)) {
Missing_data_map[1] = new_groups[1]
new_groups = new_groups[-1]
} else{
Missing_data_map[[1]]$Y_cols = numeric()
}
Missing_data_map = c(Missing_data_map,new_groups)
} else{
Missing_data_map = c(Missing_data_map[-target],new_groups)
}
Missing_data_map_list[[iter]] = Missing_data_map
results_i = data.frame(
map = iter,
N_groups = length(Missing_data_map),
max_group_size = max(sapply(Missing_data_map,function(x) length(x$Y_obs))[-1]),
total_kept_NAs = sum(sapply(Missing_data_map,function(x) {
if(length(x$Y_cols) == 0) return(0)
sum(Y_missing_mat[x$Y_obs,x$Y_cols])
}))
)
if(verbose) print(results_i)
map_results = rbind(map_results,results_i)
}
return(list(
Missing_data_map = Missing_data_map_list[[length(Missing_data_map_list)]],
Missing_data_map_list = Missing_data_map_list,
map_results = map_results
))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.