my_detectCores = function() {
ncores = try(suppressWarnings(as.numeric(system('printenv SLURM_CPUS_PER_TASK',intern=T))),silent=T)
if(length(ncores) == 0 || is.na(ncores)) ncores = parallel::detectCores()
ncores
}
optimize_n_threads = function(MegaLMM_state,n_threads,times = 10) {
exprs = lapply(n_threads,function(threads) bquote({set_MegaLMM_nthreads(.(threads));sample_latent_traits(MegaLMM_state)}))
names(exprs) = n_threads
res=microbenchmark::microbenchmark(list = exprs,times = times)
summary_res = summary(res)
list(optim = as.numeric(as.character(summary_res$expr[order(summary_res$mean)[1]])),results = summary_res)
}
make_sparse = function(X,type = 'dgCMatrix') {
X = as(as(as(X,'generalMatrix'),'CsparseMatrix'),type)
X
}
make_model_setup = function(formula,data,relmat = NULL) {
# ensure data has rownames
if(is.null(rownames(data))) rownames(data) = 1:nrow(data)
# check that RE's have approporiate levels in data
RE_levels = list() # a list of levels for each of the random effects
if(is.null(relmat)) relmat = list()
for(re in names(relmat)) {
# check that K is a matrix, then convert to Matrix
if(is.list(relmat[[re]]) && !is.data.frame(relmat[[re]])){
if(is.data.frame(relmat[[re]]$K)) relmat[[re]]$K = as.matrix(relmat[[re]]$K)
if(is.matrix(relmat[[re]]$K)) relmat[[re]]$K = Matrix(relmat[[re]]$K,sparse=T)
if(is.null(rownames(relmat[[re]]$K))) stop(sprintf('K %s must have rownames',re))
RE_levels[[re]] = rownames(relmat[[re]]$K)
} else{
if(is.data.frame(relmat[[re]])) relmat[[re]] = as.matrix(relmat[[re]])
if(is.matrix(relmat[[re]])) relmat[[re]] = Matrix(relmat[[re]],sparse=T)
if(is.null(rownames(relmat[[re]]))) stop(sprintf('K %s must have rownames',re))
RE_levels[[re]] = rownames(relmat[[re]])
}
}
for(re in names(RE_levels)){
if(!re %in% colnames(data)) stop(sprintf('Column "%s" required in data',re))
data[[re]] = as.factor(data[[re]]) # ensure 'data[[re]]' is a factor
if(!all(data[[re]] %in% RE_levels[[re]])) stop(sprintf('Levels of random effect %s missing.',re))
data[[re]] = factor(data[[re]],levels = RE_levels[[re]]) # add levels to data[[re]]
}
# Use lme4 to evaluate formula in data
# ensure there is a response
response = 'y'
while(response %in% all.vars(formula)){
response = paste0(response,response)
}
formula = sprintf('%s~%s',response,as.character(formula)[2])
data[[response]] = 1
lmod <- lme4::lFormula(formula,data=data,#weights=weights,
control = lme4::lmerControl(check.nobs.vs.nlev = 'ignore',check.nobs.vs.nRE = 'ignore'))
# compute RE_setup
RE_terms = lmod$reTrms
# construct the RE_setup list
# contains:
# Z: n x r design matrix
# K: r x r PSD covariance matrix
RE_setup = list()
for(i in 1:length(RE_terms$cnms)){
term = names(RE_terms$cnms)[i]
n_factors = length(RE_terms$cnms[[i]]) # number of factors for this grouping factor
# extract combined Z matrix
combined_Zt = RE_terms$Ztlist[[i]]
Zs_term = tapply(1:nrow(combined_Zt),gl(n_factors,1,nrow(combined_Zt),labels = RE_terms$cnms[[i]]),function(x) Matrix::t(combined_Zt[x,,drop=FALSE]))
# extract K from relmat. If missing, assign to NULL
K = NULL
if(term %in% names(relmat)) {
K = relmat[[term]]
if(is(K,'dsCMatrix')) K = make_sparse(K,'dgCMatrix')
if(nnzero(K)/length(K) > .5) K = as.matrix(K) # if not really sparse
}
if(!is.null(K)) {
if(!all(colnames(Zs_term[[1]]) %in% rownames(K))) stop('rownames of K not lining up with Z')
colnames(K) = rownames(K)
K = K[colnames(Zs_term[[1]]),colnames(Zs_term[[1]])]
} else {
K = make_sparse(diag(1,ncol(Zs_term[[1]])),'dgCMatrix')
rownames(K) = colnames(K) = colnames(Zs_term[[1]])
}
# make an entry in RE_setup for each random effect
for(j in 1:n_factors){
# name of variance component
name = term
if(n_factors > 1) name = paste(name,RE_terms$cnms[[i]][[j]],sep='.')
while(name %in% names(RE_setup)) name = paste0(name,'.1') # hack for when same RE used multiple times
# Z matrix
Z = make_sparse(Zs_term[[j]],'dgCMatrix')
RE_setup[[name]] = list(
term = term,
Z = Z,
K = K
)
}
# add names to RE_setup if needed
n_RE = length(RE_setup)
for(i in 1:n_RE){
if(is.null(names(RE_setup)[i]) || names(RE_setup)[i] == ''){
names(RE_setup)[i] = paste0('RE.',i)
}
}
}
return(list(lmod = lmod, RE_setup = RE_setup))
}
make_Missing_data_map = function(MegaLMM_state,max_NA_groups = NULL, starting_groups = NULL, verbose=NULL) {
Y_missing = MegaLMM_state$run_parameters$observation_model_parameters$observation_setup$Y_missing
Y_missing_mat = as.matrix(Y_missing)
non_missing_rows = unname(which(rowSums(!Y_missing)>0))
if(is.null(max_NA_groups)) max_NA_groups = MegaLMM_state$run_parameters$max_NA_groups
if(is.null(verbose)) verbose = MegaLMM_state$run_parameters$verbose
# create a base map
Missing_data_map = list(list(
Y_obs = non_missing_rows,
Y_cols = 1:ncol(Y_missing)
))
# if no groups allowed, just return base map
if(max_NA_groups <= 1) {
return(list(Missing_data_map = Missing_data_map, Missing_data_map_list = NULL,map_results=c()))
}
if(is.infinite(max_NA_groups)) {
Y_col_obs = lapply(1:ncol(Y_missing_mat),function(x) {
obs = which(!Y_missing_mat[,x],useNames=F)
names(obs) = NULL
obs
})
non_missing_rows = unname(which(rowSums(!Y_missing_mat)>0))
unique_Y_col_obs = unique(c(list(non_missing_rows),Y_col_obs))
unique_Y_col_obs_str = lapply(unique_Y_col_obs,paste,collapse='')
Y_col_obs_index = sapply(Y_col_obs,function(x) which(unique_Y_col_obs_str == paste(x,collapse='')))
Missing_data_map = lapply(1:max(unique(Y_col_obs_index)),function(i) {
Y_cols = which(Y_col_obs_index==i)
if(length(Y_cols) == 0) {
return(list(
Y_obs = non_missing_rows,
Y_cols = Y_cols
))
} else{
return(list(
Y_obs = unname(which(rowSums(!Y_missing[,Y_cols,drop=FALSE])>0)), # now find the rows with any non-missing data in this set of columns
Y_cols = Y_cols
))
}
})
return(list(
Missing_data_map = Missing_data_map,
Missing_data_map_list = NULL,
map_results = c()
))
}
# if a starting map is provided, initialize this
if(!is.null(starting_groups)) {
if(length(starting_groups) != ncol(Y_missing)) stop("starting_groups must be vector with length ncol(Y).")
starting_groups = as.numeric(as.factor(starting_groups))
Missing_data_map = lapply(unique(starting_groups),function(i) {
Y_cols = which(starting_groups == i)
Y_obs = unname(which(rowSums(!Y_missing[,Y_cols,drop=FALSE])>0))
list(
Y_obs = Y_obs,
Y_cols = Y_cols
)
})
if(max(sapply(Missing_data_map,function(x) length(x$Y_obs))) < length(non_missing_rows)) {
Missing_data_map = c(list(list(
Y_obs = non_missing_rows,
Y_cols = c()
)), Missing_data_map)
}
}
# otherwise, use kmeans to sequentially split map with the most non-excluded NAs
# save the sequence of maps so the user can select the best
Missing_data_map_list = list(
Missing_data_map
)
iter = 1
map_results = data.frame(
map = iter,
N_groups = length(Missing_data_map),
max_group_size = length(non_missing_rows),
total_kept_NAs = sum(sapply(Missing_data_map,function(x) {
if(length(x$Y_cols) == 0) return(0)
sum(Y_missing_mat[x$Y_obs,x$Y_cols])
}))
)
if(verbose) print(map_results)
while(length(Missing_data_map) < max_NA_groups && iter < 2*max_NA_groups){
iter = iter+1
# count the number of non-excluded NAs in each group
group_NAs = sapply(Missing_data_map,function(x) {
if(length(x$Y_cols)<= 1) return(0) # don't split a group with only one column
sum(Y_missing_mat[x$Y_obs,x$Y_cols])
})
if(max(group_NAs) == 0) break
target = which.max(group_NAs) # group with the most NAs
group = Missing_data_map[[target]]
# split this cluster into two
if(length(group$Y_cols) == 2) {
clusters = list(cluster = c(1,2))
} else{
clusters = with(group,kmeans(t(Y_missing[Y_obs,Y_cols]),centers=2))
}
new_groups = lapply(seq_len(max(clusters$cluster)),function(i) {
Y_cols = group$Y_cols[which(clusters$cluster==i)]
Y_obs = unname(which(rowSums(!Y_missing_mat[,Y_cols,drop=FALSE])>0)) # now find the rows with any non-missing data in this set of columns
# if((length(non_missing_rows)-length(Y_obs))/length(non_missing_rows) <= min_perc_drop) {
# Y_obs = non_missing_rows # if Y_obs is too big, not worth keeping this cluster
# # this might make an infinite loop!
# }
list(
Y_obs = Y_obs,
Y_cols = Y_cols
)
})
new_groups = new_groups[order(sapply(new_groups,function(x) length(x$Y_obs)),decreasing=T)]
if(target == 1) {
if(length(new_groups[[1]]$Y_obs) == length(non_missing_rows)) {
Missing_data_map[1] = new_groups[1]
new_groups = new_groups[-1]
} else{
Missing_data_map[[1]]$Y_cols = numeric()
}
Missing_data_map = c(Missing_data_map,new_groups)
} else{
Missing_data_map = c(Missing_data_map[-target],new_groups)
}
Missing_data_map_list[[iter]] = Missing_data_map
results_i = data.frame(
map = iter,
N_groups = length(Missing_data_map),
max_group_size = max(sapply(Missing_data_map,function(x) length(x$Y_obs))[-1]),
total_kept_NAs = sum(sapply(Missing_data_map,function(x) {
if(length(x$Y_cols) == 0) return(0)
sum(Y_missing_mat[x$Y_obs,x$Y_cols])
}))
)
if(verbose) print(results_i)
map_results = rbind(map_results,results_i)
}
return(list(
Missing_data_map = Missing_data_map_list[[length(Missing_data_map_list)]],
Missing_data_map_list = Missing_data_map_list,
map_results = map_results
))
}
set_Missing_data_map = function(MegaLMM_state,Missing_data_map) {
MegaLMM_state$run_variables$Missing_data_map = Missing_data_map
# find the values that are excluded by the groups in Missing_data_map
Y_missing_mat = matrix(1,MegaLMM_state$run_variables$n,MegaLMM_state$run_variables$p)
for(map in Missing_data_map) {
Y_missing_mat[map$Y_obs,map$Y_cols] = 0
}
# create an object that indexes the non-missing values by row
Y_row_obs = lapply(1:nrow(Y_missing_mat),function(x) {
obs = which(!Y_missing_mat[x,],useNames=F)
names(obs) = NULL
obs
})
non_missing_cols = unname(which(colSums(!Y_missing_mat)>0))
unique_Y_row_obs = unique(c(list(non_missing_cols),Y_row_obs))
unique_Y_row_obs_str = lapply(unique_Y_row_obs,paste,collapse=',')
Y_row_obs_index = sapply(Y_row_obs,function(x) which(unique_Y_row_obs_str == paste(x,collapse=',')))
Missing_row_data_map = lapply(seq_along(unique_Y_row_obs),function(i) {
x = unique_Y_row_obs[[i]]
return(list(
Y_cols = x,
Y_obs = which(Y_row_obs_index == i)
))
})
# expand ZL_list and RE_L_list for missing data map
MegaLMM_state$data_matrices$ZL_list = lapply(seq_along(Missing_data_map),function(x) MegaLMM_state$data_matrices$ZL_list[[1]])
MegaLMM_state$data_matrices$RE_L_list = lapply(seq_along(Missing_data_map),function(x) MegaLMM_state$data_matrices$RE_L_list[[1]])
MegaLMM_state$run_variables$Missing_row_data_map = Missing_row_data_map
return(MegaLMM_state)
}
# finds a matrix S that simultaneously diagonalizes A and B
# following algorithm here: https://math.stackexchange.com/questions/1079627/simultaneously-diagonalization-of-two-matrices
simultaneous_diagonalize = function(A,Binvsq) {
# sBAB = svd(forceSymmetric(t(Binvsq) %*% A %*% Binvsq))
# O = sBAB$u
# S = Binvsq %*% O
# return(list(S=S,d=sBAB$d))
eigenBAB = eigen(forceSymmetric(t(Binvsq) %*% A %*% Binvsq))
O = eigenBAB$vectors
S = Binvsq %*% O
return(list(S=S,d=eigenBAB$values))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.