#' Phase I : module generation
#'
#' Run first phase of the linker method where K modules of similarly expressed target genes and
#' relate them to a linear combination of very few regulators, according to the selected model. `LINKER_init()`
#' evaluate kmeans on a train set to generate a initial set of clusters containing drivers and target genes.
#' `LINKER_ReassignGenesToClusters()` reassigning genes based on closed match to new regulatory programs.
#' This functions are run inside the `LINKER_run` function, so it is not recommended to run it on its own.
#' `LINKER_corrClust()` go through two steps within a loop, learning regulatory program of modules and reassigning
#' genes. `LINKER_extract_modules()` extracts all the modules, genes and relevant information. `LINKER_EvaluateTestSet()`
#' fits the selected model with the test data. `LINKER_LearnRegulatoryPrograms()` learns the regulatory program of the modules.
#'
#' @param lognorm_est_counts Matrix of log-normalized estimated counts of the gene expression
#' data (Nr Genes x Nr samples)
#' @param target_filtered_idx Index array of the target genes on the lognorm_est_counts matrix if
#' SummarizedExperiment object is not provided.
#' @param regulator_filtered_idx Index array of the regulatory genes on the lognorm_est_counts matrix if
#' SummarizedExperiment object is not provided.
#' @param nassay if SummarizedExperiment object is passed as input to lognorm_est_counts, name of the
#' assay containing the desired matrix. Default: 1 (first item in assay's list).
#' @param regulator if SummarizedExperiment object is passed as input to lognorm_est_counts, name of the
#' rowData() variable to build target_filtered_idx and regulator_filtered_idx. This variable must be one
#' for driver genes and zero for target genes. Default: 'regulator'
#' @param mode Chosen method(s) to link module eigengenes to regulators. The available options are
#' 'VBSR', 'LASSOmin', 'LASSO1se', 'LASSOparam' and 'LM'. Default set to 'VBSR'
#' @param used_method Method selected for use. Default set to MEAN.
#' @param Nr_bootstraps Number of bootstrap of Phase I. By default, 1.
#' @param Lambda Lambda variable for Lasso models.
#' @param alpha Alpha variable for Lasso models.
#' @param pmax Maximum numbers of regulators that we want.
#' @param train_size Fraction of samples selected for the train samples. Default: 0.8.
#' @param corrClustNrIter Number of iteration for the phase I part of the method.
#' @param FDR The False Discovery Rate correction used for the modules and graphs GRN uncovering. By default, 0.05.
#' @param only_train whether to use only training samples within LINKER run. Default: FALSE
#'
#' @return list object containing the modules generated with the selected parameters and the stats associated to them.
#'
#' @examples
#'
#' ## This example is very similar to the `LINKER_run()` function.
#' ## Again, we are going to load the expression matrix dataset
#' lognorm_est_counts_p <- paste0(system.file('extdata', package='TraRe'),
#' '/expression_rewiring_example.txt')
#' lognorm_est_counts <- as.matrix(read.delim(lognorm_est_counts_p,
#' header=TRUE,row.names=1))
#'
#' ## Load gene info, its an array of regulators' names.
#' gene_info_p <- paste0(system.file('extdata',package='TraRe'),
#' '/geneinfo_rewiring_example.txt')
#' gene_info <- read.delim(gene_info_p,header=TRUE)
#' regulators <- gene_info[gene_info[,'regulator'] == 1,'uniq_isos']
#'
#' regulator_filtered_idx <- which(rownames(lognorm_est_counts)%in%regulators)
#' target_filtered_idx <- which(!rownames(lognorm_est_counts)%in%regulators)
#'
#' ## We recommend to use the default values of the function.
#' ## For the sake of time, we will select faster (and worse) ones.
#'
#' linkerphase1 <- TraRe::LINKER_runPhase1(lognorm_est_counts,
#' target_filtered_idx=target_filtered_idx,
#' regulator_filtered_idx=regulator_filtered_idx,
#' NrModules=10,mode='LASSOmin',used_method = "MEAN",
#' corrClustNrIter=10,Nr_bootstraps=1)
#' # saveRDS(linkerphase1,paste0(system.file('extdata',package='TraRe'),
#' # '/linker_phase_one.example.rds'))
#'
#' @export LINKER_runPhase1
LINKER_runPhase1 <- function(lognorm_est_counts, target_filtered_idx, regulator_filtered_idx, nassay = 1, regulator = "regulator",
NrModules, Lambda = 5, alpha = 1 - 1e-06, pmax = 10, train_size = 0.8, mode = "VBSR", used_method = "MEAN", corrClustNrIter = 100,
Nr_bootstraps = 1, FDR = 0.05,only_train=FALSE) {
# Check for SummarizedExperiment Object
if (inherits(lognorm_est_counts, "SummarizedExperiment")) {
# Generate target and regulator indexes
genenames <- rownames(lognorm_est_counts)
geneinfo <- SummarizedExperiment::rowData(lognorm_est_counts)
target_filtered_idx <- which(genenames %in% rownames(geneinfo)[geneinfo$regulator == 0])
regulator_filtered_idx <- which(genenames %in% rownames(geneinfo)[geneinfo$regulator == 1])
# Get lognorm_est_counts expression matrix.
lognorm_est_counts <- SummarizedExperiment::assays(lognorm_est_counts)[[nassay]]
}
# Creating the parameters structure
Parameters <- list(Lambda = Lambda, pmax = pmax, alpha = alpha, mode = mode, used_method = used_method)
sample_size <- dim(lognorm_est_counts)[2]
train_size_sampled <- round(train_size * sample_size)
EvaluateTestSet <- list()
bootstrap_modules <- list()
bootstrap_results <- list()
for (boost_idx in seq_len(Nr_bootstraps)) {
train_samples <- sample(seq_len(sample_size), train_size_sampled, replace = FALSE)
validation_samples <- setdiff(seq_len(sample_size), train_samples)
# Scale by driver genes
Regulator_data_train <- t(scale(t(lognorm_est_counts[regulator_filtered_idx, train_samples])))
Regulator_data_validation <- t(scale(t(lognorm_est_counts[regulator_filtered_idx, validation_samples])))
# Scale by samples within targets
MA_matrix_Var_train <- t(scale(t(lognorm_est_counts[target_filtered_idx, train_samples])))
MA_matrix_Var_validation <- t(scale(t(lognorm_est_counts[target_filtered_idx, validation_samples])))
LINKERinit <- LINKER_init(MA_matrix_Var = MA_matrix_Var_train, RegulatorData = Regulator_data_train, NrModules = NrModules,
corrClustNrIter = corrClustNrIter, Parameters = Parameters, FDR = FDR)
tmp <- LINKER_corrClust(LINKERinit)
bootstrap_results[[boost_idx]] <- tmp
if (!only_train){
EvaluateTestSet[[boost_idx]] <- LINKER_EvaluateTestSet(bootstrap_results[[boost_idx]], MA_matrix_Var_validation, Regulator_data_validation,
used_method = Parameters$used_method)
print(matrixStats::colMeans2(EvaluateTestSet[[boost_idx]]))
}
R.utils::printf("Bootstrap %d, NrModules %d:\n", boost_idx, bootstrap_results[[boost_idx]]$NrModules)
}
if (!only_train){
return(list(bootstrapResults = bootstrap_results, bootstrapTestStats = EvaluateTestSet))
}
return(list(bootstrapResults = bootstrap_results))
}
#' @export
#' @rdname LINKER_runPhase1
#' @param MA_matrix_Var Matrix of log-normalized estimated counts of the gene expression data, centered and scaled, containing
#' only the train samples.
#' @param RegulatorData Expression matrix containing only the regulators of the train samples.
#' @param Parameters List of parameters containig lambda, pmax, alpha, mode and used method.
#' @param NrModules Number of modules that are a priori to be found (note that the final number of modules
#' discovered may differ from this value). By default, 100 modules.
LINKER_init <- function(MA_matrix_Var, RegulatorData, NrModules, corrClustNrIter = 21, Parameters, FDR) {
if (nrow(MA_matrix_Var) > NrModules) {
# K-means++-style initialization
rnd_cent <- matrix()
# First one at random
rnd_cent <- (MA_matrix_Var[sample(seq_len(nrow(MA_matrix_Var)), 1), ])
if (NrModules > 0) {
if (NrModules > 1) {
# the second value is the least correlated from the first one
corr_dists <- apply(MA_matrix_Var, 1, function(x) ((x %*% rnd_cent)/(length(x) - 1))^2)
rnd_cent <- cbind(as.matrix(rnd_cent), as.matrix(MA_matrix_Var[which(corr_dists == min(corr_dists))[1], ]))
rnd_cent <- t(rnd_cent)
if (NrModules > 2) {
# compute the rest of the centers
Px <- numeric(length = nrow(MA_matrix_Var))
for (center_idx in 3:NrModules) {
for (i in seq_len(nrow(MA_matrix_Var))) {
gene <- MA_matrix_Var[i, ]
corr_dists <- apply(rnd_cent, 1, function(x) ((x %*% gene)/(length(x) - 1))^2)
Px[i] <- 2 - max(corr_dists)
}
if (sum(is.finite(Px)) != length(Px)) {
message("asdf")
}
rnd_cent <- rbind(rnd_cent, MA_matrix_Var[sample(seq_len(nrow(MA_matrix_Var)), 1, prob = Px), ])
}
}
}
}
ModuleVectors <- rnd_cent
Data <- MA_matrix_Var
Clusters <- numeric()
for (jj in seq_len(5)) {
for (i in seq_len(nrow(MA_matrix_Var))) {
CurrentGeneVector <- Data[i, , drop = FALSE]
Correlations <- (stats::cor(t(CurrentGeneVector), t(ModuleVectors)))
corr <- data.matrix(Correlations, rownames.force = NA)
MaxCorrelation <- max(corr, na.rm = TRUE)
MaxPosition <- which(signif(corr, digits = 7) == signif(MaxCorrelation, digits = 7))
MaxPosition <- MaxPosition[1] # this is new, to avoid two different reassignements
Clusters[i] <- MaxPosition
}
ClusterIDs <- unique(Clusters)
for (idx in seq_along(ClusterIDs)) {
genesInModule <- which(Clusters == ClusterIDs[idx])
cx <- MA_matrix_Var[genesInModule, ]
if (length(genesInModule) > 1) {
if (Parameters$used_method == "LINKER") {
clusterSVD <- svd(cx)
y <- clusterSVD$v[, 1]
if (stats::cor(matrixStats::colMeans2(cx), y) < 0) {
y <- -y
}
} else {
ModuleVectors[idx, ] <- matrixStats::colMeans2(cx)
}
} else {
ModuleVectors[idx, ] <- cx
}
}
}
} else {
stop("The number of modules is too large compared to the total number of genes.")
}
ModuleMembership <- as.numeric(Clusters)
names(ModuleMembership) <- rownames(MA_matrix_Var)
return(list(MA_matrix_Var = MA_matrix_Var, RegulatorData = RegulatorData, ModuleMembership = ModuleMembership, Parameters = Parameters,
corrClustNrIter = corrClustNrIter, FDR = FDR))
}
#' @export
#' @rdname LINKER_runPhase1
#' @param Data Data Matrix of log-normalized estimated counts of the gene expression data, centered and scaled, containing
#' only the train samples.
#' @param Beta Coefficient on which the decision of reassigning genes is based.
#' @param Clusters Number of modules that are a priori to be found (note that the final number of modules discovered may differ from this value).
LINKER_ReassignGenesToClusters <- function(Data, RegulatorData, Beta, Clusters) {
MIN_NUM_GENES_PER_MODULE <- 2
RegulatorData_rownames <- rownames(RegulatorData)
Data_rownames <- rownames(Data)
NrGenes <- nrow(Data)
NrSamples <- ncol(Data)
NrReassignGenes <- 0
## reassigning genes based on the Beta getting the predictor data
X <- RegulatorData
# creating the cluster 'centroids'
X1 <- data.matrix(X)
ModuleVectors <- Beta %*% X1
GeneNames <- rownames(Data)
#Compute genes to centroid correlations
g_c_corr <- t(stats::cor(t(ModuleVectors),t(Data)))
#Compute new assigns based on correlations
nc <- as.numeric(apply(g_c_corr,1,function(x) which.max(x[!is.na(x)])))
# Remove cluster with too few genes. Avoids singularities. Could be solved imposing priors. future work
for (i in unique(nc)) {
NrGenesInCluster <- sum(nc == i)
if (NrGenesInCluster < MIN_NUM_GENES_PER_MODULE) {
# I need to reassign these genes
genesInModule <- which(nc == i)
# remove the cluster
ModuleVectors[i, ] <- 0
for (j in genesInModule) {
CurrentGeneVector <- Data[j, , drop = FALSE]
# Correlations = abs(cor(t(CurrentGeneVector),t(ModuleVectors)))
Correlations <- (stats::cor(t(CurrentGeneVector), t(ModuleVectors)))
corr <- data.matrix(Correlations, rownames.force = NA)
MaxCorrelation <- max(corr, na.rm = TRUE)
MaxPosition <- which(signif(corr, digits = 7) == signif(MaxCorrelation, digits = 7))
MaxPosition <- MaxPosition[1] # this is new, to avoid two different reassignements
nc[j] <- MaxPosition
}
}
}
NrReassignGenes <- length(which(nc != Clusters))
result <- list(NrReassignGenes = NrReassignGenes, Clusters = nc)
return(result)
}
#' @export
#' @rdname LINKER_runPhase1
#' @param LINKERinit Initialization object obtained from `LINKER_init()`.
LINKER_corrClust <- function(LINKERinit) {
NrIterations <- LINKERinit$corrClustNrIter
if (nrow(LINKERinit$RegulatorData) == 1) {
stop("Only one driver is detected. More than one driver is needed.\n")
}
Data <- LINKERinit$MA_matrix_Var
Clusters <- LINKERinit$ModuleMembership
RegulatorData <- LINKERinit$RegulatorData
Parameters <- LINKERinit$Parameters
RegulatorData_rownames <- rownames(RegulatorData)
Data_rownames <- rownames(Data)
# main loop We want to end with the regulatory programs, hence we loop for Reassign, regulatory
# STEP 1: learning the regulatory program for each cluster
regulatoryPrograms <- LINKER_LearnRegulatoryPrograms(Data, Clusters, RegulatorData, Lambda = Parameters$Lambda, alpha = Parameters$alpha,
pmax = Parameters$pmax, mode = Parameters$mode, used_method = Parameters$used_method, FDR = LINKERinit$FDR)
jj <- 1
while (jj < NrIterations) {
# STEP 2: reassigning genes based on closed match to new regulatory programs
ReassignGenesToClusters <- LINKER_ReassignGenesToClusters(Data, RegulatorData, regulatoryPrograms$Beta, Clusters)
jj <- jj + 1
NrReassignGenes <- ReassignGenesToClusters$NrReassignGenes
Clusters <- ReassignGenesToClusters$Clusters
#check if Clusters exists
if (!length(Clusters)){
stop('Unable to infer any GRN module.')
}
# STEP 1: learning the regulatory program for each cluster
regulatoryPrograms <- LINKER_LearnRegulatoryPrograms(Data, Clusters, RegulatorData, Lambda = Parameters$Lambda, alpha = Parameters$alpha,
pmax = Parameters$pmax, mode = Parameters$mode, used_method = Parameters$used_method, FDR = LINKERinit$FDR)
}
# update results structure
ModuleMembership <- as.matrix(Clusters)
rownames(ModuleMembership) <- rownames(Data)
colnames(ModuleMembership) <- c("ModuleNr")
result <- list(NrModules = length(unique(Clusters)), RegulatoryPrograms = regulatoryPrograms$Beta, AllRegulators = rownames(RegulatorData),
AllGenes = rownames(Data), ModuleMembership = ModuleMembership)
training_stats <- LINKER_EvaluateTestSet(result, Data, RegulatorData, used_method = LINKERinit$Parameters$used_method)
result <- list(NrModules = length(unique(Clusters)), RegulatoryPrograms = regulatoryPrograms$Beta, AllRegulators = rownames(RegulatorData),
AllGenes = rownames(Data), ModuleMembership = ModuleMembership, trainingStats = training_stats)
return(result)
}
#' @export
#' @rdname LINKER_runPhase1
#' @param results Matrix of log-normalized estimated counts of the gene expression data (Nr Genes x Nr samples).
LINKER_extract_modules <- function(results) {
modules <- list()
enriched_idx <- 1
NrBootstraps <- length(results$bootstrapResult)
for (idx_bootstrap in seq_len(NrBootstraps)) {
NrModules <- results$bootstrapResult[[idx_bootstrap]]$NrModules
boot_results <- results$bootstrapResult[[idx_bootstrap]]
boot_idx <- sort(unique(boot_results$ModuleMembership[, ]))
for (Module_number in seq_len(NrModules)) {
Module_target_genes_full_name <- boot_results$AllGenes[which(boot_results$ModuleMembership[, ] == boot_idx[Module_number])]
Module_target_gene_list <- vapply(Module_target_genes_full_name, function(x) strsplit(x, "\\|"), FUN.VALUE = list(""))
if (length(Module_target_gene_list[[1]]) == 1) {
# NOT FULL GENECODE NAME, ONLY ONE NAME PER GENE!
Module_target_genes <- Module_target_genes_full_name
} else {
# FULL GENECODE ANNOTATION!
Module_target_genes <- vapply(Module_target_gene_list, function(x) x[[6]], FUN.VALUE = "")
Module_target_genes <- unname(Module_target_genes)
}
Modules_regulators_full_name <- names(which(boot_results$RegulatoryPrograms[Module_number, ] != 0))
if (length(Modules_regulators_full_name) == 0) {
next
}
Modules_regulators_list <- vapply(Modules_regulators_full_name, function(x) strsplit(x, "\\|"), FUN.VALUE = list(""))
if (length(Modules_regulators_list[[1]]) == 1) {
# NOT FULL GENECODE NAME, ONLY ONE NAME PER GENE!
Modules_regulators <- Modules_regulators_full_name
} else {
# FULL GENECODE ANNOTATION!
Modules_regulators <- vapply(Modules_regulators_list, function(x) x[[6]], FUN.VALUE = "")
Modules_regulators <- unname(Modules_regulators)
}
modules[[enriched_idx]] <- list(target_genes = Module_target_genes_full_name, regulators = Modules_regulators_full_name,
regulatory_program = boot_results$RegulatoryPrograms[Module_number, ], training_stats = boot_results$trainingStats[Module_number,
], test_stats = results$bootstrapTestStats[[idx_bootstrap]][Module_number], assigned_genes = which(boot_results$ModuleMembership[,
] == Module_number), bootstrap_idx = idx_bootstrap)
enriched_idx <- enriched_idx + 1
}
}
return(modules)
}
#' @export
#' @rdname LINKER_runPhase1
#' @param LINKERresults List containing the number of clusters, regulatoryprogram, name of regulators and all genes and module membership.
#' @param MA_Data_TestSet Matrix of log-normalized estimated counts of the gene expression data, centered and scaled, containing
#' only the test samples.
#' @param RegulatorData_TestSet Expression matrix containing only the regulators of the test samples.
LINKER_EvaluateTestSet <- function(LINKERresults, MA_Data_TestSet, RegulatorData_TestSet, used_method = "MEAN") {
nrSamples <- ncol(MA_Data_TestSet)
RegulatorNames <- rownames(RegulatorData_TestSet)
# Iterating over the Modules
stats <- mat.or.vec(LINKERresults$NrModules, 7)
Rsquare <- mat.or.vec(LINKERresults$NrModules, 1)
RsquareAjusted <- mat.or.vec(LINKERresults$NrModules, 1)
modules <- list()
for (i in seq_len(LINKERresults$NrModules)) {
# check regulator presence
currentRegulators <- RegulatorNames[which(LINKERresults$RegulatoryPrograms[i, ] != 0)]
stats[i, 1] <- length(currentRegulators)
# checking the presence of the clusters
currentClusterGenes <- LINKERresults$AllGenes[which(LINKERresults$ModuleMembership[, 1] == i)]
stats[i, 2] <- length(currentClusterGenes)
# predict cluster expression in test set, always calculate but report the totel percentage weight that is represented
currentWeights <- LINKERresults$RegulatoryPrograms[i, which(LINKERresults$RegulatoryPrograms[i, ] != 0)]
modules[[i]] <- currentClusterGenes[currentClusterGenes %in% rownames(MA_Data_TestSet)]
# drop=FALSE, this solves the problem when you have only one regulator, so the previous version is not needed.
predictions <- (t(RegulatorData_TestSet[currentRegulators, , drop = FALSE])) %*% (currentWeights) # need to make sure that the first argument remains a matrix.
predictions <- data.matrix(predictions)
if (length(modules[[i]]) != 0) {
if (length(currentClusterGenes) > 1) {
cx <- MA_Data_TestSet[currentClusterGenes, ]
if (used_method == "LINKER") {
module_SVD <- svd(cx)
outcome <- module_SVD$v[, 1]
if (stats::cor(predictions, outcome) < 0) {
outcome <- -outcome
}
varEx <- module_SVD$d[1]^2/sum(module_SVD$d^2)
} else {
outcome <- matrixStats::colMeans2(cx)
varEx <- 0
}
} else {
outcome <- MA_Data_TestSet[currentClusterGenes, ]
varEx <- 0
}
module_data <- MA_Data_TestSet[currentClusterGenes, ]
if (nrow(t(module_data)) == 1) {
module_data <- t(module_data)
}
inmodule_corr <- abs(stats::cor(t(module_data), outcome))
meanInModuleCor <- mean(inmodule_corr)
homogeneity <- abs(stats::cor(t(module_data), t(module_data)))
homogeneity <- (sum(homogeneity) - dim(homogeneity)[1])/((dim(homogeneity)[1] - 1) * (dim(homogeneity)[1] - 1))
# using explained variance as metric, since mean square error is not enough, no baseline interpretation possible
SStot <- sum((outcome - mean(outcome))^2)
SSres <- sum((predictions - outcome)^2)
Rsquare <- 1 - (SSres/SStot)
RsquareAjusted <- 1 - (1 - Rsquare) * ((nrSamples - 1)/(nrSamples - length(currentRegulators)))
stats[i, 3] <- meanInModuleCor
stats[i, 4] <- Rsquare
stats[i, 5] <- RsquareAjusted
stats[i, 6] <- homogeneity
stats[i, 7] <- varEx
} else {
}
}
dimnames(stats) <- list(rownames(stats, do.NULL = FALSE, prefix = "Module_"), c("nrReg", "nrGen", "MeanInModuleCorr", "Rsquare",
"RsquareAdjusted", "homogeneity", "condition"))
return(stats)
}
#' @export
#' @rdname LINKER_runPhase1
#' @param Data Matrix of log-normalized estimated counts of the gene expression data, centered and scaled, containing
#' only the train samples.
#' @param Clusters Clusters generated from the linkerinit function.
LINKER_LearnRegulatoryPrograms <- function(Data, Clusters, RegulatorData, Lambda, alpha, pmax, mode, used_method = "MEAN", FDR) {
RegulatorData_rownames <- rownames(RegulatorData)
Data_rownames <- rownames(Data)
NrClusters <- length(unique(Clusters))
NrGenes <- nrow(Data)
NrSamples <- ncol(Data)
y_all <- mat.or.vec(NrClusters, NrSamples)
ClusterIDs <- unique(Clusters)
ClusterIDs <- sort(ClusterIDs, decreasing = FALSE)
cnt <- seq_len(NrClusters)
# This will register nr of cores/threads, keep this here so the user can decide how many cores based on their hardware.
#parallClass <- BiocParallel::bpparam()
#parallClass$workers <- NrCores
# Calculate Bettas
BetaY_all <- lapply(seq_len(NrClusters), function(i){
#CalcBettas <- function(i, Data, Clusters, RegulatorData, Lambda, alpha, mode, used_method) {
CurrentClusterPositions <- which(Clusters %in% ClusterIDs[i])
nrGenesInClusters <- length(CurrentClusterPositions)
if (nrGenesInClusters > 1) {
if (used_method == "LINKER") {
gene_svd <- svd(Data[CurrentClusterPositions, ])
y <- gene_svd$v[, 1]
if (stats::cor(matrixStats::colMeans2(Data[CurrentClusterPositions, ]), y) < 0) {
y <- -y
}
} else {
y <- matrixStats::colMeans2(Data[CurrentClusterPositions, ])
}
} else {
y <- Data[CurrentClusterPositions, ]
}
X <- RegulatorData
if (mode == "LASSOmin") {
fit <- glmnet::cv.glmnet(t(X), y, alpha = alpha)
nonZeroLambdas <- fit$lambda[which(fit$nzero > 0)]
nonZeroCVMs <- fit$cvm[which(fit$nzero > 0)]
if (length(which(nonZeroCVMs == min(nonZeroCVMs, na.rm = TRUE))) == 0) {
# for now: just print a warning, *although* this error WILL cause LINKER to crash in a few steps.
warnMessage <- paste0("\nOn cluster ", i, " there were no cv.glm results that gave non-zero coefficients.")
warning(warnMessage)
bestNonZeroLambda <- fit$lambda.min
} else {
bestNonZeroLambda <- nonZeroLambdas[which(nonZeroCVMs == min(nonZeroCVMs, na.rm = TRUE))]
}
b_o <- stats::coef(fit, s = fit$lambda.min)
b_opt <- c(b_o[2:length(b_o)]) # removing the intercept.
} else if (mode == "LASSO1se") {
fit <- glmnet::cv.glmnet(t(X), y, alpha = alpha)
nonZeroLambdas <- fit$lambda[which(fit$nzero > 0)]
nonZeroCVMs <- fit$cvm[which(fit$nzero > 0)]
if (length(which(nonZeroCVMs == min(nonZeroCVMs, na.rm = TRUE))) == 0) {
# for now: just print a warning, *although* this error WILL cause LINKER to crash in a few steps.
warnMessage <- paste0("\nOn cluster ", i, " there were no cv.glm results that gave non-zero coefficients.")
warning(warnMessage)
bestNonZeroLambda <- fit$lambda.min
} else {
bestNonZeroLambda <- nonZeroLambdas[which(nonZeroCVMs == min(nonZeroCVMs, na.rm = TRUE))]
}
b_o <- stats::coef(fit, s = fit$lambda.1se)
b_opt <- c(b_o[2:length(b_o)]) # removing the intercept.
} else if (mode == 'LASSOparam'){
fit <- glmnet::cv.glmnet(t(X), y, alpha = alpha)
fitquants <- stats::quantile(fit$lambda,probs=seq(0,1,1/9))
nonZeroLambdas <- fit$lambda[which(fit$nzero > 0)]
nonZeroCVMs <- fit$cvm[which(fit$nzero > 0)]
if (length(which(nonZeroCVMs == min(nonZeroCVMs, na.rm = TRUE))) == 0) {
# for now: just print a warning, *although* this error WILL cause LINKER to crash in a few steps.
warnMessage <- paste0("\nOn cluster ", i, " there were no cv.glm results that gave non-zero coefficients.")
warning(warnMessage)
bestNonZeroLambda <- fit$lambda.min
} else {
bestNonZeroLambda <- nonZeroLambdas[which(nonZeroCVMs == min(nonZeroCVMs, na.rm = TRUE))]
}
if (Lambda>0){
b_o <- stats::coef(fit, s = fitquants[Lambda])
}else{
b_o <- stats::coef(fit, s = 0)
}
b_opt <- c(b_o[2:length(b_o)]) # removing the intercept.
} else if (mode == "VBSR") {
res <- vbsr(y, t(X), n_orderings = 15, family = "normal")
betas <- res$beta
max_beta <- max(abs(betas))
betas[res$pval > FDR/(nrow(RegulatorData) * nrow(Data))] <- 0 #Bonferroni
b_opt <- betas
b_o <- 0
} else if (mode == "LM") {
b_opt <- numeric(length = nrow(RegulatorData))
for (i in seq_len(nrow(RegulatorData))) {
x <- X[i, ]
fit <- stats::lm(y ~ x)
s <- summary(fit)
if (s$coefficients[2, "Pr(>|t|)"] < FDR/(nrow(RegulatorData) * nrow(Data))) {
b_opt[i] <- s$coefficients[2, 1]
} else {
b_opt[i] <- 0
}
}
b_o <- 0
} else {
message("MODE NOT RECOGNIZED")
}
return(list(b_opt, y, b_o[1]))
})
# Initialize
#BetaY_all <- BiocParallel::bplapply(seq_len(NrClusters), CalcBettas, Data, Clusters, RegulatorData, Lambda, alpha, mode, used_method,
# BPPARAM = parallClass)
# Transform list of bettas into matrix, as .combine=cbind in foreach
BetaY_all <- do.call(cbind, BetaY_all)
Beta <- do.call(cbind, BetaY_all[1, ])
Beta <- t(Beta)
colnames(Beta) <- RegulatorData_rownames
rownames(Beta) <- gsub("result.", "Module_", rownames(Beta))
y_all <- do.call(cbind, BetaY_all[2, ])
y_all <- t(y_all)
rownames(y_all) <- gsub("result.", "Module_", rownames(y_all))
intercept <- do.call(cbind, BetaY_all[3, ])
intercept <- t(intercept)
intercept <- as.numeric(intercept)
# calculating some statistics
prediction <- (Beta %*% RegulatorData + intercept)
error <- y_all - prediction
result <- list(Beta = Beta, error = error)
return(result)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.