#'
#' \code{getMainEffects}
#' @param dats \code{labelledDataFrame.frame} with variables in columns and samples in rows.
#' @param depVarName \code{string} name of the classLabel variable.
#' @param indVarNames Vector of independent variable names, to name output matrices
#' @param reg.fam \code{string} glm regression family name.
#' @param nCovs \code{numeric} of covariates included.
#' @param excludeMainEffects \code{logical} indicating whether to exclude main effect terms.
#' @return regainMatrix \code{matrix} of variable by variable regression coefficients.
#' @export
regain <- function(dats, depVarName, indVarNames, reg.fam="binomial", nCovs=0, excludeMainEffects=F){
# get the main effect raw coefs, std errs, pvals, standard coefs, and convergence indicators
mains <- getMainEffects(dats,
indVarNames=indVarNames,
depVarName=depVarName,
reg.fam=reg.fam,
nCovs=nCovs)
# get the interaction effect raw coefs, std errs, pvals, standard coefs, and convergence indicators
ints <- getInteractionEffects(dats,
indVarNames=indVarNames,
depVarName=depVarName,
reg.fam=reg.fam,
nCovs=nCovs,
excludeMainEffects=excludeMainEffects)
tmp <- ints$conv.mat # matrix of convergence indicators for interaction models - 0's on diagonal
diag(tmp) <- as.numeric(mains["converged",]) # load diagonal with main effect convergence indicators
conv.all.mat <- tmp
tmp <- ints$coef.mat # matrix of raw coefs for interaction models - 0's on diagonal
diag(tmp) <- as.numeric(mains["coef",]) # load diagonal with main effect raw coefs
coef.all.mat <- tmp
tmp <- ints$stderr.mat # matrix of standard errors for interaction model coefs - 0's on diagonal
diag(tmp) <- as.numeric(mains["se",]) # load diagonal with standard errors for main effect coefs
stderr.all.mat <- tmp
tmp <- ints$pval.mat # matrix of pvals for interaction model coefs - 0's on diagonal
diag(tmp) <- as.numeric(mains["pval",]) # load diagonal with pvals for main effect coefs
pval.all.mat <- tmp
tmp <- ints$stdbeta.mat # matrix of standardized interaction model coefs - 0's on diagonal
diag(tmp) <- as.numeric(mains["stdbeta",]) # load diagonal with standardized main effect coefs
stdbeta.all.mat <- tmp
list(convergence=conv.all.mat, raw.coefs=coef.all.mat, stderrs=stderr.all.mat, pvals=pval.all.mat, stdbetas=stdbeta.all.mat)
}
#'
#' \code{getMainEffects}
#' Wrapper for fitMainEffectModel with sapply over independent variable names.
#' Does not include covariates, so if you have covariates use
#' indVarNames = colnames(dats)[1:(ncol(dats) - nCovs - 1)]
#' if the columns are organized as: independent vars, covars, pheno
#' @param dats \code{data.frame} with variables in columns and samples in rows.
#' @param indVarNames vector of independent variable names to loop over.
#' @param depVarName \code{string} name of the classLabel variable.
#' @param regressionFamily \code{string} glm regression family name.
#' @param nCovs \code{numeric} of included covariates.
#' @return mainEffectValues \code{vector} of main effect values.
getMainEffects <- function(dats, indVarNames=NULL, depVarName=NULL,
reg.fam="binomial", nCovs=0){
theMains <- sapply(indVarNames,
function(x) fitMainEffectModel(dats, variableName=x,
depVarName=depVarName,
regressionFamily=reg.fam,
numCovariates=nCovs),
simplify="matrix", USE.NAMES=T)
return(theMains)
}
#'
#' \code{fitMainEffectModel}
#' Get the main effect of a variable using generalized linear regression .
#' @param labelledDataFrame \code{data.frame} with variables in columns and samples in rows.
#' @param variableName \code{string} name of the variable to consider.
#' @param depVarName \code{string} name of the classLabel variable.
#' @param regressionFamily \code{string} glm regression family name.
#' @param numCovariates \code{numeric} of included covariates.
#' @return \code{data.frame} with variable, convergence status, beta coefficient,
#' p-value, standard error and standardized beta columns.
fitMainEffectModel <- function(labelledDataFrame, variableName, depVarName,
regressionFamily, numCovariates) {
if (numCovariates > 0) {
covarsStart <- ncol(labelledDataFrame) - numCovariates
covarNames <- colnames(labelledDataFrame)[covarsStart:(ncol(labelledDataFrame) - 1)]
covarModelParts <- NULL
for (i in 1:length(covarNames)) {
covarModelParts <- c(covarModelParts, paste("`", covarNames, "`", sep = ""))
}
formulaString <- paste(depVarName, " ~ `", variableName,
"` + ", paste(covarModelParts, collapse = "+"), sep = "")
regressionFormula <- as.formula(formulaString)
} else {
regressionFormula <- as.formula(paste(depVarName, "~",
paste("`", variableName, "`", sep = ""),
sep = " "))
}
regressionModel <- glm2::glm2(regressionFormula, data = labelledDataFrame,
family = regressionFamily, na.action=na.omit)
#interceptCoeff <- summary(regressionModel)$coefficients[1, "Estimate"]
mainCoeff <- summary(regressionModel)$coefficients[2, "Estimate"]
mainStdErr <- summary(regressionModel)$coefficients[2, "Std. Error"]
if (regressionFamily == "binomial") {
#interceptPval <- summary(regressionModel)$coefficients[1, "Pr(>|z|)"]
mainPval <- summary(regressionModel)$coefficients[2, "Pr(>|z|)"]
mainStat <- summary(regressionModel)$coefficients[2, "z value"]
}
else {
#interceptPval <- summary(regressionModel)$coefficients[1, "Pr(>|t|)"]
mainPval <- summary(regressionModel)$coefficients[2, "Pr(>|t|)"]
mainStat <- summary(regressionModel)$coefficients[2, "t value"]
}
data.frame(converged = regressionModel$converged,
coef = mainCoeff,
se = mainStdErr,
pval = mainPval,
stdbeta = mainStat)
}
#'
#' \code{getInteractionEffects}
#'
#' @param dats \code{labelledDataFrame.frame} with variables in columns and samples in rows.
#' @param depVarName \code{string} name of the classLabel variable.
#' @param reg.fam \code{string} glm regression family name.
#' @param nCovs \code{numeric} of covariates included.
#' @param excludeMainEffects \code{logical} indicating whether to exclude main effect terms.
#' @param indVarNames Vector of independent variable names, to name output matrices
#' @return \code{data.frame} with variable, convergence status, beta coefficient,
#' p-value, standard error and standardized beta columns.
# i.e. indVarNames = colnames(dats)[1:(ncol(dats) - nCovs - 1)] if the columns are organized as: independent vars, covars, pheno
#
getInteractionEffects <- function(dats, depVarName, reg.fam="binomial", nCovs=0, excludeMainEffects=F, indVarNames){
numVars <- length(indVarNames)
conv.mat <- matrix(0, nrow=numVars, ncol=numVars)
coef.mat <- matrix(0, nrow=numVars, ncol=numVars)
stderr.mat <- matrix(0, nrow=numVars, ncol=numVars)
pval.mat <- matrix(0, nrow=numVars, ncol=numVars)
stdbeta.mat <- matrix(0, nrow=numVars, ncol=numVars)
for(i in 1:(numVars - 1)){
idx1 <- i
for(j in (i + 1):numVars){
idx2 <- j
getInts <- fitInteractionModel(dats,
variableIndices=c(idx1, idx2),
depVarName=depVarName,
regressionFamily=reg.fam,
numCovariates=nCovs,
excludeMainEffects=excludeMainEffects)
conv.mat[i, j] <- getInts$converged
coef.mat[i, j] <- getInts$coef
stderr.mat[i, j] <- getInts$stderr
pval.mat[i, j] <- getInts$pval
stdbeta.mat[i, j] <- getInts$stdbeta
}
}
conv.mat <- conv.mat + t(conv.mat)
colnames(conv.mat) <- indVarNames
row.names(conv.mat) <- indVarNames
coef.mat <- coef.mat + t(coef.mat)
colnames(coef.mat) <- indVarNames
row.names(coef.mat) <- indVarNames
stderr.mat <- stderr.mat + t(stderr.mat)
colnames(stderr.mat) <- indVarNames
row.names(stderr.mat) <- indVarNames
pval.mat <- pval.mat + t(pval.mat)
colnames(pval.mat) <- indVarNames
row.names(pval.mat) <- indVarNames
stdbeta.mat <- stdbeta.mat + t(stdbeta.mat)
colnames(stdbeta.mat) <- indVarNames
row.names(stdbeta.mat) <- indVarNames
list(conv.mat=conv.mat, coef.mat=coef.mat, stderr.mat=stderr.mat, pval.mat=pval.mat, stdbeta.mat=stdbeta.mat)
}
#'
#' \code{fitInteractionModel}
#'
#' @param labelledDataFrame \code{labelledDataFrame.frame} with variables in columns and samples in rows.
#' @param variableIndices \code{vector} of column indices of variable pairs.
#' @param depVarName \code{string} name of the classLabel variable.
#' @param regressionFamily \code{string} glm regression family name.
#' @param numCovariates \code{numeric} of covariates included.
#' @param excludeMainEffects \code{logical} indicating whether to exclude main effect terms.
#' @return \code{data.frame} with variable, convergence status, beta coefficient,
#' p-value, standard error and standardized beta columns.
fitInteractionModel <- function(labelledDataFrame, variableIndices, depVarName,
regressionFamily, numCovariates,
excludeMainEffects) {
variable1Idx <- variableIndices[1]
variable2Idx <- variableIndices[2]
variableNames <- colnames(labelledDataFrame)[1:(ncol(labelledDataFrame) - 1)]
variable1Name <- variableNames[variable1Idx]
variable2Name <- variableNames[variable2Idx]
if (excludeMainEffects) {
interactionTerm <- paste("`", variable1Name, "`", ":", "`",
variable2Name, "`", sep = "")
} else {
interactionTerm <- paste("`", variable1Name, "`", "*", "`",
variable2Name, "`", sep = "")
}
if (numCovariates > 0) {
covarsStart <- ncol(labelledDataFrame) - numCovariates
covarNames <- colnames(labelledDataFrame)[covarsStart:(ncol(labelledDataFrame)-1)]
covarsModelParts <- paste(covarNames, collapse = " + ")
regressionFormula <- as.formula(paste(depVarName, "~",
paste(interactionTerm, " + ",
covarsModelParts, sep = ""),
sep = " "))
}
else {
regressionFormula <- as.formula(paste(depVarName, "~", interactionTerm,
sep = " "))
}
# use glm2: Fits generalized linear models using the same model specification
# as glm in the stats package, but with a modified default fitting method that
# provides greater stability for models that may fail to converge using glm
# bcw - 10/18/15
regressionModel <- glm2::glm2(regressionFormula,
family = binomial(link = "logit"),
data = labelledDataFrame, na.action=na.omit)
if (numCovariates > 0) {
if (excludeMainEffects) {
interactionTermIndex <- 1 + numCovariates + 1
}
else {
interactionTermIndex <- 3 + numCovariates + 1
}
}
else {
if (excludeMainEffects) {
interactionTermIndex <- 2
}
else {
interactionTermIndex <- 4
}
}
#print(summary(regressionModel))
coeffs <- summary(regressionModel)$coefficients
#print(nrow(coeffs))
#print(interactionTermIndex)
# catch NA problems with interaction term fit
if (nrow(coeffs)<interactionTermIndex){
interactionCoeff <- NA
interactionStdErr <- NA
interactionPval <- NA
interactionStat <- NA
} else {
interactionCoeff <- coeffs[interactionTermIndex, "Estimate"]
interactionStdErr <- coeffs[interactionTermIndex, "Std. Error"]
if (regressionFamily == "binomial") {
interactionPval <- coeffs[interactionTermIndex, "Pr(>|z|)"]
interactionStat <- coeffs[interactionTermIndex, "z value"]
}
else {
interactionPval <- summary(regressionModel)$coefficients[interactionTermIndex, "Pr(>|t|)"]
interactionStat <- summary(regressionModel)$coefficients[interactionTermIndex, "t value"]
}
} # end catching fit NA problems
data.frame(converged = regressionModel$converged,
coef = interactionCoeff,
stderr = interactionStdErr,
pval = interactionPval,
stdbeta = interactionStat)
}
#' \code{EpistasisRank}
#'
#' @family feature selection functions
#' @param G \code{matrix} genetic association interaction network.
#' @param Gamma_vec \code{numeric} gamma vector, either a constant value or a vector
#' @param magnitude.sort default (T) is to rank variable scores by their magnitude
#' so negative effects are also ranked. Using False will force negative effects to be
#' ranked last.
#' @return rankTable \code{data.frame} with variable, EpistasisRank,
#' diagonal and degree columns. Sorted.
#' @export
EpistasisRank <- function(G = NULL, Gamma_vec = 0.85, magnitude.sort=T){
n <- nrow(G)
geneNames <- colnames(G)
Gdiag <- diag(G)
Gtrace <- sum(Gdiag)
#colsum <- colSums(G)
diag(G) <- 0
Gtrace <- Gtrace * n
colsumG <- colSums(G)
#rowSumG <- rowSums(G)
rowsum_denom <- matrix(0, n, 1)
for (i in 1:n) {
localSum <- 0
for (j in 1:n) {
factor <- ifelse(G[i, j] != 0, 1, 0)
localSum <- localSum + (factor * colsumG[j])
}
rowsum_denom[i] <- localSum
}
if (length(Gamma_vec) == 1) {
gamma_vec <- rep(Gamma_vec, n)
} else {
gamma_vec <- Gamma_vec
}
gamma_matrix <- matrix(nrow = n, ncol = n, data = rep(gamma_vec, n))
if (Gtrace) {
b <- ((1 - gamma_vec) / n) + (Gdiag / Gtrace)
}
else {
b <- ((1 - gamma_vec) / n)
}
D <- matrix(nrow = n, ncol = n, data = c(0))
diag(D) <- 1 / colsumG
I <- diag(n)
temp <- I - gamma_matrix * G %*% D
r <- solve(temp, b)
ERank <- r / sum(r)
rankTable <- data.frame(gene = geneNames, ER = ERank)
# interactions and main effects can be negative/disease protective, so you might
# want to see this in the top of the list.
if (magnitude.sort){ # large magnitudes (>0 and <0) will be at the top
rankTable[order(abs(rankTable$ER), decreasing = TRUE), ]
}else{ # negative scores will be at the bottom
rankTable[order(rankTable$ER, decreasing = TRUE), ]
}
}
#' Rank variables by EpistasisKatz algorithm.
#'
#' This function can be use as original Katz algorithm or as EpistasisKatz
#' that incorporates prior knowledge.
#'
#' \code{EpistasisKatz}
#'
#' @param A \code{matrix} network of features either in adjacency or interaction format.
#' @param alpha \code{numeric} a vector with numeric values.
#' @param beta \code{numeric} either a vector of constant values or prior knowledge.
#' @param magnitude.sort default (T) is to rank variable scores by their magnitude
#' so negative effects are also ranked. Using False will force negative effects to be
#' ranked last.
#' @return features ranking with features name.
#' @export
EpistasisKatz <- function(A = NULL, alpha = NULL, beta = NULL, magnitude.sort=T) {
lam.dom <- eigen(A)$values[1]
if (is.null(alpha)){
alpha <- .8/lam.dom # 80% of max threshold
}
if (is.null(beta)){
beta <- diag(A)
}
n <- nrow(A)
I <- diag(1, n);
A.nodiag <- A - diag(A)
KRank <- solve(I - alpha * A.nodiag,beta)
geneNames <- colnames(A)
rankTable <- data.frame(gene = geneNames, KR = KRank)
if (magnitude.sort){ # large magnitudes (>0 and <0) will be at the top
rankTable[order(abs(rankTable$KR), decreasing = TRUE), ]
}else{ # negative scores will be at the bottom
rankTable[order(rankTable$KR, decreasing = TRUE), ]
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.