#for i in `cat panelsApr2018`;
#do echo "cd /SAN/neuroscience/WT_BRAINEAC/ml/tournamentnewDB/; Rscript -e \"library(caret);library(G2PML);panel=\\\"$i\\\";saveRDS(featureSelection(genes=getGelGenes(panel=panel),k=5,repeats=40),\\\"$i.fs.rds\\\")\"" | qsub -S /bin/bash -N $i -l h_rt=48:0:0 -l tmem=8G,h_vmem=8G -o
#/SAN/neuroscience/WT_BRAINEAC/ml/tournamentnewDB/$i.fs.o -e /SAN/neuroscience/WT_BRAINEAC/ml/tournamentnewDB/$i.fs.e ; done
#for i in `cat panelsApr2018`; do echo
#"cd /SAN/neuroscience/WT_BRAINEAC/ml/tournamentnewDB/; Rscript -e \"library(caret);library(G2PML);panel=\\\"$i\\\";saveRDS(ensembleLearnKFold(panel=\\\"$i\\\",fs=featureSelection(genes=getGelGenes(panel=panel),k=3,repeats=20),nboot=20,auto=T,k=5,maxTrials=5),\\\"$i.Apr2018.rds\\\")\"" | qsub -S /bin/bash -N $i -l h_rt=72:0:0 -l tmem=8G,h_vmem=8G
#-o /SAN/neuroscience/WT_BRAINEAC/ml/tournamentnewDB/$i.apr.o -e /SAN/neuroscience/WT_BRAINEAC/ml/tournamentnewDB/$i.apr.e ; done
#' Which features are most important for your gene list?
#'
#' \code{featureSelection} computes the best features that discriminate between
#' your list of disease genes and control genes. Uses bootstrapping to form
#' balanced sets of disease and non-disease genes then selects the best
#' features based on a random forest algorithm implemented through the
#' \code{\link{caret::rfe}} function.
#'
#' @param genes chr vector. Gene symbols - can be returned from
#' \code{\link{getGenesFromPanelApp}}.
#' @param seed num scalar. Random seed for reproducibility.
#' @param sizes int vector. Sizes to be used in the recursive feature
#' elimination \code{\link{caret::rfe}}.
#' @param k int scalar. Factor by which to split training set for k-fold cross
#' validation.
#' @param trnProp num scalar. Between 0-1 - proportion of disease genes to keep
#' when bootstrapping.
#' @param repeats int scalar. Number of times you want to bootstrap/iterate. For
#' each iteration, \code{featureSelection} will compute rfe on an random
#' proportion (trnProp) of disease genes and a random size-matched set of
#' controls.
#' @param filter Useful for removing ML data features from the analysis. Is a vector of Strings
#'
#'
#' @return list of length repeats. Each element contains an rfe class object
#' fitted for a set of randomly sampled disease and control genes.
#' @export
#'
#' @examples
#' genes <- getGenesFromPanelApp(disorder="Neurology and neurodevelopmental disorders",
#' panel="Parkinson Disease and Complex Parkinsonism", color = "green")
#' featureSelection(genes, controls = "allgenome")
#'
#' @seealso For more details on rfe:
#' \url{http://topepo.github.io/caret/recursive-feature-elimination.html}
featureSelection = function(genes=NULL,
seed=12345,
sizes = c(5,10,20),
k=5,
controls="allgenome",
trnProp=0.8,
repeats=10,
filter=NULL){
allgenes = genes
cat("We'll work with",length(allgenes),"disease genes\n")
mldata = fromGenes2MLData(genes=allgenes,
which.controls=controls,
filter=filter)
mask = !(colnames(mldata) %in% c("gene","condition"))
mldata[,mask] = scale(mldata[,mask])
set.seed(seed)
fsruns = NULL
gcondition = mldata$condition
for(run in 1:repeats){
the_n = floor(length(genes)*trnProp)
lmldata = mldata[c(sample(which(mldata$condition == "Disease"),the_n),
sample(which(mldata$condition == "Nondisease"),the_n)),]
condition = lmldata$condition
lmldata$gene = NULL
lmldata$condition = NULL
nzv = caret::nearZeroVar(lmldata, saveMetrics= TRUE)
lmldata = lmldata[,!nzv$zeroVar]
descrCor = cor(lmldata) # builds correlation matrix
highlyCorDescr = caret::findCorrelation(descrCor, cutoff = .9,
names = T, verbose = F)
lmldata = lmldata[,!(colnames(lmldata) %in% highlyCorDescr)]
fcondition = as.factor(condition)
ctrl = caret::rfeControl(functions=caret::rfFuncs,
method="cv",
number=k,
verbose=T,
saveDetails=T)
lmProfile = caret::rfe(x=lmldata,
y = fcondition,
sizes = sizes,
rfeControl=ctrl)
mask = !unlist(apply(lmldata,2,function(x){ length(unique(x)) == TRUE}))
if(sum(!mask) > 0){
cat("Removing",paste0(colnames(lmldata)[!mask],collapse=", "),
"from data before going for glm\n")
lmldata = lmldata[,mask]
}
lmProfile$optVariables = lmProfile$optVariables[lmProfile$optVariables %in% colnames(lmldata)]
fit = glm(formula=fcondition ~ . ,data=lmldata[,lmProfile$optVariables],family ="binomial")
lmProfile$myFit = fit
fsruns[[run]] = lmProfile
}
return(fsruns)
}
#' Which features are most important for your gene list?
#'
#' \code{getVarsFromFS} converts the output of \code{\link{featureSelection}}
#' function to a list of features that were most important across bootstrapping
#'
#' @param fsdata rfe object. Output of \code{\link{featureSelection}}.
#' @param r num scalar. Between 0-1. Minimum proportion of bootstraps that must
#' have selected that feature as important.
#' @param counts lgl scalar. If TRUE, will return counts each "important"
#' feature appeared amongst bootstrap iterations. If FALSE, the names of the
#' features in the same order.
#'
#' @return Either counts or names of the most important features determined by
#' rfe.
#' @export
#'
#' @example
#' getVarsFromFS(allfsdata, r = 0.6, counts = F)
getVarsFromFS = function(fsdata,r=0.6,counts=F){
availfiles = 0
vars = NULL
k = length(fsdata)
for(i in 1:k){
fsdatal = fsdata[[i]]
vars = c(vars,names(sort(table(fsdatal$variables$var[fsdatal$variables$Variables == fsdatal$bestSubset]),
decreasing=T)))
}
minAppearance = floor(r*k)
fcounts = table(vars)
fcounts = fcounts[fcounts >= minAppearance]
fcounts = sort(fcounts,decreasing=T)
if(counts)
return(fcounts)
return(unique(names(fcounts)))
}
#'Which direction and to what extent do your important features affect the
#'gene's likelihood of being disease causing?
#'
#'\code{getVarsFromFS} extracts from the output of
#'\code{\link{featureSelection}} the most important features based a voting
#'system across bootstraps. Then for the most-voted features obtain the sign and
#'effect sizes of each feature.
#'
#'@inheritParams getVarsFromFS
#'
#'@param allfsdata list of rfe objects. Output of \code{\link{featureSelection}}.
#'@param panel chr scalar. The GE panel name from which input genes were
#' extracted.
#'
#'@return list with all counts, signs, effect sizes and mean effect sizes of all
#' important features across all bootstraps
#'
#'@export
#'
#'@examples
#'getVarsMetaDataFromFS(allfsdata, r = 0.6, panel = "Unknown")
getVarsMetaDataFromFS = function(allfsdata,r=0.6,panel="Unknown"){
allfeatures = NULL
allsigns = NULL
alleffects = NULL
allmeaneffect = NULL
alleffectstab = NULL
allmeaneffectstab = NULL
availfiles = 0
cat("Working now with",panel,"\n")
features = NULL
meaneffect = NULL
folds = length(allfsdata)
selected = getVarsFromFS(allfsdata,r=r)
fcounts = getVarsFromFS(allfsdata,r=r,T)
allfeatures = fcounts
signs = NULL
effects = NULL
models = NULL
folds = length(allfsdata)
for(k in 1:folds){
fsdata = allfsdata[[k]]
if(!is.null(fsdata$ga)){
models[[as.character(k)]] = fsdata$fit
fit = fsdata$fit$finalModel
}else{
models[[as.character(k)]] = fsdata$myFit
fit = fsdata$myFit
}
for(sel in selected){
if(sel %in% names(fit$coefficients)){
mask = names(fit$coefficients) == sel
if(is.null(signs[[sel]])){
signs[[sel]] = NULL
if(!is.na(fit$coefficients[mask]))
signs[[sel]] = as.character(sign(fit$coefficients[mask]))
else
signs[[sel]] = "0"
signs = as.list(signs)
}else{
older = signs[[sel]]
if(!is.na(fit$coefficients[mask]))
signs[[sel]] = paste0(older,":",as.character(sign(fit$coefficients[mask])))
else
signs[[sel]] = paste0(older,":0")
}
mask = names(fit$effects) == sel
if(sum(mask)){
if(is.null(effects[[sel]])){
effects[[sel]] = NULL
if(!is.na(fit$effects[mask])){
effects[[sel]] = as.character(signif(fit$effects[mask],3))
meaneffect[[sel]] = c(meaneffect[[sel]],fit$effects[mask])
}
else
effects[[sel]] = "0"
effects = as.list(effects)
meaneffect = as.list(meaneffect)
}else{
older = effects[[sel]]
if(!is.na(fit$effects[mask])){
effects[[sel]] = paste0(older,":",as.character(signif(fit$effects[mask],3)))
meaneffect[[sel]] = c(meaneffect[[sel]],fit$effects[mask])
}
else
effects[[sel]] = paste0(older,":0")
}
}
}
}
}
allsigns = unlist(signs)
alleffects = unlist(effects)
mask = order(abs(unlist(lapply(meaneffect,mean))),decreasing=T)
allmeaneffect = unlist(lapply(meaneffect,mean))[mask]
localdata = as.matrix(unlist(allmeaneffect))
localdata = cbind(rownames(localdata),localdata)
allmeaneffectstab = rbind(allmeaneffectstab,
cbind(rep(panel,
nrow(localdata)),
localdata,
rep(folds,nrow(localdata))))
#print(allmeaneffectstab)
localdata = as.matrix(unlist(alleffects))
localdata = cbind(rownames(localdata),localdata)
localdata = cbind(rep(panel,nrow(localdata)),
localdata)
alleffectstab = rbind(alleffectstab,localdata)
#print(alleffectstab)
colnames(alleffectstab) = c("panel","predictor","foldeffects")
colnames(allmeaneffectstab) = c("panel","feature","meaneffect","coverage")
return(list(features=allfeatures,signs=allsigns,effects=alleffects,meaneffects=allmeaneffectstab))
}
#'Which direction and to what extent do your important features affect the
#'gene's likelihood of being disease causing?
#'
#'\code{getVarsFromFS} extracts from the output of
#'\code{\link{featureSelection}} the most important features based a voting
#'system across bootstraps. Then for the most-voted features obtain the sign and
#'effect sizes of each feature.
#'
#'@inheritParams getVarsFromFS
#'
#'@param allfsdata list of rfe objects. Output of \code{\link{featureSelection}}.
#'@param panel chr scalar. The GE panel name from which input genes were
#' extracted.
#'
#'@return list with all counts, signs, effect sizes and mean effect sizes of all
#' important features across all bootstraps
#'
#'@export
#'
#'@examples
#'getVarsMetaDataFromFS(allfsdata, r = 0.6, panel = "Unknown")
getVarsMetaDataFromCaretFS = function(allfsdata,r=0.6,panel="Unknown"){
allfeatures = NULL
allsigns = NULL
alleffects = NULL
allmeaneffect = NULL
alleffectstab = NULL
allmeaneffectstab = NULL
availfiles = 0
cat("Working now with",panel,"\n")
features = NULL
meaneffect = NULL
folds = length(allfsdata)
selected = getVarsFromFS(allfsdata,r=r)
fcounts = getVarsFromFS(allfsdata,r=r,T)
allfeatures = fcounts
signs = NULL
effects = NULL
models = NULL
folds = length(allfsdata)
for(k in 1:folds){
fsdata = allfsdata[[k]]
for(sel in selected){
if(is.null(meaneffect))
meaneffect[[sel]] = NULL
if(is.null(effects))
effects[[sel]] = NULL
if(is.null(signs))
signs[[sel]] = NULL
mvalue = mean(fsdata$variables$Disease[fsdata$variables$var == sel])
if(!is.na(mvalue)){
if(!(sel %in% names(meaneffect)))
meaneffect[[sel]] = mvalue
else
meaneffect[[sel]] = paste0(meaneffect[[sel]],":",mvalue)
if(!(sel %in% names(signs)))
signs[[sel]] = sign(mvalue)
else
signs[[sel]] = paste0(signs[[sel]],":",sign(mvalue))
}
}
}
allsigns = unlist(signs)
allmeaneffect = unlist(meaneffect)
localdata = as.matrix(unlist(allmeaneffect))
localdata = cbind(rownames(localdata),localdata)
allmeaneffectstab = rbind(allmeaneffectstab,
cbind(rep(panel,
nrow(localdata)),
localdata,
unlist(lapply(localdata[,2],function(x){ mean(as.numeric(unlist(strsplit(x,":"))))})),
rep(folds,nrow(localdata))))
colnames(allmeaneffectstab) = c("panel","feature","allmeaneffects","meaneffect","coverage")
return(list(features=allfeatures,signs=allsigns,meaneffects=allmeaneffectstab))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.