R/mostVariable.R

Defines functions mostVariable mostVariableCT

# this version merges cell types into a single sample then looks for a variable.
# specific to our cell type data. not ideal but oh well...
#' @export
mostVariableCT = function(whichFile,outFile=NULL,cellTypeColumn, design,threshold = 6,threshFun =max){
    if (is.character(whichFile)){
        allDataPre = ogbox::read.exp(whichFile)
    } else{
        allDataPre = whichFile
    }
    
    if (is.character(design)){
        design = ogbox::read.design(design)
    } else{
        design = design
    }

    
    list[,exprData]= sepExpr(allDataPre)
    
    assertthat::assert_that(all(make.names(colnames(exprData)) %in% make.names(design$sampleName)))
    assertthat::assert_that(all(make.names(design$sampleName) %in% make.names(colnames(exprData))))
    
    exprData = exprData[match(make.names(design$sampleName),make.names(colnames(exprData)))]
    
    
    cellTypes = trimNAs(unique(design[,cellTypeColumn]))
    
    cellTypeExpr = lapply(cellTypes,function(x){
        apply(exprData[,design[,cellTypeColumn] %in% x,drop=F],1,mean)
    })
    exprData = as.data.frame(cellTypeExpr)
    # remove the ones with highest expression below 6
    rowmax = apply(exprData, 1, threshFun)
    discludeGenes = (rowmax<threshold)
    allDataPre = allDataPre[!discludeGenes,]
    exprData = exprData[!discludeGenes,]
    
    # ignore multiple matching probesets while mostVariable selection
    allDataMulti = allDataPre[grepl('[|]',allDataPre$Gene.Symbol),]
    exprData = exprData[!grepl('[|]',allDataPre$Gene.Symbol),]
    allDataPre = allDataPre[!grepl('[|]',allDataPre$Gene.Symbol),]
    
    # you bloody idiot... taken from lila
    decreasingVar = order(apply(exprData,1,var), decreasing = T)
    allDataPre = allDataPre[decreasingVar,]
    allDataPre = allDataPre[!duplicated(allDataPre$Gene.Symbol),]
    
    # add the multiple matching probesets back
    allDataPre = rbind(allDataPre,allDataMulti)
    allDataPre = allDataPre[!allDataPre$Gene.Symbol=='',]
    if(!is.null(outFile)){
        write.csv(allDataPre, file = outFile, row.names=FALSE)
    }
    invisible(allDataPre)
}

# this function is a generic function that looks for the most variable probeset
# of a gene. unlike the previous one, it takes in objects and outputs objects 
#' @export
mostVariable = function(allDataPre,genes = 'Gene.Symbol', threshold = 6, threshFun = max){
    list[,exprData]= sepExpr(allDataPre)
    rowmax = apply(exprData, 1, threshFun)
    discludeGenes = (rowmax<threshold)
    allDataPre = allDataPre[!discludeGenes,]
    exprData = exprData[!discludeGenes,]
    
    decreasingVar = order(apply(exprData,1,var), decreasing = T)
    allDataPre = allDataPre[decreasingVar,]
    if (class(allDataPre)[1]=='data.table'){
        allDataPre = allDataPre[!duplicated(allDataPre[,genes, with=F]),]
    } else {
        allDataPre = allDataPre[!duplicated(allDataPre[,genes]),]
        
    }
    allDataPre = allDataPre[!allDataPre[,genes]=='',]
    return(allDataPre)
}
oganm/brainGenesManuscript documentation built on Aug. 24, 2022, 7:48 p.m.