R/filtering_functions.R

Defines functions filter_correlations declare_filtering_presets filter_sample_by_class_to_ignore ExpObjVetting filter_experiment

Documented in declare_filtering_presets ExpObjVetting filter_correlations filter_experiment filter_sample_by_class_to_ignore

#' filter_experiment(ExpObj = NULL, featmaxatleastPPM = 0, featcutoff = c(0, 0), discard_SDoverMean_below = NULL, samplesToKeep = NULL, featuresToKeep = NULL, normalization = "relabund", asPPM = TRUE, PPM_normalize_to_bases_sequenced = FALSE, flush_out_empty_samples = FALSE, GenomeCompletenessCutoff = NULL, PctFromCtgscutoff = NULL)
#'
#' Filters a SummarizedExperiment object by several criteria.
#' @export

filter_experiment <- function(ExpObj = NULL, featmaxatleastPPM = 0, featcutoff = c(0, 0), discard_SDoverMean_below = NULL, samplesToKeep = NULL, featuresToKeep = NULL, normalization = "relabund", asPPM = TRUE, PPM_normalize_to_bases_sequenced = FALSE, flush_out_empty_samples = FALSE, GenomeCompletenessCutoff = NULL, PctFromCtgscutoff = NULL){

    #Get only samples you asked for
    if (!(is.null(samplesToKeep))){
        samplesToKeep <- samplesToKeep[samplesToKeep %in% colnames(ExpObj)]
        ExpObj <- ExpObj[, samplesToKeep]
        #Also fix total counts vector in ExpObj metadata
        for (mdTU in c("TotalBasesSequenced", "TotalBasesSequencedinAnalysis")){
            MTU <- t(as.matrix(metadata(ExpObj)[[mdTU]]["NumBases", samplesToKeep]))
            rownames(MTU) <- "NumBases"
            colnames(MTU) <- samplesToKeep
            metadata(ExpObj)[[mdTU]] <- MTU
        }
    }

    #Get appropriate object with which to work
    if (as.character(class(ExpObj)[1]) == "SummarizedExperiment"){
        rawcts <- as.matrix(assays(ExpObj)$BaseCounts)
        if (PPM_normalize_to_bases_sequenced) {
            totbases <- metadata(ExpObj)$TotalBasesSequenced
        } else {
            totbases <- metadata(ExpObj)$TotalBasesSequencedinAnalysis
            #totbases <- t(data.frame(NumBases = colSums(rawcts)))
        }
    } else {
        stop("Object must be a SummarizedExperiment object.")
    }

    #If setting featmaxatleastPPM or featcutoff to anything other than the defaults, then return as PPM.
    if (any(featcutoff != c(0, 0))){
        asPPM <- TRUE
    }

    #Flush out empty rows
    #Don't do this anymore
    #ExpObj <- ExpObj[(rowSums(rawcts) > 0), ]

    #Flush out empty Samples
    if (flush_out_empty_samples){
        emptysamples <- names(which(colSums(rawcts) == 0) == TRUE)
        if (length(emptysamples) > 0){
            flog.info(paste("Samples", paste0(emptysamples, collapse = ", "), "are empty and will be discarded."))
            validsamples <- names(which(colSums(rawcts) > 0) == TRUE)
            ExpObj <- ExpObj[ , validsamples]
        }
    }

    countmat <- as.matrix(assays(ExpObj)$BaseCounts)
    countmat <- countmat[!is.na(row.names(countmat)),]

    if (normalization == "compositions"){
        asPPM <- FALSE
    }

    #Transform to PPM if applicable
    if (asPPM == TRUE){
        #transform into PPM
        getPPM <- function(Sample = NULL){
            PPMs <- (countmat[ , Sample] / totbases["NumBases", Sample]) * 1000000
            return(PPMs)
        }

        countmat2 <- sapply(1:ncol(countmat), function(x){ getPPM(Sample = colnames(countmat)[x])} )

        colnames(countmat2) <- colnames(countmat)
        #Round to integer
        for (c in 1:ncol(countmat2)){
            countmat2[, c] <- round(countmat2[, c], 0)
        }
        countmatrix <- countmat2

        #Regenerate an experiment object
        assays(ExpObj)$BaseCounts <- countmatrix

    } else {

        if (normalization == "compositions"){
            countmat2 <- sapply(1:ncol(countmat), function(x){ as.numeric(compositions::clr(countmat[,x])) })
            colnames(countmat2) <- colnames(countmat)
            rownames(countmat2) <- rownames(countmat)
            countmatrix <- countmat2
        } else {
            countmatrix <- countmat
        }
    }

    #Discard features which do not match certain criteria
    if (!(missing(featcutoff))){
        if (length(featcutoff) != 2){
            stop("Please specify the minimum PPM in what percentage of the samples you want with a numerical vector of size two. For example, featcutoff=c(2000,10) would discard features which are not at least 2000 PPM in at least 10% of samples.")
        }
        thresholdPPM <- featcutoff[1]
        sampcutoffpct <- min(featcutoff[2], 100)

        prop_above_threshold <- function(countmatrix = NULL, feat = NULL, thresholdPPM = NULL) {
            proportionPassing <- length(which(countmatrix[feat, ] >= thresholdPPM)) / ncol(countmatrix)
            return(proportionPassing)
        }

        featuresToKeep2 <- rownames(countmatrix)[which((sapply(1:nrow(countmatrix), function(x) { prop_above_threshold(countmatrix = countmatrix, feat = x, thresholdPPM = thresholdPPM) })) > (sampcutoffpct/100))]

        ExpObj <- ExpObj[featuresToKeep2, ]
        countmatrix <- countmatrix[featuresToKeep2, ]

        cutoffmsg <- paste("Feature must be >", thresholdPPM, "PPM in at least", sampcutoffpct, "% of samples", sep=" ")
        flog.info(cutoffmsg)
    }

    if (featmaxatleastPPM > 0){
        featuresToKeep2 <- rownames(countmatrix)[which(rowMax(countmatrix) >= featmaxatleastPPM)]
        ExpObj <- ExpObj[featuresToKeep2, ]
        countmatrix <- countmatrix[featuresToKeep2, ]
        minPPMmsg <- paste("Highest feature must be >", featmaxatleastPPM, "PPM", sep=" ")
        flog.info(minPPMmsg)
    }

    if (!is.null(discard_SDoverMean_below)){
        dfm <- (rowSds(countmatrix) / rowMeans(countmatrix))
        featuresToKeepSDflt <- names(dfm[dfm > discard_SDoverMean_below])
        ExpObj <- ExpObj[featuresToKeepSDflt, ]
        countmatrix <- countmatrix[featuresToKeepSDflt, ]
        SDoverMeanmsg <- paste("Feature must have >", discard_SDoverMean_below, "SDs over mean", sep=" ")
        flog.info(SDoverMeanmsg)
    }

    if (all(c(("PctFromCtgs" %in% names(assays(ExpObj))), (!is.null(PctFromCtgscutoff))))){
        if(length(PctFromCtgscutoff) != 2){
            stop("Please specify the minimum percentage of information coming from contigs in what percentage of the samples you want with a numerical vector of size two. For example, PctFromCtgscutoff = c(90, 10) would discard features whose taxonomic information does not come at least 90% from contigs in at least 10% of samples.")
        }
        minPctFromCtgs <- PctFromCtgscutoff[1]
        sampcutoffpct <- min(PctFromCtgscutoff[2], 100)
        PctFromCtgsmatrix <- assays(ExpObj)[["PctFromCtgs"]]

        prop_above_minCtg <- function(x) {
            validsamples <- names(which(countmatrix[x, ] > 0))
            proportionPassing <- length(which(PctFromCtgsmatrix[x, validsamples] >= minPctFromCtgs)) / length(validsamples)
            return(proportionPassing)
        }

        featuresToKeep3 <- rownames(PctFromCtgsmatrix)[which((sapply(1:nrow(PctFromCtgsmatrix), function(x) { prop_above_minCtg(x) })) > (sampcutoffpct / 100))]

        ExpObj <- ExpObj[featuresToKeep3, ]
        countmatrix <- countmatrix[featuresToKeep3, ]
    }

    if (all(c(("GenomeCompleteness" %in% names(assays(ExpObj))), (!is.null(GenomeCompletenessCutoff))))){
        if(length(GenomeCompletenessCutoff) != 2){
            stop("Please specify the minimum Genome Completeness in what percentage of the samples you want with a numerical vector of size two. For example, GenomeCompletenessCutoff = c(10, 5) would discard features whose genome completeness is smaller than 10% from contigs in at least 5% of samples.")
        }
        minGenomeCompleteness <- (GenomeCompletenessCutoff[1] / 100)
        sampcutoffpct <- min(GenomeCompletenessCutoff[2], 100)
        GenomeCompletenessmatrix <- assays(ExpObj)[["GenomeCompleteness"]]

        prop_above_minGComp <- function(x) {
            validsamples <- names(which(countmatrix[x, ] > 0))
            proportionPassing <- length(which(GenomeCompletenessmatrix[x, validsamples] >= minGenomeCompleteness)) / length(validsamples)
            return(proportionPassing)
        }

        featuresToKeep4 <- rownames(GenomeCompletenessmatrix)[which((sapply(1:nrow(GenomeCompletenessmatrix), function(x) { prop_above_minGComp(x) })) > (sampcutoffpct / 100))]

        ExpObj <- ExpObj[featuresToKeep4, ]
        countmatrix <- countmatrix[featuresToKeep4, ]
    }

    #Get only features you asked for
    if (!(is.null(featuresToKeep))){
        featurespresent <- rownames(countmatrix)
        wantedfeatures <- featuresToKeep[featuresToKeep %in% featurespresent]
        #Check that you still have the features you're asking for
        if (length(wantedfeatures) > 0){
            ExpObj <- ExpObj[wantedfeatures, ]
        } else {
            flog.warn("There are no features you requested surviving the criteria.")
        }
    }

    #filter out unwanted feature/sample stratification if SummarizedExperiment object contains that kind of information
    if (all(c("allfeaturesbytaxa_index", "allfeaturesbytaxa_matrix") %in% names(metadata(ExpObj)))){
        #Obtain current index and matrix
        allfeaturesbytaxa_index <- metadata(ExpObj)$allfeaturesbytaxa_index
        allfeaturesbytaxa_matrix <- metadata(ExpObj)$allfeaturesbytaxa_matrix

        #Eliminate from index irrelevant samples
        allfeaturesbytaxa_index <- subset(allfeaturesbytaxa_index, Sample %in% colnames(ExpObj))

        #Eliminate from index irrelevant features
        allfeaturesbytaxa_index <- subset(allfeaturesbytaxa_index, Accession %in% rownames(ExpObj))

        #Subset sparse matrix to contain only relevant rows
        allfeaturesbytaxa_matrix <- allfeaturesbytaxa_matrix[allfeaturesbytaxa_index$RowNumber, ]

        #Rename rows with consecutive numbers
        allfeaturesbytaxa_index$RowNumber <- 1:nrow(allfeaturesbytaxa_index)
        rownames(allfeaturesbytaxa_index) <- allfeaturesbytaxa_index$RowNumber
        rownames(allfeaturesbytaxa_matrix) <- allfeaturesbytaxa_index$RowNumber

        #Eliminate irrelevant columns from sparse matrix
        matcSums <- Matrix::colSums(allfeaturesbytaxa_matrix)
        matcSums <- matcSums[matcSums > 0]
        taxaToKeep <- names(matcSums)
        allfeaturesbytaxa_matrix <- allfeaturesbytaxa_matrix[ , taxaToKeep]

        #Stick back into ExpObj
        metadata(ExpObj)$allfeaturesbytaxa_index <- allfeaturesbytaxa_index
        metadata(ExpObj)$allfeaturesbytaxa_matrix <- allfeaturesbytaxa_matrix
    }

    return(ExpObj)
}


#' ExpObjVetting(ExpObj = NULL)
#'
#' Performs vetting of a SummarizedExperiment object for use in several functions
#' @export

ExpObjVetting <- function(ExpObj = NULL, samplesToKeep = NULL, featuresToKeep = NULL, glomby = NULL, variables_to_fix = NULL, class_to_ignore = NULL){

        #Get appropriate object to work with
        if (as.character(class(ExpObj)[1]) != "SummarizedExperiment"){
            stop("This function can only take a SummarizedExperiment object as input.")
        }
        obj <- ExpObj

        if (!(is.null(glomby))){
            obj <- agglomerate_features(ExpObj = obj, glomby = glomby)
        }

        #Exclude samples and features if specified
        if (!(is.null(samplesToKeep))){
            samplesToKeep <- samplesToKeep[samplesToKeep %in% colnames(obj)]
            obj <- obj[, samplesToKeep]
        }

        if (!(is.null(featuresToKeep))){
            featuresToKeep <- featuresToKeep[featuresToKeep %in% rownames(obj)]
            obj <- obj[featuresToKeep, ]
        }

        obj <- suppressWarnings(filter_sample_by_class_to_ignore(SEobj = obj, variables = variables_to_fix, class_to_ignore = class_to_ignore))

    return(obj)
}


#' filter_sample_by_class_to_ignore(SEobj = NULL, variables = NULL, class_to_ignore = NULL)
#'
#' Filters a SummarizedExperiment object by several criteria.
#' @export

filter_sample_by_class_to_ignore <- function(SEobj = NULL, variables = NULL, class_to_ignore = NULL){

    ptb <- as.data.frame(colData(SEobj))
    allmetadata <- metadata(SEobj)
    Samples <- rownames(ptb)

    if (!is.null(class_to_ignore)){
        #Start off with all of them
        valid_samples <- rownames(ptb)
        variables <- colnames(ptb)[colnames(ptb) %in% variables]
        #Keep omitting samples which do not fit the criteria, as long as variables are contained in current metadata
        if (all(c((!is.null(variables)), (length(variables) > 0)))){
            for (v in 1:length(variables)){
                valid_samples <-  valid_samples[valid_samples %in% (rownames(ptb)[!(ptb[ , variables[v]] %in% class_to_ignore)])]
            }
        }
        if (length(valid_samples) < 1){
            flog.warn("There are no samples matching the criteria. Returning original object with all samples")
        } else {
            omitted_samples <- rownames(ptb)[!(rownames(ptb) %in% valid_samples)]
            flog.info(paste("A total of", length(omitted_samples), "samples were omitted for containing", paste0(class_to_ignore, collapse = ", "), "within metadata variables", paste0(variables, collapse = ", ")))
            SEobj <- SEobj[ , valid_samples]
            ptb <- as.data.frame(colData(SEobj))
        }
    }

    #Fix categories of metadata within Experiment object
    for (colm in 1:ncol(ptb)){
        numtest <- length(which(is.na(as.numeric(ptb[, colm])) == TRUE))
        if (numtest == 0){
            ptb[, colm] <- as.numeric(ptb[, colm])
        }
    }

    ExpObj <- SummarizedExperiment(assays = assays(SEobj), rowData = rowData(SEobj), colData = ptb)
    metadata(ExpObj) <- allmetadata

    return(ExpObj)
}


#' declare_filtering_presets(analysis = NULL, applyfilters = NULL, featcutoff = NULL, GenomeCompletenessCutoff = NULL, PctFromCtgscutoff = NULL, maxl2fc = NULL, minl2fc = NULL)
#'
#' Performs vetting of a SummarizedExperiment object for use in several functions
#' @export
declare_filtering_presets <- function(analysis = NULL, applyfilters = NULL, featcutoff = NULL, GenomeCompletenessCutoff = NULL, PctFromCtgscutoff = NULL, maxl2fc = NULL, minl2fc = NULL, minabscorrcoeff = NULL){

    if ((analysis != "LKT") && (!(is.null(GenomeCompletenessCutoff)))){
        warning("Genome completeness only makes sense for taxa. Please choose a taxonomic (non functional) analysis.")
        GenomeCompletenessCutoff <- NULL
    }

    presetlist <- list()

    if (!is.null(applyfilters)){
        if (applyfilters == "stringent"){
            if (analysis == "LKT"){
                presetlist$featcutoff <- c(2000, 15)
                presetlist$GenomeCompletenessCutoff <- c(30, 10)
                presetlist$PctFromCtgscutoff <- c(70, 50)
                presetlist$minl2fc <- 2
            } else {
                presetlist$featcutoff <- c(50, 15)
                presetlist$minl2fc <- 2.5
            }
            presetlist$minabscorrcoeff <- 0.8
        } else if (applyfilters == "moderate"){
            if (analysis == "LKT"){
                presetlist$featcutoff <- c(250, 15)
                presetlist$GenomeCompletenessCutoff <- c(10, 5)
                presetlist$PctFromCtgscutoff <- c(70, 50)
                presetlist$minl2fc <- 1
            } else {
                presetlist$featcutoff <- c(5, 5)
                presetlist$minl2fc <- 1
            }
            presetlist$minabscorrcoeff <- 0.6
        } else if (applyfilters == "light"){
            if (analysis == "LKT"){
                presetlist$featcutoff <- c(50, 5)
                presetlist$GenomeCompletenessCutoff <- c(5, 5)
                presetlist$PctFromCtgscutoff <- c(70, 50)
                presetlist$minl2fc <- 1
            } else {
                presetlist$featcutoff <- c(0, 0)
                presetlist$minl2fc <- 1
            }
            presetlist$minabscorrcoeff <- 0.4
        }
    }

    #Replace with any values explicitly set by the user
    argstoset <- c("featcutoff", "GenomeCompletenessCutoff", "PctFromCtgscutoff", "maxl2fc", "minl2fc", "minabscorrcoeff")[!unlist(lapply(list(featcutoff, GenomeCompletenessCutoff, PctFromCtgscutoff, maxl2fc, minl2fc, minabscorrcoeff), is.null))]

    if (length(argstoset) > 0){
        for (ats in argstoset){
            presetlist[[ats]] <- get(ats)
        }
    }

    #Generate a filtration message
    presetlist$filtermsg <- NULL
    #Discard features which do not match certain criteria
    if (!(is.null(presetlist$featcutoff))){
        presetlist$thresholdPPM <- presetlist$featcutoff[1]
        presetlist$sampcutoffpct <- min(presetlist$featcutoff[2], 100)
        presetlist$filtermsg <- paste("Feature must be >", presetlist$thresholdPPM, "PPM in at least ", presetlist$sampcutoffpct, "% of samples", sep = "")
    } else {
        presetlist$filtermsg <- NULL
        presetlist$featcutoff <- c(0, 0)
    }

    if (!(is.null(presetlist$PctFromCtgscutoff))){
        presetlist$thresholdPctFromCtgs <- presetlist$PctFromCtgscutoff[1]
        presetlist$sampcutoffpctPctFromCtgs <- min(presetlist$PctFromCtgscutoff[2], 100)
        presetlist$filtermsg <- paste(presetlist$filtermsg, (paste("Taxonomy information must come from >", presetlist$thresholdPctFromCtgs, "% contigs in at least ", presetlist$sampcutoffpctPctFromCtgs, "% of samples", sep = "")), sep = "\n")
    }

    if (!(is.null(presetlist$GenomeCompletenessCutoff))){
        presetlist$thresholdGenomeCompleteness <- presetlist$GenomeCompletenessCutoff[1]
        presetlist$sampcutoffpctGenomeCompleteness <- min(presetlist$GenomeCompletenessCutoff[2], 100)
        presetlist$filtermsg <- paste(presetlist$filtermsg, (paste("Taxon genome completeness must be >", presetlist$thresholdGenomeCompleteness, "% in at least ", presetlist$sampcutoffpctGenomeCompleteness, "% of samples", sep = "")), sep = "\n")
    }

    return(presetlist)
}

#' filter_correlations(corrmat = NULL, mincorrelcoeff = NULL)
#' Given a pairwise correlation matrix, eliminate features which do not present an absolute correlation coefficient smaller than mincorrelcoeff with all other features other than itself.
#'
#' @export
filter_correlations <- function(corrmat = NULL, mincorrelcoeff = NULL){

     if(nrow(corrmat) != ncol(corrmat)){
         stop("Correlation matrix must have equal numbers of rows and columns.")
     }

     featsIwant <- NULL

     for (rw in 1:nrow(corrmat)){
         featint <- rownames(corrmat)[rw]
         #print(paste("Checking:", featint))
         correlations <- corrmat[which(rownames(corrmat) != featint), featint]

         if(max(abs(correlations)) >= mincorrelcoeff){
             feat <- featint
         } else {
             feat <- NULL
         }

         featsIwant <- append(featsIwant, feat)

     }

     corrmat <- corrmat[featsIwant, featsIwant]

     return(corrmat)
 }
johnmcculloch/JAMS_BW documentation built on March 29, 2024, 7:56 p.m.