R/filter_by_quantile.R

Defines functions add_percent_zero_q1_q4 filter_genes_zero_expression_all_samples filter_genes_zero_expression filter_exp_by_quant_mean_FC calculate_mean_q4_minus_mean_q1 calculate_IQR filter_dnam_by_quant_diff

Documented in filter_dnam_by_quant_diff filter_exp_by_quant_mean_FC filter_genes_zero_expression

#' @title Select regions with variations in DNA methylation
#' levels above a threshold
#' @description
#' For each region, computes the interquartile range (IQR) of the DNA methylation (DNAm) 
#' levels and requires the IQR to be above a threshold
#' @param dnam DNA methylation matrix or SummarizedExperiment object
#' @param min.IQR.threshold
#' Threshold for minimal interquantile range (difference between the 75th and 25th percentiles)
#' of the DNAm
#' @param cores
#' Number of CPU cores to be used in the analysis. Default: 1
#' @export
#' @examples
#' data("dna.met.chr21")
#' dna.met.chr21.filtered <- filter_dnam_by_quant_diff(
#'   dna.met.chr21
#' )
#' @return
#' A subset of the original matrix only with the
#' rows passing the filter threshold.
#' @importFrom SummarizedExperiment assay<-
filter_dnam_by_quant_diff <- function(
    dnam,
    min.IQR.threshold = 0.2,
    cores = 1
){
    
    is.se <- is(dnam,"SummarizedExperiment")
    if (is.se) {
        matrix <- assay(dnam)
    } else {
        matrix <- dnam
    }
    
    # remove all NA rows
    keep.rows <- which(rowSums(is.na(matrix)) != ncol(matrix))
    if(length(keep.rows) < nrow(matrix)){
        message("Removing rows with NAs for all samples")
        matrix <- matrix[keep.rows,]
    }
    
    IQR <- calculate_IQR(matrix)
    tab <- plyr::count(IQR$IQR > min.IQR.threshold)
    colnames(tab)[1] <- "Status"
    tab$Status[which(tab$Status == FALSE)] <- "Regions below threshold"
    tab$Status[which(tab$Status == TRUE)] <- "Regions above threshold"
    print(tab)
    
    diff.regions <- c(
        IQR %>%
            filter(IQR > min.IQR.threshold) %>%
            pull(.data$ID) %>%
            as.character()
    )
    matrix <- matrix[diff.regions,,drop = FALSE]
    
    if (is.se) {
        dnam <- dnam[rownames(matrix),]
        assay(dnam) <- matrix
    } else {
        dnam <- matrix
    }
    dnam
}

#' @examples
#' library(dplyr)
#' 10 %>% rnorm %>%
#'   matrix(nrow = 1,dimnames = list(c("row1"), LETTERS[1:10])) %>%
#'   calculate_IQR
#' @noRd
calculate_IQR <- function(matrix){
    check_package("matrixStats")
    tibble::tibble(
        "ID" = rownames(matrix),
        "IQR" = matrixStats::rowIQRs(matrix, na.rm = TRUE)
    )
}

#' @examples
#' library(dplyr)
#' 10 %>% rnorm %>%
#'   matrix(nrow = 1,dimnames = list(c("row1"), LETTERS[1:10])) %>%
#'   calculate_mean_q4_minus_mean_q1
#' @noRd
calculate_mean_q4_minus_mean_q1 <- function(
    matrix, 
    cores = 1,
    dnam.group.threshold = 0.25
){
    parallel <- register_cores(cores)
    
    plyr::adply(.data = matrix,.margins = 1,.fun = function(row){
        qs <- quantile(row, probs =  c(dnam.group.threshold,1 - dnam.group.threshold), na.rm = TRUE)
        qs.mean <- tapply(row, findInterval(row, qs), mean, rm.na = TRUE)
        tibble::tibble("diff.mean" = max(qs.mean) - min(qs.mean))
    },.parallel = parallel, .progress = "time",.id = "ID")
}


#' @title Select genes with variations above a threshold
#' @description For each gene, compares the mean gene expression
#' levels in samples in high expression (Q4)
#' vs. samples with low gene expression (Q1),
#' and requires the fold change to be above a certain threshold.
#' @param exp Gene expression matrix or SumarizedExperiment object
#' @param fold.change
#' Threshold for fold change of mean gene
#' expression levels in samples with high
#' (Q4) and low (Q1) gene expression levels. Defaults to 1.5.
#' @param cores
#' Number of CPU cores to be used in the analysis. Default: 1
#' @export
#' @examples
#' data("gene.exp.chr21.log2")
#' gene.exp.chr21.log2.filtered <- filter_exp_by_quant_mean_FC(
#'   gene.exp.chr21.log2
#' )
#' @return
#' A subset of the original matrix only with the rows passing
#' the filter threshold.
filter_exp_by_quant_mean_FC <- function(
    exp,
    fold.change = 1.5,
    cores = 1
){
    
    if(is(exp,"SummarizedExperiment")){
        matrix <- assay(exp)
    } else {
        matrix <- exp
    }
    
    
    parallel <- register_cores(cores)
    diff.genes <- plyr::adply(matrix,.margins = 1,.fun = function(row){
        quant <-  quantile(row, na.rm = TRUE)
        quant.fold.change <- data.frame("q4_div_q1" = quant[4] / quant[2])
        
        low.cutoff <- quant[2]
        upper.cutoff <- quant[4]
        
        mean.q1 <- row[row <= low.cutoff] %>% mean(na.rm = TRUE)
        mean.q4 <- row[row >= upper.cutoff] %>% mean(na.rm = TRUE)
        data.frame(
            "diff_fold_change" = mean.q4 / mean.q1,
            stringsAsFactors = FALSE
        )
    }, .progress = "time",.parallel = parallel)
    
    
    tab <- plyr::count(diff.genes$diff_fold_change > fold.change)
    colnames(tab)[1] <- "Status"
    tab$Status[which(tab$Status == FALSE)] <- "Genes below threshold"
    tab$Status[which(tab$Status == TRUE)] <- "Genes above threshold"
    print(tab)
    
    diff.genes <- c(
        diff.genes %>%
            filter(.data$diff_fold_change > fold.change) %>%
            pull(.data$X1) %>%
            as.character()
    )
    exp[diff.genes,,drop = FALSE]
}


#' @title Remove genes with gene expression level equal to 0 in a
#' substantial percentage of the samples
#' @param exp Gene expression matrix or SumarizedExperiment object
#' @param max.samples.percentage Max percentage of samples with gene
#' expression as 0, for genes to be selected.
#' If max.samples.percentage 100, remove genes with 0 for 100\% samples.
#' If max.samples.percentage 25, remove genes with 0 for more
#' than 25\% of the samples.
#'
#' @export
#' @return A subset of the original matrix only with the rows
#' passing the filter threshold.
filter_genes_zero_expression <- function(
    exp,
    max.samples.percentage = 0.25
){
    
    if (is(exp,"SummarizedExperiment")) {
        matrix <- assay(exp)
    } else {
        matrix <- exp
    }
    
    na.or.zeros <- matrix == 0 | is.na(matrix)
    percent.na.or.zeros <- rowSums(na.or.zeros) / ncol(matrix)
    
    genes.keep <- (percent.na.or.zeros < max.samples.percentage) %>% which %>% names
    message("Removing ", nrow(matrix) - length(genes.keep), " out of ", nrow(matrix), " genes")
    exp[genes.keep,, drop = FALSE]
}

#' @title
#' Remove genes with gene expression level equal to 0 or NA in a all samples
#' @param exp Gene expression matrix or a Summarized Experiment object
#' @noRd
#' @examples
#' data("gene.exp.chr21.log2")
#' gene.exp.chr21.log2.filtered <- filter_genes_zero_expression_all_samples(
#'   gene.exp.chr21.log2
#' )
#' @return
#' A subset of the original matrix only with the rows
#' passing the filter threshold.
filter_genes_zero_expression_all_samples <- function(
    exp
){
    if(is(exp,"SummarizedExperiment")){
        exp <- assay(exp)
    }
    idx.all.zero <- rowSums(exp == 0, na.rm = TRUE) == ncol(exp)
    idx.all.na <- rowSums(is.na(exp)) == ncol(exp)
    
    # do not keep if it is all zero or all NA
    genes.keep <- rownames(exp)[!(idx.all.zero | idx.all.na)] %>% na.omit()
    if(length(genes.keep) < nrow(exp) & length(genes.keep) > 0){
        message(
            "Removing ", nrow(exp) - length(genes.keep),
            " out of ", nrow(exp), " genes"
        )
    }
    exp <- exp[genes.keep,,drop = FALSE]
}



#' @description  Give dnam and expression outputs the % of samples with
#' 0 in Q1 and Q4. This function is useful if the analysis is performed using
#' residuals. This function will add the following column to the input data
#' "% of 0 target genes (Q1 and Q4)".
#' @noRd
add_percent_zero_q1_q4 <- function(
    region.target,
    dnam,
    exp
){
    
    if (is(exp,"SummarizedExperiment")) {
        exp <- assay(exp)
    }
    
    if (is(dnam,"SummarizedExperiment")) {
        dnam <- assay(dnam)
    }
    
    aux <- plyr::adply(
        unique(region.target[,c("regionID","target")]),
        .margins = 1,
        .fun = function(row) {
            
            rna.target <- exp[rownames(exp) == row$target, , drop = FALSE]
            met <- dnam[rownames(dnam) == as.character(row$regionID), ]
            
            data <- data.frame(
                rna.target = rna.target %>% as.numeric,
                met = met %>% as.numeric
            )
            
            quant.met <-  quantile(data$met,na.rm = TRUE)
            low.cutoff <- quant.met[2]
            upper.cutoff <- quant.met[4]
            
            data.high.low <- data %>%
                filter(.data$met <= low.cutoff | .data$met >= upper.cutoff)
            data.high.low$metGrp <- ifelse(data.high.low$met <= low.cutoff, 0, 1)
            
            pct.zeros.in.quant.samples <- sum(
                data.high.low$rna.target == 0,
                na.rm = TRUE) / nrow(data.high.low)
            
            data.frame("% of 0 target genes (Q1 and Q4)" = paste0(
                round(pct.zeros.in.quant.samples * 100,digits = 2),
                " %")
            )
        }
    )
    region.target$`% of 0 target genes (Q1 and Q4)` <- aux$X..of.0.target.genes..Q1.and.Q4.[
        match(paste0(region.target$regionID,region.target$target),paste0(aux$regionID,aux$target))
    ]
    return(region.target)
}
TransBioInfoLab/MethReg documentation built on July 28, 2023, 9:17 p.m.