R/bgeePipelineFunctions.R

Defines functions countToTpm cutoff_info plot_distributions calculate_abundance_cutoff

# these functions should be part of
# presenceAbsence.R they mainly correspond to
# duplicated code between the pipeline and this
# package.  They do not need to be exported We
# decided to move them in a different file to
# easily update them when the pipeline is updated.
# XXX We should maybe centralise these functions
# somewhere outside of the package and the bgee
# pipeline

#' @title Calculate TPM cutoff
#'
#' @description This function calculate the TPM cutoff.
#' This cutoff will correspond to the minimal value of TPM for which the ratio
#' of genes and intergenic regions is equal to `intergenic_cutoff`
#' (by default = 0.05)
#' or lower (first test if at least 1 TPM value has this property):
#'
#' @param counts TPM information for both genic and intergenic regions
#' @param selected_coding TPM information for both protein_coding regions
#' @param selected_intergenic TPM information for intergenic regions
#' @param intergenic_cutoff value of the ratio between prop of
#' intergenic present and protein coding present (called r in this function)
#'
#' @return the TPM cutoff and the value of r (proportion of intergenic regions
#' considered as present )
#'
#' @author Julien Roux
#' @author Julien Wollbrett
#'
#' @noMd
#' @noRd
#'
calculate_abundance_cutoff <- function(counts,
    selected_coding,
    selected_intergenic,
    intergenic_cutoff = 0.05) {
    ## r = (number of intergenic regions with TPM values
    ## higher than x * number of coding regions) /
    ## (number of coding regions with TPM values higher
    ## than x * number of intergenic regions) = 0.05
    ## What is value of x (cutoff)? calculate the
    ## distribution of r for a range of TPMs, then
    ## select the closest value to `cutoff`
    ## Counting how many intergenic regions have equal
    ## or higher value of TPM for every value of TPM For
    ## each gene's TPM (sorted), calculate r
    summed_intergenic <-
        vapply(unique(sort(counts$abundance[selected_coding])),
        function(x) {
            return(sum(counts$abundance[selected_intergenic] >=
            x))
        }, numeric(1))
    ## It is not necessary to do the same for coding
    ## regions: for a sorted vector the number of
    ## position + 1.  Here, it is a bit trickier since
    ## we do not consider all coding TPM values, but the
    ## unique ones, so we use the rle function to know
    ## the lengths of the runs of similar values and sum
    ## them
    summed_coding <-
        c(0, cumsum(rle(sort(
            counts$abundance[selected_coding]
        ))$lengths))
    summed_coding <- summed_coding[-(length(summed_coding))]
    summed_coding <- sum(selected_coding) - summed_coding
    
    ## Now we can calculate r
    r <-
        (summed_intergenic / sum(selected_intergenic)) / (summed_coding / sum(selected_coding))
    
    prot_coding_present <- 100 - (intergenic_cutoff * 100)
    
    ## Select the minimal value of TPM for which the
    ## ratio of genes and intergenic regions is equal to
    ## `intergenic_cutoff` or lower (first test if at
    ## least 1 TPM value has this property):
    if (sum(r < intergenic_cutoff) == 0) {
        TPM_cutoff <- sort(unique(counts$abundance[selected_coding]))[which(r ==  min(r))[1]]
        r_cutoff <- min(r)
        warning(
            "There is no TPM cutoff for which ",
            prot_coding_present,
            "% of the expressed genes would be coding. TPM cutoff is fixed at
the first value with maximum coding/intergenic ratio. r=",
            r_cutoff,
            "at TPM=",
            TPM_cutoff,
            "\n"
        )
    } else {
        TPM_cutoff <- sort(unique(counts$abundance[selected_coding]))[which(r < intergenic_cutoff)[1]]
        r_cutoff <- intergenic_cutoff
        message("TPM cutoff for which ",prot_coding_present,
            "% of the expressed genes are coding found at TPM = ", TPM_cutoff)
    }
    return(c(TPM_cutoff, r_cutoff))
}

#' @title plot distribution of TPMs
#'
#' @description Plotting of the distribution of TPMs for coding and intergenic
#' regions + cutoff
#'
#' @param counts TPM information for both genic and intergenic regions
#' @param selected_coding TPM information for protein_coding regions
#' @param selected_intergenic TPM information for intergenic regions
#' @param cutoff TPM cutoff below which calls are considered as absent
#' @param myUserMetadata A Reference Class UserMetadata object.
#' This object has to be edited before running kallisto @seealso UserMetadata.R
#'
#' @author Julien Roux
#' @author Julien Wollbrett
#'
#' @noMd
#' @noRd
#'
#' @return plot distribution of TPMs for coding and intergenic regions + cutoff
#'
plot_distributions <- function(counts,
    selected_coding, selected_intergenic,
    cutoff, myUserMetadata) {
    ## Plotting of the distribution of TPMs for coding
    ## and intergenic regions + cutoff Note: this code
    ## is largely similar to plotting section in
    ## rna_seq_analysis.R
    
    par(mar = c(5, 6, 1, 1))  ## bottom, left, top and right margins
    dens <- density(log2(na.omit(counts$abundance) + 10 ^ -6))
    
    ## protein-coding genes only (had to take care of
    ## NAs strange behavior)
    dens_coding <- density(log2(counts$abundance[selected_coding] +
        10 ^ -6))
    ## Normalize density for number of observations
    dens_coding$y <-
        dens_coding$y * sum(selected_coding) / length(counts$abundance)
    
    ## intergenic
    dens_intergenic <-
        density(log2(counts$abundance[selected_intergenic] + 10 ^ -6))
    dens_intergenic$y <-
        dens_intergenic$y * sum(selected_intergenic) / length(counts$abundance)
    
    ## Plot whole distribution
    title <- basename(myUserMetadata@rnaseq_lib_path)
    plot(
        dens,
        ylim = c(0, max(dens$y) * 1.1),
        xlim = c(-23, 21),
        lwd = 2,
        main = title,
        bty = "n",
        axes = FALSE,
        xlab = ""
    )
    axis(2, las = 1)
    
    axis(
        1,
        at = seq(-30 , 30, by = 10),
        line = 0,
        mgp = c(3, 0.5, 0),
        cex.axis = 0.8
    )
    mtext(
        expression(log[2]('TPM'+10 ^ -6)),
        1,
        adj = 1,
        padj = 0,
        line = 0.2,
        at = par("usr")[1],
        col = "black",
        cex = 0.8
    )
    
    
    ## Plot the TPM cutoff abline(v=cutoff, col='gray',
    ## lty=1, lwd=2)
    arrows(
        log2(cutoff + 1e-05),
        par("usr")[3],
        log2(cutoff + 1e-05),
        par("usr")[4] / 2,
        col = "gray",
        lty = 1,
        lwd = 2,
        angle = 160,
        length = 0.1
    )
    
    ## Add subgroups distributions (coding, intergenic,
    ## etc): protein-coding genes
    lines(dens_coding,
        col = "firebrick3",
        lwd = 2,
        lty = 2)
    ## intergenic
    lines(dens_intergenic,
        col = "dodgerblue3",
        lwd = 2,
        lty = 2)
    
    ## legend
    legend(
        "topright",
        c(
            "all",
            "selected protein-coding genes",
            "selected intergenic regions"
        ),
        lwd = 2,
        col = c("black",
                "firebrick3", "dodgerblue3"),
        lty = c(1, 2,
                2),
        bty = "n"
    )
    return()
}

# this function is not exactly the same than in the
# bgee pipeline.  We removed the max_intergenic
# variable. In the pipeline this variable was used
# to define the reference intergenic regions.  We
# do not need it anymore in this package as we
# precomputed the list of reference intergenic
# regions we also remove TPM_final_cutoff,
# FPKM_cutoff, FPKM_final_cutoff

#' @title Cutoff information
#'
#' @description Calculate summary statistics to export in cutoff info file
#'
#' @param counts TPM information for both genic and intergenic regions
#' @param column Name of the column containing presence/absence information
#' @param TPM_cutoff TPM cutoff below which calls are considered as absent
#' @param r_cutoff Proportion of intergenic regions considered as present
#' @param myUserMetadata A Reference Class UserMetadata object.
#' This object has to be edited before running kallisto @seealso UserMetadata.R
#'
#' @return summary statistics to export in cutoff info file
#'
#' @author Julien Roux
#' @author Julien Wollbrett
#'
#' @noMd
#' @noRd
#'
cutoff_info <- function(counts,
    column, abundance_cutoff, r_cutoff,
    myUserMetadata) {
    ## Calculate summary statistics to export in cutoff
    ## info file
    genic_present <- sum(counts[[column]][counts$type ==
        "genic"] == "present") / sum(counts$type == "genic") * 100
    number_genic_present <- sum(counts[[column]][counts$type ==
        "genic"] == "present")
    
    coding_present <- sum(counts[[column]][counts$biotype %in%
        "protein_coding"] == "present") / sum(counts$biotype %in%
        "protein_coding") * 100
    number_coding_present <-
        sum(counts[[column]][counts$biotype %in%
        "protein_coding"] == "present")
    
    intergenic_present <- sum(counts[[column]][counts$type ==
        "intergenic"] == "present") / sum(counts$type ==
        "intergenic") * 100
    number_intergenic_present <-
        sum(counts[[column]][counts$type == "intergenic"] == "present")
    
    ## Export cutoff_info_file
    to_export <- c(
        basename(myUserMetadata@rnaseq_lib_path),
        abundance_cutoff,
        genic_present,
        number_genic_present,
        sum(counts$type == "genic"),
        coding_present,
        number_coding_present,
        sum(counts$biotype %in% "protein_coding"),
        intergenic_present,
        number_intergenic_present,
        sum(counts$type == "intergenic"),
        r_cutoff
    )
    names(to_export) <- c(
        "libraryId",
        "cutoffTPM",
        "proportionGenicPresent",
        "numberGenicPresent",
        "numberGenic",
        "proportionCodingPresent",
        "numberPresentCoding",
        "numberCoding",
        "proportionIntergenicPresent",
        "numberIntergenicPresent",
        "numberIntergenic",
        "ratioIntergenicCodingPresent"
    )
    return(to_export)
}

#' @title Recalculate TPMs
#'
#' @description Recalculate TPMs using functions from
#' ```https://haroldpimentel.wordpress.com/2014/05/08/what-the-fpkm-a-review-rna-seq-expression-units/```
#'
#' @param counts A list of extimated counts
#' @param effLen A list of effective length
#'
#' @return A list of recalculated TPMs
#'
#' @noMd
#' @noRd
#'
countToTpm <- function(counts, effLen) {
    rate <- log(counts) - log(effLen)
    denom <- log(sum(exp(rate)))
    exp(rate - denom + log(1e+06))
}

Try the BgeeCall package in your browser

Any scripts or data that you put into this service are public.

BgeeCall documentation built on Dec. 12, 2020, 2 a.m.