R/oncoMix_two_component_mixture_models.R
In oncomix: Identifying Genes Overexpressed in Subsets of Tumors from Tumor-Normal mRNA Expression Data

Documented in mixModelParams plotGeneHist scatterMixPlot toMatrix

#' Convert SummarizedExperiment or Dataframe to Matrix
#'
#' This internal function converts SummarizedExperiment objects and dataframes
#' (both S3 and S4) to matrices of expression values. Used within oncomix
#' functions to convert all matrix-like objects to the matrix class.
#'
#' @param m Can be a matrix, a data.frame, a DataFrame, or
#' SummarizedExperiment object.
#' @export
#' @keywords internal
#' @importFrom SummarizedExperiment assay
#' @return A matrix of expression values
#' @examples
#' m <- as.data.frame(matrix(data=rgamma(n=150, shape=2, rate=2),
#' nrow=10, ncol=15))
#' m <- toMatrix(m)

toMatrix <- function(m){
    if(is.matrix(m)){
        return(m)
    }
    if(is(m, "SummarizedExperiment")){
        m <- assay(m)
        return(m)
    }
    if(is.data.frame(m) || is(m, "DataFrame")){
        m <- as.matrix(m)
        return(m)
    }
}



#' Generate the parameters for two 2-component Gaussian mixture models
#' with equal variances
#'
#' This function allows you to generate the parameters for two 2-component
#' Gaussian mixture model with equal variances from 2 matrices of data with
#' a priori labels (eg tumor vs normal.) This application was originally
#' intended for matrices of gene expression data treated with 2 conditions.
#'
#' @param exprNml A dataframe (S3 or S4), matrix, or SummarizedExperiment object
#'   containing normal data with patients as columns and genes as rows.
#' @param exprTum A dataframe (S3 or S4), matrix, or SummarizedExperiment object
#'   containing tumor data with patients as columns and genes as rows.
#' @keywords oncomix, mixture-model, two-component
#' @return Returns a dataframe, each element of which contains the 12 mixture
#' model parameters for each gene in an n x 12 matrix, where n is the number
#' of genes.
#' @importFrom mclust Mclust mclustBIC
#' @export
#' @examples
#' exprNml <- as.data.frame(matrix(data=rgamma(n=150, shape=2, rate=2),
#' nrow=10, ncol=15))
#' colnames(exprNml) <- paste0("patientN", seq_len(ncol(exprNml)))
#' rownames(exprNml) <- paste0("gene", seq_len(nrow(exprNml)))
#'
#' exprTum <- as.data.frame(matrix(data=rgamma(n=150, shape=4, rate=3),
#' nrow=10, ncol=15))
#' colnames(exprTum) <- paste0("patientT", seq_len(ncol(exprTum)))
#' rownames(exprTum) <- paste0("gene", seq_len(nrow(exprTum)))
#'
#' mmParams <- mixModelParams(exprNml, exprTum)

mixModelParams <- function(exprNml, exprTum) {

    exprNml <- toMatrix(exprNml)
    exprTum <- toMatrix(exprTum)

    paramsNormal <- apply(exprNml, 1, function(x) {
        y <- mclust::Mclust(data=x, G=2, modelNames="E", verbose=FALSE)
        z <- c(nMu=y$parameters$mean,
            nVar=y$parameters$variance$sigmasq,
            nPi1=y$parameters$pro[1])
        return(z)})

    paramsTumor <- apply(exprTum, 1, function(x) {
        y <- mclust::Mclust(data=x, G=2, modelNames="E", verbose=FALSE)
        z <- c(tMu=y$parameters$mean,
            tVar=y$parameters$variance$sigmasq,
            tPi1=y$parameters$pro[1])
        return(z)})

    params <- rbind(paramsNormal, paramsTumor)
    rownames(params)[c(1:2, 5:6)] = c("nMu1","nMu2","tMu1", "tMu2")
    deltaMu2 <- params["tMu2",]-params["nMu2",]
    deltaMu1 <- params["tMu1",]-params["nMu1",]
    boundaryTumor <- (params["tMu2",] + params["tMu1",]) / 2

    siCalc <- function(vectNml, boundTumor){ #calculate the sel. idx
        x <- sum(boundTumor > vectNml) / length(vectNml)
        return(x)
    }
    SI <- sapply(seq_len(nrow(exprNml)),
            function(i) siCalc(exprNml[i, ], boundaryTumor[i]))

    params <- rbind(params, deltaMu2, deltaMu1, SI)
    mmParamsDf <- data.frame(t(params))
    score <- NA
    mmParamsDf$score <- mmParamsDf$SI*{
        (mmParamsDf$deltaMu2-mmParamsDf$deltaMu1)-
        (mmParamsDf$nVar+mmParamsDf$tVar)}
    mmParamsDfS <- mmParamsDf[with(mmParamsDf, order(-score)), ]
    return(mmParamsDfS)
}


#' Plot a histogram of gene expression values from
#' tumor and adjacent normal tissue.
#'
#' This function allows you to plot a histogram of gene expression values from
#' tumor and adjacent normal tissue with the option of including the best
#' fitting Gaussian curve.
#'
#' @param mmParams The output from the getMixModelParams function.
#' @param exprNml A dataframe (S3 or S4), matrix, or SummarizedExperiment object
#'   containing normal data with patients as columns and genes as rows.
#' @param exprTum A dataframe (S3 or S4), matrix, or SummarizedExperiment object
#'   containing tumor data with patients as columns and genes as rows.
#' @param isof The gene isoform to visualize
#' @keywords oncoMix, visualization, Gaussian, two-component
#' @return Returns a histogram of the gene expression
#' values from the two groups.
#' @export
#' @examples
#' exprNml <- as.data.frame(matrix(data=rgamma(n=150, shape=2, rate=2),
#' nrow=10, ncol=15))
#' colnames(exprNml) <- paste0("patientN", seq_len(ncol(exprNml)))
#' rownames(exprNml) <- paste0("gene", seq_len(nrow(exprNml)))
#'
#' exprTum <- as.data.frame(matrix(data=rgamma(n=150, shape=4, rate=3),
#' nrow=10, ncol=15))
#' colnames(exprTum) <- paste0("patientT", seq_len(ncol(exprTum)))
#' rownames(exprTum) <- paste0("gene", seq_len(nrow(exprTum)))

#' mmParams <- mixModelParams(exprNml, exprTum)
#' isof <- rownames(mmParams)[1]
#' plotGeneHist(mmParams, exprNml, exprTum, isof)
#' @seealso \code{\link{mixModelParams}}

plotGeneHist <- function(mmParams, exprNml, exprTum, isof){

    exprNml <- toMatrix(exprNml)
    exprTum <- toMatrix(exprTum)

    tidyDf <- as.data.frame(cbind(as.numeric(c(exprTum[isof,], exprNml[isof,])),
        as.factor(c(rep("tumor",ncol(exprTum)), rep("normal",ncol(exprNml))))),
        stringsAsFactors=FALSE)
    colnames(tidyDf) <- c("expr", "type")
    expr <- type <- ..density.. <- NULL # Setting the variables to NULL first
    p1 <- ggplot(tidyDf, aes(x=expr, color=as.factor(type),
        fill=as.factor(type),
        group=as.factor(type))) +
    theme_classic() +
    geom_histogram(data=subset(tidyDf,type == 1),fill="#F8766D",
        alpha=0.2, aes(y=..density..), binwidth = .2) +
    geom_histogram(data=subset(tidyDf,type == 2),fill="#00BFC4",
        alpha=0.2, aes(y=..density..), binwidth = .2) +
    geom_rug(alpha=0.3, show.legend=FALSE) +
    theme(axis.title.y=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank(),
        legend.position="none",
        plot.title=element_text(size=12),
        axis.text=element_text(size=8),
        axis.title=element_text(size=8)) +
    ggtitle(paste0(isof, " : SI = ",
        round(mmParams[isof,"SI"],4))) +
    xlab(expression(Log[2] *"(TPM Reads)")) +
    stat_function(fun="dnorm", colour="#F8766D",
        args=list(mean=mmParams[isof,"nMu2"],
        sd=sqrt(mmParams[isof,"nVar"]))) +
    stat_function(fun="dnorm", colour="#F8766D",
        args=list(mean=mmParams[isof,"nMu1"],
        sd=sqrt(mmParams[isof,"nVar"]))) +
    stat_function(fun="dnorm", colour="#00BFC4",
        args=list(mean=mmParams[isof,"tMu2"],
        sd=sqrt(mmParams[isof,"tVar"]))) +
    stat_function(fun="dnorm", colour="#00BFC4",
        args=list(mean=mmParams[isof,"tMu1"],
        sd=sqrt(mmParams[isof,"tVar"])))
    print(p1)
}


#' Generate a scatter plot with the output from mixModelParams
#'
#' This function allows you to generate the parameters for two 2-component
#' mixture models with equal variances
#'
#' @param mmParams The output from the mixModelParams function. Will utilize
#' the deltaMu2 and deltaMu1 rows.
#' @param selIndThresh This is the selectivity index threshold to use. All
#' genes with SI values above this threshold will be highlighted in purple.
#' Specify either selIndThresh or geneLabels (not both simultaneously).
#' @param geneLabels A character vector of gene names used to label the
#' genes with that name on the scatter plot. Specify either selIndThresh
#' or geneLabels (not both simultaneously).
#' @keywords oncoMix, visualization, two-component
#' @return Returns a ggplot scatter object that can be plotted
#' @import ggplot2 ggrepel stats
#' @importFrom RColorBrewer brewer.pal
#' @export
#' @examples
#' exprNml <- as.data.frame(matrix(data=rgamma(n=150, shape=2, rate=2),
#' nrow=10, ncol=15))
#' colnames(exprNml) <- paste0("patientN", seq_len(ncol(exprNml)))
#' rownames(exprNml) <- paste0("gene", seq_len(nrow(exprNml)))
#'
#' exprTum <- as.data.frame(matrix(data=rgamma(n=150, shape=4, rate=3),
#' nrow=10, ncol=15))
#' colnames(exprTum) <- paste0("patientT", seq_len(ncol(exprTum)))
#' rownames(exprTum) <- paste0("gene", seq_len(nrow(exprTum)))
#'
#' mmParams <- mixModelParams(exprNml, exprTum)
#' scatterMixPlot(mmParams)
#' @seealso \code{\link{mixModelParams}}

scatterMixPlot <- function(mmParams, selIndThresh=1, geneLabels=NULL){
    mmParams <- as.data.frame(mmParams)
    oneOverAlpha <- diff(range(mmParams$deltaMu2))
    alpha1 <- 1/oneOverAlpha

    quants <- c(0.01, 0.10, 0.50, 0.90, 0.99) #add in the quantiles
    colorsRed <- RColorBrewer::brewer.pal(n=length(quants), name="Reds")

    deltaMu2Quant <- stats::quantile(mmParams[,"deltaMu2"], quants)
    deltaMu1Quant <- stats::quantile(1/(abs(mmParams[,"deltaMu1"]) + alpha1),
                        quants)
    deltaMu2 <- deltaMu1 <- NULL
    x <- ggplot(data=as.data.frame(mmParams),
        aes(x=deltaMu2, y=1/(abs(deltaMu1) + alpha1))) +
        theme_classic() +
        geom_hline(yintercept=deltaMu1Quant,
            col=colorsRed, size=c(1,1,1,1,1)) +
        geom_vline(xintercept=deltaMu2Quant,
            col=colorsRed, size=c(1,1,1,1,1)) +
        geom_point(alpha=0.5) +
        xlab(expression(paste(Delta, mu[2]))) +
        ylab(expression(paste(frac(1, paste(Delta, mu[1], " + ", alpha)))))
    if(selIndThresh < 1){
        mmParamsSi <- mmParams[mmParams$SI > selIndThresh,]
        x <- x + geom_point(data=as.data.frame(mmParamsSi),
            aes(x=deltaMu2, y=1/(abs(deltaMu1)+alpha1)),
            size=10, alpha=0.1,
            col=colorsRed[length(colorsRed)],
            fill=colorsRed[length(colorsRed)]) +
        ggtitle(bquote(Distribution~of~Mixture~Model~
            Parameters*","~alpha~"="~.(round(alpha1,2))*", SI >"~
            .(selIndThresh)))
    } else if(!is.null(geneLabels)){
        mmParamsSi <- mmParams[geneLabels,]
        mmParamsSi$geneLabels <- geneLabels
        x <- x + geom_point(data=as.data.frame(mmParamsSi),
            aes(x=deltaMu2, y=1/(abs(deltaMu1)+alpha1)),
            size=10, alpha=0.1,
            col=colorsRed[length(colorsRed)],
            fill=colorsRed[length(colorsRed)]) +
        ggrepel::geom_text_repel(data=mmParamsSi, aes(x=deltaMu2,
            y=1/(abs(deltaMu1)+alpha1)),
            label=rownames(mmParamsSi)) +
        ggtitle(bquote(Distribution~of~Mixture~Model~
            Parameters*","~alpha~"="~.(round(alpha1,2))))
    } else {
        x <- x + ggtitle(bquote(Distribution~of~Mixture~Model~
            Parameters*","~alpha~"="~.(round(alpha1,2))))
    }
    return(x)
}

#' Identify genes that meet pre-specified quantiles
#'
#' This function allows you to subset genes that are above pre-specified
#' quantiles and that most closely resemble the distribution of oncogenes.
#'
#' @param mmParams The output from the mixModelParams function.
#' @param deltMu2Thr The percentile threshold for the deltaMu2 statistic.
#' All genes exceeding this percentile threshold will be selected.
#' @param deltMu1Thr The percentile threshold for the deltaMu1 statistic.
#' All genes exceeding this percentile threshold will be selected.
#' @param siThr The threshold for the selectivity index statistic
#' (between 0-1). All genes exceeding this threshold will be selected.
#' @keywords subsetting
#' @return Returns a dataframe containing all genes meeting the prespecified
#' thresholds.
#' @import ggplot2 ggrepel stats
#' @export
#' @examples
#' exprNml <- as.data.frame(matrix(data=rgamma(n=150, shape=2, rate=2),
#' nrow=10, ncol=15))
#' colnames(exprNml) <- paste0("patientN", seq_len(ncol(exprNml)))
#' rownames(exprNml) <- paste0("gene", seq_len(nrow(exprNml)))
#'
#' exprTum <- as.data.frame(matrix(data=rgamma(n=150, shape=4, rate=3),
#' nrow=10, ncol=15))
#' colnames(exprTum) <- paste0("patientT", seq_len(ncol(exprTum)))
#' rownames(exprTum) <- paste0("gene", seq_len(nrow(exprTum)))
#'
#' mmParams <- mixModelParams(exprNml, exprTum)
#' topGeneQuants(mmParams)
#' @seealso \code{\link{mixModelParams}}

topGeneQuants <- function(mmParams, deltMu2Thr=90, deltMu1Thr=10, siThr=.99){
    mmParamsDf <- as.data.frame(mmParams)
    deltaMu2Quant <- stats::quantile(mmParamsDf$deltaMu2, deltMu2Thr*.01)
    deltaMu1Quant <- stats::quantile(abs(mmParamsDf$deltaMu1),
        {100-deltMu1Thr}*.01)
    mmParamsDf <- as.data.frame(mmParams)
    tfVect <- mmParamsDf$deltaMu2 > deltaMu2Quant &
        abs(mmParamsDf$deltaMu1) < deltaMu1Quant & mmParamsDf$SI > siThr
    mmParamsDfQuantSubset <- mmParamsDf[tfVect,]
    return(mmParamsDfQuantSubset)
}


#' Human Breast Cancer RNA-sequencing data from TCGA - Tumor Tissue
#'
#' These RNA-sequencing expression data were obtained from the Cancer Genome
#' Atlas project (now the Genomic Data Commons (GDC)). The sequencing data were
#' generated from breast carcinoma samples from 113 patients. Quantification of
#' RNA expression values was performed using standard GDC pipelines. The
#' expression values are reported in transcripts per million reads. Out of an
#' initial 73,599 RNA transcripts, 700 are included as part of this dataset.
#' These 700 transcripts represent a random subset of the transcripts with at
#' least 20% non-zero expression values across all patient samples. Rows contain
#' anonymized patient identifiers, while columns contain UCSC gene symbols.
#'
#' @name exprTumIsof
#' @docType data
#' @author Daniel Pique \email{daniel.pique@med.einstein.yu.edu}
#' @references \url{https://gdc.cancer.gov/}
#' @keywords RNA expression, Breast Tumor

NULL


#' Human Breast Cancer RNA-sequencing data from TCGA - Adj. Normal Tissue
#'
#' These RNA-sequencing expression data were obtained from the Cancer Genome
#' Atlas project (now the Genomic Data Commons (GDC)). The sequencing data were
#' generated from adjacent normal breast tissue data from 113 patients with
#' breast cancer. Quantification of RNA expression values was performed using
#' standard GDC pipelines. The expression values are reported in transcripts per
#' million reads. Out of an initial 73,599 RNA transcripts, 700 are included as
#' part of this dataset. These 700 transcripts represent a random subset of the
#' transcripts with at least 20% non-zero expression values across all patient
#' samples. Rows contain anonymized patient identifiers, while columns contain
#' UCSC gene symbols.
#'
#'
#' @name exprNmlIsof
#' @docType data
#' @author Daniel Pique \email{daniel.pique@med.einstein.yu.edu}
#' @references \url{https://gdc.cancer.gov/}
#' @keywords RNA expression, Breast Tissue

NULL

#' Oncogene Database Mapping Gene Symbol to UCSC ID (kgID)
#'
#' These data were downloaded in September 2017 from the url listed below and
#' represent a mapping between gene symbols in ongene, an oncogene database
#' curated from the scientific literature, and gene identifiers from the
#' University of California, Santa Cruz's genomic database.
#'
#' @name queryRes
#' @docType data
#' @author Min Zhao \email{mzhao@usc.edu.au}
#' @references \url{http://ongene.bioinfo-minzhao.org/ongene_human.txt}
#' @keywords Oncogenes, Database

NULL

Any scripts or data that you put into this service are public.

oncomix documentation built on Nov. 8, 2020, 5:39 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

oncomix
Identifying Genes Overexpressed in Subsets of Tumors from Tumor-Normal mRNA Expression Data

R/oncoMix_two_component_mixture_models.R
In oncomix: Identifying Genes Overexpressed in Subsets of Tumors from Tumor-Normal mRNA Expression Data

Defines functions scatterMixPlot plotGeneHist mixModelParams toMatrix

Documented in mixModelParams plotGeneHist scatterMixPlot toMatrix

Try the oncomix package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

oncomix Identifying Genes Overexpressed in Subsets of Tumors from Tumor-Normal mRNA Expression Data

R/oncoMix_two_component_mixture_models.R In oncomix: Identifying Genes Overexpressed in Subsets of Tumors from Tumor-Normal mRNA Expression Data

Defines functions scatterMixPlot plotGeneHist mixModelParams toMatrix

Documented in mixModelParams plotGeneHist scatterMixPlot toMatrix

Try the oncomix package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

oncomix
Identifying Genes Overexpressed in Subsets of Tumors from Tumor-Normal mRNA Expression Data

R/oncoMix_two_component_mixture_models.R
In oncomix: Identifying Genes Overexpressed in Subsets of Tumors from Tumor-Normal mRNA Expression Data