#' Detect constitutive and alternative exons
#'
#' Cluster genes according to reciprocal correlations, then iteratively cut tree (bottom up)
#' until one cluster is the most expressed or tied for expression across all samples.
#' Uses \code{\link[amap:hcluster]{hcluster}} from amap package for clustering
#'
#' @param data A data.frame or numeric matrix with one row per exon and one column per sample, containing the expression values of each exon.
#' @param exonID A data.frame with the following columns. \describe{
#' \item{geneID}{Gene ID of each exon.}
#' \item{exonID}{ID of each exon.}
#' }
#' @param signDigits An integer. Number of digits rounded from expression scores for ranking calculations.
#' @param exceptions Integer. Sets the maximum number of samples in which the constitutive exon group does not need to be the most expressed one. Defaults to 10\% of the dataset.
#' @param distMethod The distance metric used by the clustering algorithm (default: correlation), see function \code{\link[amap:hcluster]{hcluster}} from package amap for more information.
#' @param link Agglomeration method used by the clustering algorithm (default: complete), see function \code{\link[amap:hcluster]{hcluster}} from package amap for more information.
#' @param nbproc Integer. Number of subprocess for parallelization (default: 1), see function \code{\link[amap:hcluster]{hcluster}} from package amap for more information
#'
#' @return A data.frame with the following columns. \describe{
#' \item{geneID}{Gene ID of each exon.}
#' \item{clusters}{Numeric ID for each exon group detected.}
#' \item{exonID}{Exon ID.}
#' \item{splicing_category}{Gene class. Can be either spliced (isoforms detected), unspliced (no isoforms detected) or single_expressed_exon (only one exon present in the gene).}
#' \item{constitutive}{Identifies whether the exon group is constitutive (main gene) or facultative (isoform).}
#' \item{transcriptID}{Identifier for each exon group detected. Ends in '_con' for constitutive and '_fac' for facultative exons.}
#' }
#' @examples
#' ExonAssignment = ClusterExons(data = SIRV_data$tpm, exonID = SIRV_data$ID)
#'
#' @export
ClusterExons <- function(data = NULL, exonID = NULL,
exceptions = ceiling(ncol(data)*.1), signDigits = 3,
distMethod = "correlation", link = "complete", nbproc = 1) {
ExonAssTable <- list()
exceptions <- exceptions/ncol(data)
data <- cbind(exonID, as.data.frame(data))
row.names(data) <- data$exonID
for (gID in unique(data$geneID)){
# Subset only exons within one gene ID, removing ID columns
Evalues <- data[which(data$geneID%in%gID),-grep(pattern = "ID",x = names(data))]
# Annotate single-exon genes
if (nrow(Evalues)<2) {
ExonAssTable[[gID]] <- as.data.frame(matrix(row.names(Evalues), ncol = 1))
names(ExonAssTable[[gID]]) <- "exonID"
ExonAssTable[[gID]]$splicing_category <- "single_expressed_exon"
ExonAssTable[[gID]]$clusters <- 0
ExonAssTable[[gID]]$clusterranks <- 1
ExonAssTable[[gID]] <- ExonAssTable[[gID]][c("clusters", "exonID", "clusterranks", "splicing_category")]
} else {
# Generate main clustering for threshold detection
tree <- amap::hcluster(Evalues, method = distMethod, link = link, nbproc = nbproc)
for (nExonClusters in nrow(Evalues):1){
# Evaluate relative times it each exon group is ranked as first
Evalues$clusters <- cutree(tree, k = nExonClusters)
clusterranks <- plyr::ddply(Evalues, plyr::.(clusters), plyr::colwise(median))
# If there is no constitutive exon group, mark as unspliced
if (nrow(clusterranks)==1){
ExonAssTable[[gID]] <- as.data.frame(matrix(row.names(Evalues), ncol = 1))
names(ExonAssTable[[gID]]) <- "exonID"
ExonAssTable[[gID]]$clusters <- 0
ExonAssTable[[gID]]$clusterranks <- 1
ExonAssTable[[gID]]$splicing_category <- "unspliced"
ExonAssTable[[gID]] <- ExonAssTable[[gID]][c("clusters", "exonID", "clusterranks", "splicing_category")]
break} else {
clusterranks <- apply(clusterranks[,-1], 2, function(x){
rank(-round(x, digits = signDigits), ties.method = "min", na.last = T)
})
row.names(clusterranks) <- unique(Evalues$cluster)
clusterranks <- apply(clusterranks, 1, function(x){sum(x==1)/ncol(clusterranks)})
# Control that there is only one exon group which consistently ranks 1st or tied allowing for exceptions
if (sum((clusterranks)>=(1-exceptions))==1) {
# create table with exon subcluster assignments
matchmaker <- Evalues[,"clusters", drop=F]
# store subcluster ranks
clusters <- as.data.frame(clusterranks)
#annotate subcluster ranks with subcluster IDs
clusters$clusters <- c(1:nrow(clusters))
# merge exon with subcluster ID and ranks
matchmaker <- plyr::join(matchmaker, clusters, by="clusters", type="left")
# add exon names
row.names(matchmaker) <- row.names(Evalues)
matchmaker$exonID <- row.names(matchmaker)
ExonAssTable[[gID]] <- matchmaker
ExonAssTable[[gID]]$splicing_category <- "spliced"
ExonAssTable[[gID]] <- ExonAssTable[[gID]][c("clusters", "exonID", "clusterranks", "splicing_category")]
break} else {next}}}
}
}
ExonAssTable <- plyr::ldply(ExonAssTable, rbind)
names(ExonAssTable)[1] <- "geneID"
# assign constitutive/specific status
ExonAssTable$constitutive <- ifelse((ExonAssTable$clusterranks>=(1-exceptions)),"constitutive","facultative")
# merge cluster assignments into unique IDs with designation of constitutiveness
ExonAssTable$transcriptID <- as.factor(paste0(
ExonAssTable$geneID, "_t",
ExonAssTable$clusters,
ifelse(ExonAssTable$constitutive=="constitutive", "_con","_fac"),
sep=""))
# code ID variables as factors
ExonAssTable$geneID <- as.factor(ExonAssTable$geneID)
ExonAssTable$splicing_category <- as.factor(ExonAssTable$splicing_category)
ExonAssTable$constitutive <- as.factor(ExonAssTable$constitutive)
ExonAssTable[,-which(colnames(ExonAssTable)=="clusterranks")]
}
#' Average expression values based on unique eigenexon IDs
#'
#' @param data A data.frame or numeric matrix with one row per exon and one column per sample, containing the expression values of each exon.
#' @param spliceID A data.frame. Output from the \code{\link{ClusterExons}} function.
#' @param splicingRatios Logical. If FALSE, expression from all entries is reported on the same scale. If TRUE, expression from splicing entries is normalized by their gene's constitutive expression score, generating splicing ratios.
#' @param NAcorrection: Logical. Applicable only if \code{splicingRatios = TRUE}. If \code{TRUE}, splicing ratios higher than 1 are set to 1 and NA/NaN/infinity values are set to 0. This accounts for experimental error in measurements.
#'
#' @return A data.frame with one column per sample and one row per exon group.
#' If \code{splicingRatios = F}, entries report the average expression of all exons in the exon group.
#' If \code{splicingRatios = T}, constitutive exon groups report the average expression of their exons and facultative exons report their average expression normalized to the average expression of their gene's constitutive exons group (splicing ratio).
#'
#' @examples
#' ExonAssignment = ClusterExons(data = SIRV_data$tpm, exonID = SIRV_data$ID)
#' AverageExons(SIRV_data$tpm, ExonAssignment)
#'
#' @export
AverageExons <- function(data = NULL, spliceID = NULL, splicingRatios = F, NAcorrection = F){
spliceID = spliceID[,c("geneID", "transcriptID", "constitutive")]
data = cbind(spliceID, data.frame(data))
if (splicingRatios == F) {
out <- plyr::ddply(.data = data, .variables = plyr::.(transcriptID), plyr::numcolwise(median), na.rm = T)
out[order(out$transcriptID),]
} else {
## split into constitutives and facultatives
ConTranscripts <- data[which(data$constitutive=="constitutive"),]
FacTranscripts <- data[which(data$constitutive!="constitutive"),]
## average exon values within transcripts
ConTranscripts <- plyr::ddply(ConTranscripts, .variables = plyr::.(geneID, transcriptID), plyr::numcolwise(median), na.rm = T)
FacTranscripts <- plyr::ddply(FacTranscripts, .variables = plyr::.(geneID, transcriptID), plyr::numcolwise(median), na.rm = T)
FacSplicing <- apply(FacTranscripts, 1, function(Fac){
Spl <- as.numeric(Fac[-grep(pattern = "ID", x = names(FacTranscripts))])
Con <- ConTranscripts[which(Fac["geneID"]==ConTranscripts$geneID),-grep(pattern = "ID", x = names(ConTranscripts))]
Spl/Con
})
FacSplicing <- cbind(FacTranscripts[,c("geneID","transcriptID")], plyr::ldply(FacSplicing))
if (NAcorrection == T) {
# set NA/NaN and infinity scores to 0, set scores greater than 1 to 1
FacSplicing[,sapply(FacSplicing, is.numeric)] <- apply(FacSplicing[,sapply(FacSplicing, is.numeric)],c(1,2),function(x){
as.numeric(ifelse(is.na(x), 0, x))})
FacSplicing[,sapply(FacSplicing, is.numeric)] <- apply(FacSplicing[,sapply(FacSplicing, is.numeric)],c(1,2),function(x){
as.numeric(ifelse(x>1, 1, x))})
}
out <- rbind(ConTranscripts, FacSplicing)
out[order(out$transcriptID),]
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.