Nothing
spliceVariants <- function(y, geneID, dispersion=NULL, group=NULL, estimate.genewise.disp=TRUE, trace=FALSE)
# Identify genes with splice variants using a negative binomial model
# We assume that the data come in a matrix (possibly and/or a DGEList), counts summarized at exon level, with gene information available
# Davis McCarthy and Gordon Smyth
# Created 4 February 2011. Last modified 3 Oct 2016.
{
if( is(y, "DGEList") ) {
y.mat <- y$counts
if( is.null(group) )
group <- y$samples$group
lib.size <- y$samples$lib.size * y$samples$norm.factors
}
else {
y.mat <- as.matrix(y)
if( is.null(group) )
stop("y is a matrix and no group argument has been supplied. Please supply group argument.")
lib.size <- colSums(y)
}
geneID <- as.vector(unlist(geneID))
# Order genes by geneID: we need some way to reorganise the data---output cannot possibly be same dimension as input so this is a sensible way to organise things
o <- order(geneID)
geneID <- geneID[o]
y.mat <- y.mat[o,]
uniqIDs <- unique(geneID)
# Organise the dispersion values to be used in the NB models
if( is.null(dispersion) ) {
if(estimate.genewise.disp) {
dispersion <- estimateExonGenewiseDisp(y.mat, geneID, group)
genewise.disp <- TRUE
if(trace)
cat("Computing genewise dispersions from exon counts.\n")
}
else {
dispersion <- estimateCommonDisp(DGEList(counts=y.mat, group=group))
genewise.disp <- FALSE
if(trace)
cat("Computing common dispersion across all exons.\n")
}
}
else {
if( length(dispersion)==length(uniqIDs) ) {
if(is.null(names(dispersion)))
stop("names(dispersion) is NULL. All names of dispersion must be unique geneID.\n")
matches <- match(uniqIDs, names(dispersion))
if( any(is.na(matches)) | any(duplicated(matches)))
stop("names(dispersion) of dispersion do not have a one-to-one mapping to unique geneID. All names of dispersion must be unique geneID.\n")
dispersion <- dispersion[matches]
genewise.disp <- TRUE
}
if( length(dispersion)==1 ) {
genewise.disp <- FALSE
}
}
# Can't get any results if there are no counts for an exon, so remove all-zero exons
keep <- rowSums(y.mat) > 0
exons <- y.mat[keep,]
rownames(exons) <- geneID[keep]
uniqIDs <- unique(geneID[keep])
na.vec <- rep_len(NA_real_, length(uniqIDs))
if(genewise.disp)
dispersion <- dispersion[names(dispersion) %in% uniqIDs]
else {
dispersion <- rep_len(dispersion, length(uniqIDs))
names(dispersion) <- uniqIDs
}
# We want to know how many exons each gene has
dummy <- rowsum(rep_len(1, nrow(exons)), rownames(exons))
nexons <- as.vector(dummy)
names(nexons) <- rownames(dummy)
mm <- match(uniqIDs,names(nexons))
nexons <- nexons[mm]
if(trace)
cat("Max number exons: ",max(nexons),"\n")
# Genes with the same number of exons have the same design matrix, allowing some parallelization of computations
splicevars.out <- data.frame(logFC= na.vec, logCPM = na.vec, LR = na.vec, PValue = na.vec)
rownames(splicevars.out) <- uniqIDs
abundance <- na.vec
# For loop iterates over number of exons for genes, starting at 2 (can't have splice variants if only one exon!)
for(i.exons in sort(unique(nexons))) {
# Select the genes with the right number of exons for this iteration
this.genes <- nexons==i.exons
full.index <- rownames(exons) %in% uniqIDs[this.genes]
if( any(this.genes) ) {
gene.counts.mat <- matrix(t(exons[full.index,]), nrow=sum(this.genes), ncol=ncol(exons)*i.exons, byrow=TRUE)
expanded.lib.size <- rep.int(lib.size, i.exons)
if(i.exons==1) {
abundance[this.genes] <- aveLogCPM(gene.counts.mat, lib.size=expanded.lib.size)
splicevars.out$LR[this.genes] <- 0
splicevars.out$PValue[this.genes] <- 1
}
else {
exon.this <- factor(rep(1:i.exons, each=ncol(exons)))
group.this <- as.factor(rep.int(group, i.exons))
# Define design matrices for this group of genes
X.full <- model.matrix(~ exon.this + group.this + exon.this:group.this )
X.null <- model.matrix(~ exon.this + group.this )
coef <- (ncol(X.null)+1):ncol(X.full)
# Fit NB GLMs to these genes
fit.this <- glmFit(gene.counts.mat, X.full, dispersion[this.genes], offset=0, prior.count=0)
abundance[this.genes] <- aveLogCPM(gene.counts.mat, lib.size=expanded.lib.size)
results.this <- glmLRT(fit.this, coef=coef)
if(sum(this.genes)==1) {
splicevars.out$LR[this.genes] <- results.this$table$LR[1]
splicevars.out$PValue[this.genes] <- results.this$table$PValue[1]
}
else {
splicevars.out$LR[this.genes] <- results.this$table$LR
splicevars.out$PValue[this.genes] <- results.this$table$PValue
}
}
}
}
splicevars.out$logCPM <- abundance
# Create a list with the exons divided up neatly by geneID (a bit slow using in-built fn)
# Not really necessary, so leave out for the time being
# exon.list <- split(as.data.frame(exons), rownames(exons))
# names(exon.list) <- uniqIDs
if(!genewise.disp)
dispersion <- dispersion[1]
new("DGEExact",list(table=splicevars.out, comparison=NULL, genes=data.frame(GeneID=uniqIDs), dispersion=dispersion))
}
estimateExonGenewiseDisp <- function(y, geneID, group=NULL)
# Function to estimate a common dispersion from exon count data
# Created by Davis McCarthy, 29 July 2011.
# Last modified 7 Aug 2019.
{
# Check objects coming in
if( is(y, "DGEList") ) {
y.mat <- y$counts
if( is.null(group) )
group <- y$samples$group
}
else {
y.mat <- as.matrix(y)
if( is.null(group) )
stop("y is a matrix and no group argument has been supplied. Please supply group argument\n")
}
geneID <- as.vector(unlist(geneID))
# Cannot maintain order of the argument y, so order on geneID so that we have some sensible organisation
o <- order(geneID)
geneID <- geneID[o]
y.mat <- y.mat[o,]
# Sum counts from all exons for each gene to get gene-level counts and form DGEList object
gene.counts <- rowsum(y.mat, geneID)
genewise.disp <- rep_len(NA_real_, nrow(gene.counts))
names(genewise.disp) <- rownames(gene.counts)
gene.data <- DGEList(counts=gene.counts, group=group)
# Cannot properly compute dispersion if there are no counts for the gene
used <- rowSums(gene.data$counts) > 0
# Need to first estimate a common dispersion
gene.data <- estimateCommonDisp(gene.data[used,])
# Next estimate tagwise dispersion for each gene, with trend. Default prop.used=2/3, grid.length=200
gene.data <- estimateTagwiseDisp(gene.data, trend="movingave")
# For those gene which have sufficient (>0) counts, assign estimated dispersion
genewise.disp[used] <- gene.data$tagwise.dispersion
# For those genes with zero counts, assign maximum estimated dispersion value
genewise.disp[!used] <- max(gene.data$tagwise.dispersion)
genewise.disp
}
plotExonUsage <- function(y, geneID, group=NULL, transform="none", counts.per.million=TRUE, legend.coords=NULL, ...)
# Plots exon usage from a matrix, DGEList or list of exon counts
# Created by Davis McCarthy, 30 July 2011.
# Last modified 2 Aug 2011.
{
if( is(y,"DGEList") ) {
ind <- rownames(y$counts) %in% geneID
counts <- y$counts
if(counts.per.million)
counts <- cpm(counts)
exon.mat <- counts[ind,]
group <- y$samples$group
}
else {
if( is(y,"list") ) {
exon.mat <- y[[geneID]]
if(counts.per.million)
stop("Counts per million cannot easily be computed when y is a list. Please use either a DGEList object or a matrix of counts.\n")
if(is.null(group))
stop("Group argument must be supplied.\n")
}
else {
if(is.null(group))
stop("Group argument must be supplied.\n")
if(counts.per.million)
y <- cpm(y)
ind <- rownames(y) %in% geneID
exon.mat <- y[ind,]
}
}
transform <- match.arg(transform, c("none", "log2", "sqrt"))
if(transform=="none") {
if(counts.per.million)
ylab <- "Counts per million"
else
ylab <- "Read counts"
}
if(transform=="log2") {
exon.mat <- log2(exon.mat + 0.5)
if(counts.per.million)
ylab <- "log2( Counts per million )"
else
ylab <- "log2( Read counts )"
}
if(transform=="sqrt") {
exon.mat <- sqrt(exon.mat)
if(counts.per.million)
ylab <- "sqrt( Counts per million )"
else
ylab <- "sqrt( Read counts )"
}
if(length(group)!=ncol(exon.mat))
stop("Length of group vector does not match number of samples (columns of exon matrix).\n")
cols <- 1:nlevels(group)
plot(exon.mat[,1], type="b", col=cols[group[1]], panel.first=grid(), ylab=ylab, xlab="Exon", main=paste("GeneID: ",geneID), ylim=c(0, max(exon.mat)), ...)
for( i in 2:ncol(exon.mat) )
lines(exon.mat[,i], type="b", col=cols[group[i]], ...)
if(is.null(legend.coords)) {
legend.coords <- rep_len(NA_real_,2L)
legend.coords[1] <- 0.8*nrow(exon.mat)
legend.coords[2] <- 0.9*max(exon.mat)
}
legend(legend.coords[1], legend.coords[2], levels(group), col=cols, lwd=2)
invisible(exon.mat)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.