knitr::opts_chunk$set(message=FALSE, warning=FALSE)
## require() or library() statements require(Biobase) require(ggplot2) require(BS831) OMPATH <- Sys.getenv("OMPATH") print(OMPATH) SHOW_ANSWERS <- TRUE
Let us load the data, show the matrix size, and the count range.
data("HNSC_htseq_raw_counts_AEvsG1vsG3") dge <- HNSC_htseq_raw_counts_AEvsG1vsG3 print(dim(dge)) quantile(exprs(dge),probs=seq(0,1,.1))
We now transform the counts into transcripts per million (TPM) according to the formula:
$$ TPM_{gene_i} = \frac{\frac{Counts_{gene_i}}{Length_{gene_i}/1000}}{\sum_{i=1}^G\frac{Counts_{gene_i}}{Length_{gene_i}/1000}} \times 1E6 $$ Where $G$ is the total number of genes.
Let us begin by uploading the information about gene lengths.
## retrieve transcript length information len <- read.csv(file.path(Sys.getenv("OMPATH"),"data/GC_lengths_GRCh38.87.csv"),row.names=1) ## match gene length info with count matrix cmn <- intersect(featureNames(dge),rownames(len)) len <- len[cmn,] dge <- dge[cmn,] if ( any(featureNames(dge)!=rownames(len)) ) stop( "featureNames(dge)!=rownames(len)" )
With this information at hand, we can perform the TPM transformation based on the above formula.
## Normalize expression by TPM dgeTPM <- dge ## per 1000 len$KB <- len$Length/1000 ## reads per transcript length (in KB) expPKB <- apply( exprs(dgeTPM), 2, function(x){ x / len$KB } ) ## Divide by total exprs(dgeTPM) <- apply( expPKB, 2, function(x) { x / sum(x) * 1E6} ) ## show
Let us show few expression matrix entries before TPM transformation ...
exprs(dge)[order(rowSums(exprs(dgeTPM)),decreasing=TRUE),][1:5,1:5]
.. and after.
exprs(dgeTPM)[order(rowSums(exprs(dgeTPM)),decreasing=TRUE),][1:5,1:5]
Are all "library sizes" the same after TPM transformation?
lSize <- colSums(exprs(dgeTPM)) ## are all sizes the same? table(lSize)
SS <- data.frame( MED=matrixStats::rowMedians(exprs(dgeTPM)), MAD=matrixStats::rowMads(exprs(dgeTPM))) ggplot(SS,aes(MED,MAD)) + geom_point() + geom_smooth() + labs(title="linear axes") suppressWarnings( ggplot(SS,aes(MED,MAD)) + geom_point() + geom_smooth() + scale_x_continuous(trans='log2') + scale_y_continuous(trans='log2')) + labs(title="log2 axes")
dgeTPMlog2 <- log2(exprs(dgeTPM)+1) SS2 <- data.frame( MED=matrixStats::rowMedians(dgeTPMlog2), MAD=matrixStats::rowMads(dgeTPMlog2)) ggplot(SS2,aes(MED,MAD)) + geom_point() + geom_smooth()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.