##
# minReadsPerGene=3 <- is needed so that it works for iSeq runs
#' compute and plot the GC content and gene-length associated bias in RNA-seq data
#'
#' @param dsFile
#' @param dsName
#' @param param
#' @param qcSummaryDir
#' @param refBuildMap
#' @param minReadsPerSample
#' @param maxReadsPerSample
#' @param minReadsPerGene
#' @param minPresentFraction
#'
#' @return
#' @export
#'
#' @examples
#' ## needs the following modules loaded module add Tools/samtools QC/fastp Aligner/kallisto
#' options(error=recover)
#' require(ezRun)
#' dsFile = "/srv/gstore/projects/p3082/ISeq_20190425_iSeq35_o5404_DataDelivery/dataset.tsv"
#' ezComputeBias(dsFile)
ezComputeBias = function(dsFile, dsName=NULL, param=NULL, qcSummaryDir="/srv/GT/analysis/p2220/RNA-seq-bias-results", refBuildMap=getRefBuildMap(), minReadsPerSample=30000,
maxReadsPerSample=1e6,
minReadsPerGene=3, minPresentFraction=0.2, toMail='', tag = ''){
inputMeta = ezRead.table(file=dsFile)
inputMeta = inputMeta[ inputMeta$"Read Count" > minReadsPerSample, ]
inputMeta$Name = rownames(inputMeta)
outMeta = inputMeta
outMeta$'Count [File]' = paste0(rownames(outMeta), "-count.txt")
outMeta$'bootstrappedCount [File]' = paste0(rownames(outMeta), "-bootstrap.h5")
outMeta$'runInfo [File]' = paste0(rownames(outMeta), "-runInfo.json")
outMeta$'BAM [File]' = paste0(rownames(outMeta), "-tr.bam")
outMeta$'CovStat [File]' = paste0(rownames(outMeta), "-genebody_coverage.rds")
if (is.null(param)){
param = list()
param[['cores']] = '8'
param[['ram']] = '30'
param[['scratch']] = '100'
param[['node']] = ''
param[['process_mode']] = 'SAMPLE'
param[['paired']] = 'false'
param[['strandMode']] = 'both'
param[['refFeatureFile']] = 'genes.gtf'
param[['bootstrap-samples']] = '10'
param[['seed']] = '42'
param[['fragment-length']] = '150'
param[['sd']] = '70'
param[['bias']] = 'true'
param[['pseudobam']] = 'true'
param[['transcriptFasta']] = ''
param[['transcriptTypes']] = 'protein_coding'
param[['trimAdapter']] = 'true'
param[['cut_front']] = 'false'
param[['trim_front1']] = '0'
param[['trim_tail1']] = '0'
param[['cut_front']] = 'false'
param[['cut_front_window_size']] = '4'
param[['cut_front_mean_quality']] = '20'
param[['cut_tail']] = 'true'
param[['cut_tail_window_size']] = '4'
param[['cut_tail_mean_quality']] = '15'
param[['cut_right']] = 'false'
param[['cut_right_window_size']] = '4'
param[['cut_right_mean_quality']] = '20'
param[['average_qual']] = '20'
param[['max_len1']] = '0'
param[['max_len2']] = '0'
param[['poly_x_min_len']] = '10'
param[['length_required']] = '20'
param[['mail']] = toMail
param[['dataRoot']] = '/srv/gstore/projects'
param[['backgroundExpression']] = minReadsPerGene ## low numbers needed for the iSeq runs with
param$sigThresh = minReadsPerGene
param[['transcriptTypes']] = 'protein_coding'
param$normMethod = "logMean"
param$expressionName = "est_counts"
param$nReads = maxReadsPerSample
}
if(!ezIsSpecified(param$nReads)){
param$nReads = maxReadsPerSample
}
if(param[['paired']]){
param[['fragment-length']] = 0
param[['sd']] = 0
}
paramList = param
if(is.na(paramList$refBuild)){
paramList$refBuild = refBuildMap[inputMeta$Species[1]]
if (is.na(paramList$refBuild)){
stop(paste("refBuild not defined for species:", refBuildMap[inputMeta$Species[1]]))
}
}
if (is.null(dsName)){
## assumes the dsFile has the format: /srv/gstore/projects/<project>/<read folder>
parentDirs = strsplit(dirname(dsFile), "/")[[1]]
dsName = paste0(parentDirs[5], "_", parentDirs[6])
plates = paste(unique(inputMeta$`PlateName [Characteristic]`), collapse="_")
if (!is.null(plates) && plates != ""){
dsName = paste0(plates, "_", dsName)
}
}
message(dsName)
resultDir = file.path(qcSummaryDir, dsName)
setwdNew(resultDir)
## align and count with kallisto + coverage profiles
for (i in 1:nrow(inputMeta)){
message(i)
covStatFile = file.path(resultDir, outMeta[i, "CovStat [File]"])
countFile = file.path(resultDir, outMeta$`Count [File]`[i])
if (file.exists(covStatFile) && file.exists(countFile)){
next
}
scratchDir = paste0("/scratch/", dsName, "-", rownames(inputMeta)[i], "-", ezTime())
setwdNew(scratchDir)
EzAppKallisto$new()$run(input=inputMeta[i, ], output=outMeta[i, ], param=paramList)
ezSystem(paste("mv", outMeta$`Count [File]`[i], resultDir))
ga = ezReadGappedAlignments(outMeta[i, "BAM [File]"])
tcList = coverage(ga)
transcriptLengthTotal <- elementNROWS(tcList)
sampledTranscriptCov <- ezMclapply(tcList,
function(x){as.integer(x[round(seq(1, length(x), length.out=101))])},
mc.preschedule=TRUE, mc.cores=min(as.integer(param$cores), 4))
sampledTranscriptCov <- do.call(cbind, sampledTranscriptCov)
trUse = colSums(sampledTranscriptCov) > 0
sampledTranscriptCov = sampledTranscriptCov[ , trUse, drop=FALSE]
trLength = transcriptLengthTotal[trUse]
lengthClasses = ezCut(trLength, breaks=c(399, 1000, 1500, 4000),
labels=c("less than 400nt", "400 to 999nt",
"1000 to 1500nt", "1501 to 4000", "above 4000nt"))
genebody_coverage = list()
for (lc in levels(lengthClasses)){
isInLc = lengthClasses == lc
if (sum(isInLc) > 50){
ltc = sampledTranscriptCov[ , isInLc, drop=FALSE]
avgCov = colMeans(ltc)
relativeCov = ezScaleColumns(ltc, 1/colSums(ltc)) ## normalize so that every transcripts adds the same weight
avgCovQuant = unique(quantile(avgCov, c(0.3, 0.95)))
if (length(avgCovQuant) == 2){
covClasses = ezCut(avgCov, breaks=avgCovQuant,
labels=c("low expressed", "medium expressed",
"high expressed"))
genebody_coverage[[lc]] = list()
for (cc in levels(covClasses)){
genebody_coverage[[lc]][[cc]] = rowMeans(relativeCov[ , covClasses == cc, drop=FALSE ])
}
}
}
}
saveRDS(genebody_coverage, covStatFile)
setwd(resultDir)
unlink(scratchDir, recursive = TRUE)
}
## compute the stats from the
param = ezParam(paramList)
setwd(qcSummaryDir)
countDs = outMeta
countDs$featureLevel = "isoform"
countDs = countDs[file.exists(file.path(resultDir, countDs$`Count [File]`)), ]
input = EzDataset(meta=countDs, dataRoot=resultDir)
rawData = loadCountDataset(input, param)
rawData = subset(rawData, select=colSums(assays(rawData)$counts >= minReadsPerGene) >= 100)
counts = assays(rawData)$counts
nGenes = colSums(counts >= param$sigThresh)
meta = ezFrame(colData(rawData))
assays(rawData)$signal = ezNorm(assays(rawData)$counts,
presentFlag=assays(rawData)$counts >= 3,
method=param$normMethod)
seqAnno <- data.frame(rowData(rawData), row.names=rownames(rawData),
check.names = FALSE, stringsAsFactors=FALSE)
x = assays(rawData)$signal
stopifnot(!is.na(x))
isPresent = rowMeans(x > 1) >= minPresentFraction
logSig = log2(x + param$backgroundExpression)
logRatio = logSig - rowMeans(logSig)
isHighGc = seqAnno$gc > 0.6 & isPresent
isLowGc = seqAnno$gc < 0.4 & isPresent
gcEffect = apply(logRatio, 2, function(x){mean(x[isHighGc], na.rm =TRUE) - mean(x[isLowGc] ,na.rm = TRUE)})
gcTypes = ezFrame(lowGc=isLowGc, highGc=isHighGc)
widthTypes = data.frame("width < 800nt"=as.numeric(rowData(rawData)$featWidth) < 800,
"width > 4000nt"=as.numeric(rowData(rawData)$featWidth) > 4000,
check.names=FALSE)
widthEffect = apply(logRatio, 2, function(x){mean(x[widthTypes$`width > 4000nt`], na.rm =TRUE) - mean(x[widthTypes$`width < 800nt`] ,na.rm = TRUE)})
if (!is.null(meta$`PlatePosition [Characteristic]`)){
ptLabels = sub(".*_", "", meta$`PlatePosition [Characteristic]`)
} else {
ptLabels = 1:nrow(meta)
}
countDs$"Read Count"[countDs$"Read Count" > param$maxReadsPerSample] = param$maxReadsPerSample
widthEffect = shrinkToRange(widthEffect, c(-1, 1))
gcEffect = shrinkToRange(gcEffect, c(-1, 1))
## plot the 3'-enrichment
biasScores = sapply(countDs$`CovStat [File]`, function(rdsFile){
gbCov = readRDS(file.path(resultDir, rdsFile))
midScore = sum(gbCov$`1000 to 1500nt`$`medium expressed`[80:101])
longScore = sum(gbCov$`1501 to 4000`$`medium expressed`[80:101])
c(midScore, longScore)
})
rownames(biasScores) = c("medium genes", "long genes")
colnames(biasScores) = rownames(countDs)
biasScores = t(biasScores)
biasScores = shrinkToRange(biasScores, c(0.1, 0.5))
pdf(file=paste0(dsName, "-bias.pdf"), width=16, height=8)
par(mfrow=c(2,4))
par(pty="s")
smoothScatter(widthEffect, gcEffect, xlim=c(-1, 1), ylim=c(-1, 1), main=paste(dsName, tag), nrpoints = 0, nbin=40, bandwidth = 0.05,
cex.main=0.8, colramp=colorRampPalette(c("white", blues9[1:6])))
text(widthEffect, gcEffect, labels = ptLabels, cex=0.8)
dsty = density(gcEffect, bw=0.05, from=-1, to=1)
plot(dsty$y, dsty$x, type="l", xlab="density", ylab="", main=countDs$LibraryPrepKit[1], cex.main=0.8)
plateRow = substr(ptLabels, start = 1, stop=1)
plateCol = substr(ptLabels, start = 2, stop=3)
mat = ezMatrix(NA, rows=toupper(letters[1:8]), cols=as.character(1:12))
for (i in which(plateRow %in% rownames(mat) & plateCol %in% colnames(mat))) {
mat[plateRow[i], plateCol[i]] = gcEffect[i]
}
lim = c(-1, 1)
image(1:ncol(mat), 1:nrow(mat), t(shrinkToRange(mat, lim)), col=getBlueRedScale(), breaks = seq(from = lim[1], to = lim[2], length.out = 257), axes=FALSE,
xlim=c(0.5, ncol(mat)+0.5), ylim=c(0.5, nrow(mat)+0.5), xlab="", ylab="")
axis(1, 1:12, labels = colnames(mat))
axis(2, 1:8, labels = rownames(mat), las=2)
logWidthBreaks = c(9.5, 10.5, 11.5)
gcBreaks = c(0.42, 0.48, 0.53, 0.57, 0.62)
#gcBreaks = c(0.4, 0.43, 0.46, 0.49, 0.53, 0.57, 0.61, 0.64, 0.67)
ctsCorr = ezCorrectBias(counts, seqAnno$gc, seqAnno$featWidth, logWidthBreaks = logWidthBreaks, gcBreaks = gcBreaks, widthOffset=100)
lwClasses = ezCut(ctsCorr$logWidthShrinked, breaks = logWidthBreaks)
gcClasses = ezCut(ctsCorr$gcShrinked, gcBreaks)
mat = ctsCorr$binScoresFilled
lim = c(-1.5, 1.5)
image(1:nrow(mat), 1:ncol(mat), shrinkToRange(mat, lim), col=getBlueRedScale(), breaks = seq(from = lim[1], to = lim[2], length.out = 257), axes=FALSE,
xlim=c(0.5, nrow(mat)+0.5), ylim=c(0.5, ncol(mat)+0.5), ylab="GC", xlab="width")
## second row of plots
plot(1:length(gcEffect), gcEffect, type="n", ylim=c(-1, 1))
text(1:length(gcEffect), gcEffect, labels = ptLabels, cex=0.8)
plot(1:length(gcEffect), gcEffect, type="n", ylim=c(-1, 1), xlim=c(1, length(gcEffect)*1.1))
text(1:length(gcEffect), gcEffect, labels = names(gcEffect), cex=0.8, pos = 4)
smoothScatter(biasScores[ , "medium genes"], biasScores[ , "long genes"], xlim=c(0.1, 0.5), ylim=c(0.1, 0.5),
xlab="medium genes", ylab="long genes", main=dsName, nrpoints = 0, nbin=40, bandwidth = 0.05,
cex.main=0.8, colramp=colorRampPalette(c("white", blues9[1:6])))
abline(h=0.2, v=0.2, col="gray")
text(biasScores[ , "medium genes"], biasScores[ , "long genes"], labels = ptLabels, cex=0.8)
readCount = countDs[names(nGenes), "Read Count"] * exp(runif(length(nGenes), min=-0.1, max=0.1))
plot(readCount,
nGenes, log="xy",
xlab="nReads", ylab="nGenes", main=dsName, type="n")
text(readCount,
nGenes, labels=ptLabels, cex=0.8)
dev.off()
}
getRefBuildMap = function(){
refBuildMap = c(
"Mus musculus (house mouse)" = "Mus_musculus/GENCODE/GRCm39/Annotation/Release_M26-2021-04-20",
"Homo sapiens (human)"="Homo_sapiens/GENCODE/GRCh38.p13/Annotation/Release_37-2021-05-04",
"human"="Homo_sapiens/GENCODE/GRCh38.p13/Annotation/Release_37-2021-05-04",
"homo"="Homo_sapiens/GENCODE/GRCh38.p13/Annotation/Release_37-2021-05-04",
"Equus caballus (horse)"="Equus_caballus/Ensembl/EquCab2/Annotation/Release_89-2017-06-07",
"Canis familiaris"="Canis_familiaris/Ensembl/CanFam3.1/Annotation/Release_104-2021-06-01",
"Oryctolagus cuniculus (rabbit)"="Oryctolagus_cuniculus/Ensembl/OryCun2.0/Annotation/Release_89-2017-06-08",
"Escherichia coli K12"="Escherichia_coli/Ensembl/K12_MG1655_ASM584v2/Annotation/Release_2019-01-31",
"E Coli"="Escherichia_coli/Ensembl/K12_MG1655_ASM584v2/Annotation/Release_2019-01-31",
"Fusarium oxysporum"="Fusarium_oxysporum/Misc/Fo5176/Annotation/Release_5176-2019-03-18",
"Canis lupus familiaris"="Canis_familiaris/Ensembl/CanFam3.1/Annotation/Release_104-2021-06-01",
"Cricetulus griseus (Chinese hamster)"="Cricetulus_griseus/CHO/CHO-K1/Annotation/Version-2015-12-11/Genes",
"Felis silvestris catus"="Felis_catus/Ensembl/Felis_catus_6.2/Annotation/Release_89-2017-06-07",
"Candida glabrata"="Candida_albicans/CGD/SC5314_Assembly22/Annotation/Release_2018-11-13",
"Toxoplasma gondii"="Toxoplasma_gondiiME49/ToxoDB/TgondiiME49_R33/Annotation/Release_33-2017-07-28",
"Sus Linnaeus (Pig)"="Sus_scrofa/Ensembl/Sscrofa11.1/Annotation/Release_90-2017-11-17",
"Canis familiaris"="Canis_familiaris/Ensembl/CanFam3.1/Annotation/Release_89-2017-06-07",
"Sus scrofa"="Sus_scrofa/Ensembl/Sscrofa11.1/Annotation/Release_90-2017-11-17",
"Bos taurus"="Bos_taurus/Ensembl/UMD_v3.1/Annotation/Release_92-2018-05-30",
"Rattus norvegicus"="Rattus_norvegicus/Ensembl/Rnor_6.0/Annotation/Release_98-2019-11-11",
"Gallus gallus (chicken)"="Gallus_gallus/Ensembl/Gallus_gallus-5.0/Annotation/Release_89-2017-06-08",
"Danio rerio (zebrafish)"="Danio_rerio/Ensembl/GRCz11/Annotation/Release_96-2019-04-22",
"Drosophila melanogaster (fruit fly)"="Drosophila_melanogaster/Ensembl/BDGP6/Annotation/Release_89-2017-06-07",
"Caenorhabditis elegans (nematode)"="Caenorhabditis_elegans/Ensembl/WBcel235/Annotation/Release_90-2017-08-31",
"Saccharomyces cerevisiae (bakers yeast)"="Saccharomyces_cerevisiae/Ensembl/R64/Annotation/Version-2015-08-04",
"Oryza sativa (rice)"="Oryza_sativa_japonica/Ensembl/IRGSP_1.0/Annotation/Release_42-2019-01-28",
"Arabidopsis thaliana (thale cress)"="Arabidopsis_thaliana/TAIR/TAIR10/Annotation/Release_01-2018-10-09",
"Zea mays"="Sus_scrofa/Ensembl/Sscrofa11.1/Annotation/Release_90-2017-11-17",
"Triticum aestivum"="Triticum_aestivum/URGI/IWGSCv3/Annotation/Version-2016-05-09",
"Escherichia coli"="Escherichia_coli/Ensembl/K12_MG1655_ASM584v2/Annotation/Release_2019-01-31"
)
return(refBuildMap)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.