#' @title Fetch sequences from ncRNAmapping
#'
#' @description This method returns a data frame containing the sequence and UID from mapped ncRNAs
#' @param ncRNAMappingDF candidates of interest
#' @param genomeIndexFilePath_ncRNA path to the fasta file where the STAR index is based on
#' @return data.frame containing the UID and sequence
#' @docType methods
#' @export
fetchNcRNAMappingSeq = function( ncRNAMappingDF, genomeIndexFilePath_ncRNA ){
require(Biostrings)
if( sum( !c("UID","mapStart","mapEnd") %in% colnames(ncRNAMappingDF) ) > 0 ){
stop("Please provide: UID, mapStart, mapEnd column in ncRNAMappingDF!")
}
ncRNASeqs= readDNAStringSet(filepath = file.path( genomeIndexFilePath_ncRNA,list.files(path=genomeIndexFilePath_ncRNA, pattern = ".fa$")))
names(ncRNASeqs) = sub(" .*","", names(ncRNASeqs))
ncRNAMappingDF$ID = sub("_minus","",ncRNAMappingDF$UID)
ncRNAMappingDF$ID = sub("_plus","",ncRNAMappingDF$ID)
ncRNAMappingSeqDFL = vector("list", dim(ncRNAMappingDF)[1])
for( i in 1:dim(ncRNAMappingDF)[1] ){
x = ncRNAMappingDF[i,]
currID = x$ID
currSeq = ncRNASeqs[which(currID == names(ncRNASeqs))]
currSubstr = subseq(BStringSet(currSeq), start=as.integer(x$mapStart), end=ifelse( as.integer(x$mapEnd) > width(currSeq), width(currSeq), as.integer(x$mapEnd) ) )
ncRNAMappingSeqDFL[[i]] = data.frame("UID"=x$UID, "Sequence"=as.character(currSubstr), "Sense"=TRUE, "ID"=x$ID)
}
ncRNAMappingSeqDF = do.call(rbind, ncRNAMappingSeqDFL)
return(ncRNAMappingSeqDF)
}
#' @title Calculate Normalized Read Count Coverage
#'
#' @description This method returns a data frame containing normalized coverage values for each position. It requires the output of calcReadCountCoveragePerCondition or generateCoverageDF_contigs as data.frame or list
#' @param coverageL output of calcReadCountCoveragePerCondition/generateCoverageDF_contigs
#' @param deseqds DESeq object of the current data
#' @return data.frame containing the normalized read counts for each position in order to create a histogram of read counts for each ncRNA
#' @docType methods
#' @export
calcNormReadCountCoverage = function( coverageL, deseqds ){
#Converting list to data.frame
coverageDF = do.call(rbind, coverageL)
if( sum(colnames(coverageDF) == "Sample") == 0 ){
stop(paste0("Column 'Sample' must be present in coverageDF!"))
}
coverageDFL = split(coverageDF, factor( coverageDF$Sample, levels=unique(coverageDF$Sample), ordered=FALSE ) )
if( sum(names(coverageDFL) == dds$sampleName) != length(dds$sampleName) ){
stop("SampleNames must be the same in the DESeq object and the coverageDF")
}
coverageDF = do.call(rbind,mapply(function(x,y){
x$ReadCountNorm = x$ReadCount / y
return(x)
}, coverageDFL, sizeFactors(deseqds), SIMPLIFY = FALSE) )
return(coverageDF)
}
#' @title Splits coverage list per condition
#'
#' @description Splits the list obtained by calcReadCountCoveragePerCondition/generateCoverageDF_contigs by condition and calculates the mean read coverage per condition
#' @param coverageL output of calcReadCountCoveragePerCondition/generateCoverageDF_contigs
#' @param deseqds DESeq object of the current data
#' @param samplesInfo describing the experiment (produced by configuration template)
#' @return data.frame containing the normalized read counts for each position in order to create a histogram of read counts for each ncRNA
#' @docType methods
#' @export
splitReadCountCoveragePerCondition = function( coverageL, deseqds, samplesInfo ){
coverageDF = RNASeqUtility::calcNormReadCountCoverage(coverageL, deseqds)
#Mapping condition to sample groups
coverageDF = do.call(rbind, lapply( split(coverageDF,coverageDF$Sample), function(x){
x$condition = samplesInfo[ which(samplesInfo$sampleName == x$Sample[1]),]$condition
return(x)
}))
#Splitting the data frame by condition and calculating the mean coverage over samples
coverageDFL = split( coverageDF, coverageDF$condition)
coverageDFL = lapply(coverageDFL, function(x){
ncRNAClassL = split(x, x$ID)
ncRNAClassL_mod = lapply( ncRNAClassL , function(y){
coordL = split(y, y$Idx)
coordL_modL = lapply( coordL, function(z){
z$ReadMeans = mean( z$ReadCount, na.omit=TRUE )
z$ReadMeansNorm = mean( z$ReadCountNorm, na.omit=TRUE )
return(z)
})
coordL_mod = do.call(rbind,coordL_modL)
return(coordL_mod)
})
return(do.call(rbind, ncRNAClassL_mod))
})
return( do.call(rbind, coverageDFL) )
}
#' @title Generate coverage data frame for specific candidates
#'
#' @description This method returns a data frame containing coverage values for each position. It is intended for genome mapping results. For ncRNA mappings please see: generateCoverageDF_ncRNAmappings
#' @param contigsGR GRanges file containing the coordinates of the contigs and their contigID (specified as contigID in the metadata)
#' @param contigResTable Selected candidates from the result of differential expression analysis from DESeq2 combined with the minimum annotation dataframe
#' @param bedgraphMapping_plusL the cmdResult object of the bedgraph plus mapping (genomeCovNcRNAMapping_plusL)
#' @param bedgraphMapping_minusL the cmdResult object of the bedgraph minus mapping (genomeCovNcRNAMapping_minusL)
#' @param sampleNames names of the samples from samplesInfo table
#' @param annotationGR An otpional annotation GRanges object, when provided the annotation will be used as reference interval
#' @return data.frame containing the read counts for each position in order to create a histogram of read counts for each ncRNA
#' @docType methods
#' @export
generateCoverageDF_contigs = function(contigsGR, contigResTable, bedgraphMapping_plusL, bedgraphMapping_minusL, sampleNames, annotationGR){
require(rtracklayer)
require(GenomicRanges)
colsRequired = c("ID","UID","biotype","Name","log2FoldChange","pvalue","padj","strandness")
if( sum( colsRequired %in% colnames(resAnnot) ) != length(colsRequired) ){
stop( paste0( "Columns: ",paste0(colsRequired,collapse=", ")," have to be provided!"))
}
#Combining to GRanges object
roisContigs = contigsGR[ contigsGR$contigID %in% contigResTable$UID ]
elementMetadata(roisContigs) = merge( elementMetadata( roisContigs ), contigResTable, by.x="contigID", by.y="UID" ,sort=FALSE)
roisContigsL = GenomicRanges::split(roisContigs, roisContigs$ID)
bedgraphncRNAMapping_plusL = lapply( bedgraphMapping_plusL, function(x){
rtracklayer::import(file.path(getOutFilePath(getCLIApplication(x)), getOutResultName(getOutResultReference(x))),
asRangedData=FALSE, format="bedGraph")
} )
bedgraphncRNAMapping_minusL = lapply( bedgraphMapping_minusL, function(x){
rtracklayer::import(file.path(getOutFilePath(getCLIApplication(x)), getOutResultName(getOutResultReference(x))),
asRangedData=FALSE, format="bedGraph")
} )
if(missing(annotationGR)){
roiL = vector("list", length(bedgraphncRNAMapping_plusL))
for(i in c(1:length(bedgraphncRNAMapping_plusL))){
bgminus = bedgraphncRNAMapping_minusL[[i]]
bgplus = bedgraphncRNAMapping_plusL[[i]]
roiL_contig = lapply( roisContigsL , function(x){
if( as.character(strand(x)) == "+" ){
rnaOI = GenomicRanges::subsetByOverlaps(bgplus,x )
}else{
rnaOI = GenomicRanges::subsetByOverlaps(bgminus,x )
}
if(length(rnaOI) != 0){
#Filling the gaps in the coverage file by creating a one base based annotation with zero score!
rnaOI_fill = GRanges( as.character(seqnames(rnaOI))[1],
IRanges(
start=start(rnaOI)[1]:(end(rnaOI)[length(rnaOI)]-1),
end=(start(rnaOI)[1]+1):end(rnaOI)[length(rnaOI)] ),
score=rep(0, end(rnaOI)[length(rnaOI)]-start(rnaOI)[1] ) )
fillGR = GenomicRanges::setdiff( rnaOI_fill, rnaOI )
if( length(fillGR) > 0 ){
fillGR$score = 0
rnaOI = IRanges::append( rnaOI, fillGR)
}
}
rnaOICov = rep(rnaOI$score, width(rnaOI))#
if( length(rnaOICov) == 0 ){
rnaOICovDF = NULL
} else{
rnaOICovDF = data.frame( "Idx"=1:length(rnaOICov),
"ReadCount"=rnaOICov,
"group"=x$biotype,
"Name"=x$Name,
"ID"=x$ID,
"regulation"=ifelse( x$log2FoldChange > 0, "up", "down"),
"log2FoldChange"=x$log2FoldChange,
"pvalue"=x$pvalue,
"padj"=x$padj,
"strandness"=x$strandness
)
}
return(rnaOICovDF)
} )
roi_ncOtherDF = do.call(rbind, roiL_contig)
roi_ncOtherDF$Sample = ifelse(missing(sampleNames),paste0("Sample",i), sampleNames[i])
roiL[[i]] = roi_ncOtherDF
}
return(roiL)
} else{
currMap = GenomicRanges::findOverlaps(annotationGR, roisContigs)
currMap = currMap[ !queryHits(currMap) %in% queryHits(currMap)[which(duplicated( queryHits(currMap)))] ]
roisContigs = roisContigs[subjectHits(currMap)]
annotationGR = annotationGR[queryHits(currMap)]
roiL = vector("list", length(bedgraphncRNAMapping_plusL))
for(i in c(1:length(bedgraphncRNAMapping_plusL))){
bgminus = bedgraphncRNAMapping_minusL[[i]]
bgplus = bedgraphncRNAMapping_plusL[[i]]
roicL = vector("list", length(roisContigs))
for(k in 1:length(roisContigs)){
contig = roisContigs[k]
refAnnot = annotationGR[k]
if( as.character(strand(contig)) == "+" ){
rnaOI = GenomicRanges::subsetByOverlaps(bgplus,refAnnot )
}else{
rnaOI = GenomicRanges::subsetByOverlaps(bgminus,refAnnot )
}
if(length(rnaOI) == 0){
#No coverage information available in this library!
return(data.frame())
}
if( start(rnaOI)[1] > start(refAnnot) ){
toAdd = (length( start(refAnnot):start(rnaOI)[1] ))
beforeToAdd = GRanges( seqnames(rnaOI)[1], IRanges( (start(rnaOI)[1]-toAdd+1), (start(rnaOI)[1]-1) ), score=0 )
rnaOI = IRanges::append(rnaOI, beforeToAdd)
} else{
start(rnaOI)[1] = start(refAnnot)
}
if(end(refAnnot) > end(rnaOI)[length(rnaOI)]){
toAdd = length( end(rnaOI)[length(rnaOI)]:end(refAnnot) )
afterToAdd = GRanges( seqnames(rnaOI)[1], IRanges( (end(rnaOI)[length(rnaOI)]+1),
(end(rnaOI)[length(rnaOI)]+toAdd-1 ) ), score=0 )
rnaOI = IRanges::append(rnaOI, afterToAdd)
} else{
end(rnaOI)[length(end(rnaOI))] = end(refAnnot)
}
#Filling the gaps in the coverage file by creating a one base based annotation with zero score!
refAnnot_fill = GRanges( as.character(seqnames(refAnnot))[1],
IRanges(
start=start(refAnnot):(end(refAnnot)-1),
end=(start(refAnnot)+1):end(refAnnot) ),
score=rep(0,width(refAnnot)-1))
fillGR = GenomicRanges::setdiff( refAnnot_fill, rnaOI )
if( length(fillGR) > 0 ){
fillGR$score = 0
rnaOI = IRanges::append( rnaOI, fillGR)
}
rnaOI = sort(rnaOI)
rnaOICov = rep(rnaOI$score, width(rnaOI))#
rnaOICovDF = data.frame( "Idx"=1:length(rnaOICov),
"ReadCount"=rnaOICov,
"group"=contig$biotype,
"Name"=contig$Name,
"ID"=contig$ID,
"regulation"=ifelse( contig$log2FoldChange > 0, "up", "down"),
"log2FoldChange"=contig$log2FoldChange,
"pvalue"=contig$pvalue,
"padj"=contig$padj,
"strandness"=contig$strandness
)
roicL[[k]] = rnaOICovDF
}
roiDF = do.call(rbind, roicL)
roiDF$Sample = ifelse(missing(sampleNames),paste0("Sample",i), sampleNames[i])
roiL[[i]] = roiDF
}
return(roiL)
}
}
#' @title Generate coverage data frame for specific candidates
#'
#' @description This method returns a data frame containing coverage values for each position. It is intended for ncRNAmapping results. For contigs please see: generateCoverageDF_contigs
#' @param ncRNAMappingsDF Selected candidates from the result of differential expression analysis from DESeq2 combined with the minimum annotation dataframe
#' @param bedgraphMapping_plusL the cmdResult object of the bedgraph plus mapping (genomeCovNcRNAMapping_plusL)
#' @param bedgraphMapping_minusL the cmdResult object of the bedgraph minus mapping (genomeCovNcRNAMapping_minusL)
#' @param sampleNames names of the samples from samplesInfo table
#' @return data.frame containing the read counts for each position in order to create a histogram of read counts for each ncRNA
#' @docType methods
#' @export
generateCoverageDF_ncRNAmappings = function( ncRNAMappingsDF, bedgraphMapping_plusL, bedgraphMapping_minusL, sampleNames ){
require(rtracklayer)
require(GenomicRanges)
colsRequired = c("ID","UID","biotype","Name","log2FoldChange","pvalue","padj","strandness")
if( sum( colsRequired %in% colnames(resAnnot) ) != length(colsRequired) ){
stop( paste0( "Columns: ",paste0(colsRequired,collapse=", ")," have to be provided!"))
}
ncRNAMappingsDF$ID = sub("_plus.*$","",ncRNAMappingsDF$UID)
ncRNAMappingsDF$ID = sub("_minus.*$","",ncRNAMappingsDF$ID)
bedgraphncRNAMapping_plusL = lapply( bedgraphMapping_plusL, function(x){
rtracklayer::import(file.path(getOutFilePath(getCLIApplication(x)), getOutResultName(getOutResultReference(x))),
asRangedData=FALSE, format="bedGraph")
} )
bedgraphncRNAMapping_minusL = lapply( bedgraphMapping_minusL, function(x){
rtracklayer::import(file.path(getOutFilePath(getCLIApplication(x)), getOutResultName(getOutResultReference(x))),
asRangedData=FALSE, format="bedGraph")
} )
ncRNAMappingsDFL = split(ncRNAMappingsDF, ncRNAMappingsDF$UID)
roiL = vector("list", length(bedgraphncRNAMapping_plusL))
for(i in c(1:length(bedgraphncRNAMapping_plusL))){
bgminus = bedgraphncRNAMapping_minusL[[i]]
bgplus = bedgraphncRNAMapping_plusL[[i]]
roi_ncOtherL = lapply( ncRNAMappingsDFL , function(x){
#First decide if plus or minus according to UID
if( length(grep("minus",x$UID)) == 0 ){
currGR = bgplus[seqnames( bgplus ) == x$ID]
}else{
currGR = bgminus[seqnames( bgminus ) == x$ID]
}
if(length(currGR) == 0) {
#Too less reads, in the mapping file are no reads available
return(data.frame())
}
#Filling the gaps in the coverage file by creating a one base based annotation with zero score!
rnaOI_fill = GRanges( as.character(seqnames(currGR))[1],
IRanges(
start=start(currGR)[1]:(end(currGR)[length(currGR)]-1),
end=(start(currGR)[1]+1):end(currGR)[length(currGR)] ),
score=rep(0, end(currGR)[length(currGR)]-start(currGR)[1] ) )
fillGR = GenomicRanges::setdiff( rnaOI_fill, currGR )
if( length(fillGR) > 0 ){
fillGR$score = 0
currGR = IRanges::append( currGR, fillGR)
}
rnaOICov = rep(currGR$score, width(currGR))#
rnaOICovDF = data.frame( "Idx"=1:length(rnaOICov),
"ReadCount"=rnaOICov,
"group"=x$biotype,
"Name"=x$Name,
"ID"=x$ID,
"regulation"=ifelse( x$log2FoldChange > 0, "up", "down"),
"log2FoldChange"=x$log2FoldChange,
"pvalue"=x$pvalue,
"padj"=x$padj,
"strandness"=x$strandness )
return(rnaOICovDF)
} )
roi_ncOtherDF = do.call(rbind, roi_ncOtherL)
roi_ncOtherDF$Sample = ifelse(missing(sampleNames),paste0("Sample",i), sampleNames[i])
roiL[[i]] = roi_ncOtherDF
}
return(roiL)
}
#' @title Generate Count table
#'
#' @description This method combines the read counts after counting them with mergeBed from bam files
#' @param bamToBedAndMergencRNAL bed files after mapping
#' @return read count table
#' @docType methods
#' @export
generateCountFromMappingDF = function( bamToBedAndMergencRNAL, samplesInfo ){
require(rtracklayer)
require(GenomicRanges)
ncRNAReadCountL = lapply( bamToBedAndMergencRNAL, function(x){
x = x[[2]]
dir = getOutFilePath(getCLIApplication(x))
return( rtracklayer::import( file.path( dir,getOutResultName(getOutResultReference(x))), format="BED",asRangedData=FALSE ) )
})
#Create a reference dataframe containing all entries (Ensembl ids plus strand as ID)
overallTable = unique(do.call( rbind, lapply( ncRNAReadCountL, function(x){
xdf = GenomicRanges::as.data.frame(x)
colnames(xdf) = c("transcript_id", "mapStart","mapEnd","mapWidth","mapStrand","readCount","mapQ")
xdf$UID = paste0(xdf$transcript_id, ifelse(xdf$mapStrand == "-", "_minus", "_plus") )
return(xdf[,c("transcript_id","UID")])
}) ) )
rownames(overallTable) = 1:dim(overallTable)[1]
#Now merge with counted reads
readsCountDF = do.call( cbind, lapply( ncRNAReadCountL, function(x){
xdf = GenomicRanges::as.data.frame(x)
colnames(xdf) = c("transcript_id", "mapStart","mapEnd","mapWidth","mapStrand","readCount","mapQ")
xdf$UID = paste0(xdf$transcript_id, ifelse(xdf$mapStrand == "-", "_minus", "_plus") )
#Merge The duplicated UIDs
xdf = do.call(rbind, lapply( split(xdf, xdf$UID), function(x){
xnew = x[1,]
xsize = dim(x)[1]
if( xsize != 1 ){
xnew$mapStart = min(as.numeric(x$mapStart))
xnew$mapEnd = min(as.numeric(x$mapEnd))
xnew$readCount = sum(as.numeric(x$readCount),na.rm = TRUE)
xnew$mapQ = mean(as.numeric(x$mapQ),na.rm = TRUE)
}
return(xnew)
}))
xdf_merged = merge(overallTable, xdf[,c("UID", "readCount","mapStart","mapEnd")], by="UID", all.x=TRUE)
return(xdf_merged)
} ) )
#Change for mapStart take minimum and map End take maximum? -> else rowMeans should be ok too, but round!
mapStart = readsCountDF[,grep("mapStart", colnames(readsCountDF)) ]
mapStart = apply(mapStart, 1, min, na.rm=TRUE)
mapEnd = readsCountDF[,grep("mapEnd", colnames(readsCountDF)) ]
mapEnd = apply(mapEnd, 1, min, na.rm=TRUE)
readsCountDF = readsCountDF[, c(1, grep("readCount", colnames(readsCountDF))) ]
colnames(readsCountDF)[1] = "UID"
colnames(readsCountDF)[2:length(colnames(readsCountDF))] = as.character(samplesInfo$sampleName)
#Changing all NA values to zero
readsCountDF[,2:length(colnames(readsCountDF))] = apply(readsCountDF[,2:length(colnames(readsCountDF))], 2, function(x){
ifelse( is.na(x), 0, x)
})
readsCountDF$mapStart = mapStart
readsCountDF$mapEnd = mapEnd
return( readsCountDF )
}
#' @title Summarize contig Annotation
#'
#' @description This method summarizes the contig annotation table by first checking for range based annotation then for rfam annotation then for repeat annotation
#' @param data.frame contig annotation table (contigAnnotTable_fin.csv)
#' @return summarized contig annotation
#' @docType methods
#' @export
summarizeContigAnnotation = function(contigDF){
currentlySupportedCols = c("contigID","g_gene_biotype","g_gene_type_ext","g_gene_name","r_repClass","r_repName","inf_targetName","inf_descriptionOfTarget")
if( sum(colnames( contigDF ) %in% currentlySupportedCols) == length(colnames( contigDF )) ){
stop( paste0("Please provide following columns in your input:\n ", paste0(currentlySupportedCols, collapse=" ") ) )
}
gene_type = contigDF[,c("contigID","g_gene_biotype","g_gene_type_ext","g_gene_name")]
gene_type$g_gene_biotype = with(gene_type, ifelse(is.na(g_gene_biotype),"none", g_gene_biotype) )
gene_type$g_gene_name = with(gene_type, ifelse(is.na(g_gene_name),"not_annotated", g_gene_name) )
#get the best feature annotation
#start with range based annotation, then go to repeat masker if infernal is ? -> else infernal first then repeat masker
featureAnnotType = c()
featureAnnotName = c()
annotationType = c()
featureAnnotType = with(contigDF, ifelse(
#outer if statement
contigDF$f_featureAnnot_short == "unknown", ifelse(!is.na(contigDF$inf_inc) & contigDF$inf_inc == "!" ,
sub( ".*small nucleolar RNA.*","snoRNA", sub(".*microRNA.*","miRNA",contigDF$inf_descriptionOfTarget, ignore.case=TRUE), ignore.case=TRUE ),
ifelse(!is.na(contigDF$r_repClass),
contigDF$r_repClass, "unknown")),
#outer else statement
contigDF$f_featureAnnot_short
) )
featureAnnotName = with(contigDF, ifelse(
#outer if statement
contigDF$f_featureAnnot_short == "unknown", ifelse(!is.na(contigDF$inf_inc) & contigDF$inf_inc == "!" ,
contigDF$inf_targetName,
ifelse(!is.na(contigDF$r_repClass),
contigDF$r_repName, "unknown")),
#outer else statement
contigDF$f_feature_name
) )
annotationType = with(contigDF, ifelse(
#outer if statement
contigDF$f_featureAnnot_short == "unknown", ifelse(!is.na(contigDF$inf_inc) & contigDF$inf_inc == "!" ,
"infernal",
ifelse(!is.na(contigDF$r_repClass),
"repeatMaskr", "unknown")),
#outer else statement
"featureAnnot"
) )
# Long form for readability but this would need to be iterated with for loop -> slow in R!:
# if( contigDF$f_featureAnnot_short == "unknown" ){
#
# if( contigDF$inf_inc == "!" ){
# featureAnnotType = c(featureAnnotType,sub( ".*small nucleolar RNA.* ","snoRNA", sub(".*microRNA.*","miRNA",contigDF$inf_descriptionOfTarget) ))
# featureAnnotName = c(featureAnnotName,contigDF$inf_targetName)
# } else if( !is.na(contigDF$r_repClass) ){
# featureAnnotType = c(featureAnnotType,contigDF$r_repClass)
# featureAnnotName = c(featureAnnotName,contigDF$r_repName)
# } else{
# featureAnnotType = c(featureAnnotType,"unknown")
# featureAnnotName = c(featureAnnotName,"unknown")
# }
#
# } else{
# featureAnnotType = c(featureAnnotType,contigDF$f_featureAnnot_short)
# featureAnnotName = c(featureAnnotName,contigDF$f_feature_name)
# }
gene_type$featureAnnotType = featureAnnotType#ifelse( is.na(featureAnnotType), "none",annotationType)
gene_type$featureAnnotName = featureAnnotName
gene_type$annotationType = annotationType#ifelse( is.na(annotationType), "none",annotationType)
return(gene_type)
}
#' @title Randomly extract reads from fastq
#'
#' @description Extract a random subset of reads from fastq files. The files have to be in the
#' same directory, since the method gets all fastq files from a directory and iterates through them.
#' @param numberOfReads The number of reads one wants to extract (default: 50000)
#' @param rawDataOldDir The path to the fastq files from which the random subset is extracted
#' @param rawDataNewDir The path to the new sampled fastq files
#' @return returns TRUE
#' @docType methods
#' @export
extractRandomReadSet = function(numberOfReads=50000, rawDataOldDir, rawDataNewDir){
require(ShortRead)
setwd(rawDataOldDir)
allFastq = list.files(pattern=".fastq$")
tmp = lapply(allFastq, function(x){
message("... Sampling reads ....")
sampler <- FastqSampler(x, numberOfReads)
set.seed(123); currSample = yield(sampler)
currSample
writeFastq(currSample, file.path(rawDataNewDir,x) )
message( paste0("writing to:\n", file.path(rawDataNewDir,x) ))
})
return(TRUE)
}
#' @title Helper method for clustering in MappAssClust_ncRNA_Template.R object
#'
#' @description Not intended for general use, just a helper method for MappAssClust_ncRNA_Template.R
#' The method will merge the contig file generated from multiIntersect perl script with
#' all the contigs that have been intersected with -wb option to the multiIntersected file and merged afterwards.
#' Please see MappAssClust_ncRNA_Template.R, default inputs: multiIntersectBed_perl_CLI_cmdRes, mergeBedFile_CLI_cmdResL
#'
#' @param contigFile CmdResult object is needed as input
#' @param sampleBedFiles CmdResult object is needed as input
#' @return data.frame (Unclustered)
#' @docType methods
#' @export
##Code for Count table generation for clustering
generateCountUnclusteredTable = function( contigFile, sampleBedFiles ){
overallContigPath = file.path( getOutFilePath(getCLIApplication(contigFile)), getOutResultName(getOutResultReference(contigFile)) )
overallContig = read.table(overallContigPath, sep="\t", header=FALSE)
colnames(overallContig) = c("chr","start","end","ID","score","strand")
overallContig = overallContig[order(overallContig$ID),]
overallContig = overallContig[,c("chr","start","end","ID","strand")]
mergedDF = do.call( cbind, mapply( function(x, y){
mergedFilePath = file.path( getOutFilePath(getCLIApplication(x)), getOutResultName(getOutResultReference(x)) )
mergedFile = read.table(mergedFilePath, sep="\t", header=FALSE)
colnames(mergedFile) = c("chr","start","end","ReadCount","Uniqueness","strand","ID")
mergedFile = mergedFile[,c("ReadCount","Uniqueness","ID")]
#Could be that there are the same IDs that have not been merged, due to the -d restriction (but the contigs are longer due to group intersection )..
#Therefore these have to be merged -> this unfortunately may take some time since R is not the fastest with iterating
idL = split(mergedFile, mergedFile$ID )
mergedFile = do.call(rbind,lapply( idL, function(x){
x[1,"ReadCount"] = sum( x[,1] );
x[1,"Uniqueness"] = mean( x[,2] );
return(x[1,])
} ) )
mergedTable = merge(x = overallContig, y = mergedFile[,c("ReadCount","Uniqueness","ID")], by = "ID", all.x = TRUE)
mergedTable = mergedTable[order(mergedTable$ID),]
mergedTable = mergedTable[,c("ReadCount","Uniqueness")]
colnames( mergedTable ) = paste0(colnames( mergedTable ),"_",y)
return(mergedTable)
}, sampleBedFiles, samplesInfo$sampleName, SIMPLIFY = FALSE ))
overallContig = cbind(overallContig, mergedDF)
return(overallContig)
}
#' @title Generate ncRNA database for rnastarMapping
#'
#' @description This method downloads ensembl ncRNA data if provided with ftp adress, modifies the fasta file,
#' subsets the fasta file to sequences of a given and generates a STAR index file if wanted.
#'
#' @param genomeDBdir Location to where the database should be saved
#' @param ncRNAsize maximum sequence length of the fasta file (default 400nt)
#' @param nrThreads Number of threads for the rnastar program (default 6)
#' @param ensemblFTP ftp adress e.g. ftp://ftp.ensembl.org/pub/release-78/fasta/mus_musculus/ncrna/, however for convenience for mouse and human the ftp sites are stored and
#' either 'mouse' or 'human' may be used for parametrization
#' @param generateIdx Specification if an rnastar index should be created (default FALSE)
#' @return all commands generated
#' @docType methods
#' @export
#CODE FOR ncRNA DATABASE CREATION FOR RNASTAR mapping
#ftp://ftp.ensembl.org/pub/release-78/fasta/homo_sapiens/ncrna/
createSizeSelectedEnsemblncRNADB = function(genomeDBdir="/tmp/mmu", ncRNAsize=400, nrThreads=6, ensemblFTP="mouse", generateIdx=FALSE){
dir.create(genomeDBdir)
if( ensemblFTP == "mouse" ){
ensemblFTP = "ftp://ftp.ensembl.org/pub/release-78/fasta/mus_musculus/ncrna/"
}else if (ensemblFTP == "human"){
ensemblFTP = "ftp://ftp.ensembl.org/pub/release-78/fasta/homo_sapiens/ncrna/"
}
cmd1 = paste0("wget --directory-prefix ",genomeDBdir," ",ensemblFTP,"*")
message(paste0("Feching files from ensembl ", cmd1))
system(cmd1)
archiveFN = list.files(path=genomeDBdir, pattern = ".*.gz")
cmd2 = paste0("gunzip ", file.path(genomeDBdir,archiveFN))
message(paste0("Unzip file:\n", cmd2))
system(cmd2)
fastaFN = list.files(path=genomeDBdir, pattern = ".*.fa")
fastaModFN = paste0(sub(".fa$","",fastaFN),".mod.fa")
cmd3 = paste0( "python ",file.path( system.file( package="RNASeqUtility"), "data", "EliminateNewLinesFasta.py"), " -i ",file.path(genomeDBdir,fastaFN),
" -o ",file.path(genomeDBdir,fastaModFN) )
message(paste0("Modifying fasta file to single line only:\n", cmd3))
system(cmd3)
fastaModFN_restricted = paste0(sub(".fa$","",fastaModFN),".",ncRNAsize,"nt",".fa")
restrictedDir = file.path(genomeDBdir, paste0("restr",ncRNAsize,"nt"))
dir.create(restrictedDir)
cmd4 = paste0( "awk \'!/^>/ { next } { getline seq } length(seq) <= ",ncRNAsize," { print $0 \"\\n\" seq }\' ", file.path(genomeDBdir, fastaModFN), " > ", file.path(restrictedDir, fastaModFN_restricted) )
message(paste0("Restricting fasta to:\n",ncRNAsize," sequences ", cmd4))
system(cmd4)
setwd(restrictedDir)
cmd5 = paste0( "STAR --runThreadN ", nrThreads," --runMode genomeGenerate --genomeDir ",restrictedDir," --genomeFastaFiles ",file.path(restrictedDir, fastaModFN_restricted))
if(generateIdx){
message(paste0("Creating IndexDB with star:\n", cmd5))
system(cmd5)
}
commands = c(cmd1,cmd2,cmd3,cmd4,cmd5)
#return the new index directory
return( commands )
}
#' @title Clustering of contigs
#'
#' @description Contigs that contain reads that match to multiple locations are clustered to the contig with the highest read count.
#' If more contigs have the same read count, the longest of them is chosen and then the first one in the list.
#' Starting with the first contig (sorted by read count and length) in the unclustered list:
#' All contigs containing a read of the chosen contig are reported.
#' If these contigs are composed by x% (readCompositionIdentity) of reads from the
#' representative contig, they are removed from the list, including the representative contig.
#' The representative contig is stored in the clustered list.
#' Then the next contig of the unclustered list is chosen, until the unclustered list is empty.
#'
#' @param contigForCountingGR_unclustered GRanges object of unclustered contigs
#' @param allReads GRanges object of the single reads (obtained from each sample in bed format with bamToBed)
#' @param readCompositionIdentity The percentage of read similarity of contigs that need to be reached in order to get clustered (default 0.95)
#' When 0 is specified than one shared read leads to clustering/removing of non-representative contigs.
#' @return clustered GRanges object
#' @docType methods
#' @export
#This clustering removes contigs that have reads in a representative contig!
clusterMultiMappingReads_stringent = function( contigForCountingGR_unclustered, allReads, readCompositionIdentity = 0.95 ){
# tmp1 = allReads
# tmp2 = contigForCountingGR_unclustered
#
# allReads = import(bamFileL[[1]],
# format="BED", asRangedData=FALSE)#alignedbwt_s.bed
# start(allReads) = start(allReads) - 1 #Strange behaviour in import of rtracklayer!
# currBamFile = 1
contigForCountingGR_unclustered = contigForCountingGR_unclustered[order(contigForCountingGR_unclustered$uniqueness, contigForCountingGR_unclustered$readCount, decreasing=TRUE)]
overlapMap = GenomicRanges::findOverlaps( contigForCountingGR_unclustered, allReads )
if( length(unique(queryHits(overlapMap))) != length(contigForCountingGR_unclustered) ){
warning("Some contigs have zero reads in in one sample within a group, therefore these contigs are removed from the analysis!
One could avoid this by intersecting all bam files with the contig files")
contigForCountingGR_unclustered = contigForCountingGR_unclustered[ unique(queryHits(overlapMap) )]
overlapMap = GenomicRanges::findOverlaps( contigForCountingGR_unclustered, allReads )
}
allClustered = TRUE
clusteredContigs = GRanges()
while(allClustered){
if( length( overlapMap) == 0 ){
break;
}
idx = 1
currContig = queryHits(overlapMap[idx])
currReads = subjectHits( overlapMap[ which(queryHits(overlapMap) == currContig) ] ) #Take first entry of overlap matrix to compare
contigReadNames = unique(allReads[currReads]$name) #get all subject Hits (reads) for this contig
allReadPositions = which( allReads$name %in% contigReadNames ) #gets the positions in the BED file (all Reads) for the reads of the contig!
contigClusterIdx = queryHits(overlapMap)[subjectHits(overlapMap) %in% allReadPositions]#finds all other contigs containing these reads
contigCluster = contigForCountingGR_unclustered[contigClusterIdx]
toBeClusteredTo = which(contigCluster$readCount == max(contigCluster$readCount))# Fetching the contig with the highest read count!
toBeClusteredTo = toBeClusteredTo[which( width(contigForCountingGR_unclustered[contigClusterIdx[toBeClusteredTo]]) == max( width(contigForCountingGR_unclustered[contigClusterIdx[toBeClusteredTo]]) ) )][1]#from the contigs with the highest read count taking the longest one!
#Adding the contig to the set of clustered contigs!
clusteredContigs = append(clusteredContigs, contigCluster[toBeClusteredTo])
#contigForCountingGR_unclustered[contigClusterIdx[toBeClusteredTo]]
readsFromRepContig = subjectHits(overlapMap[queryHits(overlapMap) == contigClusterIdx[toBeClusteredTo]]) #These are all reads from the representative contig!
repContigReadNames = unique(allReads[readsFromRepContig]$name)
allReadPositionsRepContig = which( allReads$name %in% repContigReadNames )
#From the representative contig all reads are obtained and eliminated from the overlapMap (therefore if another contig would consist 100% of the same reads it will get eliminated)
#Otherwise the contig will be kept and may be of interest -> one could delete some further contigs by x percent...
overlapMap_filt = overlapMap[which( !subjectHits(overlapMap) %in% allReadPositionsRepContig)]#This would be the filtered overlapMap!
#check for contig leftovers...
#Filtering by 95% read composition identity!
leftoverContigs = table( queryHits(overlapMap_filt[ queryHits( overlapMap_filt ) %in% unique(contigClusterIdx)]))
filtered_leftoverContigs = names(leftoverContigs[ifelse( 1 - (leftoverContigs / (contigCluster[toBeClusteredTo]$readCount + leftoverContigs) ) >= readCompositionIdentity , TRUE, FALSE )]) #left with less than 95% of the reads in main cluster!
overlapMap_filt = overlapMap_filt[which(!queryHits(overlapMap_filt) %in% filtered_leftoverContigs)]
overlapMap = overlapMap_filt
}
return(clusteredContigs)
}
#' @title Deprecated version of clustering
#'
#' @description This method is deprecated do not use!
#' @param contigForCountingGR_unclustered
#' @param allReads
#' @return clusteredContigs
#' @docType methods
#' @export
#This clustering removes contigs that have reads in a representative contig!
clusterMultiMappingReads_stringent_deprecated = function( contigForCountingGR_unclustered, allReads ){
overlapMap = findOverlaps( contigForCountingGR_unclustered, allReads )
if( length(unique(queryHits(overlapMap))) != length(contigForCountingGR_unclustered) ){
stop("The Clusters MUST Map all the reads provided! (This is achieved by intersection of all files for contig assembly!)as")
}
allClustered = TRUE
clusteredContigs = GRanges()
while(allClustered){
if( length( overlapMap) == 0 ){
break;
}
currHits = subjectHits( overlapMap[ 1 ] )
readNamesOfContig = unique(allReads[currHits]$name)
allReadPositions = which( allReads$name %in% readNamesOfContig )
contigClusterIdx = queryHits(overlapMap)[subjectHits(overlapMap) %in% allReadPositions]
readsToDelete = subjectHits(overlapMap)[ which(queryHits(overlapMap) %in% contigClusterIdx) ] #the other reads (with other names) that overlap this locus have to be deleted too...
contigCluster = contigForCountingGR_unclustered[contigClusterIdx]
toBeClusteredTo = which(contigCluster$readCount == max(contigCluster$readCount))[1]#else taking the first one currently
clusteredContigs = append(clusteredContigs, contigForCountingGR_unclustered[contigClusterIdx[toBeClusteredTo]])
contigForCountingGR_unclustered = contigForCountingGR_unclustered[ -contigClusterIdx ] # delete all other locations with same read name!
if(length(contigForCountingGR_unclustered) == 0 ){
allClustered = FALSE
} else{
allReadsToDelete = allReads[readsToDelete]
readToDeleteIndex = which(allReads$name %in% unique(allReads[readsToDelete]$name)) #checking for other positions of these reads from that cluster!
allReadsToDelete = allReads[readToDeleteIndex]
unclusteredContigsToDelete = queryHits(findOverlaps( contigForCountingGR_unclustered, allReadsToDelete ))
if(length(unclusteredContigsToDelete) != 0){#could be that we have already captured all reads -> This is the case when all reads overlapping a cluster map to the same positions (example miRNA that maps to 3 positions)
contigForCountingGR_unclustered = contigForCountingGR_unclustered[-unclusteredContigsToDelete]
}
allReads = allReads[-readToDeleteIndex]
overlapMap = findOverlaps( contigForCountingGR_unclustered, allReads )
}
}
return(clusteredContigs)
}
#
annotDFCreation = function( annotationTableSub, gffDetailColumn=9 ){
require(R.oo)
splittedAnnot = R.oo::trim(unlist( strsplit( annotationTableSub[,9] , ";") ))
resVect = unlist( lapply( splittedAnnot, function(x){
resVecttmp = unlist( strsplit( x, " " ) )
resVect = resVecttmp[2]
names(resVect) = resVecttmp[1]
return(resVect)
}) )
resVect = resVect[ !duplicated(names( resVect )) ]
eval( parse(text=paste0(names( resVect ), " = c()" )) )
rm(splittedAnnot)
counter = c(0)
for( annotRow in annotationTableSub$V9 ){
splittedAnnot = R.oo::trim(unlist( strsplit( annotRow, ";") ))
resVectCurr = unlist( lapply( splittedAnnot, function(x){
resVecttmp = unlist( strsplit( x, " " ) )
resVect = resVecttmp[2]
names(resVect) = resVecttmp[1]
return(resVect)
}) )
resVectEval = resVect
if( length(resVectCurr) != 0 ){
for( i in 1:length(resVectCurr) ){
resVectEval[names(resVectCurr)[i]] = resVectCurr[i]
}
resVectEval[ !names(resVectEval) %in% names(resVectCurr)] = NA
} else{
resVectEval = rep(NA, length(resVect))
names(resVectEval) = names(resVect)
}
eval( parse(text=paste0(names( resVectEval ), " = c(", names(resVectEval),",\"", resVectEval,"\")" ) ) )
}
eval( parse(text= paste0( "resDF = data.frame( ",paste0("\"", names(resVect), "\"=",eval(names(resVect)),collapse="," ), ")") ))
return(resDF)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.