######################################################
## Generate various feature types from TxDb objects ##
######################################################
genFeatures <- function(txdb, featuretype = "all", reduce_ranges, upstream = 1000, downstream = 0, verbose = TRUE) {
pkg <- c("GenomeInfoDb", "GenomicFeatures")
checkPkg(pkg, quietly = FALSE)
## Check validity of inputs
supported_features <- c("tx_type", "promoter", "intron", "exon", "cds", "fiveUTR", "threeUTR", "intergenic")
if (tolower(featuretype[1]) == "all") featuretype <- supported_features
if (class(txdb) != "TxDb") stop("Argument 'txdb' needs to be assigned object of class TxDb!")
if (any(!featuretype %in% supported_features)) stop("featuretype supports any of the following values: ", paste(supported_features, collapse = ", "))
## Empty GRangesList to store results
featuresGRl <- GRangesList()
GenomeInfoDb::seqinfo(featuresGRl) <- GenomeInfoDb::seqinfo(txdb)
## Empty GRanges object for feature types that don't exist
gr_empty <- GRanges()
mcols(gr_empty) <- S4Vectors::DataFrame(feature_by = character(), featuretype_id = character(), featuretype = character())
GenomeInfoDb::seqinfo(gr_empty) <- GenomeInfoDb::seqinfo(txdb)
## Gene/transcript id mappings (required for some features)
ids <- mcols(GenomicFeatures::transcripts(txdb, columns = c("tx_id", "tx_name", "gene_id")))
ge_id <- unstrsplit(ids$gene_id)
names(ge_id) <- ids$tx_id
## Transcript ranges: each 'tx_type' as separate GRanges object (reduced by gene)
if ("tx_type" %in% featuretype) {
tx <- GenomicFeatures::transcripts(txdb, columns = c("tx_name", "gene_id", "tx_type"))
if (length(tx) == 0) { # If range set is empty, append 'gr_empty'.
featuresGRl <- c(featuresGRl, GRangesList("transcript" = gr_empty))
} else {
mcols(tx)$tx_type <- ifelse(is.na(mcols(tx)$tx_type), "transcript", mcols(tx)$tx_type)
tx_type <- unique(mcols(tx)$tx_type)
for (i in seq_along(tx_type)) {
tmp <- tx[mcols(tx)$tx_type == tx_type[i]]
if (reduce_ranges == TRUE) {
tmp <- reduce(split(tmp, as.character(mcols(tmp)$gene_id)))
tmp <- unlist(tmp)
feature_id <- paste(names(tmp), ":", tx_type[i], "_red", sep = "")
mcols(tmp) <- S4Vectors::DataFrame(feature_by = names(tmp), featuretype_id = feature_id, featuretype = paste0(tx_type[i], "_red"))
names(tmp) <- seq_along(tmp)
featuresGRl <- c(featuresGRl, GRangesList(tmp = tmp))
names(featuresGRl)[length(featuresGRl)] <- paste0(tx_type[i], "_red")
} else {
mcols(tmp) <- S4Vectors::DataFrame(feature_by = mcols(tmp)$gene_id, featuretype_id = mcols(tmp)$tx_name, featuretype = tx_type[i])
featuresGRl <- c(tmp = featuresGRl, GRangesList(tmp = tmp))
names(tmp) <- seq_along(tmp)
names(featuresGRl)[length(featuresGRl)] <- tx_type[i]
}
}
if (verbose == TRUE) cat("Created feature ranges:", paste(tx_type, collapse = ", "), "\n")
}
}
## Create promoter ranges reduced by gene
if ("promoter" %in% featuretype) {
mycheck <- suppressWarnings(promoters(GenomicFeatures::transcriptsBy(txdb, "gene"), upstream, downstream))
if (length(mycheck) == 0) { # If range set is empty, append 'gr_empty'.
featuresGRl <- c(featuresGRl, GRangesList("promoter" = gr_empty))
} else {
if (reduce_ranges == TRUE) {
mypromoters <- suppressWarnings(unlist(reduce(promoters(GenomicFeatures::transcriptsBy(txdb, "gene"), upstream, downstream))))
mypromoters <- trim(mypromoters)
feature_id <- paste0(names(mypromoters), ":P_red")
mcols(mypromoters) <- S4Vectors::DataFrame(feature_by = names(mypromoters), featuretype_id = feature_id, featuretype = "promoter_red")
names(mypromoters) <- seq_along(mypromoters)
featuresGRl <- c(featuresGRl, GRangesList("promoter_red" = mypromoters))
} else {
mypromoters <- suppressWarnings(promoters(txdb, upstream, downstream, columns = c("tx_name", "gene_id", "tx_type")))
mypromoters <- trim(mypromoters)
mcols(mypromoters) <- S4Vectors::DataFrame(feature_by = as(mcols(mypromoters)$gene_id, "CharacterList"), featuretype_id = mcols(mypromoters)$tx_name, featuretype = "promoter")
featuresGRl <- c(featuresGRl, GRangesList("promoter" = mypromoters))
}
if (verbose == TRUE) cat("Created feature ranges: promoter", "\n")
}
}
## Create intron ranges reduced by gene
if ("intron" %in% featuretype) {
## Introns by transcript
myintrons <- GenomicFeatures::intronsByTranscript(txdb)
if (length(myintrons) == 0) { # If range set is empty, append 'gr_empty'.
featuresGRl <- c(featuresGRl, GRangesList("intron" = gr_empty))
} else {
## Convert to introns by genes
intron_count <- sapply(width(myintrons), length)
names(intron_count) <- ge_id[names(intron_count)]
intron_count <- intron_count[intron_count != 0]
ge_id_intron <- rep(names(intron_count), intron_count)
myintrons <- unlist(myintrons)
myintrons <- split(myintrons, ge_id_intron) # Introns by gene
if (reduce_ranges == TRUE) {
## Reduce introns by gene, use gene ID for naming and append counter
myintrons <- reduce(myintrons)
intron_red_count <- sapply(width(myintrons), length)
ge_id_red_intron <- rep(names(intron_red_count), intron_red_count)
feature_id <- paste(ge_id_red_intron, sprintf("I%03d_red", unlist(sapply(as.integer(intron_red_count), function(x) seq(from = 1, to = x)))), sep = ":")
myintrons <- unlist(myintrons)
mcols(myintrons) <- S4Vectors::DataFrame(feature_by = ge_id_red_intron, featuretype_id = feature_id, featuretype = "intron_red")
names(myintrons) <- seq_along(myintrons)
featuresGRl <- c(featuresGRl, GRangesList("intron_red" = myintrons))
} else {
intron_red_count <- sapply(width(myintrons), length)
ge_id_red_intron <- rep(names(intron_red_count), intron_red_count)
feature_id <- paste(ge_id_red_intron, sprintf("I%03d", unlist(sapply(as.integer(intron_red_count), function(x) seq(from = 1, to = x)))), sep = ":")
myintrons <- unlist(myintrons)
mcols(myintrons) <- S4Vectors::DataFrame(feature_by = as(ge_id_red_intron, "CharacterList"), featuretype_id = feature_id, featuretype = "intron")
names(myintrons) <- seq_along(myintrons)
featuresGRl <- c(featuresGRl, GRangesList("intron" = myintrons))
}
if (verbose == TRUE) cat("Created feature ranges: intron", "\n")
}
}
## Create exon ranges reduced by gene
if ("exon" %in% featuretype) {
## exons by gene
myexons <- GenomicFeatures::exonsBy(txdb, "gene")
if (length(myexons) == 0) { # If range set is empty, append 'gr_empty'.
featuresGRl <- c(featuresGRl, GRangesList("exon" = gr_empty))
} else {
if (reduce_ranges == TRUE) {
myexons <- reduce(myexons)
exon_red_count <- sapply(width(myexons), length)
ge_id_red_exon <- rep(names(exon_red_count), exon_red_count)
feature_id <- paste(ge_id_red_exon, sprintf("E%03d_red", unlist(sapply(as.integer(exon_red_count), function(x) seq(from = 1, to = x)))), sep = ":")
myexons <- unlist(myexons)
mcols(myexons) <- S4Vectors::DataFrame(feature_by = ge_id_red_exon, featuretype_id = feature_id, featuretype = "exon_red")
names(myexons) <- seq_along(myexons)
featuresGRl <- c(featuresGRl, GRangesList("exon_red" = myexons))
} else {
exon_red_count <- sapply(width(myexons), length)
ge_id_red_exon <- rep(names(exon_red_count), exon_red_count)
feature_id <- paste(ge_id_red_exon, sprintf("E%03d", unlist(sapply(as.integer(exon_red_count), function(x) seq(from = 1, to = x)))), sep = ":")
myexons <- unlist(myexons)
mcols(myexons) <- S4Vectors::DataFrame(feature_by = as(ge_id_red_exon, "CharacterList"), featuretype_id = feature_id, featuretype = "exon")
names(myexons) <- seq_along(myexons)
featuresGRl <- c(featuresGRl, GRangesList("exon" = myexons))
}
if (verbose == TRUE) cat("Created feature ranges: exon", "\n")
}
}
## Create CDS ranges reduced by gene
if ("cds" %in% featuretype) {
## CDS by gene
mycds <- GenomicFeatures::cdsBy(txdb, "gene")
if (length(mycds) == 0) { # If range set is empty, append 'gr_empty'.
featuresGRl <- c(featuresGRl, GRangesList("cds" = gr_empty))
} else {
if (reduce_ranges == TRUE) {
mycds <- reduce(mycds)
cds_red_count <- sapply(width(mycds), length)
ge_id_red_cds <- rep(names(cds_red_count), cds_red_count)
feature_id <- paste(ge_id_red_cds, sprintf("CDS%03d_red", unlist(sapply(as.integer(cds_red_count), function(x) seq(from = 1, to = x)))), sep = ":")
mycds <- unlist(mycds)
mcols(mycds) <- S4Vectors::DataFrame(feature_by = ge_id_red_cds, featuretype_id = feature_id, featuretype = "cds_red")
names(mycds) <- seq_along(mycds)
featuresGRl <- c(featuresGRl, GRangesList("cds_red" = mycds))
} else {
cds_red_count <- sapply(width(mycds), length)
ge_id_red_cds <- rep(names(cds_red_count), cds_red_count)
feature_id <- paste(ge_id_red_cds, sprintf("CDS%03d", unlist(sapply(as.integer(cds_red_count), function(x) seq(from = 1, to = x)))), sep = ":")
mycds <- unlist(mycds)
mcols(mycds) <- S4Vectors::DataFrame(feature_by = as(ge_id_red_cds, "CharacterList"), featuretype_id = feature_id, featuretype = "cds")
names(mycds) <- seq_along(mycds)
featuresGRl <- c(featuresGRl, GRangesList("cds" = mycds))
}
if (verbose == TRUE) cat("Created feature ranges: cds", "\n")
}
}
## Create 5'UTR ranges reduced by gene
if ("fiveUTR" %in% featuretype) {
## 5'UTRs by transcript
myfiveutr <- GenomicFeatures::fiveUTRsByTranscript(txdb)
if (length(myfiveutr) == 0) { # If range set is empty, append 'gr_empty'.
featuresGRl <- c(featuresGRl, GRangesList("fiveUTR" = gr_empty))
} else {
## Convert to 5'UTRs by genes
utr_count <- sapply(width(myfiveutr), length)
names(utr_count) <- ge_id[names(utr_count)]
utr_count <- utr_count[utr_count != 0]
ge_id_utr <- rep(names(utr_count), utr_count)
myfiveutr <- unlist(myfiveutr)
myfiveutr <- split(myfiveutr, ge_id_utr) # UTRs by gene
if (reduce_ranges == TRUE) {
## Reduce introns by gene, use gene ID for naming and append counter
myfiveutr <- reduce(myfiveutr)
utr_red_count <- sapply(width(myfiveutr), length)
ge_id_red_utr <- rep(names(utr_red_count), utr_red_count)
feature_id <- paste(ge_id_red_utr, sprintf("fiveUTR%03d_red", unlist(sapply(as.integer(utr_red_count), function(x) seq(from = 1, to = x)))), sep = ":")
myfiveutr <- unlist(myfiveutr)
mcols(myfiveutr) <- S4Vectors::DataFrame(feature_by = ge_id_red_utr, featuretype_id = feature_id, featuretype = "fiveUTR_red")
names(myfiveutr) <- seq_along(myfiveutr)
featuresGRl <- c(featuresGRl, GRangesList("fiveUTR_red" = myfiveutr))
} else {
utr_red_count <- sapply(width(myfiveutr), length)
ge_id_red_utr <- rep(names(utr_red_count), utr_red_count)
feature_id <- paste(ge_id_red_utr, sprintf("fiveUTR%03d", unlist(sapply(as.integer(utr_red_count), function(x) seq(from = 1, to = x)))), sep = ":")
myfiveutr <- unlist(myfiveutr)
mcols(myfiveutr) <- S4Vectors::DataFrame(feature_by = as(ge_id_red_utr, "CharacterList"), featuretype_id = feature_id, featuretype = "fiveUTR")
names(myfiveutr) <- seq_along(myfiveutr)
featuresGRl <- c(featuresGRl, GRangesList("fiveUTR" = myfiveutr))
}
if (verbose == TRUE) cat("Created feature ranges: fiveUTR", "\n")
}
}
## Create 3'UTR ranges reduced by gene
if ("threeUTR" %in% featuretype) {
## 3'UTRs by transcript
mythreeutr <- GenomicFeatures::threeUTRsByTranscript(txdb)
if (length(mythreeutr) == 0) { # If range set is empty, append 'gr_empty'.
featuresGRl <- c(featuresGRl, GRangesList("threeUTR" = gr_empty))
} else {
## Convert to 3'UTRs by genes
utr_count <- sapply(width(mythreeutr), length)
names(utr_count) <- ge_id[names(utr_count)]
utr_count <- utr_count[utr_count != 0]
ge_id_utr <- rep(names(utr_count), utr_count)
mythreeutr <- unlist(mythreeutr)
mythreeutr <- split(mythreeutr, ge_id_utr) # UTRs by gene
if (reduce_ranges == TRUE) {
## Reduce introns by gene, use gene ID for naming and append counter
mythreeutr <- reduce(mythreeutr)
utr_red_count <- sapply(width(mythreeutr), length)
ge_id_red_utr <- rep(names(utr_red_count), utr_red_count)
feature_id <- paste(ge_id_red_utr, sprintf("threeUTR%03d_red", unlist(sapply(as.integer(utr_red_count), function(x) seq(from = 1, to = x)))), sep = ":")
mythreeutr <- unlist(mythreeutr)
mcols(mythreeutr) <- S4Vectors::DataFrame(feature_by = ge_id_red_utr, featuretype_id = feature_id, featuretype = "threeUTR_red")
names(mythreeutr) <- seq_along(mythreeutr)
featuresGRl <- c(featuresGRl, GRangesList("threeUTR_red" = mythreeutr))
} else {
utr_red_count <- sapply(width(mythreeutr), length)
ge_id_red_utr <- rep(names(utr_red_count), utr_red_count)
feature_id <- paste(ge_id_red_utr, sprintf("threeUTR%03d", unlist(sapply(as.integer(utr_red_count), function(x) seq(from = 1, to = x)))), sep = ":")
mythreeutr <- unlist(mythreeutr)
mcols(mythreeutr) <- S4Vectors::DataFrame(feature_by = as(ge_id_red_utr, "CharacterList"), featuretype_id = feature_id, featuretype = "threeUTR")
names(mythreeutr) <- seq_along(mythreeutr)
featuresGRl <- c(featuresGRl, GRangesList("threeUTR" = mythreeutr))
}
if (verbose == TRUE) cat("Created feature ranges: threeUTR", "\n")
}
}
## Create intergenic ranges
if ("intergenic" %in% featuretype) {
# if(verbose==TRUE & any(is.na(seqlengths(txdb)))) warning("seqlengths missing for: ", paste(head(names(seqlengths(txdb))[is.na(seqlengths(txdb))]), collapse=", "), ". Thus, corresponding chromosome end ranges will be missing in intergenic results.")
ge <- GenomicFeatures::genes(txdb)
if (length(ge) == 0) { # If range set is empty, append 'gr_empty'.
featuresGRl <- c(featuresGRl, GRangesList("intergenic" = gr_empty))
} else {
myseqinfo <- GenomeInfoDb::seqinfo(ge)
GenomeInfoDb::seqlengths(ge) <- NA # Note: the following ignores chromosome end ranges on the right end
mynames <- names(ge)
BiocGenerics::strand(ge) <- "*"
ge <- reduce(ge, with.revmap = TRUE)
myids <- character(length = length(ge))
revmaplist <- as.list(mcols(ge)$revmap)
for (i in seq(along = ge)) myids[i] <- paste(mynames[revmaplist[[i]]], collapse = "_")
myintergenics <- gaps(ge)
myintergenics <- myintergenics[BiocGenerics::strand(myintergenics) == "*"] # Removes chromosome ranges created by gaps when seqlengths available
# index <- GenomicRanges::findOverlaps(ge, myintergenics, minoverlap=0L)
index <- GenomicRanges::findOverlaps(ge, myintergenics, maxgap = 0L, minoverlap = 0L) # change necessary since default in IRanges changed from maxgap=0L to maxgap=-1L
myids <- tapply(myids[as.matrix(index)[, 1]], as.matrix(index)[, 2], paste, collapse = "__")
if (reduce_ranges == TRUE) {
mcols(myintergenics) <- S4Vectors::DataFrame(feature_by = sprintf("INTER%08d", seq_along(myids)), featuretype_id = as.character(myids), featuretype = "intergenic")
} else {
mcols(myintergenics) <- S4Vectors::DataFrame(feature_by = as(sprintf("INTER%08d", seq_along(myids)), "CharacterList"), featuretype_id = as.character(myids), featuretype = "intergenic")
}
names(myintergenics) <- seq_along(myintergenics)
GenomeInfoDb::seqinfo(myintergenics) <- myseqinfo
featuresGRl <- c(featuresGRl, GRangesList("intergenic" = myintergenics))
if (verbose == TRUE) cat("Created feature ranges: intergenic", "\n")
}
}
## Return results
return(featuresGRl)
}
## Usage:
# file <- system.file("extdata/annotation", "tair10.gff", package="systemPipeRdata")
# txdb <- makeTxDbFromGFF(file=file, format="gff3", organism="Arabidopsis")
# feat <- genFeatures(txdb, featuretype="all", reduce_ranges=TRUE, upstream=1000, downstream=0, verbose=TRUE)
#############################################################
## Compute and Plot Read Distribution Across Feature Types ##
#############################################################
## Compute distribution of reads across feature types
featuretypeCounts <- function(bfl, grl, singleEnd = TRUE, readlength = NULL, type = "data.frame") {
pkg <- c("GenomicAlignments", "IRanges")
checkPkg(pkg, quietly = FALSE)
## Check for valid inputs
if (!(is.null(readlength[1]) | is.integer(readlength))) stop("readlength needs to be assigned NULL or integer vector")
if (length(type) != 1) stop("Argument needs to be character vector of length 1.")
if (!any(c("list", "data.frame") %in% type)) stop("Argument 'type' needs to be assigned one of 'list' or 'data.frame'")
## List container for storing results
resultlist <- sapply(names(bfl), function(x) list(NULL))
## Vector to store counts of total aligned reads
totalcounts <- numeric(length(bfl))
names(totalcounts) <- names(bfl)
## Total length (in bps) of each reduced feature type
feattypelength <- sapply(names(grl), function(x) sum(width(reduce(grl[[x]]))))
## Read count for every feature given in gr (coming from gff)
for (i in names(grl)) BiocGenerics::strand(grl[[i]][BiocGenerics::strand(grl[[i]]) == "*"]) <- "+" # Sets "*" to "+" to avoid double-counting of reads in this strand-specific method
feature <- names(grl)
featurewithtotal <- c(names(grl), "N_total_aligned")
for (j in seq(along = bfl)) {
if (is.null(readlength[1])) {
myMA <- matrix(rep(0, length(featurewithtotal) * 1), nrow = length(featurewithtotal), ncol = 1, dimnames = list(featurewithtotal, "anyreadlength"))
} else {
myMA <- matrix(rep(0, length(featurewithtotal) * length(readlength)), nrow = length(featurewithtotal), ncol = length(readlength), dimnames = list(featurewithtotal, readlength))
}
myMAsense <- myMA
bf <- open(bfl[[j]])
while (isIncomplete(bf)) {
if (singleEnd == TRUE) {
aligns <- GenomicAlignments::readGAlignments(bf, use.names = FALSE)
} else if (singleEnd == FALSE) {
aligns <- GenomicAlignments::readGAlignmentPairs(bf, use.names = FALSE)
} else {
stop("singleEnd needs to be TRUE or FALSE")
}
for (i in feature) {
gr <- grl[[i]]
counter <- which(feature == i)
## Non-strand-specific counts
alignssub <- IRanges::subsetByOverlaps(aligns, gr, ignore.strand = TRUE)
if (singleEnd == TRUE) {
if (is.null(readlength[1])) {
counts <- length(alignssub)
if (counter == 1) totalcounts <- length(aligns)
} else {
counts <- table(GenomicAlignments::qwidth(alignssub))[colnames(myMA)]
if (counter == 1) totalcounts <- table(GenomicAlignments::qwidth(aligns))[colnames(myMA)]
}
} else { # The following reports for PE reads the mean read length for each pair
if (is.null(readlength[1])) {
counts <- length(alignssub)
if (counter == 1) totalcounts <- length(aligns)
} else {
mywidth <- round((GenomicAlignments::qwidth(GenomicAlignments::last(alignssub)) + GenomicAlignments::qwidth(S4Vectors::first(alignssub))) / 2)
counts <- table(mywidth)[colnames(myMA)]
if (counter == 1) {
totalmywidth <- round((GenomicAlignments::qwidth(GenomicAlignments::last(aligns)) + GenomicAlignments::qwidth(S4Vectors::first(aligns))) / 2)
totalcounts <- table(totalmywidth)[colnames(myMA)]
}
}
}
counts[is.na(counts)] <- 0
myMA[i, ] <- myMA[i, ] + counts
if (counter == 1) {
totalcounts[is.na(totalcounts)] <- 0
myMA["N_total_aligned", ] <- myMA["N_total_aligned", ] + totalcounts
}
## Strand-specific counts
alignssub <- IRanges::subsetByOverlaps(aligns, gr, ignore.strand = FALSE)
if (singleEnd == TRUE) {
if (is.null(readlength[1])) {
sensecounts <- length(alignssub)
if (counter == 1) totalsensecounts <- length(aligns[BiocGenerics::strand(aligns) == "+"])
} else {
sensecounts <- table(GenomicAlignments::qwidth(alignssub))[colnames(myMAsense)]
if (counter == 1) totalsensecounts <- table(GenomicAlignments::qwidth(aligns[BiocGenerics::strand(aligns) == "+"]))[colnames(myMAsense)]
}
} else { # The following reports for PE reads the mean read length for each pair
if (is.null(readlength[1])) {
sensecounts <- length(alignssub)
if (counter == 1) totalsensecounts <- length(aligns[BiocGenerics::strand(aligns) == "+"])
} else {
mywidth <- round((GenomicAlignments::qwidth(GenomicAlignments::last(alignssub)) + GenomicAlignments::qwidth(S4Vectors::first(alignssub))) / 2)
sensecounts <- table(mywidth)[colnames(myMA)]
if (counter == 1) {
totalmywidth <- round((GenomicAlignments::qwidth(GenomicAlignments::last(aligns)) + GenomicAlignments::qwidth(S4Vectors::first(aligns))) / 2)
totalsensecounts <- table(totalmywidth)[colnames(myMA)]
}
}
}
sensecounts[is.na(sensecounts)] <- 0
myMAsense[i, ] <- myMAsense[i, ] + sensecounts
if (counter == 1) {
totalsensecounts[is.na(totalsensecounts)] <- 0
myMAsense["N_total_aligned", ] <- myMAsense["N_total_aligned", ] + totalsensecounts
}
}
}
close(bf)
myMAantisense <- myMA - myMAsense
myMAantisense[myMAantisense < 0] <- 0
myMAsense[myMAsense <- 0] <- 0
myMAsense <- data.frame(SampleName = names(bfl)[j], Strand = "Sense", Featuretype = row.names(myMAsense), Featuretypelength = as.numeric(feattypelength[row.names(myMAsense)]), myMAsense, check.names = FALSE)
myMAantisense <- data.frame(SampleName = names(bfl)[j], Strand = "Antisense", Featuretype = row.names(myMAantisense), Featuretypelength = as.numeric(feattypelength[row.names(myMAantisense)]), myMAantisense, check.names = FALSE)
resultlist[[j]] <- list(Sense = myMAsense, Antisense = myMAantisense)
cat(paste0("Processed sample ", j, ": ", names(bfl)[j]), "\n")
}
## Object types to return
if (type == "list") {
return(resultlist)
} else if (type == "data.frame") {
resultdf <- do.call("rbind", unlist(resultlist, recursive = FALSE))
row.names(resultdf) <- NULL
return(resultdf)
} else {
stop("Argument 'type' needs to be assigned one of 'list' or 'data.frame'")
}
}
## Usage:
# outpaths <- subsetWF(args , slot="output", subset=1, index=1)
# file.exists(outpaths)
# featureCounts <- featuretypeCounts(bfl=BamFileList(outpaths, yieldSize=50000), grl=feat, singleEnd=TRUE, readlength=c(74:76,99:102), type="data.frame")
# write.table(featureCounts, "results/featureCounts.xls", quote=FALSE, row.names=FALSE, sep="\t")
# featureCounts2 <- featuretypeCounts(bfl=BamFileList(outpaths), grl=feat, singleEnd=TRUE, readlength=NULL, type="data.frame")
# write.table(featureCounts2, "results/featureCounts2.xls", quote=FALSE, row.names=FALSE, sep="\t")
######################################################
## Plot distribution of reads across feature types ##
######################################################
plotfeaturetypeCounts <- function(x, graphicsfile, graphicsformat = "pdf", scales = "fixed", anyreadlength = FALSE, drop_N_total_aligned = TRUE, scale_count_val = 10^6, scale_length_val = NULL) {
checkPkg("grid", quietly = FALSE)
## Input validity checks
if (class(x) != "data.frame") stop("x needs to be object of class 'data.frame'")
if (any(colnames(x)[1:3] != c("SampleName", "Strand", "Featuretype"))) stop("First three column names need to be: 'SampleName', 'Strand', 'Featuretype'")
if (!is.numeric(as.matrix(x[, -c(1:3)]))) stop("Following columns (after 3rd one) need to be numeric.")
## global functions or variables
Strand <- Feature <- Counts <- Length <- NULL
## Store Featuretypelength and remove it from x
featuretypelength <- tapply(x$Featuretypelength, x$Featuretype, unique)
featuretypelength <- featuretypelength[!is.na(featuretypelength)]
x <- x[, !colnames(x) %in% "Featuretypelength"]
## Get numbers of total aligned reads per sample
N_total_aligned_DF <- x[x$Featuretype == "N_total_aligned", ]
if (colnames(N_total_aligned_DF)[4] != "anyreadlength") N_total_aligned_DF <- data.frame(N_total_aligned_DF[, 1:3], anyreadlength = rowSums(N_total_aligned_DF[, -c(1:3)]))
N_total_aligned <- tapply(as.numeric(as.vector(N_total_aligned_DF[, 4])), N_total_aligned_DF$SampleName, sum)
## Eliminate total read counts to not show them in plots
if (drop_N_total_aligned == TRUE) x <- x[x$Featuretype != "N_total_aligned", ]
## Sum up read length counts if 'anyreadlength=TRUE'
if (colnames(x)[4] != "anyreadlength" & anyreadlength == TRUE) x <- data.frame(x[, 1:3], anyreadlength = rowSums(x[, -c(1:3)]))
## Scale per x reads (e.g. per million reads)
if (is.numeric(scale_count_val[1])) {
if (is.numeric(scale_length_val[1])) {
correctv <- scale_length_val / featuretypelength[as.character(x$Featuretype)]
for (i in seq_along(colnames(x))[-c(1:3)]) x[, i] <- as.numeric(as.vector(x[, i])) * (scale_count_val / N_total_aligned[x$SampleName]) * correctv
} else if (length(scale_length_val) == 0) {
for (i in seq_along(colnames(x))[-c(1:3)]) x[, i] <- as.numeric(as.vector(x[, i])) * (scale_count_val / N_total_aligned[x$SampleName])
} else {
stop("'scale_length_val' needs to be assinged NULL or numeric value.")
}
} else if (is.null(scale_count_val[1])) {
if (is.numeric(scale_length_val[1])) {
correctv <- scale_length_val / featuretypelength[as.character(x$Featuretype)]
for (i in seq_along(colnames(x))[-c(1:3)]) x[, i] <- as.numeric(as.vector(x[, i])) * correctv
} else if (length(scale_length_val) == 0) {
x <- x
} else {
stop("'scale_length_val' needs to be assinged NULL or numeric value.")
}
} else {
stop("'scale_count_val' needs to be assinged NULL or numeric value.")
}
## Define plotting order of samples
x[, 1] <- factor(x[, 1], levels = unique(x[, 1]), ordered = TRUE)
## Label for count axis
if (length(scale_count_val) == 0 & length(scale_length_val) == 0) axis_label <- "Raw counts"
if (length(scale_count_val) > 0 & length(scale_length_val) > 0) axis_label <- paste("Counts normalized per", scale_count_val, "reads", "and", scale_length_val, "bp feature length")
if (length(scale_count_val) > 0 & length(scale_length_val) == 0) axis_label <- paste("Counts normalized per", scale_count_val, "reads")
if (length(scale_count_val) == 0 & length(scale_length_val) > 0) axis_label <- paste("Counts normalized per", scale_length_val, "bp feature length")
## Plot without read length resolution
if (colnames(x)[4] == "anyreadlength") {
x[, 3] <- factor(x[, 3], levels = sort(as.character(unique(x[, 3])), decreasing = TRUE), ordered = TRUE) # Defines plotting order of bars!!!
colnames(x)[3:4] <- c("Feature", "Counts") # Assign nicer names for plot
myplot <- ggplot2::ggplot(x, ggplot2::aes(x = Feature, y = Counts)) +
ggplot2::geom_bar(ggplot2::aes(fill = Strand), position = "stack", stat = "identity") +
ggplot2::facet_wrap(~SampleName) +
ggplot2::scale_y_continuous(axis_label) +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = -90, hjust = 0, vjust = 0.4)) +
ggplot2::coord_flip() +
ggplot2::scale_x_discrete(limits = unique(x$Feature))
get(graphicsformat)(graphicsfile)
print(myplot)
grDevices::dev.off()
cat("Generated graphics file", graphicsfile, "\n")
return(myplot)
## Plot with read length resolution
} else {
## Split data on samples (SampleName column) and convert to format expected by ggplot2
.convert2DFlist <- function(x) {
featureCountslist <- split(x, x$SampleName, drop = TRUE)
featureCountslist <- featureCountslist[unique(as.character(x$SampleName))]
for (j in seq_along(featureCountslist)) {
df <- featureCountslist[[j]]
tmpDF <- data.frame()
for (i in seq_along(df[, 4:length(df[1, ])])) {
tmpDF <- rbind(tmpDF, data.frame(df[, 1:3], Counts = df[, i + 3], Length = colnames(df[, i + 3, drop = FALSE])))
}
featureCountslist[[j]] <- tmpDF
}
return(featureCountslist)
}
featureCountslist <- .convert2DFlist(x)
## Get maximum read counts in data set after summing up strand spec counts
tmp <- data.frame(paste(x[, 1], x[, 3], sep = "_"), x[, -c(1:3)])
mymax <- max(t(sapply(unique(tmp[, 1]), function(x) colSums(tmp[tmp[, 1] == x, -1]))))
## Generate plotting instructions
myplotlist <- sapply(names(featureCountslist), function(x) NULL, simplify = FALSE)
for (i in seq(along = featureCountslist)) {
myplot <- ggplot2::ggplot(featureCountslist[[i]], ggplot2::aes(x = Length, y = Counts)) +
ggplot2::geom_bar(ggplot2::aes(fill = Strand), position = "stack", stat = "identity") +
ggplot2::facet_wrap(~Featuretype, ncol = 1, scales = scales) +
ggplot2::theme(legend.position = "bottom") +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = -90, hjust = 0, vjust = 0.4)) +
ggplot2::ggtitle(featureCountslist[[i]][1, 1]) +
ggplot2::scale_x_discrete(limits = unique(featureCountslist[[i]]$Length))
## Assure same scale for all panels if scales="fixed"
if (scales == "fixed") {
myplot <- myplot + ggplot2::scale_y_continuous(axis_label, limits = c(0, mymax))
} else {
myplot <- myplot + ggplot2::scale_y_continuous(axis_label)
}
myplotlist[[i]] <- myplot
}
## Generate graphics and write to file in pdf, png or jpeg format.
if (tolower(graphicsformat) == "pdf") mydim <- c(height = 20, width = 6 * length(myplotlist))
if (tolower(graphicsformat) %in% c("png", "jpeg")) mydim <- c(height = 20 * 96, width = 6 * length(myplotlist) * 96)
get(graphicsformat)(graphicsfile, width = mydim["width"], height = mydim["height"])
grid::grid.newpage() # Open a new page on grid device
grid::pushViewport(grid::viewport(layout = grid::grid.layout(1, length(myplotlist)))) # Assign to device viewport with 1 by 2 grid layout
for (i in seq(along = myplotlist)) print(myplotlist[[i]], vp = grid::viewport(layout.pos.row = 1, layout.pos.col = i))
grDevices::dev.off()
cat("Generated graphics file", graphicsfile, "\n")
return(myplotlist)
}
}
## Usage:
# library(ggplot2); library(grid)
# featureCounts <- read.delim("results/featureCounts.xls", check.names=FALSE)
# myplots <- plotfeaturetypeCounts(x=featureCounts, graphicsfile="results/featureCounts.pdf", graphicsformat="pdf", scales="fixed", anyreadlength=FALSE, drop_N_total_aligned=TRUE, scale_count_val=10^6, scale_length_val=10^3)
# featureCounts2 <- read.delim("results/featureCounts2.xls", check.names=FALSE)
# myplots <- plotfeaturetypeCounts(x=featureCounts2, graphicsfile="results/featureCounts2.pdf", graphicsformat="pdf", scales="fixed", anyreadlength=TRUE, drop_N_total_aligned=TRUE, scale_count_val=10^6, scale_length_val=10^3)
##############################################
## Compute and plot coverage along features ##
##############################################
## Computes coverage along single and multi component features, such as exons/cds by transcripts.
featureCoverage <- function(bfl, grl, resizereads = NULL, readlengthrange = NULL, Nbins = 20, method = mean, fixedmatrix, resizefeatures, upstream, downstream, outfile, overwrite = FALSE) {
pkg <- c("GenomicAlignments", "IRanges", "GenomeInfoDb")
checkPkg(pkg, quietly = FALSE)
## Input validity checks
if (!is.null(outfile)) {
if (file.exists(outfile) & overwrite == FALSE) stop("File ", outfile, " exists. Delete it or set 'overwrite=TRUE'")
if (file.exists(outfile) & overwrite == TRUE) unlink(outfile)
}
if (class(bfl) != "BamFileList") stop("'bfl' needs to be of class 'BamFileList'")
if (!inherits(grl, "GRangesList")) stop("'grl' needs to be a GRangesList object.")
if (!is.null(readlengthrange)) {
if (is.numeric(readlengthrange) & length(readlengthrange) != 2) stop("'readlengthrange' needs to be assigned NULL or numeric vector of length 2")
if (!is.numeric(readlengthrange)) stop("'readlengthrange' needs to be assigned NULL or numeric vector of length 2")
if ((is.numeric(readlengthrange) & length(readlengthrange) == 2) & readlengthrange[1] > readlengthrange[2]) stop("Second value in read length range needs to be equal or larger then first one.")
}
if (fixedmatrix == TRUE & resizefeatures == FALSE) stop("If 'fixedmatrix=TRUE' then 'resizefeatures' needs to be set to 'TRUE' too.")
## If Nbins is not NULL, then remove ranges with width < Nbins
if (!is.null(Nbins)) grl <- grl[sum(width(reduce(grl))) >= Nbins]
## Resize features by upstream and downstream value
if (resizefeatures == TRUE) {
grl <- .resizeFeature(grl = grl, upstream = upstream, downstream = upstream, component_resort = TRUE)
}
## Following codes operates mostly on GRanges object
gr <- unlist(grl)
## Define empty buckets for collcecting results generated in master loop
binDFresult <- data.frame()
fixedMAresult <- data.frame()
fixedbinMAresult <- data.frame()
rle_result <- list()
for (mysample in seq_along(bfl)) {
## Compute coverage for ranges in stored in gr from BAM file. Note, coverage() works
## directly on BAM files but for resizing read lengths and strand specific analysis
## one needs to import the corresponding alignment sections first.
## Import alignment chunk
aligns <- GenomicAlignments::readGAlignments(bfl[[mysample]], param = ScanBamParam(what = Rsamtools::scanBamWhat(), which = gr))
## Remove duplicated mappings generated by readGAlignment for nearby/overlapping ranges; reduce() doesn't fix that
aligns <- aligns[!duplicated(paste(as.character(GenomeInfoDb::seqnames(aligns)), start(aligns), end(aligns), as.character(BiocGenerics::strand(aligns)), mcols(aligns)$qname, sep = "_"))]
## Subset to specific read length range specified under readlengthrange
if (is.numeric(readlengthrange)) {
aligns <- aligns[GenomicAlignments::qwidth(aligns) >= readlengthrange[1] & GenomicAlignments::qwidth(aligns) <= readlengthrange[2]]
}
## Compute coverage
if (is.numeric(resizereads[1])) {
cov <- coverage(resize(granges(aligns), resizereads[1]))
cov_pos <- coverage(resize(granges(IRanges::subsetByOverlaps(aligns, gr, ignore.strand = FALSE)), resizereads[1]))
} else if (is.null(resizereads)) {
cov <- coverage(aligns)
cov_pos <- coverage(IRanges::subsetByOverlaps(aligns, gr, ignore.strand = FALSE))
} else {
stop("'resizereads' needs to be assigned NULL or positive integer of length 1.")
}
cov_neg <- cov - cov_pos
## Extract coverage for gr components.
## Sense coverage
# cov_posreg <- suppressWarnings(Biostrings::Views(cov_pos, as(gr, "IntegerRangesList"))) # delete
cov_posreg <- suppressWarnings(Biostrings::Views(cov_pos[names(as(gr, "IntegerRangesList"))], as(gr, "IntegerRangesList"))) # Update 22-Nov-15: cov_pos needs to be subsetted by seqnames in IntegerRangesList. This is relevant if txdb/grl was created from gff with scaffolds not containing any genes
cov_posreg <- cov_posreg[sapply(cov_posreg, length) > 0] # Removes empty components (chr)
## Antisense coverage
# cov_negreg <- suppressWarnings(Biostrings::Views(cov_neg, as(gr, "IntegerRangesList"))) # delete
cov_negreg <- suppressWarnings(Biostrings::Views(cov_neg[names(as(gr, "IntegerRangesList"))], as(gr, "IntegerRangesList"))) # Update 22-Nov-15: cov_neg needs to be subsetted by seqnames in IntegerRangesList. This is relevant if txdb/grl was created from gff with scaffolds not containing any genes
cov_negreg <- cov_negreg[sapply(cov_negreg, length) > 0] # Removes empty components (chr)
## Collapse (splice) multicomponent coverage ranges, e.g. cds exons to full cds
## Sense coverage
mystrand <- BiocGenerics::strand(grl)
mystrand <- sapply(names(mystrand), function(x) as.character(mystrand[[x]])[1])
cov_posList <- as.list(mystrand)
## Features on "+" strand
for (i in seq_along(cov_posreg)) {
for (j in names(mystrand[mystrand == "+"])) {
myview <- cov_posreg[[i]][names(cov_posreg[[i]]) %in% j]
if (length(myview) != 0) {
cov_posList[[j]] <- S4Vectors::Rle(as.numeric(na.omit(as.vector(t(as.matrix(myview))))))
}
}
}
## Features on "-" strand: note coverage of component features needs to be reversed here prior to collapsing to single feature!
for (i in seq_along(cov_posreg)) {
for (j in names(mystrand[mystrand == "-"])) {
myview <- cov_posreg[[i]][names(cov_posreg[[i]]) %in% j]
if (length(myview) != 0) {
cov_posList[[j]] <- S4Vectors::Rle(as.numeric(na.omit(as.vector(t(as.matrix(reverse(myview)))))))
}
}
}
## Antisense coverage
cov_negList <- as.list(mystrand)
## Features on "+" strand
for (i in seq_along(cov_negreg)) {
for (j in names(mystrand[mystrand == "+"])) {
myview <- cov_negreg[[i]][names(cov_negreg[[i]]) %in% j]
if (length(myview) != 0) {
cov_negList[[j]] <- S4Vectors::Rle(as.numeric(na.omit(as.vector(t(as.matrix(myview))))))
}
}
}
## Features on "-" strand: note coverage of component features needs to be reversed here prior to collapsing to single feature!
for (i in seq_along(cov_negreg)) {
for (j in names(mystrand[mystrand == "-"])) {
myview <- cov_negreg[[i]][names(cov_negreg[[i]]) %in% j]
if (length(myview) != 0) {
cov_negList[[j]] <- S4Vectors::Rle(as.numeric(na.omit(as.vector(t(as.matrix(reverse(myview)))))))
}
}
}
## Get total number of aligned reads from BAM file and add to output for normalization
param <- ScanBamParam(flag = scanBamFlag(isSecondaryAlignment = FALSE, isUnmappedQuery = FALSE))
aligned_reads <- Rsamtools::countBam(bfl[mysample], param = param)
## If Nbins is integer, compute relative bin coverage for fixed number of bins (intervals) of each feature
if (!is.null(Nbins)) {
## Function to bin coverage
.interComp <- function(x, slice, method) {
interval <- cut(1:length(x), slice, labels = FALSE)
resultvec <- tapply(x, as.factor(interval), method)
return(resultvec)
}
## Sense coverage
myMApos <- t(sapply(cov_posList, function(y) .interComp(x = y, slice = Nbins, method = method)))
## Antisense coverage
myMAneg <- t(sapply(cov_negList, function(y) .interComp(x = y, slice = Nbins, method = method)))
## Combine results
myMAneg <- myMAneg[rownames(myMApos), , drop = FALSE] # Assure proper sorting
myDFpos <- data.frame(SampleName = names(bfl[mysample]), N_total_aligned = aligned_reads$records, IDs = rownames(myMApos), Strand = "Sense", myMApos, check.names = FALSE)
myDFneg <- data.frame(SampleName = names(bfl[mysample]), N_total_aligned = aligned_reads$records, IDs = rownames(myMAneg), Strand = "Antisense", myMAneg, check.names = FALSE)
myDFbins <- rbind(myDFpos, myDFneg, make.row.names = FALSE)
row.names(myDFbins) <- NULL
binDFresult <- rbind(binDFresult, myDFbins)
exportDF <- myDFbins
}
## Generate fixed matrix output
if (fixedmatrix == TRUE) {
mycolnames <- c(-upstream:-1, 0, 1:downstream)
fixedMAstartpos <- t(sapply(names(cov_posList), function(x) as.numeric(cov_posList[[x]])[1:(upstream + downstream + 1)]))
colnames(fixedMAstartpos) <- mycolnames
fixedMAendpos <- t(sapply(names(cov_posList), function(x) as.numeric(cov_posList[[x]])[(length(as.numeric(cov_posList[[x]])) - (upstream + downstream)):length(as.numeric(cov_posList[[x]]))]))
colnames(fixedMAendpos) <- mycolnames
fixedMAstartneg <- t(sapply(names(cov_negList), function(x) as.numeric(cov_negList[[x]])[1:(upstream + downstream + 1)]))
colnames(fixedMAstartneg) <- mycolnames
fixedMAendneg <- t(sapply(names(cov_negList), function(x) as.numeric(cov_negList[[x]])[(length(as.numeric(cov_negList[[x]])) - (upstream + downstream)):length(as.numeric(cov_negList[[x]]))]))
colnames(fixedMAendneg) <- mycolnames
fixedMA <- rbind(
data.frame(SampleName = names(bfl[mysample]), N_total_aligned = aligned_reads$records, IDs = rownames(fixedMAstartpos), Strand = "Sense", fixedMAstartpos, check.names = FALSE),
data.frame(SampleName = names(bfl[mysample]), N_total_aligned = aligned_reads$records, IDs = rownames(fixedMAstartneg), Strand = "Antisense", fixedMAstartneg, check.names = FALSE)
)
tmpMA <- rbind(
data.frame(fixedMAendpos[rownames(fixedMAstartpos), , drop = FALSE], check.names = FALSE),
data.frame(fixedMAendneg[row.names(fixedMAstartneg), , drop = FALSE], check.names = FALSE)
)
fixedMAout <- cbind(fixedMA, "<S|E>" = "|", tmpMA)
row.names(fixedMAout) <- NULL
fixedMAresult <- rbind(fixedMAresult, fixedMAout)
exportDF <- fixedMAout
}
## Combine fixed and binned coverage matrix
if (!is.null(Nbins) & fixedmatrix == TRUE) {
fixedbinMAout <- data.frame(fixedMA, "<S|B>" = "|", myDFbins[, -c(1:4), drop = FALSE], "<B|E>" = "|", tmpMA, check.names = FALSE)
row.names(fixedbinMAout) <- NULL
fixedbinMAresult <- rbind(fixedbinMAresult, fixedbinMAout)
exportDF <- fixedbinMAout
}
## Organize rle results in list
if (fixedmatrix == FALSE & is.null(Nbins)) {
rle_out <- list(list(N_total_aligned = aligned_reads$records, sense = as(cov_posList, "RleList"), antisense = as(cov_negList, "RleList")))
names(rle_out) <- names(bfl[mysample])
rle_result <- c(rle_result, rle_out)
}
## Write tabular data to file in append mode
if (!is.null(outfile) & (fixedmatrix == TRUE | !is.null(Nbins))) {
if (!file.exists(outfile)) {
write.table(exportDF, outfile, append = FALSE, quote = FALSE, row.names = FALSE, col.names = TRUE, sep = "\t")
cat("Processed sample:", names(bfl[mysample]), "and wrote results to file", outfile, "\n")
} else {
write.table(exportDF, outfile, append = TRUE, quote = FALSE, row.names = FALSE, col.names = FALSE, sep = "\t")
cat("Processed sample:", names(bfl[mysample]), "and appended results to file", outfile, "\n")
}
} else {
cat("Processed sample:", names(bfl[mysample]), "\n")
}
}
## Return proper final result
if (!is.null(Nbins) & fixedmatrix == FALSE) {
return(binDFresult)
} else if (is.null(Nbins) & fixedmatrix == TRUE) {
return(fixedMAresult)
} else if (!is.null(Nbins) & fixedmatrix == TRUE) {
return(fixedbinMAresult)
} else {
return(rle_result)
}
}
## Usage:
# grl <- GenomicFeatures::cdsBy(txdb, "tx", use.names=TRUE)
# fcov <- featureCoverage(bfl=BamFileList(outpaths[1]), grl=grl[1:4], resizereads=NULL, readlengthrange=NULL, Nbins=NULL, method=mean, fixedmatrix=TRUE, resizefeatures=TRUE, upstream=20, downstream=20, outfile="results/zzz.xls", overwrite=TRUE)
## Helper function to extend single and multi component ranges
## Extends featureBy GRangeList objects containing component features
## such as exons in CDSs or transcripts so that only the first and last components
## get extended. Single component features will be extended the same way.
.resizeFeature <- function(grl, upstream, downstream, component_resort = TRUE) {
checkPkg("GenomeInfoDb", quietly = FALSE)
if (!inherits(grl, "GRangesList")) stop("'grl' needs to be a GRangesList object.")
gr <- unlist(grl)
if (!all(names(grl) %in% unique(names(gr)))) stop("None or not all components in grl are named.")
## Add sort_index
mcols(gr) <- S4Vectors::DataFrame(mcols(gr), sort_index = seq_along(gr))
## Ranges on + strand
gr_pos <- gr[BiocGenerics::strand(gr) == "+" | BiocGenerics::strand(gr) == "*"] # Note: treats unstranded * the same as pos +
if (component_resort == TRUE) {
gr_pos <- gr_pos[order(as.character(GenomeInfoDb::seqnames(gr_pos)), names(gr_pos), start(gr_pos))] # Assures proper sorting
}
index_pos_first <- !duplicated(names(gr_pos), fromLast = FALSE)
suppressWarnings(start(gr_pos[index_pos_first]) <- start(gr_pos[index_pos_first]) - upstream)
gr_pos <- trim(gr_pos)
index_pos_last <- !duplicated(names(gr_pos), fromLast = TRUE)
suppressWarnings(end(gr_pos[index_pos_last]) <- end(gr_pos[index_pos_last]) + downstream)
gr_pos <- trim(gr_pos)
## Ranges on - strand
gr_neg <- gr[BiocGenerics::strand(gr) == "-"]
if (component_resort == TRUE) {
gr_neg <- gr_neg[order(as.character(GenomeInfoDb::seqnames(gr_neg)), names(gr_neg), -start(gr_neg))]
}
index_neg_first <- !duplicated(names(gr_neg), fromLast = FALSE)
suppressWarnings(end(gr_neg[index_neg_first]) <- end(gr_neg[index_neg_first]) + upstream)
gr_neg <- trim(gr_neg)
index_neg_last <- !duplicated(names(gr_neg), fromLast = TRUE)
suppressWarnings(start(gr_neg[index_neg_last]) <- start(gr_neg[index_neg_last]) - downstream)
gr_neg <- trim(gr_neg)
## Return output in same format as input
gr_mod <- c(gr_pos, gr_neg)
gr_mod <- gr_mod[order(mcols(gr_mod)$sort_index)]
gr_mod <- gr_mod[, seq_along(colnames(mcols(grl[[1]])))]
myfactor <- names(gr_mod)
if (is.null(names(grl[[1]]))) names(gr_mod) <- NULL
grl_mod <- split(gr_mod, myfactor)
grl_mod <- grl_mod[names(grl)]
return(grl_mod)
}
## Usage:
# grl_mod <- .resizeFeature(grl=grl, upstream=20, downstream=20, component_resort=TRUE)
###########################
## Plot feature coverage ##
###########################
## Plots tabular coverage data generated by featureCoverage()
plotfeatureCoverage <- function(covMA, method = mean, scales = "fixed", extendylim = 2, scale_count_val = 10^6) {
checkPkg("grid", quietly = FALSE)
## Some input validity checks
if (class(covMA) != "data.frame") stop("'covMA' needs to be assigned an object of class 'data.frame'.")
expectedcol <- c("SampleName", "N_total_aligned", "IDs", "Strand")
if (any(!colnames(covMA)[1:4] %in% expectedcol)) stop("The first 4 columns in 'covMA' need to be named: ", paste(expectedcol, collapse = ", "))
## global functions or variables
Coverage <- Strand <- NULL
## Determine split type required for provided input
splitloc <- which(grepl("(<S\\|B>)|(<B\\|E>)|(<S\\|E>)", colnames(covMA)))
## Get numbers of total aligned reads per sample
N_total_aligned <- covMA$N_total_aligned
names(N_total_aligned) <- covMA$SampleName
N_total_aligned <- N_total_aligned[!duplicated(names(N_total_aligned))]
## Define function to convert tabular input into ggplot2 friendly format
.convertDFlist <- function(malist) {
for (j in seq_along(malist)[-1]) {
df <- malist[[j]]
tmpDF <- data.frame()
for (i in seq_along(colnames(df))[-c(1, 2)]) {
tmpDF <- rbind(tmpDF, data.frame(df[, 1:2], Position = colnames(df)[i], Coverage = df[, i]))
}
malist[[j]] <- tmpDF
}
return(malist)
}
## Construct informative title: print ID if only one or their count if more than 1
ti <- as.character(unique(covMA[, "IDs"]))
if (length(ti) > 1) ti <- paste(length(ti), "features")
## Split and compute summary stats in format expected by ggplot2
## For binned matrix
if (length(splitloc) == 0) {
malist <- sapply(c("title", "bin"), function(x) NULL)
malist[["title"]] <- c(titlebin = paste("CDS of", ti), x = "Bins", y = "Coverage")
bin <- covMA
malist[["bin"]] <- aggregate(bin[, -c(1:4)], by = list(SampleName = bin$SampleName, Strand = bin$Strand), FUN = method)
malist <- .convertDFlist(malist)
}
## For fixed matrix around start and stop codons
if (length(splitloc) == 1) {
malist <- sapply(c("title", "start", "stop"), function(x) NULL)
malist[["title"]] <- c(titlestart = paste("Start of", ti), titlestop = paste("Stop of", ti), x = "Bins", y = "Coverage")
start <- covMA[, 1:(splitloc[1] - 1)]
malist[["start"]] <- aggregate(start[, -c(1:4)], by = list(SampleName = start$SampleName, Strand = start$Strand), FUN = method)
stop <- covMA[, c(1:4, (splitloc[1] + 1):length(colnames(covMA)))]
malist[["stop"]] <- aggregate(stop[, -c(1:4)], by = list(SampleName = stop$SampleName, Strand = stop$Strand), FUN = method)
malist <- .convertDFlist(malist)
}
## For fixed matrix and binned matrix
if (length(splitloc) == 2) {
malist <- sapply(c("title", "start", "bin", "stop"), function(x) NULL)
malist[["title"]] <- c(titlestart = paste("Start of", ti), titlebin = paste("CDS of", ti), titlestop = paste("Stop of", ti), x = "Bins", y = "Coverage")
start <- covMA[, 1:(splitloc[1] - 1)]
malist[["start"]] <- aggregate(start[, -c(1:4)], by = list(SampleName = start$SampleName, Strand = start$Strand), FUN = method)
bin <- covMA[, c(1:4, (splitloc[1] + 1):(splitloc[2] - 1))]
malist[["bin"]] <- aggregate(bin[, -c(1:4)], by = list(SampleName = bin$SampleName, Strand = bin$Strand), FUN = method)
stop <- covMA[, c(1:4, (splitloc[2] + 1):length(colnames(covMA)))]
malist[["stop"]] <- aggregate(stop[, -c(1:4)], by = list(SampleName = stop$SampleName, Strand = stop$Strand), FUN = method)
malist <- .convertDFlist(malist)
}
## Scale per x reads (e.g. per million reads)
if (is.numeric(scale_count_val[1])) {
for (i in seq_along(malist)[-1]) malist[[i]][, "Coverage"] <- malist[[i]][, "Coverage"] * (scale_count_val / N_total_aligned[malist[[i]][, "SampleName"]])
} else if (length(scale_count_val) == 0) {
malist <- malist
} else {
stop("'scale_count_val' needs to be assinged NULL or numeric value.")
}
## Label for coverage axis
if (length(scale_count_val) == 0) axis_label <- "Raw Coverage"
if (length(scale_count_val) > 0) axis_label <- paste("Coverage normalized per", scale_count_val, "reads")
## Get maximum coverage in data sets after summing up strand spec counts
mymax <- numeric()
for (i in seq_along(malist)[-1]) {
myfactor <- paste(malist[[i]][, 1], malist[[i]][, 2], malist[[i]][, 3], sep = "_")
mymax <- max(c(mymax, max(tapply(malist[[i]][, "Coverage"], myfactor, sum))))
}
mymax <- mymax * extendylim
## Generate plotting instructions
myplotlist <- sapply(names(malist[-1]), function(x) NULL, simplify = FALSE)
for (i in seq_along(myplotlist)) {
myplot <- ggplot2::ggplot(malist[[names(myplotlist)[i]]], ggplot2::aes(x = Position, y = Coverage)) +
ggplot2::geom_bar(ggplot2::aes(fill = Strand), position = "stack", stat = "identity") +
ggplot2::facet_wrap(~SampleName, ncol = 1, scales = scales) +
ggplot2::theme(legend.position = "bottom") +
ggplot2::ggtitle(malist[["title"]][i]) +
ggplot2::scale_x_discrete(limits = unique(malist[[names(myplotlist)[i]]]$Position))
## Assure same scale for all panels if scales="fixed"
if (scales == "fixed") {
myplot <- myplot + ggplot2::scale_y_continuous(axis_label, limits = c(0, mymax))
} else {
myplot <- myplot + ggplot2::scale_y_continuous(axis_label)
}
myplotlist[[i]] <- myplot
}
## Generate graphics
grid::grid.newpage() # Open a new page on grid device
grid::pushViewport(grid::viewport(layout = grid::grid.layout(1, length(myplotlist)))) # Assign to device viewport with 1 by 2 grid layout
for (i in seq(along = myplotlist)) print(myplotlist[[i]], vp = grid::viewport(layout.pos.row = 1, layout.pos.col = i))
}
## Usage:
# grl <- GenomicFeatures::cdsBy(txdb, "tx", use.names=TRUE)
# fcov <- featureCoverage(bfl=BamFileList(outpaths[1:2]), grl=grl[1:4], resizereads=NULL, readlengthrange=NULL, Nbins=20, method=mean, fixedmatrix=TRUE, resizefeatures=TRUE, upstream=20, downstream=20, outfile="results/zzz.xls", overwrite=TRUE)
# plotfeatureCoverage(covMA=fcov, method=mean, scales="fixed", extendylim=2, scale_count_val=10^6)
##################
## Predict ORFs ##
##################
## Function to predict ORFs in DNA sequences provided as DNAString/DNAStringSet objects
predORF <- function(x, n = 1, type = "grl", mode = "orf", strand = "sense", longest_disjoint = FALSE, startcodon = "ATG", stopcodon = c("TAA", "TAG", "TGA")) {
pkg <- c("IRanges", "GenomeInfoDb")
checkPkg(pkg, quietly = FALSE)
## Check input validity
if (any(nchar(c(startcodon, stopcodon)) != 3)) stop("startcodon and stopcodons can only contain 3-letter strings.")
if (!toupper(mode) %in% c("ORF", "CDS")) stop("'mode' can only be assigned one of: 'orf' or 'cds'")
if (length(names(x)) == 0 | any(duplicated(names(x)))) stop("Sequence name slot of x need be populated with unique names.")
## Remove sequence with less than 6 nucleotides in length
x <- x[width(x) >= 6]
## Function for predicting ORFs/CDSs on single sequence
.predORF <- function(x, n, mode, strand, ...) {
## start/stop codon assignment
if (tolower(strand) == "sense") {
mystrand <- 1 # for +
startcodon <- startcodon
stopcodon <- stopcodon
} else if (tolower(strand) == "antisense") {
## Reverse and complement start/stop plus swap their assignment
mystrand <- 2 # for -
stopcodon_temp <- as.character(Biostrings::reverseComplement(Biostrings::DNAStringSet(stopcodon)))
startcodon_temp <- as.character(Biostrings::reverseComplement(Biostrings::DNAStringSet(startcodon)))
stopcodon <- startcodon_temp
startcodon <- stopcodon_temp
} else {
stop("strand can only be assigned 'sense', 'antisense' or 'both'")
}
## Sequences containing N are not processed
if (Biostrings::alphabetFrequency(x)["N"] > 0) {
orfRanges <- cbind(subject_id = numeric(), start = numeric(), end = numeric(), width = numeric(), strand = numeric(), inframe2end = numeric())
warning("Skipped sequence containing Ns.")
return(orfRanges)
}
## Tripletize x for each frame
c1 <- as.character(suppressWarnings(Biostrings::codons(x)))
c2 <- as.character(suppressWarnings(Biostrings::codons(x[2:length(x)])))
c3 <- as.character(suppressWarnings(Biostrings::codons(x[3:length(x)])))
## Identify position of start/stop in tripletized x
startpos1 <- which(c1 %in% startcodon)
stoppos1 <- which(c1 %in% stopcodon)
startpos2 <- which(c2 %in% startcodon)
stoppos2 <- which(c2 %in% stopcodon)
startpos3 <- which(c3 %in% startcodon)
stoppos3 <- which(c3 %in% stopcodon)
## Make sure subsequent code also finds coding sequence frames (CDS) rather than just strict open reading frames (ORFs)
if (mode == "cds") {
stoppos1 <- unique(c(0, stoppos1, length(c1) + 1))
stoppos2 <- unique(c(0, stoppos2, length(c2) + 1))
stoppos3 <- unique(c(0, stoppos3, length(c3) + 1))
startpos1 <- stoppos1
startpos2 <- stoppos2
startpos3 <- stoppos3
}
## Map tripletized matches back to sequence of x
if (tolower(strand) == "sense") {
orfpos1 <- t(sapply(seq(along = startpos1), function(x) c((startpos1[x] * 3) - 2, stoppos1[stoppos1 > startpos1[x]][1] * 3)))
if (length(orfpos1) == 0) orfpos1 <- matrix(nrow = 0, ncol = 2)
orfpos2 <- t(sapply(seq(along = startpos2), function(x) c((startpos2[x] * 3) - 1, (stoppos2[stoppos2 > startpos2[x]][1] * 3) + 1)))
if (length(orfpos2) == 0) orfpos2 <- matrix(nrow = 0, ncol = 2)
orfpos3 <- t(sapply(seq(along = startpos3), function(x) c((startpos3[x] * 3) - 0, (stoppos3[stoppos3 > startpos3[x]][1] * 3) + 2)))
if (length(orfpos3) == 0) orfpos3 <- matrix(nrow = 0, ncol = 2)
}
if (tolower(strand) == "antisense") { # Note - for antisense predictions startpos contain stop codons and stoppos contain start codons
startpos1 <- sort(startpos1, decreasing = TRUE)
startpos2 <- sort(startpos2, decreasing = TRUE)
startpos3 <- sort(startpos3, decreasing = TRUE)
stoppos1 <- sort(stoppos1, decreasing = TRUE)
stoppos2 <- sort(stoppos2, decreasing = TRUE)
stoppos3 <- sort(stoppos3, decreasing = TRUE)
orfpos1 <- t(sapply(seq(along = stoppos1), function(x) c((startpos1[startpos1 < stoppos1[x]][1] * 3) - 2, stoppos1[x] * 3)))
if (length(orfpos1) == 0) orfpos1 <- matrix(nrow = 0, ncol = 2)
orfpos2 <- t(sapply(seq(along = stoppos2), function(x) c((startpos2[startpos2 < stoppos2[x]][1] * 3) - 1, (stoppos2[x] * 3) + 1)))
if (length(orfpos2) == 0) orfpos2 <- matrix(nrow = 0, ncol = 2)
orfpos3 <- t(sapply(seq(along = stoppos3), function(x) c((startpos3[startpos3 < stoppos3[x]][1] * 3) - 0, (stoppos3[x] * 3) + 2)))
if (length(orfpos3) == 0) orfpos3 <- matrix(nrow = 0, ncol = 2)
}
orfRanges <- rbind(orfpos1, orfpos2, orfpos3)
orfRanges <- na.omit(orfRanges)
if (mode == "cds") { # Modifications required for mode=="cds"
orfRanges[, 1] <- orfRanges[, 1] + 3
orfRanges[orfRanges[, 2] > length(x), 2] <- length(x)
}
orfRanges <- IRanges::IRanges(start = orfRanges[, 1], end = orfRanges[, 2])
orfRanges <- orfRanges[rev(order(width(orfRanges)))]
## Organize results in data.frame and also add info about frame of predicted ORF to downstream ORF e.g. prediction is uORF of 5'-UTR
orfRanges <- as.data.frame(orfRanges)
inframe <- (length(x) - orfRanges$end) / 3
inframe2 <- inframe
inframe2[abs((inframe - as.integer(inframe))) == 0] <- 1
inframe2[abs(round((inframe - as.integer(inframe)), 2)) == round(1 / 3, 2)] <- 2
inframe2[abs(round((inframe - as.integer(inframe)), 2)) == round(2 / 3, 2)] <- 3
if (nrow(orfRanges) > 0) {
orfRanges <- cbind(subject_id = 1:length(orfRanges[, 1]), orfRanges, strand = mystrand, inframe2end = inframe2)
} else {
orfRanges <- cbind(subject_id = numeric(), start = numeric(), end = numeric(), width = numeric(), strand = numeric(), inframe2end = numeric())
}
## Return all ORFs
if (n == "all") {
## Subset to non-overlapping ORF set containing longest ORF
if (nrow(orfRanges) > 0 & longest_disjoint == TRUE) {
tmpgr <- GRanges(seqnames = "dummy", IRanges::IRanges(orfRanges[, 2], orfRanges[, 3]), strand = "+")
orfRanges <- orfRanges[disjointBins(tmpgr) == 1, , drop = FALSE]
orfRanges[, 1] <- 1:nrow(orfRanges)
}
return(orfRanges)
## Return only as many ORFs as specified under n sorted decreasingly by length
} else if (is.numeric(n)) {
## Make sure subsetting does not exceed number of records in orfRanges
if (nrow(orfRanges) < n & nrow(orfRanges) != 0) {
upperlimit <- length(orfRanges[, 1])
} else if (nrow(orfRanges) > 0) {
upperlimit <- n
} else {
upperlimit <- NULL
}
if (is.null(upperlimit)) {
return(orfRanges[upperlimit, , drop = FALSE])
} else {
return(orfRanges[1:upperlimit, , drop = FALSE])
}
} else {
stop("n needs to be assigned positive integer or 'all'")
}
}
## Run .predORF
if (class(x) == "DNAString") {
if (tolower(strand) == "sense" | tolower(strand) == "antisense") {
tmp <- .predORF(x, n, mode, strand, longest_disjoint, startcodon, stopcodon)
tmp[, "strand"] <- ifelse(as.numeric(tmp[, "strand"]) == 1, "+", "-")
} else if (tolower(strand) == "both") {
tmp_pos <- .predORF(x, n, mode, strand = "sense", longest_disjoint, startcodon, stopcodon)
tmp_neg <- .predORF(x, n, mode, strand = "antisense", longest_disjoint, startcodon, stopcodon)
tmp <- rbind(tmp_pos, tmp_neg)
tmp[, "strand"] <- ifelse(as.numeric(tmp[, "strand"]) == 1, "+", "-")
} else {
stop("strand can only be assigned 'sense', 'antisense' or 'both'")
}
if (type == "df") {
return(tmp)
}
if (type == "gr") {
return(makeGRangesFromDataFrame(data.frame(seqnames = "unknown", tmp), keep.extra.columns = TRUE))
}
}
if (class(x) == "DNAStringSet") {
if (tolower(strand) == "sense" | tolower(strand) == "antisense") {
tmp <- lapply(x, function(y) .predORF(y, n = n, mode = mode, strand, longest_disjoint, startcodon, stopcodon))
names(tmp) <- names(x)
tmpdf <- do.call("rbind", tmp)
rownames(tmpdf) <- NULL
tmpdf <- data.frame(seqnames = rep(names(tmp), sapply(tmp, nrow)), tmpdf)
tmpdf[, "strand"] <- ifelse(as.numeric(tmpdf[, "strand"]) == 1, "+", "-")
} else if (tolower(strand) == "both") {
## for sense strand
tmp_pos <- lapply(x, function(y) .predORF(y, n = n, mode = mode, strand = "sense", longest_disjoint, startcodon, stopcodon))
names(tmp_pos) <- names(x)
tmpdf_pos <- do.call("rbind", tmp_pos)
rownames(tmpdf_pos) <- NULL
## for antisense strand
tmpdf_pos <- data.frame(seqnames = rep(names(tmp_pos), sapply(tmp_pos, nrow)), tmpdf_pos)
tmp_neg <- lapply(x, function(y) .predORF(y, n = n, mode = mode, strand = "antisense", longest_disjoint, startcodon, stopcodon))
names(tmp_neg) <- names(x)
tmpdf_neg <- do.call("rbind", tmp_neg)
rownames(tmpdf_neg) <- NULL
tmpdf_neg <- data.frame(seqnames = rep(names(tmp_neg), sapply(tmp_neg, nrow)), tmpdf_neg)
## combine and sort results
tmpdf <- rbind(tmpdf_pos, tmpdf_neg)
tmpdf[, "strand"] <- ifelse(as.numeric(tmpdf[, "strand"]) == 1, "+", "-")
tmpdf <- tmpdf[order(tmpdf$seqnames, tmpdf$subject_id), ]
} else {
stop("strand can only be assigned 'sense', 'antisense' or 'both'")
}
## Return results in format defined by type
if (tolower(type) == "df") {
rownames(tmpdf) <- NULL
return(tmpdf)
} else if (tolower(type) == "gr") {
tmpdf <- makeGRangesFromDataFrame(tmpdf, keep.extra.columns = TRUE)
return(tmpdf)
} else if (tolower(type) == "grl") {
tmpdf <- makeGRangesFromDataFrame(tmpdf, keep.extra.columns = TRUE)
tmpdf <- split(tmpdf, as.character(GenomeInfoDb::seqnames(tmpdf)))
return(tmpdf)
} else {
stop("type can only be assigned 'df' of 'gr'")
}
}
}
# library(Biostrings)
# file <- system.file("extdata", "someORF.fa", package="Biostrings")
# dna <- readDNAStringSet(file)
# orf <- predORF(dna[1:4], n=1, type="df", mode="orf", strand="antisense", startcodon="ATG", stopcodon=c("TAA", "TAG", "TGA"))
######################################################
## Scale feature level mappings to genome positions ##
######################################################
## Function to scale ranges within genomic features (query)
## to corresponding genome positions (subject). The function
## also accounts for intron insertions of compound features
## (e.g. between exons of transcribed regions) that are absent in
## the query ranges, but present in the corresponding subject ranges.
scaleRanges <- function(subject, query, type = "custom", verbose = TRUE) {
pkg <- c("IRanges", "GenomeInfoDb")
checkPkg(pkg, quietly = FALSE)
## Both input objects need to be of class GRangesList
if (!inherits(subject, "GRangesList") | !inherits(query, "GRangesList")) stop("Both subject and query need to be GRangesList objects.")
## All names(query) need to be present in names(subject)
if (any(!names(query) %in% names(subject))) stop("All 'names(query)' need to be present in 'names(subject)'.")
## Perform scaling on single subject/query pair each containing on entry
.scaleRanges <- function(subject, query, returntype = "df") {
## Check for validity of query
if (length(query) > 1) warning("Only the first range in 'query' will be used.")
query <- query[1]
if (sum(width(subject)) < width(query)) stop("Sum of width of subject ranges cannot be smaller than width of query range.")
## Check for validity of subject
subjectstrand <- unique(as.character(BiocGenerics::strand(subject)))
if (length(subjectstrand) != 1) stop("More than one orientation detected. There can only be one.")
querystrand <- unique(as.character(BiocGenerics::strand(query)))
## Check for validity of seqnames
myseqname <- unique(as.character(GenomeInfoDb::seqnames(subject)))
if (length(myseqname) != 1) stop("More than one seqname detected. There can only be one.")
## Scale query range to range of subject ranges (of genomic feature) using interval trees from IRanges
subject <- subject[order(start(subject))]
rangev <- paste("c(", paste(paste(start(subject), end(subject), sep = ":"), collapse = ", "), ")", sep = "")
rangev <- eval(parse(text = rangev))
names(rangev) <- seq_along(rangev)
if (subjectstrand == "+") rangev <- rangev[start(query):end(query)]
if (subjectstrand == "-") rangev <- rev(rangev)[start(query):end(query)]
ir <- reduce(IRanges::IRanges(rangev, rangev))
## Set orientation properly
if (querystrand == "-" & subjectstrand == "-") mystrand <- "+"
if (querystrand == "+" & subjectstrand == "+") mystrand <- "+"
if (querystrand == "+" & subjectstrand == "-") mystrand <- "-"
if (querystrand == "-" & subjectstrand == "+") mystrand <- "-"
## Organize result as GRanges
if (mystrand == "+") { # Proper exon ranking for exons on +/- strand
gr <- GRanges(myseqname, ir[order(start(ir), decreasing = FALSE)], mystrand)
} else {
gr <- GRanges(myseqname, ir[order(start(ir), decreasing = TRUE)], mystrand)
}
mcols(gr) <- data.frame(type = type)
## Return results as data.frame or GRanges
if (returntype == "gr") {
return(gr)
}
if (returntype == "df") {
return(as.data.frame(gr))
}
}
## Run .scaleRanges on two GRangesLists as input
subject <- subject[names(query)] # Subset subject to entries in query
mygrl <- unlist(query)
myids1 <- rep(names(query), sapply(start(query), length))
myids2 <- paste(myids1, ":", start(mygrl), "_", end(mygrl), sep = "")
mcols(mygrl) <- data.frame(type = type)
mygrl <- split(mygrl, seq_along(mygrl))
names(mygrl) <- myids2
mylist <- as.list(character(length = length(mygrl)))
names(mylist) <- names(mygrl)
for (i in seq_along(mygrl)) {
myentryid <- myids1[i]
suppressWarnings(mylist[[i]] <- .scaleRanges(subject[[myentryid]], mygrl[[i]]))
if (verbose == TRUE) {
cat(paste0("Scaled range ", i, " of ", length(mygrl), " ranges: ", names(mygrl[i])), "\n")
}
}
## If loop gets interrupted, output only completed results!
mylist <- mylist[1:i]
gr <- GenomicRanges::makeGRangesFromDataFrame(do.call("rbind", mylist), keep.extra.columns = FALSE)
mcols(gr) <- S4Vectors::DataFrame(feature_by = as(rep(names(mylist), sapply(mylist, nrow)), "CharacterList"), featuretype_id = names(gr), featuretype = type)
names(gr) <- NULL
grl <- split(gr, as.character(mcols(gr)$feature_by))
return(grl)
}
## Usage for simple example
# subject <- GRanges(seqnames="Chr1", IRanges(c(5,15,30),c(10,25,40)), strand="+")
# query <- GRanges(seqnames="myseq", IRanges(1, 9), strand="+")
# scaleRanges(GRangesList(myid1=subject), GRangesList(myid1=query), type="test")
## Usage for more complex example
# library(txdbmaker)
# gff <- system.file("extdata/annotation", "tair10.gff", package="systemPipeRdata")
# txdb <- makeTxDbFromGFF(file=gff, format="gff3", organism="Arabidopsis")
# futr <- fiveUTRsByTranscript(txdb, use.names=TRUE)
# genome <- system.file("extdata/annotation", "tair10.fasta", package="systemPipeRdata")
# dna <- extractTranscriptSeqs(FaFile(genome), futr)
# uorf <- predORF(dna, n="all", mode="orf", longest_disjoint=TRUE, strand="sense")
# grl_scaled <- scaleRanges(subject=futr, query=uorf, type="uORF", verbose=TRUE)
# rtracklayer::export.gff3(unlist(grl_scaled), "uorf.gff")
#######################################
## Translational Efficiency Analysis ##
#######################################
## Currently, this step is covered entire by the code given in the Ribo-Seq
## workflow vignette using DESeq2 out of the box. The approach ended up being
## very similar to some of time course analysis steps given in the DESeq2
## vignette as well as some community discussions:
## https://support.bioconductor.org/p/61509/ also here
## https://support.bioconductor.org/p/67455/
## To-dos: add convenience function similar to run_DESeq2 (or extend it)
## to handle this analysis step for many experiment sets in a more convenient manner.
######################
## Adaptor Trimming ##
######################
## Trims adaptors hierarchically from longest to shortest match from right end of read.
## If 'internalmatch=TRUE' then internal matches will trigger the same behavior. The
## argument minpatternlength defines shortest adaptor match to consider for reads
## containing only partial adaptors at the right end.
trimbatch <- function(fq, pattern, internalmatch = FALSE, minpatternlength = 8,
Nnumber = 1, polyhomo = 100, minreadlength = 18, maxreadlength) {
mylength <- max(width(fq))
if (internalmatch == TRUE) {
regpart <- ".*"
} else {
regpart <- "$"
}
processedreads <- ShortRead::ShortReadQ()
## Iterative trimming based on regular expression
for (j in seq(along = minpatternlength:(nchar(pattern))) - 1) {
mypattern <- paste(substring(pattern, 1, nchar(pattern) - j), regpart, sep = "")
d <- Biostrings::DNAStringSet(gsub(mypattern, "", sread(fq)))
q <- Biostrings::BStringSet(Biostrings::quality(Biostrings::quality(fq)), 1, width(d))
fq <- ShortRead::ShortReadQ(sread = d, quality = q, id = id(fq))
processedreads <- BiocGenerics::append(processedreads, fq[width(fq) < mylength])
fq <- fq[width(fq) >= mylength]
}
processedreads <- BiocGenerics::append(processedreads, fq)
processedreads <- processedreads[width(processedreads) >= minreadlength & width(processedreads) <= maxreadlength] # Read length filter
filter1 <- ShortRead::nFilter(threshold = Nnumber) # keep only reads with <= x Ns
filter2 <- ShortRead::polynFilter(threshold = polyhomo, nuc = c("A", "C", "T", "G")) # remove reads with x or more of the same nucleotide
filter <- ShortRead::compose(filter1, filter2) # Combine filters into one
processedreads[filter(processedreads)]
}
## Usage:
# iterTrim <- trimbatch(fq, pattern="ACACGTCT", minpatternlength=6, Nnumber=1, polyhomo=20, minreadlength=16, maxreadlength = 101)
# preprocessReads(args=args, Fct=iterTrim, batchsize=100000, overwrite=TRUE, compress=TRUE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.