Nothing
#' Convert BED file into data.frame of count windows
#'
#' @param filename filename of the BED file with absolute or relative path
#' (structure of BED file without header: chromosome, exon start, exon end,
#' exon name)
#' @param chr indicates whether naming contains chr prefix
#' @return a data.frame with the contents of the BED file with an additional
#' gene name and exon name column
#' @examples
#' bed <- list.files(system.file("extdata", package = "panelcn.mops"),
#' pattern = ".bed$", full.names = TRUE)
#' countWindows <- getWindows(bed)
#' @importFrom utils read.csv
#' @export
getWindows <- function(filename, chr = FALSE) {
data <- read.csv(filename, sep="\t", header = FALSE)
if (ncol(data) < 4 || is.na(data[1,2])) {
stop("BED file needs to have gene name in 4th column and no header.")
}
if (!is.numeric(data$V2) || !is.numeric(data$V3)) {
stop(paste0("2nd and 3rd column of BED file need to be numeric. ",
"Make sure all columns are separated by tabs."))
}
data <- data[,1:4]
data$V4 <- as.character(data$V4)
data$V1 <- as.character(data$V1)
data <- data[!duplicated(paste(data$V1, data$V2, data$V3, sep="_")),]
firstColumnAsList <- strsplit(data[,4], "[.]")
firstColumnAsVector <- sapply(firstColumnAsList, '[', 1)
data[,5] <- firstColumnAsVector
secondColumnAsVector <- sapply(firstColumnAsList, '[', 2)
if (length(grep("E", secondColumnAsVector))) {
data[,6] <- secondColumnAsVector
} else {
data[,6] <- NA
}
names(data) <- c("chromosome", "start", "end", "name", "gene", "exon")
if (!chr && any(grepl("chr", data$chromosome))) {
data[,1] <- sub("chr", "", data[,1])
message(paste0("naming without chr prefix chosen, but BED contains ",
"chr -> removing chr"))
} else if (chr && !any(grepl("chr", data$chromosome))) {
data[,1] <- paste0("chr", data[,1])
message(paste0("naming with chr prefix chosen, but BED does not",
" contain chr -> adding chr"))
}
return(data)
}
#' Get read counts for a list of BAM files and given count windows
#'
#' @param bam.files list with absolute or relative paths to BAM files
#' @param countWindows data.frame with contents of a BED file as returned by
#' getWindows
#' @param read.width read.width parameter for countBamInGRanges or FALSE if
#' actual read width should be extracted from BAM file
#' @param ... additional parameters
#' @return a GRanges object over the countWindows with read counts for each
#' sample as elementMetadata
#' @export
#' @import GenomicRanges
#' @import IRanges
#' @importFrom Rsamtools scanBamHeader
#' @importFrom Rsamtools ScanBamParam
#' @importFrom Rsamtools scanBamFlag
#' @importFrom Rsamtools scanBam
#' @examples
#' bed <- system.file("extdata/Genes_part.bed", package = "panelcn.mops")
#' countWindows <- getWindows(bed)
#'\dontrun{
#' testbam <- "SAMPLE1.bam"
#' test <- countBamListInGRanges(countWindows = countWindows,
#' bam.files = testbam, read.width = 150)
#' }
countBamListInGRanges <- function(bam.files, countWindows, read.width = 150,
...){
RCs <- list()
GR <- GRanges(countWindows[,1], IRanges(countWindows[,2],
countWindows[,3]))
for (i in seq_along(bam.files)) {
message(paste("Processing", basename(bam.files[i]), "...", i, "/",
length(bam.files)))
indexed <- (file.exists(paste(bam.files[i],".bai",sep="")) |
file.exists(gsub(".bam",".bai", bam.files[i])))
if (indexed){
if (read.width == FALSE) {
message("get.width")
RCs[[i]] <- .countBamInGRanges(bam.file=bam.files[i], GR ,
min.mapq=1, get.width = TRUE,
...)
} else {
RCs[[i]] <- .countBamInGRanges(bam.file=bam.files[i], GR ,
min.mapq=1,
read.width = read.width, ...)
}
} else {
message(paste0("No index file (.bai) found for BAM file",
bam.files[i], ". RCs cannot be calculated"))
return()
}
}
message("finished processing samples")
M <- matrix(unlist(RCs), nrow=nrow(countWindows))
if (all(M == 0)) {
message(paste0("All read counts are 0. Please make sure ",
"that you have the right BAM file and BED file."))
}
# colnames(M) <- basename(bam.files)
# print(colnames(M))
RC <- GR
RC@elementMetadata <- DataFrame(M)
colnames(RC@elementMetadata) <- basename(bam.files)
return(RC)
}
#' countBamInGRanges from package exomeCopy
.countBamInGRanges <- function(bam.file, granges, min.mapq = 1, read.width = 1,
stranded.start = FALSE,
get.width = FALSE, remove.dup = FALSE) {
rds.counts <- integer(length(granges))
seq.names <- unique(as.character(GenomicRanges::seqnames(granges)))
seq.names.in.bam <- names(Rsamtools::scanBamHeader(bam.file)[[1]]$targets)
if ((sum(grepl(pattern = '^chr', seq.names)) > 0) &&
(sum(grepl(pattern = '^chr', seq.names.in.bam)) == 0)) {
warning('countWindows contain chr prefix to the chromosome name but
BAM does not -> removing chr.')
seq.names <- substr(seq.names, 4, nchar(seq.names))
GenomeInfoDb::seqlevels(granges) <-
substr(GenomeInfoDb::seqlevels(granges), 4,
nchar(GenomeInfoDb::seqlevels(granges)))
} else if ((sum(grepl(pattern = '^chr', seq.names.in.bam)) > 0) &&
(sum(grepl(pattern = '^chr', seq.names)) == 0)) {
warning('BAM contains chr prefix to the chromosome name but
countWindows do not -> adding chr.')
seq.names <- paste0("chr", seq.names)
GenomeInfoDb::seqlevels(granges) <-
paste0("chr", GenomeInfoDb::seqlevels(granges))
}
for (seq.name in seq.names) {
if (seq.name %in% seq.names.in.bam) {
granges.subset <-
granges[GenomicRanges::seqnames(granges) == seq.name]
strand(granges.subset) <- "*"
scan.what <- c("pos", "mapq")
if (stranded.start | get.width) {
scan.what <- c(scan.what, "qwidth")
}
if (stranded.start | remove.dup) {
scan.what <- c(scan.what, "strand")
}
rds <-
Rsamtools::scanBam(bam.file,
param = Rsamtools::ScanBamParam(what = scan.what,
which = range(granges.subset)))
if (min.mapq > 0) {
mapq.test <- rds[[1]]$mapq >= min.mapq & !is.na(rds[[1]]$mapq)
}
else {
mapq.test <- rep(TRUE, length(rds[[1]]$mapq))
}
if (remove.dup) {
if (get.width) {
mapq.test <- mapq.test &
!duplicated(rds[[1]]$pos +
as.numeric(rds[[1]]$strand)/10 +
rds[[1]]$qwidth/1e+05)
}
else {
mapq.test <- mapq.test & !duplicated(rds[[1]]$pos +
as.numeric(rds[[1]]$strand)/10)
}
}
if (sum(mapq.test) > 0) {
if (stranded.start) {
rds.ranges <-
GenomicRanges::GRanges(seq.name,
IRanges::IRanges(start =
ifelse(rds[[1]]$strand[mapq.test] %in%
c("+", "*"), rds[[1]]$pos[mapq.test],
rds[[1]]$pos[mapq.test] +
rds[[1]]$qwidth[mapq.test] - read.width),
end = ifelse(rds[[1]]$strand[mapq.test] %in%
c("+", "*"), rds[[1]]$pos[mapq.test] +
read.width - 1, rds[[1]]$pos[mapq.test] +
rds[[1]]$qwidth[mapq.test] - 1)))
}
else if (get.width) {
rds.ranges <-
GenomicRanges::GRanges(seq.name,
IRanges::IRanges(start = rds[[1]]$pos[mapq.test],
width = rds[[1]]$qwidth[mapq.test]))
}
else {
rds.ranges <- GenomicRanges::GRanges(seq.name,
IRanges::IRanges(start = rds[[1]]$pos[mapq.test],
width = read.width))
}
rds.counts.seq.name <-
GenomicRanges::countOverlaps(granges.subset, rds.ranges)
rds.counts[as.logical(GenomicRanges::seqnames(granges) ==
seq.name)] <- rds.counts.seq.name
}
else {
rds.counts[as.logical(GenomicRanges::seqnames(granges) ==
seq.name)] <- 0
}
}
else {
rds.counts[as.logical(GenomicRanges::seqnames(granges) ==
seq.name)] <- 0
}
}
if (sum(rds.counts) == 0) {
message("No reads found with minimum mapping quality")
}
rds.counts
}
#' Split (larger) ROIs into multiple smaller (overlapping) bins and create
#' new BED file
#'
#' @param oldBedFile filename of the BED file with absolute or relative path
#' (structure of BED file without header: chromosome, exon start, exon end,
#' exon name)
#' @param newBedFile filename of the new BED file that should be created
#' @param limit ROIs larger than limit will be split
#' @param bin size of bins (in bp) the ROIs will be split into
#' @param shift no. of bp between start positions of adjacent bins
#' @param chr indicates whether naming contains chr prefix
#' @return generates a new BED file with (larger) ROIs split into smaller bins
#' @importFrom utils write.table
#' @export
#' @examples
#' bed <- list.files(system.file("extdata", package = "panelcn.mops"),
#' pattern = ".bed$", full.names = TRUE)
#' splitROIs(bed, "newBed.bed")
splitROIs <- function(oldBedFile, newBedFile, limit = 0, bin = 100, shift = 50,
chr = FALSE) {
if (!(is.numeric(limit) & limit >= 0 & length(limit)==1)) {
stop("\"limit\" must be numeric, larger or equal 0 and of length 1.")
}
if (!(is.numeric(bin) & bin > 0 & length(bin)==1)) {
stop("\"bin\" must be numeric, larger than 0 and of length 1.")
}
if (!(is.numeric(shift) & shift > 0 & length(shift)==1)) {
stop("\"shift\" must be numeric, larger than 0 and of length 1.")
}
message(paste("reading from bedFile", basename(oldBedFile)))
windows <- getWindows(oldBedFile, chr)
message(paste("No. of original ROIs:", nrow(windows)))
starts <- c()
ends <- c()
chrom <- c()
exon <- c()
geneName <- c()
for (i in seq_len(nrow(windows))) {
cw_row <- windows[i,]
if ((cw_row$end - cw_row$start) > limit &
(cw_row$end - cw_row$start) > bin) {
starts <- c(starts, seq(cw_row$start, (cw_row$end - bin + 1), shift))
ends <- c(ends, seq((cw_row$start + bin - 1), cw_row$end, shift))
diff <- ends[length(ends)] - cw_row$end
if (diff != 0) {
stopifnot(diff < 0)
ends[length(ends)] <- cw_row$end
}
nrows <- length(ends) - length(chrom)
chrom <- c(chrom, rep(cw_row$chrom, nrows))
geneName <- c(geneName, rep(cw_row$gene, nrows))
exon <- c(exon, rep(cw_row$exon, nrows))
} else {
starts <- c(starts, cw_row$start)
ends <- c(ends, cw_row$end)
chrom <- c(chrom, cw_row$chrom)
geneName <- c(geneName, cw_row$gene)
exon <- c(exon, cw_row$exon)
}
}
if (sum(grepl(pattern = '^chr', chrom)) == 0) {
chromn <- paste0("chr", chrom)
} else {
chromn <- chrom
}
geneName <- paste(geneName, exon, chromn, starts, ends, sep = ".")
message(paste("Created", length(chrom), "ROIs"))
windows <- data.frame(chrom = chrom, start = starts, end = ends,
name = geneName, stringsAsFactors = FALSE)
write.table(windows, file = newBedFile, sep = "\t", quote = FALSE,
col.names = FALSE, row.names = FALSE)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.