R/read.bedix.R

Defines functions read.bedix

Documented in read.bedix

##' Using \code{Rsamtools} functions to manipulate tabix file, the indexed BED file is read
##' and the features present in the selected region are retrieved. 
##' @title Retrieve a subset of an indexed BED file 
##' @param file a BED file containing the data to be read.
##' It should be compressed with \code{bgzip} and indexed by tabix algorithm.
##' @param subset.reg a data.frame or GRanges object with the regions to subset from.
##' If a data.frame, it must have columns named 'chr', 'start' and 'end'.
##' @param header should the first line of \code{file} be used as header. Default is \code{TRUE}.
##' @return a data.frame with the retrieved BED information.
##' @author Jean Monlong, Diego Garrido-Martín
##' @export
##' @import data.table
read.bedix <- function(file, subset.reg = NULL, header = TRUE) 
{
    if(!is.character(file)){
        file <- as.character(file)
    }
    if(!file.exists(file)){
        file <- paste0(file, ".bgz")
    }
    if(!file.exists(file)){
        stop(file, "Input file not found (with and without .bgz extension).")
    }
    if(!file.exists(paste0(file, ".tbi"))){
        stop(paste0(file, ".tbi"), "Index file not found.")
    }
    if(is.null(subset.reg)){
        return(utils::read.table(file, as.is = TRUE, header = header))
    }
    if (is.data.frame(subset.reg)) {
        if(!all(c("chr", "start", "end") %in% colnames(subset.reg))){
            stop("Missing column in 'subset.reg'. 'chr', 'start' and 'end' are required.")
        }
        subset.reg <- with(subset.reg, 
                           GenomicRanges::GRanges(chr, IRanges::IRanges(start, end)))
    } else if (class(subset.reg) != "GRanges") {
        stop("'subset.reg' must be a data.frame or a GRanges object.")
    }
    subset.reg <- subset.reg[order(as.character(GenomicRanges::seqnames(subset.reg)), 
                                   GenomicRanges::start(subset.reg))]
    read.chunk <- function(gr){
        bed <- tryCatch(unlist(Rsamtools::scanTabix(file, param = GenomicRanges::reduce(gr)), 
                               use.names = FALSE, recursive = FALSE), error = function(e) c())
        if (length(bed) == 0) {
            return(NULL)
        }
        ncol <- length(strsplit(bed[1], "\t")[[1]])
        bed <- matrix(unlist(strsplit(bed, "\t", fixed = TRUE), use.names=FALSE, recursive = FALSE), 
                      length(bed), ncol, byrow = TRUE)
        bed <- data.table::data.table(bed)
        if (header) {
            data.table::setnames(bed, 
                                 as.character(utils::read.table(file, nrows = 1, as.is = TRUE)))
        }
        bed <- bed[, lapply(.SD, function(ee)utils::type.convert(as.character(ee), as.is = TRUE))]
        bed <- as.data.frame(bed)
        return(bed)
    }
    if (length(subset.reg) > 10000) {
        chunks <- cut(1:length(subset.reg), ceiling(length(subset.reg)/10000))
        bed.df <- plyr::ldply(levels(chunks), function(ch.id){
            read.chunk(subset.reg[which(chunks == ch.id)])
        })
    } else {
        bed.df <- read.chunk(subset.reg)
    }
    if(!is.null(bed.df)){
        bed.df <- bed.df[order(bed.df$chr, bed.df$start), ]
    }
    return(bed.df)
} 
dgarrimar/sQTLseekeR2 documentation built on Nov. 22, 2021, 3:23 a.m.