R/IO.R

Defines functions ReadBED WriteBED WriteCSV

Documented in ReadBED WriteBED WriteCSV

#' Read BED-formatted file.
#'
#' Read BED-formatted file. See 'Details'.
#'
#' The function opens and reads in a BED6-formatted file, and 
#' stores the annotation features in a \code{GRanges} object.
#' Chromosome names must follow either UCSC (chr1, ..., chrM, chrX)
#' or Ensembl naming conventions (1, ..., MT, X).
#'
#' @param file A character string; specifies the input BED file.
#' @param collapseRange A logical scalar; should \code{TRUE} loci
#' spanning more than one nucleotide be collapsed to a single
#' nucleotide locus corresponding to the midpoint of the range;
#' default is \code{TRUE}.
#'
#' @return A \code{GRanges} object. See 'Details'.
#' 
#' @examples
#' bedFile <- system.file("extdata",
#'                        "miCLIP_m6A_Linder2015_hg38.bed",
#'                        package = "RNAModR")
#' sites <- ReadBED(bedFile)
#'
#' @author Maurits Evers, \email{maurits.evers@@anu.edu.au}
#'
#' @import GenomicRanges IRanges
#' @importFrom utils read.table
#' 
#' @export
ReadBED <- function(file, collapseRange = TRUE) {

    # Sanity checks
    if (!file.exists(file)) {
        stop(sprintf("File %s doesn't exist.", file))
    }
    bed <- read.table(file, header = FALSE, stringsAsFactors = FALSE)
    if (ncol(bed) != 6) {
        stop("[ERROR] File must be a 6 column BED file.")
    }
    colnames(bed) <- c("chr", "start", "end", "id", "score", "strand")
    
    # Use UCSC chromosome names
    if (all(!grepl("chr", bed$chr, ignore.case = TRUE))) {
        bed$chr <- sprintf("chr%s", bed$chr)
    }
    bed$chr <- gsub("chrMT", "chrM", bed$chr)
    
    # Order BED file by chromosome then starting point
    bed <- bed[order(bed$chr, bed$start), ]
    
    # Replace strand = "." with strand = "*"
    if (length(grep("\\.", bed$strand)) > 0) {
        bed$strand <- gsub("\\.", "*", bed$strand)
        warning(sprintf(
            "BED file %s contains entries without strand information.",
            file))
    }
    
    # Collapse ranges
    if (collapseRange == TRUE) {
        bed$start <- floor(0.5 * (bed$start + bed$end))
        bed$end <- bed$start + 1
    }
    
    # Convert to GRanges
    GRanges(
        seqnames = bed$chr,
        ranges = IRanges(bed$start + 1, bed$end),
        strand = bed$strand,
        score = as.numeric(bed$score),
        id = bed$id)
    
}


#' Write \code{txLoc} object to a BED file.
#'
#' Write \code{txLoc} object to a BED file. See 'Details'.
#'
#' The function writes entries from a \code{txLoc} object to a 6-column BED 
#' file (BED6). Note that this process is not "splice-aware", i.e. if an entry 
#' spans an intron the BED entry gives the left and right-most genomic 
#' coordinate of the feature. If \code{file = NULL}, entries will be written to  
#' \code{sites.bed}.
#' 
#' @param txLoc A \code{txLoc} object.
#' @param file A character string; specifies the filename of the output 
#' BED file. If \code{NULL}, then \code{file = "sites.bed"}; default is
#' \code{NULL}.
#'
#' @author Maurits Evers, \email{maurits.evers@@anu.edu.au}
#'
#' @examples
#' \dontrun{
#' bedFile <- system.file("extdata",
#'                        "miCLIP_m6A_Linder2015_hg38.bed",
#'                        package = "RNAModR")
#' sites <- ReadBED(bedFile)
#' txSites <- SmartMap(sites, id = "m6A", refGenome = "hg38")
#' WriteBED(txSites)
#' }
#' 
#' @importFrom utils write.table
#' 
#' @export
WriteBED <- function(txLoc, file = NULL) {

    # Sanity checks
    CheckClass(txLoc, "txLoc")
    
    # Quieten R CMD check concerns regarding "no visible binding for global
    # variable ..."
    locus_in_genome <- id <- tx_region <- tx_refseq <- NULL

    # Get loci
    loci <- GetLoci(txLoc)
    
    # Extract BED6 entries from txLoc object and row-bind into `data.frame`
    lst <- lapply(loci, function(locus) 
        with(locus, data.frame(
            seqnames = seqnames(locus_in_genome),
            start = start(locus_in_genome) - 1L,
            end = end(locus_in_genome),
            id = paste(GetId(txLoc), id, tx_region, tx_refseq, sep = "|"),
            score = score,
            strand = strand(locus_in_genome))))
    df <- do.call(rbind, lst)
    
    # Sort entries
    df <- df[order(df[, 1], df[, 2]), ]
        
    # Output file name
    if (is.null(file)) {
        if (nchar(GetId(txLoc)) > 0) {
            file <- sprintf("sites_%s.bed", GetId(txLoc))
        } else {
            file <- "sites.bed"
        }
    }

    # Write to file
    write.table(
        df, 
        file = file,
        sep = "\t", 
        quote = FALSE,
        col.names = FALSE, 
        row.names = FALSE)
    cat(sprintf("Output in file %s.\n", file))

}


#' Write txLoc object to CSV file.
#'
#' Write a \code{txLoc} object to a comma-separated-values file.
#' 
#' @param txLoc A \code{txLoc} object.
#' @param file Filename of output CSV file. If NULL then file = "sites.csv".
#' @param withSeq If TRUE then include full sequence. Default is FALSE.
#' @param withGC If TRUE then include GC content. Default is FALSE. Unused.
#'
#' @author Maurits Evers, \email{maurits.evers@@anu.edu.au}
#'
#' @importFrom utils write.csv
#' 
#' @export
WriteCSV <- function(txLoc,
                     file = NULL, 
                     withSeq = FALSE, 
                     withGC = FALSE) {

    # Sanity checks
    CheckClass(txLoc, "txLoc")

    # Get loci
    loci <- GetLoci(txLoc)

    # Convert entries in columns to `data.frame`
    lst <- lapply(loci, function(locus) {
        if (!withSeq) 
            as.data.frame(locus[, -match("tx_region_sequence", names(locus))])
        else
            as.data.frame(locus)
    })
    df <- do.call(rbind, lst)    
   
    # Output file name
    if (is.null(file)) {
        if (nchar(GetId(txLoc)) > 0) {
            file <- sprintf("sites_%s.csv", GetId(txLoc))
        } else {
            file <- "sites.csv"
        }
    }
    
    # Write to file
    write.csv(
        df, 
        file = file,
        row.names = FALSE, 
        quote = FALSE)
    cat(sprintf("Output in file %s.\n", file))
    
}
mevers/RNAModR documentation built on Nov. 17, 2019, 9:11 a.m.