R/file_format_io.R

Defines functions read_additional_gene_info read_annotation_from_rdata read_annotation_from_saf read_annotation_from_gff read_tx2gene_from_genemap read_saf read_regions write_narrowPeak read_narrowPeak write_motifmap read_motifmap read_idr_table read_table_general

Documented in read_motifmap read_narrowPeak read_regions read_saf read_table_general read_tx2gene_from_genemap write_motifmap write_narrowPeak

## Read a table from a R data file, csv, or xlsx file. Returns a data
## frame or throws an error.

#' General function for reading a table from any table-like file
#'
#' @export
read_table_general <- function(filename, read.table.args = NULL, read.xlsx.args = NULL,
                               dataframe.class = "data.frame") {
    req_ns("future", "openxlsx")
    suppressWarnings({
        read.table.args %<>% as.list
        read.table.args$file <- filename
        read.table.args$header <- TRUE
        read.xlsx.args %<>% as.list
        read.xlsx.args$xlsxFile <- filename
        lazy_results <- make_futures(
            .future.args = list(evaluator = future::sequential, lazy = TRUE),
            rdata = read_RDS_or_RDA(filename, dataframe.class),
            table = do.call(read.table, read.table.args),
            csv = do.call(read.csv, read.table.args),
            xlsx = do.call(openxlsx::read.xlsx, read.xlsx.args))
        for (lzresult in lazy_results) {
            result <- tryCatch({
                x <- as(future::value(lzresult), dataframe.class)
                assert_that(is(x, dataframe.class))
                x
            }, error = function(...) NULL)
            if (!is.null(result)) {
                return(result)
            }
        }
        stop(glue("Could not read a data frame from {deparse{filename}} as R data, csv, or xlsx"))
    })
}

#' @export
read_idr_table <- function(file) {
    idrcols <- c("chr", "start", "end", "name", "score", "strand",
                 "LocalIDR", "GlobalIDR", "startA", "endA", "scoreA", "startB", "endB", "scoreB")
    read.table(file, header = FALSE, sep = "\t", col.names = idrcols) %>%
        mutate_(LocalIDR = ~10^-LocalIDR, GlobalIDR = ~10^-GlobalIDR)
}

#' Read MotifMap-provided BED file into a GRanges object.
#'
#' Even though this function reads a BED file,
#' [rtracklayer::import.bed()] is not suitable because it chokes on
#' spaces in fields, which MotifMap contains. Hence the need for a
#' separate function.
#'
#' @param file The file name to read from.
#' @param parse_name If TRUE, separate the "name" column into
#'     "motif_ID" and "TF_name" fields.
#' @return a GRanges object with an mcol named "score". If
#'     `parse_name` is FALSE, it will also have an mcol named "name".
#'     If `parse_name` is TRUE, it will have mcols named "motif_ID",
#'     and "TF_name".
#'
#' @export
read_motifmap <- function(file, parse_name = TRUE) {
    req_ns("tidyr", "GenomicRanges", "readr")
    tab <- readr::read_tsv(file, col_names = c("chr", "start", "end", "name", "score", "strand"),
                    col_types = "ciicdc", progress = FALSE)
    if (parse_name) {
        tab %<>% tidyr::separate_(~name, into = c("motif_ID", "TF_name"), sep = " = ")
    }
    gr <- GenomicRanges::makeGRangesFromDataFrame(
        tab, starts.in.df.are.0based = TRUE,
        keep.extra.columns = TRUE)
    gr
}

#' Write a MotifMap-type GRanges into a BED file.
#'
#' This takes a GRanges object with the appropriate mcols (see
#' [read_motifmap()]) and saves it as a BED file.
#'
#' @param x A GRanges object with appropriate metadata columns.
#' @param file The file name to save.
#'
#' @export
write_motifmap <- function(x, file) {
    req_ns("S4Vectors", "rtracklayer", "tidyr")
    assert_that(is(x, "GRanges"))
    if (! "name" %in% names(S4Vectors::mcols(x))) {
        assert_that(all(c("motif_ID", "TF_name") %in% names(S4Vectors::mcols(x))))
        S4Vectors::mcols(x) %<>% as.data.frame %>%
            tidyr::unite_(~name, ~c(motif_ID, TF_name), sep = " = ") %>%
            as("DataFrame")
    }
    rtracklayer::export(x, file, format = "BED")
}

#' Read a narrowPeak format BED file.
#'
#' NarrowPeak is a sub-format of the BED file format used as the
#' output format of some peak callers. [rtracklayer::import.bed()]
#' often throws an error when reading these files, so a separate
#' function is needed.
#'
#' @param file The file name to read from.
#' @param ... Additional arguments are passed to `read.table()`.
#' @return A GRanges object containing the ranges from `file`.
#'
#' @export
read_narrowPeak <- function(file, ...) {
    peaks.df <- read.table(file, sep = "\t", row.names = NULL, ...)
    names(peaks.df) <- c("chr", "start", "end", "name", "score", "strand", "signalValue", "pValue", "qValue", "summit")
    peaks.df$name <- as.character(peaks.df$name)
    GenomicRanges::makeGRangesFromDataFrame(
        peaks.df, starts.in.df.are.0based = TRUE,
        keep.extra.columns = TRUE)
}

#' Write a narrowPeak format BED file.
#'
#' This function takes a GRanges of the kind returned by
#' [read_narrowPeak()] and writes it back to a file.
#'
#' @param x A GRanges or data frame containing the all appropriate
#'     columns for a narrowPeak file. In addition to the normal BED
#'     columns, it must contain columns named "signalValue", "pValue",
#'     "qValue", and "summit".
#' @param file The file name to write to.
#' @param ... Additional arguments are passed to `write.table()`.
#'
#' If some of the required columns are not available (e.g. from a peak
#' caller that does not report a summit), you can fill in the
#' unavailable columns with NA.
#'
#' @export
write_narrowPeak <- function(x, file, ...) {
    x <- as(x, "data.frame")
    if("seqnames" %in% names(x))
        names(x)[names(x) == "seqnames"] <- "chr"
    x <- x[c("chr", "start", "end", "name", "score", "strand", "signalValue", "pValue", "qValue", "summit")]
    write.table(x, file, sep = "\t", row.names = FALSE, col.names = FALSE, ...)
}

#' Read a GRanges from a variety of possible file types.
#'
#' This function attempts to read in a set of genomic regions as a
#' GRanges object. It does its best to auto-detect the input file
#' type. If it cannot do so, it throws an error.
#'
#' @param filename The file to read regions from.
#' @return a GRanges object.
#'
#' Possible input file types include: RDA or RDS file (containing a
#' GRanges or data frame), narrowPeak, BED, GFF, and SAF.
#'
#' @export
read_regions <- function(filename) {
    req_ns("rtracklayer", "future")
    suppressWarnings({
        lazy_results <- make_futures(
            .future.args = list(evaluator = future::sequential, lazy = TRUE),
            rdata = read_RDS_or_RDA(filename),
            narrowPeak = read_narrowPeak(filename),
            bed = rtracklayer::import(filename, format = "bed"),
            gff = rtracklayer::import(filename, format = "gff"),
            saf = read_saf(filename),
            table = read_table_general(filename))
        for (lzresult in lazy_results) {
            result <- tryCatch({
                x <- future::value(lzresult)
                if (is(x, "List")) {
                    x <- unlist(x)
                }
                x <- as(x, "GRanges")
                x
            }, error = function(...) NULL)
            if (!is.null(result)) {
                return(result)
            }
        }
        stop(glue("Could not read genomic regions from {deparse(filename)} as R data, narrowPeak, bed, gff, SAF, or csv"))
    })
}

## TODO: Make import.saf and export.saf functions?

#' Read a SAF file into a GRangesList.
#'
#' The SAF format is described in [Rsubread::featureCounts()].
#'
#' @param filename The file name to read
#' @param ... Additional arguments are passed to
#'     [read_table_general()].
#' @return a GRangesList with one element for each GeneID, containing
#'     the ranges for that gene.
#'
#' @export
read_saf <- function(filename, ...) {
    saf <- read_table_general(filename, ...)
    assert_that("GeneID" %in% names(saf))
    gr <- as(saf, "GRanges")
    grl <- split(gr, gr$GeneID) %>%
        promote_common_mcols(delete_from_source = FALSE)
    return(grl)
}

## TODO: Give the below functions better names

#' Read a Salmon "geneMap" file into a data frame
#'
#' This function reads a file in the Salmon "geneMap" format into a
#' data frame. The format of this file is described in salmon's help
#' text for the "--geneMap" command-line option.
#'
#' @param fname The file to read from. This should generally be a file
#'     named "genemap.txt" in a Salmon index directory
#' @return A data frame with 2 columns named "TXNAME" and "GENEID".
#'
#' Since this uses [read_table_general()] and allows additional
#' columns to be present beyond the first two, it is actually quite
#' lenient about the format of the table.
#'
#' @seealso The "--geneMap" option to salmon, whose help text
#'     describes the format of the file.
#'
#' @export
read_tx2gene_from_genemap <- function(fname) {
    df <- read_table_general(
        fname,
        read.xlsx.args = list(colNames = FALSE),
        read.table.args = list(header = FALSE))
    df %<>% .[1:2]
    df[] %<>% lapply(as.character)
    names(df) <- c("TXNAME", "GENEID")
    df
}

#' @export
read_annotation_from_gff <- function(filename, format = "GFF3", ...) {
    req_ns("rtracklayer")
    gff <- NULL
    ## Allow the file to be an RDS file containing the GRanges
    ## resulting from import()
    gff <- tryCatch({
        read_RDS_or_RDA(filename, "GRanges")
    }, error = function(...) {
        req_ns("rtracklayer")
        rtracklayer::import(filename, format = format)
    })
    assert_that(is(gff, "GRanges"))
    grl <- gff_to_grl(gff, ...)
    return(grl)
}

#' @export
read_annotation_from_saf <- function(filename, ...) {
    saf <- read_table_general(filename, ...)
    assert_that("GeneID" %in% names(saf))
    gr <- as(saf, "GRanges")
    grl <- split(gr, gr$GeneID) %>% promote_common_mcols
    return(grl)
}

#' @export
read_annotation_from_rdata <- function(filename) {
    read_RDS_or_RDA(filename, "GRangesList")
}

#' @export
read_additional_gene_info <- function(filename, gff_format = "GFF3",
                                      geneFeatureType = "gene", ...) {
    req_ns("S4Vectors", "rtracklayer")
    df <- tryCatch({
        gff <- tryCatch({
            read_RDS_or_RDA(filename, "GRanges")
        }, error = function(...) {
            req_ns("rtracklayer")
            rtracklayer::import(filename, format = gff_format)
        })
        assert_that(is(gff, "GRanges"))
        if (!is.null(geneFeatureType)) {
            gff %<>% .[.$type %in% geneFeatureType]
        }
        gff %<>% .[!is.na(.$ID) & !duplicated(.$ID)]
        gff %>% S4Vectors::mcols %>% cleanup_mcols(mcols_df = .)
    }, error = function(...) {
        tab <- read_table_general(filename, ..., dataframe.class = "DataFrame")
        ## Nonexistent or automatic row names
        if (.row_names_info(tab) <= 0) {
            row.names(tab) <- tab[[1]]
        }
        tab
    })
    df %<>% S4Vectors::DataFrame
    assert_that(is(df, "DataFrame"))
    return(df)
}
DarwinAwardWinner/rctutils documentation built on July 22, 2022, 5:19 a.m.