R/read_bed.r

Defines functions read_gtf read_bigwig read_broadpeak read_narrowpeak read_bedgraph read_bed12 read_bed sniff_fields

Documented in read_bed read_bed12 read_bedgraph read_bigwig read_broadpeak read_gtf read_narrowpeak

#' sniff number of fields in a bed file
#' @noRd
sniff_fields <- function(filename) {
  ncol(
    readr::read_tsv(
      filename,
      n_max = 10, comment = "#",
      show_col_types = FALSE,
      name_repair = "minimal"
    )
  )
}

#' @title Read BED and related files.
#'
#' @description read functions for BED and related formats. Filenames can be
#'   local file or URLs. The read functions load data into tbls with consistent
#'   `chrom`, `start` and `end` colnames.
#'
#' @param filename file or URL
#' @param col_types column type spec for [readr::read_tsv()]
#' @param sort sort the tbl by chrom and start
#' @param ... options to pass to [readr::read_tsv()]
#' @param n_fields `r lifecycle::badge("deprecated")`
#'
#' @return [ivl_df]
#'
#' @family read functions
#'
#' @details \url{https://genome.ucsc.edu/FAQ/FAQformat.html#format1}
#'
#' @examples
#' # read_bed assumes 3 field BED format.
#' read_bed(valr_example("3fields.bed.gz"))
#'
#' # result is sorted by chrom and start unless `sort = FALSE`
#' read_bed(valr_example("3fields.bed.gz"), sort = FALSE)
#'
#' @export
read_bed <- function(filename, col_types = bed12_coltypes,
                     sort = TRUE, ..., n_fields = NULL) {
  check_required(filename)

  if (!is.null(n_fields)) {
    lifecycle::deprecate_warn(
      "0.6.9",
      "read_bed(n_fields)",
      details = "fields are now determined automatically from the file"
    )
  }

  n_fields <- sniff_fields(filename)

  coltypes <- col_types[1:n_fields]
  colnames <- names(coltypes)

  out <- readr::read_tsv(
    filename,
    col_names = colnames,
    col_types = coltypes, ...
  )

  if (sort) out <- bed_sort(out)

  out
}

#' @rdname read_bed
#'
#' @details \url{https://genome.ucsc.edu/FAQ/FAQformat.html#format1}
#'
#' @examples
#'
#' read_bed12(valr_example("mm9.refGene.bed.gz"))
#'
#' @export
read_bed12 <- function(filename, ...) {
  check_required(filename)
  n_fields <- sniff_fields(filename)
  if (n_fields != 12) {
    cli::cli_abort("expected 12 columns in bed12")
  }
  bed12_tbl <- read_bed(filename)
  bed12_tbl
}

#' @rdname read_bed
#'
#' @details \url{https://genome.ucsc.edu/goldenPath/help/bedgraph.html}
#'
#' @examples
#'
#' read_bedgraph(valr_example("test.bg.gz"))
#'
#' @export
read_bedgraph <- function(filename, ...) {
  # load as bed4, rename `value` column and covert to double
  check_required(filename)
  n_fields <- sniff_fields(filename)
  if (n_fields != 4) {
    cli::cli_abort("expected 4 columns in bedgraph")
  }
  out <- read_bed(filename, sort = FALSE)
  out <- select(out, everything(), value = name)
  out <- mutate(out, value = as.double(value))
  out
}

#' @rdname read_bed
#'
#' @details \url{https://genome.ucsc.edu/FAQ/FAQformat.html#format12}
#'
#' @examples
#'
#' read_narrowpeak(valr_example("sample.narrowPeak.gz"))
#'
#' @export
read_narrowpeak <- function(filename, ...) {
  check_required(filename)
  n_fields <- sniff_fields(filename)
  colnames <- names(peak_coltypes)
  out <- readr::read_tsv(
    filename,
    col_types = peak_coltypes,
    col_names = colnames
  )
  out
}

#' @rdname read_bed
#'
#' @details \url{https://genome.ucsc.edu/FAQ/FAQformat.html#format13}
#'
#' @examples
#'
#' read_broadpeak(valr_example("sample.broadPeak.gz"))
#'
#' @export
read_broadpeak <- function(filename, ...) {
  check_required(filename)
  n_fields <- sniff_fields(filename)
  coltypes <- peak_coltypes[1:length(peak_coltypes) - 1]
  colnames <- names(coltypes)
  out <- readr::read_tsv(filename, col_names = colnames, col_types = coltypes)
  out
}

peak_coltypes <- list(
  chrom = readr::col_character(),
  start = readr::col_integer(),
  end = readr::col_integer(),
  name = readr::col_character(),
  score = readr::col_integer(),
  strand = readr::col_character(),
  signal = readr::col_double(),
  pvalue = readr::col_double(),
  qvalue = readr::col_double(),
  peak = readr::col_integer()
)

bed12_coltypes <- list(
  chrom = readr::col_character(),
  start = readr::col_integer(),
  end = readr::col_integer(),
  name = readr::col_character(),
  score = readr::col_character(),
  strand = readr::col_character(),
  cds_start = readr::col_integer(),
  cds_end = readr::col_integer(),
  item_rgb = readr::col_character(),
  exon_count = readr::col_integer(),
  exon_sizes = readr::col_character(),
  exon_starts = readr::col_character()
)


#' Import and convert a bigwig file into a valr compatible tbl
#'
#' @description This function will output a 5 column tibble with
#' zero-based chrom, start, end, score, and strand columns.
#'
#' @param path path to bigWig file
#' @param set_strand strand to add to output (defaults to "+")
#'
#' @note This functions uses \code{rtracklayer} to import bigwigs which
#' has unstable support for the windows platform and therefore may error
#' for windows users (particularly for 32 bit window users).
#'
#' @examples
#' \dontrun{
#' if (.Platform$OS.type != "windows") {
#'   bw <- read_bigwig(valr_example("hg19.dnase1.bw"))
#'   head(bw)
#' }
#' }
#' @importFrom rtracklayer import
#' @export
read_bigwig <- function(path, set_strand = "+") {
  check_required(path)
  # note that rtracklayer will produce a one-based GRanges object
  res <- rtracklayer::import(path)
  res <- dplyr::as_tibble(res)
  res <- dplyr::mutate(res,
    chrom = as.character(seqnames),
    start = start - 1L,
    strand = set_strand
  )
  dplyr::select(res, chrom, start, end, score, strand)
}

#' Import and convert a GTF/GFF file into a valr compatible bed tbl format
#'
#' @description This function will output a tibble with the
#' required chrom, start, and end columns, as well as other columns depending
#' on content in GTF/GFF file.
#'
#' @param path path to gtf or gff file
#' @param zero_based if TRUE, convert to zero based
#'
#' @examples
#'
#' gtf <- read_gtf(valr_example("hg19.gencode.gtf.gz"))
#' head(gtf)
#'
#' @importFrom rtracklayer import
#' @export
read_gtf <- function(path, zero_based = TRUE) {
  gtf <- rtracklayer::import(path)
  gtf <- as.data.frame(gtf)
  gtf <- dplyr::mutate_if(gtf, is.factor, as.character)
  res <- dplyr::rename(gtf, chrom = seqnames)

  if (zero_based) {
    res <- dplyr::mutate(res, start = start - 1L)
  }

  tibble::as_tibble(res)
}
jayhesselberth/valr documentation built on April 8, 2024, 12:32 p.m.