R/utils_imports.R
In ORFik: Open Reading Frames in Genomics

Documented in fimport findFa fread.bed import.bedo import.bedoc import.ofst readBam readWig

#' Load bed file as GRanges
#'
#' Wraps around \code{\link{import.bed}} and
#' tries to speed up loading with the
#' use of data.table. Supports gzip, gz, bgz and bed formats.
#' Also safer chromosome naming with the argument chrStyle
#' @param filePath The location of the bed file
#' @inheritParams matchSeqStyle
#' @importFrom data.table fread setDF
#' @importFrom tools file_ext
#' @importFrom rtracklayer import.bed
#' @return a \code{\link{GRanges}} object
#' @export
#' @family utils
#' @examples
#' # path to example CageSeq data from hg19 heart sample
#' cageData <- system.file("extdata", "cage-seq-heart.bed.bgz",
#'                         package = "ORFik")
#' fread.bed(cageData)
#'
fread.bed <- function(filePath, chrStyle = NULL) {

  if (.Platform$OS.type == "unix") {
    if (file.exists(filePath)) {
      if (any(file_ext(filePath) %in% c("gzip", "gz", "bgz"))) {
        bed <- bedToGR(setDF(
          fread(cmd = paste("gunzip -c", filePath), sep = "\t")))
      } else if (file_ext(filePath) == "bed"){
        bed <- bedToGR(setDF(fread(filePath, sep = "\t")))
      } else {
        bed <- import.bed(con =  filePath)
      }
    } else {stop("Filepath specified does not name existing file.")}
  } else {
    ## NB: Windows user will have slower loading
    bed <- import.bed(con =  filePath)
  }

  return(matchSeqStyle(bed, chrStyle))
}

#' Custom bam reader
#'
#' Read in Bam file from either single end or paired end.
#' Safer combined version of \code{\link{readGAlignments}} and
#' readGAlignmentPairs that takes care of some common errors.\cr
#' If QNAMES of the aligned reads are from collapsed fasta files
#' (if the names are formated from collapsing in either
#' (ORFik, ribotoolkit or fastx)), the
#' bam file will contain a meta column called collapsed with the counts
#' of duplicates per read.\cr
#'
#' In the future will use a faster .bam loader for big .bam files in R.
#' @param path a character / data.table with path to .bam file. There
#' are 3 input file possibilities.
#' \itemize{
#'  \item{single end : }{a character path (length 1)}
#'  \item{paired end (1 file) : }{Either a character path (length of 2), where
#'  path[2] is "paired-end", or a data.table
#'   with 2 columns, forward = path & reverse = "paired-end"}
#'  \item{paired end (2 files) : }{Either a character path (length of 2), where
#'  path[2] is path to R2, or a data.table
#'   with 2 columns, forward = path to R1 & reverse = path to R2.
#'   (This one is not used often)}
#' }
#' @inheritParams matchSeqStyle
#' @inheritParams GenomicAlignments::readGAlignments
#' @param strandMode numeric, default 0. Only used for paired end bam files.
#' One of (0: strand = *, 1: first read of pair is +, 2: first read of pair is -).
#' See ?strandMode. Note: Sets default to 0 instead of 1, as readGAlignmentPairs uses 1.
#' This is to guarantee hits, but will also make mismatches of overlapping
#' transcripts in opposite directions.
#' @return a \code{\link{GAlignments}} or \code{\link{GAlignmentPairs}} object of bam file
#' @importFrom Rsamtools scanBam BamFile ScanBamParam
#' @export
#' @family utils
#' @examples
#' bam_file <- system.file("extdata", "ribo-seq.bam", package = "ORFik")
#' readBam(bam_file, "UCSC")
readBam <- function(path, chrStyle = NULL, param = NULL, strandMode = 0) {
  if (!(length(path) %in% c(1,2))) stop("readBam must have 1 or 2 bam files!")
  if (is(path, "factor")) path <- as.character(path)
  # If data.table path
  if (is(path, "data.table")) {
    if (path$reverse == "paired-end") {
      message("ORFik reads this paired end bam as readGAlignmentPairs")
      message(paste("strandMode =", strandMode, ". Update it if is wrong!"))
      bam <- matchSeqStyle(readGAlignmentPairs(path$forward, param = param,
                                               strandMode = strandMode), chrStyle)
      if (length(bam) == 0)
        stop(paste("File", path$forward,
                   "was read as paired-end file, but had 0 paired reads!"))
      return(bam)
    } else {
      message("ORFik reads these split paired end bams as readGAlignments combination")
      return(matchSeqStyle(c(readGAlignments(path$forward, param = param),
                             readGAlignments(path$reverse, param = param)), chrStyle))
    }
  }
  # If character path
  if (is(path, "character") & length(path) == 2) {
    if (path[2] == "paired-end"){
      message("ORFik reads paired end bam in as readGAlignmentPairs")
      message(paste("strandMode =", strandMode, ". Update it if is wrong!"))
      bam <- matchSeqStyle(readGAlignmentPairs(path[1], param = param,
                                               strandMode = strandMode), chrStyle)
      if (length(bam) == 0)
        stop(paste("File", path[1],
                   "was read as paired-end file, but had 0 paired reads!"))
      return(bam)
    } else {
      message("ORFik reads these split paired end bams as readGAlignments combination")
      return(matchSeqStyle(c(readGAlignments(path[1], param = param),
                             readGAlignments(path[2], param = param)), chrStyle))
    }
  }
  # else single end bam file

  # Check if it is a collapsed reads format bam file
  # Get qnames with the data (check first 2 rows)
  headers <- unlist(scanBam(BamFile(path, yieldSize=2),
                            param = ScanBamParam(what = "qname")),
                    use.names = FALSE)
  bam.is.collapsed <- all(seq_along(headers) %in%grep("^(>|>seq)\\d+(-|_x)\\d+$", headers))
  if (bam.is.collapsed) {

    headers <- unlist(scanBam(path, param = ScanBamParam(what = "qname")),
                      use.names = FALSE)
    format.header <- ifelse(all(seq_along(headers) %in% grep("^>seq\\d+_x\\d+$", headers)),
                            "ribotoolkit",
                            "fastx")
    if (format == "ribotoolkit") {
      scores <- as.integer(gsub(".*_x", "", headers))
    } else {
      scores <- as.integer(gsub(".*-", "", headers))
    }
    bam <- matchSeqStyle(readGAlignments(path, param = param), chrStyle)
    mcols(bam) <- DataFrame(collapsed = scores)
    return(bam)
  } else return(matchSeqStyle(readGAlignments(path, param = param), chrStyle))
}

#' Custom wig reader
#'
#' Given 2 wig files, first is forward second is reverse.
#' Merge them and return as GRanges object.
#' If they contain name reverse and forward, first and second order
#' does not matter, it will search for forward and reverse.
#'
#' @param path a character path to two .wig files, or a data.table
#' with 2 columns, (forward, filepath) and reverse, only 1 row.
#' @inheritParams matchSeqStyle
#' @importFrom rtracklayer import.wig
#' @return a \code{\link{GRanges}} object of the file/s
#' @family utils
#'
readWig <- function(path, chrStyle = NULL) {
  if (is(path, "character")) {
    if (length(path) != 2) stop("readWig must have 2 wig files,
                              one forward strand and one reverse!")

    forwardPath <- grep("forward|fwd", path)
    reversePath <- grep("reverse|rev", path)
    if (length(forwardPath) == 1 & length(reversePath) == 1){
      forwardIndex <- forwardPath
      reverseIndex <- reversePath
    }

    forward <- import.wig(path[forwardIndex])
    reverse <- import.wig(path[reverseIndex])
  } else if (is(path, "data.table")) {
    if (!is.null(path$forward)) {
      forward <- import.wig(path$forward)
    } else forward <- import.wig(path$filepath)
    reverse <- import.wig(path$reverse)
  }
  strand(forward) <- "+"
  strand(reverse) <- "-"
  return(matchSeqStyle(c(forward, reverse), chrStyle))
}

#' Load GRanges object from .bedo
#'
#' .bedo is .bed ORFik, an optimized bed format for coverage reads with read lengths
#' .bedo is a text based format with columns (6 maximum):\cr
#' 1. chromosome\cr 2. start\cr 3. end\cr 4. strand\cr
#' 5. ref width (cigar # M's, match/mismatch total)\cr
#' 6. duplicates of that read\cr
#'
#' Positions are 1-based, not 0-based as .bed.
#' export with export.bedo
#' @param path a character, location on disc (full path)
#' @return GRanges object
#' @importFrom tools file_ext
#' @export
import.bedo <- function(path) {
  if (file_ext(path) != "bedo") stop("import.bedo can only load .bedo files!")
  dt <- fread(input = path)
  if (is.null(dt$end)) dt$end <- dt$start
  return(makeGRangesFromDataFrame(dt, keep.extra.columns = TRUE))
}

#' Load GAlignments object from .bedoc
#'
#' A much faster way to store, load and use bam files.\cr
#' .bedoc is .bed ORFik, an optimized bed format for coverage reads with
#' cigar and replicate number.\cr
#' .bedoc is a text based format with columns (5 maximum):\cr
#' 1. chromosome\cr 2. cigar: (cigar # M's, match/mismatch total)
#' \cr 3. start (left most position) \cr 4. strand (+, -, *)\cr
#' 5. score: duplicates of that read\cr
#'
#' Positions are 1-based, not 0-based as .bed.
#' export with export.bedo
#' @param path a character, location on disc (full path)
#' @return GAlignments object
#' @importFrom tools file_ext
#' @export
import.bedoc <- function(path) {
  if (file_ext(path) != "bedoc")
    stop("import.bedoc can only load .bedoc files!")
  return(getGAlignments(fread(input = path, stringsAsFactors = TRUE)))
}

#' Load GRanges / GAlignments object from .ofst
#'
#' A much faster way to store, load and use bam files.\cr
#' .ofst is ORFik fast serialized object,
#' an optimized format for coverage reads with
#' cigar and replicate number. It uses the fst format as back-end:
#' \code{\link{fst-package}}.\cr A .ofst ribo seq file can compress the
#' information in a bam file from 5GB down to a few MB. This new files has
#' super fast reading time, only a few seconds, instead of minutes. It also has
#' random index access possibility of the file. \cr
#' .ofst is represented as a data.frane format with minimum 4 columns:\cr
#' 1. chromosome\cr  2. start (left most position) \cr 3. strand (+, -, *)\cr
#' 4. width (not added if cigar exists)\cr
#' 5. cigar (not needed if width exists):
#'  (cigar # M's, match/mismatch total) \cr
#' 5. score: duplicates of that read\cr
#' 6. size: qwidth according to reference of read\cr\cr
#' If file is from \code{\link{GAlignmentPairs}},
#' it will contain a cigar1, cigar2 instead
#' of cigar and start1 and start2 instead of start
#'
#' Other columns can be named whatever you want and added to meta columns.
#' Positions are 1-based, not 0-based as .bed.
#' Import with import.ofst
#' @param file a path to a .ofst file
#' @inheritParams readBam
#' @return a GAlignment, GAlignmentPairs or GRanges object,
#' dependent of if cigar/cigar1 is defined in .ofst file.
#' @importFrom fst read_fst
#' @export
#' @examples
#' ## GRanges
#' gr <- GRanges("1:1-3:-")
#' tmp <- file.path(tempdir(), "path.ofst")
#' # export.ofst(gr, file = tmp)
#' # import.ofst(tmp)
#' ## GAlignment
#' # Make input data.frame
#' df <- data.frame(seqnames = "1", cigar = "3M", start = 1L, strand = "+")
#' ga <- ORFik:::getGAlignments(df)
#' # export.ofst(ga, file = tmp)
#' # import.ofst(tmp)
import.ofst <- function(file, strandMode = 0) {
  df <- read_fst(file)
  if ("cigar" %in% colnames(df)) {
    getGAlignments(df)
  } else if ("cigar1" %in% colnames(df)) {
    getGAlignmentsPairs(df, strandMode)
  } else getGRanges(df)
}

#' Load any type of sequencing reads
#'
#' Wraps around rtracklayer::import and tries to speed up loading with the
#' use of data.table. Supports gzip, gz, bgz compression formats.
#' Also safer chromosome naming with the argument chrStyle
#'
#' NOTE: For wig you can send in 2 files, so that it automaticly merges
#' forward and reverse stranded objects. You can also just send 1 wig file,
#' it will then have "*" as strand.
#' @param path a character path to file (1 or 2 files),
#'  or data.table with 2 colums(forward&reverse)
#'  or a GRanges/Galignment/GAlignmentPairs object etc.
#'  If it is ranged object it will presume to be
#'  already loaded, so will return the object as it is,
#'  updating the seqlevelsStyle if given.
#' @inheritParams readBam
#' @importFrom tools file_ext
#' @importFrom tools file_path_sans_ext
#' @importFrom rtracklayer import
#' @return a \code{\link{GAlignments}}/\code{\link{GRanges}} object,
#'  depending on input.
#' @export
#' @family utils
#' @examples
#' bam_file <- system.file("extdata", "ribo-seq.bam", package = "ORFik")
#' fimport(bam_file)
#' # Certain chromosome naming
#' fimport(bam_file, "NCBI")
#' # Paired end bam strandMode 1:
#' fimport(bam_file, strandMode = 1)
#' # (will have no effect in this case, since it is not paired end)
#'
fimport <- function(path, chrStyle = NULL, param = NULL, strandMode = 0) {
  if (is(path, "data.table")) {
    if (ncol(path) == 2 & colnames(path) == c("forward", "reverse")) {
      path <- c(path$forward, path$reverse)
    } else stop("When path is data.table,",
                "it must have 2 columns (forward&reverse)")
  } else if (is(path, "list")) {
    if (!(all(lengths(path) %in% c(1,2))) | length(path) != 1) {
      stop("When path is list,",
           "it must have 1 or 2 elements (forward, reverse)")
    }
  }
  if (is(path, "list") | is(path, "data.table") | is(path, "character")) {
    path <- unlist(path)
    path <- path[path != ""]
    pairedEndBam <- FALSE
    if (length(path) == 2) {
      if (path[2] == "paired-end") {
        pairedEndBam <- TRUE
        path <- path[1]
      }
    }
  }

  if (is.character(path)) {
    if (all(file.exists(path))) {
      fext <- file_ext(path)
      compressions <- c("gzip", "gz", "bgz", "zip")
      areCompressed <- fext %in% compressions
      fext[areCompressed] <- file_ext(file_path_sans_ext(path[areCompressed],
                                                         compression = FALSE))
      if (length(path) == 2) { # Multiple file paths
        if (all(fext %in% c("wig"))) {
          return(readWig(path, chrStyle))
        } else if (all(fext %in% c("bam"))) {
          return(readBam(path, chrStyle, param, strandMode))
        } else stop("only wig and valid bam format allowed for 2 files input!")
      } else if (length(path) == 1) { # Only 1 file path given
        if (fext == "bam") {
          if (pairedEndBam) path <- c(path, "paired-end")
          return(readBam(path, chrStyle, param, strandMode))
        } else if (fext == "bed" |
                   file_ext(file_path_sans_ext(path,
                                               compression = TRUE)) == "bed" |
                   file_ext(file_path_sans_ext(path, compression = FALSE))
                   == "bed") {
          return(fread.bed(path, chrStyle))
        } else if (fext == "bedo") {
          return(matchSeqStyle(import.bedo(path), chrStyle))
        } else if (fext == "bedoc") {
          return(matchSeqStyle(import.bedoc(path), chrStyle))
        } else if (fext == "ofst") {
          return(matchSeqStyle(import.ofst(path, strandMode), chrStyle))
        }else return(matchSeqStyle(import(path), chrStyle))
      } else stop("fimport takes either 1 or 2 files!")
    } else stop(paste(path, "does not exist as File/Files!"))
  } else if (is.gr_or_grl(path) | is(path, "GAlignments") |
             is(path, "GAlignmentPairs")) {
    return(matchSeqStyle(path, chrStyle))
  } else {
    stop("path must be either a valid character",
         " filepath or ranged object.")
  }
}

#' Convenience wrapper for Rsamtools FaFile
#'
#' Get fasta file object, to find sequences in file.\cr
#' Will load and import file if necessarry.
#' @param faFile \code{\link{FaFile}}, BSgenome, fasta/index file path or an
#' ORFik \code{\link{experiment}}. This file is usually used to find the
#' transcript sequences from some GRangesList.
#' @importFrom Rsamtools FaFile
#' @importFrom methods is
#' @return a \code{\link{FaFile}} or BSgenome
#' @family utils
#' @export
#' @examples
#' # Some fasta genome with existing fasta index in same folder
#' path <- system.file("extdata", "genome.fasta", package = "ORFik")
#' findFa(path)
#'
findFa <- function(faFile) {
  if (is.character(faFile)) {
    if (file.exists(faFile)) {
      return(FaFile(faFile))
    } else {
      stop("faFile does not name a valid fasta/index file")
    }
  } else if (is(faFile, "FaFile") || is(faFile, "BSgenome")) {
    return(faFile)
  } else if (is(faFile, "experiment")) {
    return(FaFile(faFile@fafile))
  }
  stop("faFile must be FaFile, BSgenome, valid filePath, or ORFik experiment")
}
Any scripts or data that you put into this service are public.
ORFik documentation built on March 27, 2021, 6 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
ORFik
Open Reading Frames in Genomics

R/utils_imports.R
In ORFik: Open Reading Frames in Genomics

Defines functions findFa fimport import.ofst import.bedoc import.bedo readWig readBam fread.bed

Documented in fimport findFa fread.bed import.bedo import.bedoc import.ofst readBam readWig

Try the ORFik package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ORFik Open Reading Frames in Genomics

R/utils_imports.R In ORFik: Open Reading Frames in Genomics

Defines functions findFa fimport import.ofst import.bedoc import.bedo readWig readBam fread.bed

Documented in fimport findFa fread.bed import.bedo import.bedoc import.ofst readBam readWig

Try the ORFik package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ORFik
Open Reading Frames in Genomics

R/utils_imports.R
In ORFik: Open Reading Frames in Genomics