R/utils_exports.R

Defines functions export.bedoc export.bedo export.covlist export.cov save.fstwig export.fstwig export.bigWig export.wiggle export.bed12

Documented in export.bed12 export.bedo export.bedoc export.bigWig export.fstwig export.wiggle

#' Export as bed12 format
#'
#' bed format for multiple exons per group, as transcripts.
#' Can be use as alternative as a sparse .gff format for ORFs.
#' Can be direct input for ucsc browser or IGV
#'
#' If grl has no names, groups will be named 1,2,3,4..
#' @param grl A GRangesList
#' @param file a character path to valid output file name
#' @param rgb integer vector, default (0), either single integer or
#' vector of same size as grl to specify groups. It is adviced to not
#' use more than 8 different groups
#' @return NULL (File is saved as .bed)
#' @importFrom data.table fwrite
#' @export
#' @family utils
#' @examples
#' grl <- GRangesList(GRanges("1", c(1,3,5), "+"))
#' # export.bed12(grl, "output/path/orfs.bed")
export.bed12 <- function(grl, file, rgb = 0) {
  if (!is.grl(class(grl))) stop("grl, must be of class GRangesList")
  if (!is.character(file)) stop("file must be of class character")
  if (length(rgb) != 1 & length(rgb) != length(grl))
    stop("rgb must be integer of size 1 or length(grl)")
  if (is.null(names(grl))) names(grl) <- seq.int(length(grl))
  grl <- sortPerGroup(grl, ignore.strand = TRUE) # <- sort bed way!

  dt.grl <- data.table(seqnames = seqnamesPerGroup(grl, FALSE))
  dt.grl$start <- as.integer(firstStartPerGroup(grl,keep.names = FALSE) -1)
  dt.grl$end <- lastExonEndPerGroup(grl, keep.names = FALSE)#non inclusive end
  dt.grl$name <- names(grl)
  dt.grl$score <- widthPerGroup(grl, keep.names = FALSE)
  dt.grl$strand <- strandPerGroup(grl, FALSE)
  dt.grl$thickStart <- dt.grl$start
  dt.grl$thickEnd <- dt.grl$end
  dt.grl$rgb <- if(length(rgb) > 1) rgb else rep(rgb, length(grl))
  dt.grl$blockCount <- numExonsPerGroup(grl)
  blockSizes <- paste(width(grl), collapse = ",")
  names(blockSizes) <- NULL
  dt.grl$blockSizes <- blockSizes
  relativeStarts <- (start(grl) -1) - dt.grl$start
  blockStarts <- paste(relativeStarts, collapse = ",")
  names(blockStarts) <- NULL
  dt.grl$blockStarts <- blockStarts

  # Write without colnames
  data.table::fwrite(x = dt.grl, file = file,
                     sep = "\t", col.names = FALSE, row.names = FALSE,
                     quote = FALSE)
  return(invisible(NULL))
}

#' Export as wiggle format
#'
#' Will create 2 files, 1 for + strand (*_forward.wig)
#' and 1 for - strand (*_reverse.wig). If all
#' ranges are * stranded, will output 1 file.
#' Can be direct input for ucsc browser or IGV
#'
#' @references https://genome.ucsc.edu/goldenPath/help/wiggle.html
#' @param x A GRangesList, GAlignment GAlignmentPairs with score column.
#' Will be converted to 5' end position of original range. If score column
#' does not exist, will group ranges and give replicates as score column.
#' @param file a character path to valid output file name
#' @return invisible(NULL) (File is saved as 2 .wig files)
#' @importFrom rtracklayer export.wig
#' @export
#' @family utils
#' @examples
#' x <- c(GRanges("1", c(1,3,5), "-"), GRanges("1", c(1,3,5), "+"))
#' # export.wiggle(x, "output/path/rna.wig")
export.wiggle <- function(x, file) {
  if (!(is(x, "GRanges") | is(x, "GAlignmentPairs") | is(x, "GAlignments")))
     stop("x must be GRanges, GAlignments or GAlignmentPairs")
  if (!is(x, "GRanges")) x <- GRanges(x)

  x <- resize(x, width = 1, fix = "start")
  if (!("score" %in% colnames(mcols(x)))) {
    x <- convertToOneBasedRanges(x, method = "None",
                                 addScoreColumn = TRUE,
                                 addSizeColumn = FALSE)
  } else { # merge reads by sum of existing scores
    x <- collapse.by.scores(x)
  }
  strands <- as.character(strand(x))
  if (all(strands == "*")) {
    file <- gsub("\\.wig", "", file)
    file <- paste0(file, ".wig")
    export.wig(x, file)
  } else {
    file <- gsub("\\.wig", "", file)
    forward_file <- paste0(file, "_forward.wig")
    reverse_file <- paste0(file, "_reverse.wig")
    export.wig(x[strandBool(x)], forward_file)
    export.wig(x[!strandBool(x)], reverse_file)
  }

  return(invisible(NULL))
}

#' Export as bigWig format
#'
#' Will create 2 files, 1 for + strand (*_forward.bigWig)
#' and 1 for - strand (*_reverse.bigWig). If all
#' ranges are * stranded, will output 1 file.
#' Can be direct input for ucsc browser or IGV
#'
#' @references https://genome.ucsc.edu/goldenPath/help/bigWig.html
#' @param x A GRangesList, GAlignment GAlignmentPairs with score column.
#'  Will be converted to 5' end position of original range. If score column
#'  does not exist, will group ranges and give replicates as score column.
#'  Since bigWig needs a score column to represent counts!
#' @param file a character path to valid output file name
#' @param is_pre_collapsed logical, default FALSE. Have you already
#'  collapsed reads with collapse.by.scores,
#'  so each positions is only in 1 GRanges object with
#'  a score column per readlength?
#'  Set to TRUE, only if you are sure, will give a speedup.
#' @param split.by.strand logical, default TRUE. Split bigWig into 2 files,
#'  one for each strand.
#' @param seq_info a Seqinfo object, default seqinfo(x).
#'  Must have non NA seqlengths defined!
#' @return invisible(NULL) (File is saved as 2 .bigWig files)
#' @importFrom rtracklayer export.bw
#' @export
#' @family utils
#' @examples
#' x <- c(GRanges("1", c(1,3,5), "-"), GRanges("1", c(1,3,5), "+"))
#' seqlengths(x) <- 10
#' file <- file.path(tempdir(), "rna.bigWig")
#' # export.bigWig(x, file)
#' # export.bigWig(covRleFromGR(x), file)
export.bigWig <- function(x, file, split.by.strand = TRUE,
                          is_pre_collapsed = FALSE, seq_info = seqinfo(x)) {
  if(anyNA(seqlengths(seq_info))) stop("seqinfo of x must be defined and have defined seqlengths!")
  if (!(is(x, "GRanges") | is(x, "GAlignmentPairs") | is(x, "GAlignments") | is(x, "covRle")))
    stop("x must be GRanges, GAlignments, GAlignmentPairs or covRLE")

  if (!is(x, "covRle")) {
    if (!is(x, "GRanges")) { x <- GRanges(x, seqinfo = seq_info)
    } else {seqlevels(x) <- seqlevels(seq_info); seqinfo(x) <- seq_info}

    if (!all(width(x) == 1)) x <- resize(x, width = 1, fix = "start")
    if (!("score" %in% colnames(mcols(x)))) {
      x <- convertToOneBasedRanges(x, method = "None",
                                   addScoreColumn = TRUE,
                                   addSizeColumn = FALSE)
    } else { # merge reads by sum of existing scores
      if (!is_pre_collapsed) x <- collapse.by.scores(x)
    }
    strands <- as.character(strand(x))
    can_split_by_strand <- !all(strands == "*")
    make_single_file <- !can_split_by_strand | !split.by.strand
    x <- covRleFromGR(x, ignore.strand = make_single_file)
  } else make_single_file <- !strandMode(x) | !split.by.strand


  if (make_single_file) {
    file <- gsub("\\.bigWig", "", file, ignore.case = TRUE)
    file <- paste0(file, ".bigWig")
    if (length(r(x)) > 0) {
      x@forward <- f(x) + r(x)
    }
    export.bw(GRanges(f(x)), file)
  } else {
    file <- gsub("\\.bigWig", "", file, ignore.case = TRUE)
    forward_file <- paste0(file, "_forward.bigWig")
    reverse_file <- paste0(file, "_reverse.bigWig")
    export.bw(GRanges(f(x)), forward_file)
    export.bw(GRanges(r(x)), reverse_file)
  }
  return(invisible(NULL))
}


#' Export as fstwig (fastwig) format
#'
#' Will create 2 files, 1 for + strand (*_forward.fstwig)
#' and 1 for - strand (*_reverse.fstwig). If all
#' ranges are * stranded, will output 1 file.
#'
#' @references "TODO"
#' @param x A GRangesList, GAlignment GAlignmentPairs with score column
#'  or coverage RLElist
#' Will be converted to 5' end position of original range. If score column
#' does not exist, will group ranges and give replicates as score column.
#' @inheritParams fst::write_fst
#' @param file a character path to valid output file name
#' @param by.readlength logical, default TRUE
#' @param by.chromosome logical, default TRUE
#' @return invisible(NULL) (File is saved as 2 .fstwig files)
#' @export
#' @family utils
#' @examples
#' x <- c(GRanges("1", c(1,3,5), "-"), GRanges("1", c(1,3,5), "+"))
#' x$size <- rep(c(28, 29), length.out = length(x))
#' x$score <- c(5,1,2,5,1,6)
#' seqlengths(x) <- 5
#' # export.fstwig(x, "~/Desktop/ribo")
export.fstwig <- function(x, file, by.readlength = TRUE,
                          by.chromosome = TRUE, compress = 50) {
  # TODO: Implement split.by.strand argument!
  if (length(x) == 0) stop("length of x is 0, no values to export!")
  if (anyNA(seqlengths(x))) stop("Define seqlength to make sure tails are correct")

  if ((is(x, "GRanges") | is(x, "GAlignmentPairs") | is(x, "GAlignments"))) {
    if (!is(x, "GRanges")) x <- GRanges(x)
    message("-- Creating covRle object from GRanges")
    strands <- as.character(strand(x))
    strandBool <- strandBool(x)
    strand_mode <- ifelse(all(strands == "*"), "single", "double")
    # Get coverage
    if (by.readlength) {
      readlengths <- readWidths(x)
    } else readlengths <- rep.int(1, length(x))
    unique_lengths <- sort(unique(readlengths))
    x_b <- x_p <- x_m <- list()
    message("- Read length: ", appendLF = FALSE)
    for (length in unique_lengths) {
      message(", ", length, appendLF = FALSE)
      if (strand_mode == "single") { # b = both strands
        x_b <- c(x_b, coverage(x[readlengths == length], weight = "score"))
      } else { # p = pluss strand, m = minus strand
        hits <- which(readlengths == length)
        x_p <- c(x_p, coverage(x[hits][strandBool[hits]], weight = "score"))
        x_m <- c(x_m, coverage(x[hits][!strandBool[hits]], weight = "score"))
      }
    }

    if (strand_mode != "single") { # Add check for readlength
      names(x_p) <- names(x_m) <- unique_lengths
    } else {
      names(x_b) <-  unique_lengths
    }

  } else if (is(x, "covRle")) {
    if (strandMode(x)) {
      x_p <- f(x)
      x_m <- r(x)
      strand_mode <- "double"
    }
  } else if (is(x, "RleList")) {
    x_b <- x
    strand_mode <- "single"
  } else stop("x must be GRanges, GAlignments, GAlignmentPairs, RleList or covRle")
  chrs <- seqlevels(x)
  if (!dir.exists(dirname(file))) dir.create(dirname(file), recursive = TRUE, showWarnings = FALSE)
  if (by.readlength) {
    message("- Output fstwig for chr:")
    for (chr in chrs) {
      message(chr)
      if (strand_mode == "single") {
        stop("Not implemented")
      } else {
        if (by.readlength) {
          lengths_to_use <- as.character(sort(unique_lengths))
          dtt_p <- data.table()
          dtt_m <- data.table()
          message("- Inserting column for read length:", appendLF = FALSE)
          for(length in lengths_to_use) {
            message(", ", length, appendLF = FALSE)
            dtt_p <-cbind(dtt_p, data.table(unlist(IntegerList(x_p[[length]][chr]))))
            dtt_m <-cbind(dtt_m, data.table(unlist(IntegerList(x_m[[length]][chr]))))
          }
          colnames(dtt_p) <- colnames(dtt_m) <- lengths_to_use
          dt <- list(dtt_p, dtt_m)
        } else {
          dt <- list(data.table(unlist(IntegerList(x_p[chr]))), data.table(unlist(IntegerList(x_m[chr]))))
        }

      }
      file_chr <- paste0(file, "_", chr)
      save.fstwig(dt, file_chr, compress = compress)
      rm(dt)
    }
  } else {
    stop("Not implemented")
  }

  return(invisible(NULL))
}

save.fstwig <- function(x, file, compress = 50) {
  mode <- ifelse(is(x, "data.table"), "single", "double")
  file <- gsub("\\.fstwig", "", file, ignore.case = TRUE)
  if (mode == "single") {
    file <- paste0(file, ".fstwig")
    fst::write_fst(x, file, compress = compress)
  } else {
    file <- gsub("\\.fstwig", "", file, ignore.case = TRUE)
    forward_file <- paste0(file, "_forward.fstwig")
    reverse_file <- paste0(file, "_reverse.fstwig")

    fst::write_fst(x[[1]], path = forward_file, compress = compress)
    fst::write_fst(x[[2]], path = reverse_file, compress = compress)
  }
  return(invisible(NULL))
}

export.cov <- function(x, file, seqinfo, split.by.strand = TRUE,
                       weight = "score") {
  stopifnot(is(seqinfo, "Seqinfo"))
  if (!is(x, "covRle") & !is(x, "RleList")) {
    seqlevels(x) <- seqlevels(seqinfo)
    seqinfo(x) <- seqinfo
    x <- covRleFromGR(x, weight = weight, ignore.strand = !split.by.strand)
  }
  seqinfo(x) <- seqinfo
  file <- paste0(gsub("\\.covrds", "", file, ignore.case = TRUE), ".covrds")
  saveRDS(x, file = file)
}

export.covlist <- function(x, file, seqinfo, split.by.strand = TRUE,
                       weight = "score", verbose = TRUE) {
  stopifnot(is(seqinfo, "Seqinfo"))
  if (!is(x, "covRleList")) {
    seqlevels(x) <- seqlevels(seqinfo)
    seqinfo(x) <- seqinfo

    all_readl_lengths <- readWidths(x)
    read_lengths <- sort(unique(all_readl_lengths))
    if (verbose) message("Readlength:", appendLF = FALSE)
    list <- list()
    for (i in read_lengths) {
      if (verbose) message(", ", i, appendLF = FALSE)
      list <- c(list, covRleFromGR(x[all_readl_lengths == i],
                                   weight = weight,
                                   ignore.strand = !split.by.strand))
    }
    x <- covRleList(list, fraction = read_lengths)
  }
  file <- paste0(gsub("\\.covrds", "", file, ignore.case = TRUE), ".covrds")
  saveRDS(x, file = file)
}

#' Store GRanges object as .bedo
#'
#' .bedo is .bed ORFik, an optimized bed format for coverage reads with
#' read lengths .bedo is a text based format with columns (6 maximum):\cr
#' 1. chromosome\cr 2. start\cr 3. end\cr 4. strand\cr
#' 5. ref width (cigar # M's, match/mismatch total)\cr
#' 6. duplicates of that read\cr
#'
#' Positions are 1-based, not 0-based as .bed.
#' End will be removed if all ends equals all starts.
#' Import with import.bedo
#' @param object a GRanges object
#' @param out a character, location on disc (full path)
#' @return NULL, object saved to disc
#' @importFrom data.table fwrite
#'
export.bedo <- function(object, out) {
  if (!is(object, "GRanges")) stop("object must be GRanges")
  dt <- setDT(as.data.frame(object))
  uwidths <- unique(dt$width)
  if (all(uwidths == 1)) dt$end <- NULL
  dt$width <- NULL
  fwrite(dt, file = out)
}

#' Store GAlignments object as .bedoc
#'
#' A fast way to store, load and use bam files.
#' (we now recommend using \code{link{export.ofst}} instead!)\cr
#' .bedoc is .bed ORFik, an optimized bed format for coverage reads with
#' cigar and replicate number.\cr
#' .bedoc is a text based format with columns (5 maximum):\cr
#' 1. chromosome\cr 2. cigar: (cigar # M's, match/mismatch total)
#' \cr 3. start (left most position) \cr 4. strand (+, -, *)\cr
#' 5. score: duplicates of that read\cr
#'
#' Positions are 1-based, not 0-based as .bed.
#' Import with import.bedoc
#' @param object a GAlignments object
#' @param out a character, location on disc (full path)
#' @return NULL, object saved to disc
#' @importFrom data.table fwrite
#' @export
#'
export.bedoc <- function(object, out) {
  if (!is(object, "GAlignments"))
    stop("object must be of class GAlignments!")
  dt <- data.table(seqnames = as.character(seqnames(object)),
                   start = start(ranges(object)),
                   cigar = cigar(object),
                   strand = as.character(strand(object)))
  if (!is.null(mcols(object)$score)) dt$score = mcols(object)$score
  fwrite(dt, file = out)
}

#' Store GRanges / GAlignments object as .ofst
#'
#' A much faster way to store, load and use bam files.\cr
#' .ofst is ORFik fast serialized object,
#' an optimized format for coverage reads with
#' cigar and replicate number. It uses the fst format as back-end:
#' \code{\link{fst-package}}.\cr A .ofst ribo seq file can compress the
#' information in a bam file from 5GB down to a few MB. This new files has
#' super fast reading time, only a few seconds, instead of minutes. It also has
#' random index access possibility of the file. \cr
#' .ofst is represented as a data.frane format with minimum 4 columns:\cr
#' 1. chromosome\cr  2. start (left most position) \cr 3. strand (+, -, *)\cr
#' 4. width (not added if cigar exists)\cr
#' 5. cigar (not needed if width exists):
#'  (cigar # M's, match/mismatch total) \cr
#' 5. score: duplicates of that read\cr
#' 6. size: qwidth according to reference of read\cr\cr
#' If file is from GAlignmentPairs, it will contain a cigar1, cigar2 instead
#' of cigar and start1 and start2 instead of start
#'
#' Other columns can be named whatever you want and added to meta columns.
#' Positions are 1-based, not 0-based as .bed.
#' Import with import.ofst
#' @param x a GRanges, GAlignments or GAlignmentPairs object
#' @param ... additional arguments for write_fst
#' @return NULL, object saved to disc
#' @importFrom fst write_fst
#' @export
#' @examples
#' ## GRanges
#' gr <- GRanges("1:1-3:-")
#' # export.ofst(gr, file = "path.ofst")
#' ## GAlignment
#' # Make input data.frame
#' df <- data.frame(seqnames = "1", cigar = "3M", start = 1L, strand = "+")
#' ga <- ORFik:::getGAlignments(df)
#' # export.ofst(ga, file = "path.ofst")
setGeneric("export.ofst", function(x, ...) standardGeneric("export.ofst"))

#' @inherit export.ofst
#' @param file a character, location on disc (full path)
setMethod("export.ofst", "GRanges",
          function(x, file, ...) {
            df <- data.frame(seqnames = x@seqnames,
                             start = x@ranges@start,
                             width = x@ranges@width,
                             strand = x@strand)
            if (!is.null(x@ranges@NAMES)) df$NAMES <- x@ranges@NAMES
            if (ncol(x@elementMetadata) > 0)
              df <- as.data.frame(cbind(df, x@elementMetadata))
            write_fst(df, file,...)
          })

#' @inherit export.ofst
#' @param file a character, location on disc (full path)
setMethod("export.ofst", "GAlignments",
          function(x, file, ...) {
            df <- data.frame(seqnames = x@seqnames,
                             start = x@start,
                             cigar = factor(x@cigar, levels = unique(x@cigar)),
                             strand = x@strand)
            if (!is.null(x@NAMES)) df$NAMES <- x@ranges@NAMES
            if (ncol(x@elementMetadata) > 0)
              df <- as.data.frame(cbind(df, x@elementMetadata))
            write_fst(df, file,...)
          })

#' @inherit export.ofst
#' @param file a character, location on disc (full path)
setMethod("export.ofst", "GAlignmentPairs",
          function(x, file, ...) {
            # There is always equal seqname in a pair,
            # strand is always reverse of the other
            df <- data.frame(seqnames = x@first@seqnames,
                             start1 = x@first@start,
                             start2 = x@last@start,
                             cigar1 = factor(x@first@cigar,
                                             levels = unique(x@first@cigar)),
                             cigar2 = factor(x@last@cigar,
                                             levels = unique(x@last@cigar)),
                             strand = x@first@strand)
            if (!is.null(x@NAMES)) df$NAMES <- x@NAMES # Check that this is correct
            if (ncol(x@elementMetadata) > 0)
              df <- as.data.frame(cbind(df, x@elementMetadata))
            write_fst(df, file,...)
          })
Roleren/ORFik documentation built on April 25, 2024, 8:41 p.m.