R/Utility_functions.R

Defines functions merge_GRanges flip_strand_info write_grl_as_bed12 save_GRanges_as_bedGraph

Documented in flip_strand_info merge_GRanges save_GRanges_as_bedGraph write_grl_as_bed12

#' Save stranded GRanges object as bedGraph file with strand info encoded in the sign of values
#'
#' @param gr \code{GRanges} object with a numeric column in mcols.
#' @param filepath Valid path to the output file.
#' @param colname Character (must match the name of the numeric column).
#' @section Details:
#' This function saves a gzipped bedGraph file which contains both positive and negative values
#' (to visualize the coverage of forward and reverse strands in a genome browser).
#' @export
save_GRanges_as_bedGraph <- function(gr, filepath, colname = "score") {
  stopifnot(class(gr) == "GRanges")
  stopifnot(is.character(filepath) && length(filepath) == 1)
  stopifnot(is.character(colname) && length(colname) == 1 && !is.null(S4Vectors::mcols(gr)[[colname]]))
  S4Vectors::mcols(gr)[[colname]] <- ifelse(BiocGenerics::strand(gr)=="-", -S4Vectors::mcols(gr)[[colname]], S4Vectors::mcols(gr)[[colname]])
  BiocGenerics::strand(gr) <- "*"
  gr <- BiocGenerics::sort(gr)
  if (!stringr::str_detect(filepath, "gz$")) {
    filepath <- stringr::str_c(filepath, ".gz")
  }
  con <- gzfile(filepath, "w")
  writeLines("track type=bedGraph color=0,0,0 altColor=128,128,128", con)
  rtracklayer::export.bedGraph(gr, con)
  close(con)
}

# ------------------------------------------------------------------------------------------------------------------------

#' Save GRangesList object as BED12 file
#'
#' @param grl \code{GRangesList} object.
#' @param filename Valid path to the output file.
#' @param name_in_mcols Logical.
#' @section Details:
#' This function does the same work as \code{rtracklayer::export(format = "BED")},
#' with the added benefit that the \code{S4Vectors::mcols(grl)$score} is properly exported.\cr
#' If \code{name_in_mcols == TRUE}, then the \code{S4Vectors::mcols(grl)$name} is exported as range names (otherwise the \code{names(grl)} is used).\cr
#' Observe that the \code{default_score = "."} and \code{default_itemRgb = "."} are OK for IGV browser only.
#' The UCSC browser wants numeric values instead, e.g. \code{default_score = "0"} and \code{default_itemRgb = "0,0,0"}
#' @export
write_grl_as_bed12 <- function(grl, filename, name_in_mcols = FALSE) {
  stopifnot(BiocGenerics::grepl("GRangesList", class(grl)))
  stopifnot(is.character(filename) && length(filename) == 1)
  stopifnot(is.logical(name_in_mcols) && length(name_in_mcols) == 1)
  gr <- range(grl) %>% BiocGenerics::unlist(use.names = FALSE)
  grl_unl <- BiocGenerics::unlist(grl, use.names = FALSE)
  idx_fw <- BiocGenerics::lapply(S4Vectors::elementNROWS(grl), function(x) { seq(1, x) }) %>% BiocGenerics::unlist() %>% unname()
  idx_rev <- BiocGenerics::lapply(S4Vectors::elementNROWS(grl), function(x) { seq(x, 1) }) %>% BiocGenerics::unlist() %>% unname()
  idx <- ifelse(BiocGenerics::strand(grl_unl) == "+", idx_fw, idx_rev)
  mc <- grl_unl[idx == 1] %>% S4Vectors::mcols()
  chrom <- GenomeInfoDb::seqnames(gr) %>% as.character()
  chromStart <- BiocGenerics::start(gr)
  chromEnd <- BiocGenerics::end(gr)
  if (isTRUE(name_in_mcols)) {
    if (is.null(mc$name)) {
      name <- "."
    } else {
      name <- mc$name
    }
  } else {
    if (is.null(names(grl))) {
      name <- "."
    } else {
      name <- names(grl)
    }
  }
  if (is.null(mc$score)) {
    score <- "."
  } else {
    score <- mc$score
  }
  strand <- BiocGenerics::strand(gr) %>% as.character()
  if (is.null(mc$thick)) {
    thickStart <- chromStart
    thickEnd <- chromEnd
  } else {
    thickStart <- mc$thick %>% BiocGenerics::start()
    thickEnd <- mc$thick %>% BiocGenerics::end()
  }
  if (is.null(mc$itemRgb)) {
    itemRgb <- "."
  } else {
    itemRgb <- mc$itemRgb
  }
  blockCount <- S4Vectors::elementNROWS(grl)
  blockSizes <- BiocGenerics::width(grl) %>% BiocGenerics::lapply(stringr::str_c, collapse = ",") %>% BiocGenerics::unlist()
  blockStarts <- BiocGenerics::start(grl) %>% `-`(BiocGenerics::start(gr)) %>% BiocGenerics::lapply(stringr::str_c, collapse = ",") %>% BiocGenerics::unlist()
  # Ensure zero offset (bugfix 2023-09-21):
  chromStart <- chromStart - 1
  thickStart <- thickStart - 1
  tbl <- dplyr::tibble(chrom, chromStart, chromEnd, name, score, strand, thickStart, thickEnd, itemRgb, blockCount, blockSizes, blockStarts)
  tbl <- dplyr::arrange(tbl, chrom, chromStart, chromEnd)
  readr::write_tsv(tbl, filename, col_names = FALSE)
}

# ----------------------------------------------------------------------------------------------------------------

#' Flip strand orientation of a GenomicRanges object
#'
#' @param gr \code{GRanges} or \code{GRangesList} object
#' @return Object of the same class as input with positive and negative strands flipped
#' @export
flip_strand_info <- function(gr) {
  stopifnot(class(gr) == "GRanges" || BiocGenerics::grepl("GRangesList", class(gr)))
  if (class(gr) == "GRanges") {
    BiocGenerics::strand(gr) <- ifelse(BiocGenerics::strand(gr) == "+", "-", "+")
  } else {
    unl <- BiocGenerics::unlist(gr, use.names = FALSE)
    BiocGenerics::strand(unl) <- ifelse(BiocGenerics::strand(unl) == "+", "-", "+")
    gr <- BiocGenerics::relist(unl, gr)
  }
  return(gr)
}

# -----------------------------------------------------------------------------------------------------------------

#' Merge coverage of GRanges objects
#'
#' @param gr_list List of \code{GRanges} objects with \code{score} metadata column
#' @return \code{GRanges} object
#' @export
merge_GRanges <- function(gr_list) {
  stopifnot(class(gr_list) == "list" || BiocGenerics::grepl("GRangesList", class(gr_list)))
  stopifnot(length(gr_list) > 0)
  stopifnot(all(unlist(lapply(gr_list, function(x) { !is.null(BiocGenerics::score(x)) }))))
  cov_fw <- gr_list %>% BiocGenerics::lapply(function(x) { GenomicRanges::coverage(x[BiocGenerics::strand(x) == "+"], weight = "score") }) %>% BiocGenerics::Reduce(`+`, .) %>% GenomicRanges::bindAsGRanges(score = .)
  cov_rev <- gr_list %>% BiocGenerics::lapply(function(x) { GenomicRanges::coverage(x[BiocGenerics::strand(x) == "-"], weight = "score") }) %>% BiocGenerics::Reduce(`+`, .) %>% GenomicRanges::bindAsGRanges(score = .)
  BiocGenerics::strand(cov_fw) <- "+"
  BiocGenerics::strand(cov_rev) <- "-"
  out <- c(cov_fw, cov_rev)
  out <- out[BiocGenerics::score(out) > 0]
  return(BiocGenerics::sort(out))
}

# -----------------------------------------------------------------------------------------------------------------------

convert_GRanges_to_coverage <- function(gr) {
  # Input: GRanges without score column (e.g. overlapping reads);
  # Output: GRanges with non-overlapping ranges and score;
  cov_fw <- GenomicRanges::coverage(gr[BiocGenerics::strand(gr) == "+"]) %>% GenomicRanges::bindAsGRanges(score = .)
  cov_rev <- GenomicRanges::coverage(gr[BiocGenerics::strand(gr) == "-"]) %>% GenomicRanges::bindAsGRanges(score = .)
  strand(cov_fw) <- "+"
  strand(cov_rev) <- "-"
  out <- c(cov_fw, cov_rev)
  out <- out[BiocGenerics::score(out) > 0] %>% GenomeInfoDb::sortSeqlevels() %>% BiocGenerics::sort()
  return(out)
}

# ----------------------------------------------------------------------------------------------------------------------

find_uniq_max <- function(x) {
  out <- rep(FALSE, times = length(x))
  idx <- BiocGenerics::which.max(x)
  out[[idx]] <- TRUE
  return(out)
}

# ------------------------------------------------------------------------------------------------------------------------

find_uniq_min <- function(x) {
  out <- rep(FALSE, times = length(x))
  idx <- BiocGenerics::which.min(x)
  out[[idx]] <- TRUE
  return(out)
}

# ------------------------------------------------------------------------------------------------------------------------

poverlaps_gr <- function(gr1, gr2, maxgap = 0L, minoverlap = 1L, type = c("any", "start", "end", "within", "equal"), ignore.strand = FALSE) {
  stopifnot(length(gr1) == length(gr2))
  goodSeqnames <- as.character(GenomeInfoDb::seqnames(gr1)) == as.character(GenomeInfoDb::seqnames(gr2))
  if (isTRUE(ignore.strand)) {
    goodStrands <- TRUE
  } else {
    goodStrands <- GenomicRanges:::compatibleStrand(BiocGenerics::strand(gr1), BiocGenerics::strand(gr2))
  }
  goodRanges <- IRanges::poverlaps(IRanges::ranges(gr1), IRanges::ranges(gr2), maxgap, minoverlap, type)
  return(goodSeqnames & goodStrands & goodRanges)
}

# -------------------------------------------------------------------------------------------------------------------------

sort_grl <- function(grl) {
  orig_names <- names(grl)
  names(grl) <- 1:length(grl)
  nms <- grl %>% range() %>% BiocGenerics::unlist() %>% BiocGenerics::sort() %>% names()
  idx <- BiocGenerics::match(nms, names(grl))
  out <- grl[idx]
  names(out) <- orig_names[idx]
  return(out)
}

# -------------------------------------------------------------------------------------------------------------------------

deduplicate_grl <- function(grl, colname = "score") {
  uniq_ranges <- grl %>% BiocGenerics::unlist(use.names = FALSE) %>% BiocGenerics::unique()
  gr <- BiocGenerics::unlist(grl, use.names = FALSE)
  idx <- GenomicRanges::findOverlaps(gr, uniq_ranges, type = "equal", select = "first")
  group <- rep(1:length(grl), times = S4Vectors::elementNROWS(grl))
  id <- S4Vectors::split(idx, group) %>% BiocGenerics::lapply(stringr::str_c, collapse = ",") %>% as.character() %>% unname()
  dupl <- BiocGenerics::duplicated(id)
  out <- grl[!dupl]
  out_unl <- BiocGenerics::unlist(out, use.names = FALSE)
  tbl_1 <- dplyr::tibble(id = unique(id))
  tbl_2 <- suppressMessages(dplyr::tibble(id = id) %>% dplyr::group_by(id) %>% dplyr::summarize(n = n()))
  tbl <- dplyr::left_join(tbl_1, tbl_2, by = "id")
  S4Vectors::mcols(out_unl) <- NULL
  S4Vectors::mcols(out_unl)[[colname]] <- rep(tbl$n, times = S4Vectors::elementNROWS(out))
  out <- BiocGenerics::relist(out_unl, out)
  return(out)
}

# ----------------------------------------------------------------------------------------------------------------

check_gr_up_down <- function(g1, g2, offset = 1) {
  # Returns character vector of "up", "down" or "over"
  # "up": g2 range starts upstream from g1 range
  # "over": g2 range starts in the "grey zone" (g1 start + offset)
  # "down": g2 range starts downstream from g1 range
  stopifnot(class(g1) == "GRanges")
  stopifnot(class(g2) == "GRanges")
  stopifnot(length(g1) == length(g2))
  grey <- GenomicRanges::resize(g1, offset, "start")
  S4Vectors::mcols(g2)$idx <- 1:length(g2)
  g2 <- GenomicRanges::resize(g2, 1, "start")
  #over <- g2 %over% grey
  over <- poverlaps_gr(g2, grey) ### bugfixed on 2023-09-26
  out_1 <- g2[over]
  if (length(out_1) > 0) {
    S4Vectors::mcols(out_1)$decision <- "over"
  }
  grey <- grey[!over]
  g2 <- g2[!over]
  gap <- GenomicRanges::pgap(grey, g2) %>% GenomicRanges::resize(width(.) + 1, "start")
  S4Vectors::mcols(g2)$decision <- ifelse(gap %over% g2, "down", "up")
  out <- c(out_1, g2) %>% `[`(BiocGenerics::order(S4Vectors::mcols(.)$idx)) %>% S4Vectors::mcols() %>% .$decision
  return(out)
}

# --------------------------------------------------------------------------------------------------------------------

cluster_indexes <- function(mat, show_progress = FALSE) {
  stopifnot(all(mat[, 1] < mat[, 2]))
  if (any(BiocGenerics::duplicated(mat))) {
    mat <- BiocGenerics::unique(mat)
  }
  col1 <- as.character(mat[, 1])
  col2 <- as.character(mat[, 2])
  cln <- 1
  d <- collections::dict()
  d$set(col1[[1]], cln)
  d$set(col2[[1]], cln)
  if (isTRUE(show_progress)) {
    pb <- utils::txtProgressBar(min = 2, max = nrow(mat), style = 3)
  }
  if (nrow(mat) >= 2) {
    for (i in 2:nrow(mat)) {
      if (isTRUE(show_progress)) {
        utils::setTxtProgressBar(pb, i)
      }
      key1 <- col1[[i]]
      key2 <- col2[[i]]
      lgl1 <- d$has(key1)
      lgl2 <- d$has(key2)
      if (!lgl1 && !lgl2) {
        cln <- cln + 1
        d$set(key1, cln)
        d$set(key2, cln)
      } else if (lgl1 && !lgl2) {
        d$set(key2, d$get(key1))
      } else if (!lgl1 && lgl2) {
        d$set(key1, d$get(key2))
      } else {
        val1 <- d$get(key1)
        val2 <- d$get(key2)
        if (val1 != val2) {
          d$set(key1, val1)
          d$set(key2, val1)
          bad <- BiocGenerics::which(BiocGenerics::unlist(d$values()) == val2)
          bad_keys <- BiocGenerics::unlist(d$keys())[bad]
          for (j in seq_along(bad_keys)) {
            d$set(bad_keys[[j]], val1)
          }
        }
      }
    }
  }
  if (isTRUE(show_progress)) {
    close(pb)
  }
  lst <- d$as_list()
  indexes <- as.integer(names(lst))
  clusters <- as.integer(lst)
  out <- S4Vectors::split(indexes, clusters)
  out <- BiocGenerics::lapply(out, sort)
  return(unname(out))
}

# -----------------------------------------------------------------------------------------------------------------

# Decisions are interpreted from the perspective of gr2:
# "up" = gr2 overlaps 5' end of gr1;
# "down" = gr2 overlaps 3' end of gr1;
# "inside" = gr2 is within gr1;
# "contains" = gr2 includes gr1;
# "exact" = gr2 and gr1 are equal;
# "no_up" = no overlap, gr2 is upstream of gr1;
# "no_down" = no overlap, gr2 is downstream of gr1;

parallel_overlap_type <- function(gr1, gr2) {
  stopifnot(length(gr1) == length(gr2)) # gr1 and gr2 are expected to be parallel
  stopifnot(all(as.character(BiocGenerics::strand(gr1)) %in% c("+", "-"))) # All intervals in gr1 are expected to have strand info. Strandedness of gr2 is not taken into account.
  out <- vector("character", length(gr1))
  a <- BiocGenerics::start(gr2) <= BiocGenerics::start(gr1) & BiocGenerics::end(gr2) < BiocGenerics::end(gr1) & BiocGenerics::end(gr2) >= BiocGenerics::start(gr1) # gr2 overlaps the beginning of gr1
  b <- BiocGenerics::start(gr2) >= BiocGenerics::start(gr1) & BiocGenerics::end(gr2) <= BiocGenerics::end(gr1) # gr2 is within (inside of) gr1
  c <- BiocGenerics::start(gr2) > BiocGenerics::start(gr1) & BiocGenerics::end(gr2) >= BiocGenerics::end(gr1) & BiocGenerics::start(gr2) <= BiocGenerics::end(gr1) # gr2 overlaps the end of gr1
  d <- BiocGenerics::start(gr2) <= BiocGenerics::start(gr1) & BiocGenerics::end(gr2) >= BiocGenerics::end(gr1) # gr2 includes (contains) gr1
  e <- BiocGenerics::start(gr2) == BiocGenerics::start(gr1) & BiocGenerics::end(gr2) == BiocGenerics::end(gr1)
  b[e] <- FALSE
  d[e] <- FALSE
  f <- BiocGenerics::end(gr2) < BiocGenerics::start(gr1) # no overlap
  g <- BiocGenerics::start(gr2) > BiocGenerics::end(gr1)
  no_up <- ifelse(BiocGenerics::strand(gr1) == "+", f, g)
  no_down <- ifelse(BiocGenerics::strand(gr1) == "+", g, f)
  out[no_up] <- "no_up"
  out[no_down] <- "no_down"
  up <- ifelse(BiocGenerics::strand(gr1) == "+", a, c)
  down <- ifelse(BiocGenerics::strand(gr1) == "+", c, a)
  out[up] <- "up"
  out[down] <- "down"
  out[b] <- "inside"
  out[d] <- "contains"
  out[e] <- "exact"
  return(as.factor(out))
}

# ------------------------------------------------------------------------------------------------------------------

trim_by_down_or_upstream_features <- function(windows, features, mode, offset = 1, ignore.strand = FALSE) {
  stopifnot(mode %in% c("down", "up"))
  old_names <- names(windows)
  S4Vectors::mcols(windows)$orig_order <- 1:length(windows) # enumerate windows
  if (offset > 0) {
    if (mode == "down") {
      features <- suppressWarnings(GenomicRanges::trim(GenomicRanges::resize(features, width = width(features) + offset, fix = "end"))) # extend features upstream by the offset value
    } else if (mode == "up") {
      features <- suppressWarnings(GenomicRanges::trim(GenomicRanges::resize(features, width = width(features) + offset, fix = "start")))
    }
  }
  over_any <- IRanges::overlapsAny(windows, features, ignore.strand = ignore.strand)
  out1 <- windows[!over_any] # exclude and save windows which do not overlap any feature
  win_rem <- windows[over_any]
  if (length(win_rem) == 0) {
    return(windows)
  }
  hits <- GenomicRanges::findOverlaps(win_rem, features, ignore.strand = ignore.strand)
  if (length(hits) == 0) {
    return(windows)
  }
  win_par <- win_rem[S4Vectors::queryHits(hits)]
  feat_par <- features[S4Vectors::subjectHits(hits)]
  over_type <- parallel_overlap_type(win_par, feat_par) # detect the type of overlap
  if (mode == "down") {
    bad <- as.logical(BiocGenerics::tapply(over_type, list(S4Vectors::mcols(win_par)$orig_order), function(x) { any(x %in% c("up", "contains", "exact", "no_up", "no_down")) }))
  } else if (mode == "up") {
    bad <- as.logical(BiocGenerics::tapply(over_type, list(S4Vectors::mcols(win_par)$orig_order), function(x) { any(x %in% c("down", "contains", "exact", "no_up", "no_down")) }))
  }
  out2 <- win_rem[bad] # exclude and save windows which overlap any feature in undesired orientation
  out2 <- GenomicRanges::resize(out2, width = 0) # trim such windows to zero width
  out3 <- win_rem[!bad] # continue with windows which overlap at least one downstream or upstream feature
  if (length(out3) == 0) {
    out <- c(out1, out2)
    out <- out[BiocGenerics::order(S4Vectors::mcols(out)$orig_order)]
    S4Vectors::mcols(out)$orig_order <- NULL
    return(out)
  }
  if (mode == "down") {
    out3_zw <- GenomicRanges::resize(out3, width = 0, fix = "start") # resize both windows and features to 1 bp width around start
    feat_zw <- GenomicRanges::resize(features, width = 0, fix = "start")
    if (isTRUE(ignore.strand)) {
      BiocGenerics::strand(feat_zw) <- "*"
    }
    feat_near <- features[GenomicRanges::precede(out3_zw, feat_zw)] # extract the nearest downstream feature start
  } else {
    out3_zw <- GenomicRanges::resize(out3, width = 0, fix = "end")
    feat_zw <- GenomicRanges::resize(features, width = 0, fix = "end")
    if (isTRUE(ignore.strand)) {
      BiocGenerics::strand(feat_zw) <- "*"
    }
    feat_near <- features[GenomicRanges::follow(out3_zw, feat_zw)] # extract the nearest downstream feature start
  }
  out3_trimmed <- GenomicRanges::pgap(out3_zw, feat_near, ignore.strand = ignore.strand) # compute truncated windows as gap between the starts
  S4Vectors::mcols(out3_trimmed) <- S4Vectors::mcols(out3) # restore mcols which were removed in the pgap operation
  out <- c(out1, out2, out3_trimmed) # combine all processed windows
  out <- out[BiocGenerics::order(S4Vectors::mcols(out)$orig_order)] # restore the original order
  S4Vectors::mcols(out)$orig_order <- NULL
  names(out) <- old_names
  return(out)
}


# ------------------------------------------------------------------------------------------------------

pintersect_punion_on_grl <- function(grl_a, grl_b, mode) {
  stopifnot(mode %in% c("pintersect", "punion"))
  stopifnot(BiocGenerics::grepl("GRangesList", class(grl_a)))
  stopifnot(BiocGenerics::grepl("GRangesList", class(grl_b)))
  stopifnot(length(grl_a) == length(grl_b))
  gr_a <- BiocGenerics::unlist(grl_a, use.names = FALSE)
  S4Vectors::mcols(gr_a)$group <- rep(1:length(grl_a), times = S4Vectors::elementNROWS(grl_a))
  gr_b <- BiocGenerics::unlist(grl_b, use.names = FALSE)
  S4Vectors::mcols(gr_b)$group <- rep(1:length(grl_b), times = S4Vectors::elementNROWS(grl_b))
  hits <- GenomicRanges::findOverlaps(gr_a, gr_b)
  par1 <- gr_a[S4Vectors::queryHits(hits)]
  par2 <- gr_b[S4Vectors::subjectHits(hits)]
  same_group <- S4Vectors::mcols(par1)$group == S4Vectors::mcols(par2)$group
  par1 <- par1[same_group]
  par2 <- par2[same_group]
  hits <- hits[same_group]
  if (mode == "pintersect") {
    gr <- GenomicRanges::pintersect(par1, par2)
  } else {
    gr <- GenomicRanges::punion(par1, par2)
  }
  out <- S4Vectors::split(gr, S4Vectors::mcols(par1)$group) %>% GenomicRanges::reduce()
  return(out)
}

# ----------------------------------------------------------------------------------------------------------------

quantile_trim <- function(x, q, value = "idx") {
  stopifnot(value %in% c("idx", "lgl"))
  idx <- BiocGenerics::order(x, decreasing = TRUE)
  x2 <- x[idx]
  last_good <- BiocGenerics::which(cumsum(x2) / sum(x2) >= (1 - q))[[1]]
  if (last_good == length(x)) {
    out <- 1:length(x)
  } else {
    out <- BiocGenerics::which(x >= x2[last_good])
  }
  if (value == "idx") {
    return(out)
  } else {
    lgl <- rep(FALSE, times = length(x))
    lgl[out] <- TRUE
    return(lgl)
  }
}
Maxim-Ivanov/TranscriptomeReconstructoR documentation built on Oct. 3, 2023, 11:19 p.m.