R/density.matrix.R

Defines functions density.matrix

Documented in density.matrix

#' @title Density matrix builder
#'
#' @description A function (completely in R) that generates a matrix given a list of regions (.bed files) and signals (.bigWig files) alternative (even though more time consuming) to \link{computeMatrix.deeptools}. The output can be passed as it is to the functions \link{plot.density.profile}, \link{plot.density.summary} and, \link{plot.density.differences}.
#'
#' @param mode A string indicating the method for the matrix computation:
#' \itemize{
#'   \item \code{scale-regions} all regions in the BED file are stretched or shrunken to the length (in bases) indicated by the user (\code{body.length});
#'   \item \code{reference-point} the matrix will be performed on the range -upstream+downstream from the indicated reference point (center, TSS, TES).
#'  }
#' @param regions.list A string vector with a list of full paths to bed files or list of data.frames in at least BED3 format (eg. generated by \link{build.bed}).
#' @param samples.list A string vector with a list of full paths to bigWig files.
#' @param region.names A string vector with the names of the regions. If \code{NULL} or of length lower than the number of regions the names will be assigned using the basename of the file if a path is provided otherwise "region_<order number>". By default \code{NULL}.
#' @param sample.names A string vector with the names of the samples. If \code{NULL} or of length lower than the number of samples the names will be assigned using the basename of the file. By default \code{NULL}.
#' @param sort.regions.coordinates Logical value to define whether the output matrix should contain the regions sorted by genomic location for each region group (sorted by \link{sort.bed}). By default \code{FALSE}.
#' @param reference.point The reference point for the matrix generation could be either the region start (\code{"TSS"}), the region end (\code{"TES"}) or the \code{"center"} of the region. By default \code{"center"}.
#' @param reference.point.label A single string with the label for the reference point that could be used for the plots.
#' @param upstream Distance, in bases (bp), upstream of the reference-point, in "reference-point" mode, or the region start, in "scale-regions" mode. By default \code{500}.
#' @param downstream Distance, in bases (bp), downstream of the reference-point, in "reference-point" mode, or the region start, in "scale-regions" mode. By default \code{500}.
#' @param body.length Distance, in bases (bp), to which all regions will be fit. By default: \code{1000}.
#' @param missing.data.as.zero A logical value to define whether missing data (NAs) should be treated as zeros. By default \code{FALSE}.
#' @param bin.size Length, in bases (bp), of the non-overlapping bins for averaging the score over the regions length. By default \code{10}.
#' @param binning.operation A single string to define the type of statistic that should be used over the bin size range. The options are: "mean", "median", "sum". By default \code{"mean"}.
#' @param stranded Logical value to indicate whether the strand of the region should be taken into account. When \code{TRUE}, the order of the bigWig score for the given region will be reversed. Default \code{FALSE}.
#'
#' @return The function returns a named list containing:
#' \itemize{
#'   \item \code{metadata} data.frame with the parameters used to build the matrix;
#'   \item \code{matrix.data} data.frame with the computed scores;
#'   \item \code{original.file.path} with the string: "Matrix generated by Rseb::density.matrix()".
#'  }
#'  This list can be passed as it is to the functions \link{plot.density.profile}, \link{plot.density.summary} and, \link{plot.density.differences}.
#'
#' @export density.matrix

density.matrix = function(mode,
                          regions.list,
                          samples.list,
                          region.names = NULL,
                          sample.names = NULL,
                          sort.regions.coordinates = FALSE,
                          reference.point = "center",
                          reference.point.label = NULL,
                          upstream = 500,
                          downstream = 500,
                          body.length = 1000,
                          missing.data.as.zero = FALSE,
                          bin.size = 10,
                          binning.operation = "mean",
                          stranded = FALSE) { # BEGIN FUNCTION

  ##########################################################################################
  # Check if Rseb is up-to-date #
  Rseb::actualize(update = F, verbose = F)

  ##########################################################################################
  # CHECK PARAMETERS
  ##########################################################################################

  # Regions names
  if ((length(region.names) < length(regions.list)) | is.null(region.names)) {
    message("The number of region.names is NULL or lower than the number of provided regions files. The region.names will automatically redifined.")

    if (class(regions.list) == "character") {
      region.names =
        sapply(regions.list, function(x){
          if (class(x) == "character") {return(gsub(x = basename(x), pattern = "[.].[A-z]*$", replacement = ""))} else {
            return(paste("region", which(sapply(regions.list, function(y) {y == x}), sapply(regions.list, function(y) {y == x}) == TRUE), sep="_"))
          }
        })
    } else {
      if ("data.frame" %in% class(regions.list)) {regions.list = list(regions.list)}
      region.names = paste0("region_", c(1:length(regions.list)))
    }
  }


  # Sample names
  if (length(sample.names) < length(samples.list) | is.null(sample.names)) {
    message("The number of sample.names is NULL or lower than the number of provided regions files. The sample.names will automatically redifined.")
    sample.names = sapply(samples.list, function(x){return(gsub(x = basename(x), pattern = "[.].[A-z]*$", replacement = ""))})
  }

  # Reference point
  if (!(tolower(mode) %in% c("reference-point", "scale-regions"))) {
    return(warning("The mode must be one among 'reference-point' and 'scale-regions'."))
  }

  # Check bin.size compatibility
  if (mode == "reference-point") {
    if ((upstream+downstream) %% bin.size != 0) {return(warning("The region size (upstream + downstream) must be multiple of the bin.size"))}
  } else {
    if (upstream %% bin.size != 0) {return(warning("The upstream region size must be multiple of the bin.size"))}
    if (downstream %% bin.size != 0) {return(warning("The downstream region size must be multiple of the bin.size"))}
    if (body.length %% bin.size != 0) {return(warning("The body.length region size must be multiple of the bin.size"))}
  }

  # Reference-point
  if (!(tolower(reference.point) %in% c("tss", "tes", "center")) | length(reference.point) > 1) {
    return(warning("The reference.point must be one among 'TSS', 'TES', 'center'. Default: 'center'."))
  }

  # Reference-point-labels
  if (is.null(reference.point.label)) {
    reference.point.label = reference.point
  } else if (length(reference.point.label) > 1) {
    return(warning("The reference point label must be a single string."))
  }

  # Set ranges as positive values
  upstream = abs(upstream)
  downstream = abs(downstream)
  bin.size = abs(bin.size)
  body.length = ifelse(test = mode == "reference-point",
                       yes = 0,
                       no = abs(body.length))

  # Binning.operation check
  if (!(tolower(binning.operation) %in% c("mean", "median", "sum"))) {
    return(warning("The reference.point must be one among 'mean', 'median', 'sum'. Default: 'mean'."))
  }


  ##########################################################################################
  # INTERNAL FUNCTIONS DEFINITION
  ##########################################################################################

  ###############
  import_bed = function(bed, mode, upstream, downstream, reference.point, sort.regions.coordinates) {

    # Required libraries
    require(GenomicRanges)
    require(dplyr)

    # importing bed file/regions
    if (class(bed) == "character") {regions = read.delim(file = bed, sep = "\t", header = F)
    } else if (class(bed) %in% c("data.frame", "tibble")) {regions = as.data.frame(bed)}

    if (sort.regions.coordinates == TRUE) {
      bed = Rseb::sort.bed(bed = bed, return.bed = T)
    }

    # Normalize bed file to a BED6
    if (class(regions) == "data.frame") {
      if (ncol(regions) >= 6) {regions = regions[,1:6]
      } else if (ncol(regions) >= 3 & ncol(regions) < 6) {
        if (ncol(regions) == 3) {
          regions =
            regions %>%
            dplyr::mutate(name = paste("region", 1:nrow(regions), sep="_"),
                          score = 0,
                          strand = ".")
        } else if (length(regions) == 4) {
          regions =
            regions %>%
            dplyr::mutate(score = 0,
                          strand = ".")
        } else if (length(regions) == 5) {
          regions = regions %>% dplyr::mutate(strand = ".")
        }
      }
    }

    colnames(regions) = c("chr", "start", "end", "name", "score", "strand")
    regions$strand = sapply(regions$strand, function(x){ifelse(test = x %in% c("+", "-", "*"),
                                                               yes = x,
                                                               no = "*")})


    # Redefine regions depending on mode
    if (mode == "reference-point") {
      if (tolower(reference.point) == "center") {
        reference.position = round((regions$start + regions$end) / 2)

      } else if (toupper(reference.point) == "TSS") {
        reference.position = ifelse(test = regions$strand == "-",
                                    yes = regions$end,
                                    no = regions$start)

      } else if (toupper(reference.point) == "TES") {
        reference.position = ifelse(test = regions$strand == "-",
                                    yes = regions$start,
                                    no = regions$end)
      }

      start = ifelse(test = regions$strand == "-",
                     yes = reference.position - downstream + 1,
                     no =  reference.position - upstream)

      end = ifelse(test = regions$strand == "-",
                   yes = reference.position + upstream,
                   no =  reference.position + downstream - 1)


    } else if (mode == "scale-regions") {
      start = ifelse(test = regions$strand == "-",
                     yes = regions$start - downstream,
                     no = regions$start - upstream)

      end = ifelse(test = regions$strand == "-",
                   yes = regions$end + upstream,
                   no =  regions$end + downstream)
    }


    # Convert regions to GRanges object
    return(unique(GRanges(seqnames = regions$chr,
                          ranges = IRanges(start = start,
                                           end = end,
                                           names = regions$name),
                          strand = regions$strand,
                          score = regions$score)))
  } # end import.bed


  ###########
  get.single.base.scores = function(bigWig, regions, missing.data.as.zero) {
    require(rtracklayer)

    score_list = list()

    for (i in 1:length(regions)) {
      # Chek if the bigWig chrom names are the same than in the bed regions
      if (suppressWarnings(inherits(try(import(BigWigFile(bigWig), selection = regions[i], as = 'NumericList')[[1]],
                       silent = TRUE),
                   "try-error"))) { # it is TRUE when it does not work due to chromosome names in the bigWig != bed
        regions[i] = diffloop::rmchr(regions[i])
      }

      score_list[[i]] = import(BigWigFile(bigWig), selection = regions[i], as = 'NumericList')[[1]]
      if ((strand(regions[i])@values[1] == "-") & stranded == TRUE) {score_list[[i]] = rev(score_list[[i]])}
    }

    if (missing.data.as.zero == TRUE) {purrr::pmap(.l = score_list,
                                                   .f = function(score_list){
                                                     score_list[is.na(score_list)] = 0
                                                     return(score_list)})}

    if (missing.data.as.zero == TRUE) {
      lapply(score_list,
             function(x){
               x[is.na(x)] = 0
               return(x)})
    }

    return(score_list)

  } # end get.single.base.scores


  ###########
  compute.bins = function(score_list,
                          mode,
                          bin.size,
                          binning.operation,
                          upstream,
                          downstream,
                          body.length) {
    ### reference point
    if (mode == "reference-point") {
      # Build a matrix
      matrix = do.call(rbind, score_list)

      # Compute binning (when bin.size > 1)
      if (binning.operation == "mean" & bin.size != 1) {
        bin.score.list = purrr::pmap(.l = list(bin.left = seq(1, (upstream+downstream), by = bin.size),
                                               bin.right = seq(bin.size, (upstream+downstream), by = bin.size)),
                                     .f = function(bin.left, bin.right) {rowMeans(matrix[,bin.left:bin.right], na.rm = F)})
        return(do.call(cbind, bin.score.list))

      } else if (binning.operation == "sum" & bin.size != 1) {
        bin.score.list = purrr::pmap(.l = list(bin.left = seq(1, (upstream+downstream), by = bin.size),
                                               bin.right = seq(bin.size, (upstream+downstream), by = bin.size)),
                                     .f = function(bin.left, bin.right) {rowSums(matrix[,bin.left:bin.right], na.rm = F)})
        return(do.call(cbind, bin.score.list))

      } else if (binning.operation == "median" & bin.size != 1) {
        bin.score.list = purrr::pmap(.l = list(bin.left = seq(1, (upstream+downstream), by = bin.size),
                                               bin.right = seq(bin.size, (upstream+downstream), by = bin.size)),
                                     .f = function(bin.left, bin.right) {robustbase::rowMedians(matrix[,bin.left:bin.right], na.rm = F)})
        return(do.call(cbind, bin.score.list))
      } else {return(matrix)}

    ### scale-regions
    } else if (mode == "scale-regions") {

      ### Re-scaling of the body regions
      # Extract different parts values
      upstream.scores = lapply(score_list, function(x){x[1:upstream]})
      body.scores = lapply(score_list, function(x){x[-c(1:upstream, (length(x) - downstream + 1):length(x))]})
      downstream.scores = lapply(score_list, function(x){x[c((length(x) - downstream + 1):length(x))]})

      # Get at least..body.length-size vectors
      resized.body.scores = lapply(body.scores, function(x){if (length(x) < body.length){return(rep(x, each = ceiling(body.length/length(x))))} else {return(x)}})

      # Split in body.length chunks and calculate the mean for each chunk
      rescaled.scores = lapply(resized.body.scores, function(x){
        scaled = sapply(split(x, ceiling(seq_along(x)/(length(x)/body.length))), mean, USE.NAMES = F)
        if (length(scaled) > body.length) {scaled = scaled[body.length] = mean(scaled[c(body.length:length(scaled))])}
        return(scaled[1:body.length])
        })


      # Merge (upstream + body.length + downstream) matrix
      matrix = do.call(rbind, purrr::pmap(.l = list(up = upstream.scores, body = rescaled.scores, down = downstream.scores),
                                          .f = function(up, body, down) {return(c(up, body, down))}))

      # Compute binning
      if (binning.operation == "mean" & bin.size != 1) {
        bin.score.list = purrr::pmap(.l = list(bin.left = seq(1, (upstream+body.length+downstream), by = bin.size),
                                               bin.right = seq(bin.size, (upstream+body.length+downstream), by = bin.size)),
                                     .f = function(bin.left, bin.right) {rowMeans(matrix[,bin.left:bin.right], na.rm = F)})
        return(do.call(cbind, bin.score.list))

      } else if (binning.operation == "sum" & bin.size != 1) {
        bin.score.list = purrr::pmap(.l = list(bin.left = seq(1, (upstream+body.length+downstream), by = bin.size),
                                               bin.right = seq(bin.size, (upstream+body.length+downstream), by = bin.size)),
                                     .f = function(bin.left, bin.right) {rowSums(matrix[,bin.left:bin.right], na.rm = F)})
        return(do.call(cbind, bin.score.list))

      } else if (binning.operation == "median" & bin.size != 1) {
        bin.score.list = purrr::pmap(.l = list(bin.left = seq(1, (upstream+body.length+downstream), by = bin.size),
                                               bin.right = seq(bin.size, (upstream+body.length+downstream), by = bin.size)),
                                     .f = function(bin.left, bin.right) {robustbase::rowMedians(matrix[,bin.left:bin.right], na.rm = F)})
        return(do.call(cbind, bin.score.list))
      } else {return(matrix)}
    } # end scale-regions
  } # end compute.bins


  ##########################################################################################
  # COMPUTE WHOLE MATRIX
  ##########################################################################################
  regions.boundaries = 0
  group.matrices = list()

  for (r in 1:length(regions.list)) {
    current.bed = import_bed(bed = regions.list[[r]],
                             mode = mode,
                             upstream = upstream,
                             downstream = downstream,
                             reference.point = reference.point,
                             sort.regions.coordinates = sort.regions.coordinates)
    regions.boundaries = c(regions.boundaries, length(current.bed) + regions.boundaries[length(regions.boundaries)])


    all.samples.matrces = list()
    for (s in 1:length(samples.list)) {
      all.samples.matrces[[s]] = compute.bins(score_list = get.single.base.scores(bigWig = samples.list[s],
                                                                                  regions = current.bed,
                                                                                  missing.data.as.zero = missing.data.as.zero),
                                              mode = mode,
                                              bin.size = bin.size,
                                              binning.operation = binning.operation,
                                              upstream = upstream,
                                              downstream = downstream,
                                              body.length = body.length)
    }

    group.matrices[[r]] = cbind(as.data.frame(current.bed, row.names = c()),
                                as.data.frame(do.call(cbind, all.samples.matrces)))
  } # end matrices list generation

  whole.matrix = do.call(rbind, group.matrices)

  # Define sample boundaries
  samples.boundaries = 0
  length_samples = ifelse(test = mode == "reference-point",
                          yes = (upstream + downstream)/bin.size,
                          no = (upstream + body.length + downstream)/bin.size)
  for (i in 1:length(samples.list)) {samples.boundaries = c(samples.boundaries, length_samples + samples.boundaries[length(samples.boundaries)])}


  ##########################################################################################
  # METADATA TABLE
  ##########################################################################################

  metadata = data.frame(upstream = paste(rep(upstream, length(samples.list)), collapse=","),
                        downstream = paste(rep(upstream, length(samples.list)), collapse=","),
                        body = ifelse(test = mode == "scale-regions",
                                      yes = paste(rep(body.length, length(samples.list)), collapse=","),
                                      no = paste(rep(0, length(samples.list)), collapse=",")),
                        bin_size = paste(rep(bin.size, length(samples.list)), collapse=","),
                        ref_point = paste(rep(reference.point.label, length(samples.list)), collapse=","),
                        verbose = "false",
                        bin_avg_type = binning.operation,
                        missing_data_as_zero = tolower(as.character(missing.data.as.zero)),
                        min_threshold = "null",
                        max_threshold = "null",
                        scale = 1,
                        skip_zeros = "false",
                        nan_after_end = "false",
                        proc_number = parallel::detectCores(),
                        sort_regions = ifelse(test = sort.regions.coordinates == TRUE, yes = "sorted", no = "keep"),
                        sort_using = ifelse(test = sort.regions.coordinates == TRUE, yes = "coordinates", no = "null"),
                        unscaled_5_prime = "null",
                        unscaled_3_prime = "null",
                        group_labels = paste(region.names, collapse = ","),
                        group_boundaries = paste(regions.boundaries, collapse = ","),
                        sample_labels = paste(sample.names, collapse = ","),
                        sample_boundaries = paste(samples.boundaries, collapse = ",")
  )


  ##########################################################################################
  # OUTPUT
  ##########################################################################################

  return(list(metadata = data.frame(parameters = colnames(metadata),
                                    values = t(metadata)[,1],
                                    row.names = c()),
              matrix.data = whole.matrix,
              original.file.path = "Matrix generated by Rseb::density.matrix()"))

} # END FUNCTION
sebastian-gregoricchio/Rseb documentation built on Sept. 4, 2024, 1:59 p.m.