R/readHiC.R

Defines functions readHiC

Documented in readHiC

#' Reads Hi-C Contact Data from File
#' 
#' @param file The pathname of a normalize Hi-C contact matrix file
#' stored as a whitespace-delimited file.  See below for details.
#' Also a gzip-compressed file can be used.
#'
#' @param chr,binSize If the file contains a count matrix without bin
#' annotation, the latter is created from these parameters.
#'
#' @param \ldots Arguments passed to [utils::read.table()] as-is.
#'
#' @param debug If `TRUE`, debug output is produced.
#' 
#' @return A list with elements \code{bins} (an N-by-4 data.frame) and
#' \code{counts} (N-by-N matrix).
#'
#' @section Format of HiC contact-matrix file:
#' The contact-matrix file should be a whitespace-delimited text file with
#' neither row names nor column names.  The content should be a N-by-(3+N)
#' table where the first three columns correspond to `chr` (string),
#' `from.coord` (integer position), and `to.coord` (integer position).
#' These column defines the genomic location of the N Hi-C bins (in order).
#' The last N columns should contain normalized contact counts (float) such
#' that element (r,3+c) in this table corresponds to count (r,c) in the
#' normalized contact matrix.
#'
#' If an N-by-(4+N) table, then the first column is assumed to contain an
#' `id` (integer), and everything else as above.
#'
#' Example:
#' \preformatted{
#'   chr10       0   40000  0 0 0 0 ...
#'   chr10   40000   80000  0 0 0 0 ...
#'   chr10   80000  120000  0 0 0 0 ...
#'   chr10  120000  160000  0 0 0 0 ...
#'   ...
#' }
#'
#' @example incl/readHiC.R
#' 
#' @seealso [TopDom].
#'
#' @importFrom utils file_test read.table
#' @export
readHiC <- function(file, chr = NULL, binSize = NULL, ..., debug = getOption("TopDom.debug", FALSE)) {
  stopifnot(file_test("-f", file))
  stopifnot(is.logical(debug), length(debug) == 1L, !is.na(debug))

  if (debug) {
    mcat("#########################################################################")
    mcat("Step 0 : File Read")
    mcat("#########################################################################")
  }
  
  if (!is.null(chr)) {
    chr <- as.character(chr)
    stopifnot(is.character(chr), length(chr) == 1, !is.na(chr))
    binSize <- as.integer(binSize)
    stopifnot(is.integer(binSize), length(binSize) == 1, !is.na(binSize), binSize >= 1)

    args <- list(..., comment.char = "", na.strings = "", quote = "",
                      stringsAsFactors = FALSE)
    bins <- args$bins                        
    args$bins <- NULL
    args <- args[unique(names(args))]
    argsT <- c(list(file, header = FALSE, nrows = 1L), args)
    first <- do.call(read.table, args = argsT)
    if (debug) mcat("  -- reading ", length(first), "-by-", length(first), " count matrix")
    ## Assert that it's a count matrix
    is.numeric <- unlist(lapply(first, FUN = is.numeric), use.names = FALSE)
    stopifnot(all(is.numeric))
    
    if (!is.null(bins)) {
      if (any(bins < 1)) {
        stop("Argument 'bins' specifies non-positive bin indices")
      } else if (any(bins > length(first))) {
        stop(sprintf("Argument 'bins' specifies bin indices out of range [1,%d]", length(first)))
      }
      colClasses <- rep("NULL", times = length(first))
      colClasses[bins] <- "numeric"
    } else {
      colClasses <- rep("numeric", times = length(first))
    }
    argsT <- c(list(file, colClasses = colClasses, header = FALSE), args)
    matrix.data <- do.call(read.table, args = argsT)
    colnames(matrix.data) <- NULL

    if (!is.null(bins)) {
      matrix.data <- matrix.data[bins, , drop = FALSE]
    }

    ## N-by-N count matrix (from file content)
    matrix.data <- as.matrix(matrix.data)
    dimnames(matrix.data) <- NULL
    stopifnot(nrow(matrix.data) == ncol(matrix.data))
    n_bins <- length(first)

    from.coord <- seq(from = 0, by = binSize, length.out = n_bins)
    to.coord   <- seq(from = binSize, by = binSize, length.out = n_bins)
    if (is.null(bins)) {
      stopifnot(n_bins == nrow(matrix.data))
      id <- seq_len(n_bins)
    } else {
      n_bins <- length(bins)
      id <- bins
      from.coord <- from.coord[bins]
      to.coord   <- to.coord[bins]
    }
    
    ## Bin annotation from (chr, binSize)
    bins <- data.frame(
      id               = id,
      chr              = chr,
      from.coord       = from.coord,
      to.coord         = to.coord,
      stringsAsFactors = FALSE
    )
  } else {
    args <- list(..., stringsAsFactors = FALSE)
    args <- args[unique(names(args))]
    argsT <- c(list(file, header = FALSE), args)
    matdf <- do.call(read.table, args = argsT)
    n_bins <- nrow(matdf)
    if (ncol(matdf) - n_bins == 3) {
      colnames(matdf) <- c("chr", "from.coord", "to.coord")
    } else if (ncol(matdf) - n_bins == 4) {
      colnames(matdf) <- c("id", "chr", "from.coord", "to.coord")
    } else {
      stop("Unknown format of count-matrix file: ", sQuote(file))
    }

    ## Bin annotation (from file content)
    bins <- data.frame(
      id         = seq_len(n_bins),
      chr        = matdf[["chr"]],
      from.coord = matdf[["from.coord"]],
      to.coord   = matdf[["to.coord"]],
      stringsAsFactors = FALSE
    )

    ## N-by-N count matrix (from file content)
    matdf <- matdf[, (ncol(matdf) - n_bins + 1):ncol(matdf)]
    matrix.data <- as.matrix(matdf)
    rm(list = "matdf")
  }

  stopifnot(is.numeric(matrix.data),
            is.matrix(matrix.data),
            nrow(matrix.data) == ncol(matrix.data),
            nrow(matrix.data) == n_bins)

  if (debug) mcat("-- Done!")
  if (debug) mcat("Step 0 : Done!")

  structure(list(bins = bins, counts = matrix.data), class = "TopDomData")
}
HenrikBengtsson/TopDom documentation built on April 9, 2023, 2:11 a.m.