#' Reads Hi-C Contact Data from File
#'
#' @param file The pathname of a normalize Hi-C contact matrix file
#' stored as a whitespace-delimited file. See below for details.
#' Also a gzip-compressed file can be used.
#'
#' @param chr,binSize If the file contains a count matrix without bin
#' annotation, the latter is created from these parameters.
#'
#' @param \ldots Arguments passed to [utils::read.table()] as-is.
#'
#' @param debug If `TRUE`, debug output is produced.
#'
#' @return A list with elements \code{bins} (an N-by-4 data.frame) and
#' \code{counts} (N-by-N matrix).
#'
#' @section Format of HiC contact-matrix file:
#' The contact-matrix file should be a whitespace-delimited text file with
#' neither row names nor column names. The content should be a N-by-(3+N)
#' table where the first three columns correspond to `chr` (string),
#' `from.coord` (integer position), and `to.coord` (integer position).
#' These column defines the genomic location of the N Hi-C bins (in order).
#' The last N columns should contain normalized contact counts (float) such
#' that element (r,3+c) in this table corresponds to count (r,c) in the
#' normalized contact matrix.
#'
#' If an N-by-(4+N) table, then the first column is assumed to contain an
#' `id` (integer), and everything else as above.
#'
#' Example:
#' \preformatted{
#' chr10 0 40000 0 0 0 0 ...
#' chr10 40000 80000 0 0 0 0 ...
#' chr10 80000 120000 0 0 0 0 ...
#' chr10 120000 160000 0 0 0 0 ...
#' ...
#' }
#'
#' @example incl/readHiC.R
#'
#' @seealso [TopDom].
#'
#' @importFrom utils file_test read.table
#' @export
readHiC <- function(file, chr = NULL, binSize = NULL, ..., debug = getOption("TopDom.debug", FALSE)) {
stopifnot(file_test("-f", file))
stopifnot(is.logical(debug), length(debug) == 1L, !is.na(debug))
if (debug) {
mcat("#########################################################################")
mcat("Step 0 : File Read")
mcat("#########################################################################")
}
if (!is.null(chr)) {
chr <- as.character(chr)
stopifnot(is.character(chr), length(chr) == 1, !is.na(chr))
binSize <- as.integer(binSize)
stopifnot(is.integer(binSize), length(binSize) == 1, !is.na(binSize), binSize >= 1)
args <- list(..., comment.char = "", na.strings = "", quote = "",
stringsAsFactors = FALSE)
bins <- args$bins
args$bins <- NULL
args <- args[unique(names(args))]
argsT <- c(list(file, header = FALSE, nrows = 1L), args)
first <- do.call(read.table, args = argsT)
if (debug) mcat(" -- reading ", length(first), "-by-", length(first), " count matrix")
## Assert that it's a count matrix
is.numeric <- unlist(lapply(first, FUN = is.numeric), use.names = FALSE)
stopifnot(all(is.numeric))
if (!is.null(bins)) {
if (any(bins < 1)) {
stop("Argument 'bins' specifies non-positive bin indices")
} else if (any(bins > length(first))) {
stop(sprintf("Argument 'bins' specifies bin indices out of range [1,%d]", length(first)))
}
colClasses <- rep("NULL", times = length(first))
colClasses[bins] <- "numeric"
} else {
colClasses <- rep("numeric", times = length(first))
}
argsT <- c(list(file, colClasses = colClasses, header = FALSE), args)
matrix.data <- do.call(read.table, args = argsT)
colnames(matrix.data) <- NULL
if (!is.null(bins)) {
matrix.data <- matrix.data[bins, , drop = FALSE]
}
## N-by-N count matrix (from file content)
matrix.data <- as.matrix(matrix.data)
dimnames(matrix.data) <- NULL
stopifnot(nrow(matrix.data) == ncol(matrix.data))
n_bins <- length(first)
from.coord <- seq(from = 0, by = binSize, length.out = n_bins)
to.coord <- seq(from = binSize, by = binSize, length.out = n_bins)
if (is.null(bins)) {
stopifnot(n_bins == nrow(matrix.data))
id <- seq_len(n_bins)
} else {
n_bins <- length(bins)
id <- bins
from.coord <- from.coord[bins]
to.coord <- to.coord[bins]
}
## Bin annotation from (chr, binSize)
bins <- data.frame(
id = id,
chr = chr,
from.coord = from.coord,
to.coord = to.coord,
stringsAsFactors = FALSE
)
} else {
args <- list(..., stringsAsFactors = FALSE)
args <- args[unique(names(args))]
argsT <- c(list(file, header = FALSE), args)
matdf <- do.call(read.table, args = argsT)
n_bins <- nrow(matdf)
if (ncol(matdf) - n_bins == 3) {
colnames(matdf) <- c("chr", "from.coord", "to.coord")
} else if (ncol(matdf) - n_bins == 4) {
colnames(matdf) <- c("id", "chr", "from.coord", "to.coord")
} else {
stop("Unknown format of count-matrix file: ", sQuote(file))
}
## Bin annotation (from file content)
bins <- data.frame(
id = seq_len(n_bins),
chr = matdf[["chr"]],
from.coord = matdf[["from.coord"]],
to.coord = matdf[["to.coord"]],
stringsAsFactors = FALSE
)
## N-by-N count matrix (from file content)
matdf <- matdf[, (ncol(matdf) - n_bins + 1):ncol(matdf)]
matrix.data <- as.matrix(matdf)
rm(list = "matdf")
}
stopifnot(is.numeric(matrix.data),
is.matrix(matrix.data),
nrow(matrix.data) == ncol(matrix.data),
nrow(matrix.data) == n_bins)
if (debug) mcat("-- Done!")
if (debug) mcat("Step 0 : Done!")
structure(list(bins = bins, counts = matrix.data), class = "TopDomData")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.