R/compartmentalize.R
In diffHic: Differential Analyis of Hi-C Data

Documented in compartmentalize

compartmentalize <- function(data, centers=2, dist.correct=TRUE,
		cov.correct=TRUE, robust.cov=5, ...)
# Computes compartments for every chromosome, using the intra-chromosomal
# contact maps that have been corrected for distance effects.
#
# written by Aaron Lun
# created 26 May 2015
# last modified 8 December 2015
{
    .check_StrictGI(data)

	is.intra <- intrachr(data)
	data <- data[is.intra,]
	if (dist.correct) {
		trended <- filterTrended(data)
		contacts <- trended$abundance - trended$threshold
		dist2trend <- approxfun(x=trended$log.distance, y=trended$threshold, rule=2)
	} else {
		contacts <- scaledAverage(data)
		dist2trend <- function(x) { 0 } # Trend correction function does nothing if no distance correction is requested.
	}

	# Going chromosome-by-chromosome.
    chrs <- seqlevelsInUse(regions(data))
	stored <- vector('list', length(chrs))
    names(stored) <- chrs
	for (chr in chrs) {
		mat <- inflate(data, rows=chr, columns=chr, fill=contacts)
		stored[[chr]] <- .compartChr(mat, data, dist2trend, robust.cov, cov.correct, centers, ...)
	}

# Not sensible to use the entire thing, as clustering will probably be dominated by chromosome, not compartment.
#	} else {
#		# Using the entirety of the interaction space.
#		mat <- as.matrix(data, fill=contacts)
#		stored <- .compartChr(mat, data, dist2trend, robust.cov, cov.correct, centers, ...)
#	}

	return(stored)
}

.compartChr <- function(cm, data, dist2trend, robust.cov, cov.correct, centers, ...) {
	all.a <- anchors(cm, type="row", id=TRUE)
	mat <- as.matrix(cm)
	colnames(mat) <- rownames(mat) <- all.a

	# Filling NA's (i.e., zero's). Using mid-distance, interpolating to get the trend.
	lost.ind <- Matrix::which(is.na(mat))
    lost <- arrayInd(lost.ind, dim(mat))
	seq.id <- as.integer(seqnames(regions(data)))
	mid.pts <- mid(ranges(regions(data)))
	lost.dist <- abs(mid.pts[lost[,1]] - mid.pts[lost[,2]])
	lost.dist <- log10(lost.dist + metadata(data)$width)
	mat[lost.ind] <- .makeEmpty(data) - dist2trend(lost.dist)

	# Correcting for coverage biases, by subtracting half the average coverage from both rows
	# and columns. This is equivalent to dividing by square root of coverage, which works pretty
	# well in place of a more rigorous iterative approach (check out Rao's supplementaries).
	rwm <- log2(Matrix::rowMeans(2^mat))
	if (cov.correct) {
		mat <- mat - rwm/2
		mat <- t(t(mat) - rwm/2)
	}
	
	# Robustifying.
	if (!is.na(robust.cov)) {
 	    rwm.med <- median(rwm)
		rwm.mad <- mad(rwm, center=rwm.med)	
		keep <- (rwm <= rwm.med + robust.cov*rwm.mad) & (rwm >= rwm.med - robust.cov*rwm.mad)
		temp.mat <- mat
		mat <- mat[keep,keep,drop=FALSE]
	}

	# K-means.
	if (nrow(mat) > centers) {
		out <- kmeans(mat, centers=centers, ...)
		comp <- out$cluster
	} else {
		comp <- seq_len(nrow(mat))
	}

	# Filling the robustified bins back in.
	if (!is.na(robust.cov)) {
		mat <- temp.mat
		temp <- integer(length(keep))
		temp[keep] <- comp
		temp[!keep] <- NA
		comp <- temp
	}

	names(comp) <- all.a
    as.matrix(cm) <- mat
	return(list(compartment=comp, matrix=cm))
}