R/get_ccA_atanh.R

Defines functions get_ccA_atanh

	get_ccA_atanh = function(contact_mat_file, compress_size, bin_size)
	{
		contact_mat_raw = fread(contact_mat_file)
		contact_mat_raw = subset(contact_mat_raw, !is.na(V3))
		contact_mat_raw[,1] = contact_mat_raw[,1]/bin_size
		contact_mat_raw[,2] = contact_mat_raw[,2]/bin_size
		contact_mat = contact_mat_raw

		colnames(contact_mat) = c('pos_1', 'pos_2', 'val')	
		if(!all(contact_mat[[2]] >= contact_mat[[1]])) stop('\nYour provided matrix does not represent an upper triangular matrix!\n\n')

		n_bins_whole = max(max(contact_mat[[1]]), max(contact_mat[[2]])) + 1 ## should +1 because contact_mat index starts from 0 (bin 0 represents: 0-10E3, checked by looking at the juicebox map, 2018-11-19)
		contact_mat_sparse = Matrix(0, nrow=n_bins_whole, ncol=n_bins_whole)
		contact_mat_sparse[cbind(contact_mat[[1]]+1, contact_mat[[2]]+1)] <- contact_mat[[3]]

		rownames(contact_mat_sparse) = colnames(contact_mat_sparse) = as.character( 1:nrow(contact_mat_sparse) )
		
		contact_mat_sparse = forceSymmetric(contact_mat_sparse, uplo='U')
		# if(bin_size!=bin_size_initial) contact_mat_sparse = mat_10to40kb( contact_mat_sparse, bin_size, bin_size_initial )
		A = remove_blank_cols(contact_mat_sparse, sparse=TRUE, ratio=zero_ratio) ## has the same rows/cols as A
		if(nrow(A) < 100) A = remove_blank_cols(contact_mat_sparse, sparse=TRUE, ratio=0) ## when all are dense
		while(min(apply(A, 1, sd))==0) ## sometimes after removing the cols / rows, the remained part will all be 0
		{
			A = remove_blank_cols(A, sparse=TRUE, ratio=1E-7) ## has the same rows/cols as A
			if(nrow(A) < 1) stop('ERROR IN GENERATING MEANINGFUL A at the data generating step')
		}

		# #########################################################

		n_bins_A = nrow(A) - nrow(A)%%compress_size
		A_2_compress = A[, 1:n_bins_A]

		bin_names = rownames(A)

		A_compressed = compress_mat_fast( as.matrix(A_2_compress), compress_size=compress_size )
		colnames(A_compressed) = bin_names
		rm(A_2_compress); gc()
		
		A_compressed_sparse = A_compressed
		A_compressed = as.matrix(A_compressed)
		A_compressed_log = log2(A_compressed + 1)
		
		# #########################################################
		cat('Compute correlation matrix\n')

		cA_compressed_log = fast_cor(A_compressed_log)
		ccA_compressed_log = fast_cor(cA_compressed_log)

		cat('Finished computing correlation matrix\n')

		# #########################################################
		# # ccA_compressed_atanh = atanh(ccA_compressed - 1E-7)
		ccA_atanh = atanh(ccA_compressed_log / (1+1E-7))
		A = as.matrix(A)
		rm(A_compressed, A_compressed_sparse, cA_compressed_log, A_compressed_log); gc()
		# #########################################################
		cat('Finished data generation\n')
		return(list(A=A, ccA_atanh=ccA_atanh))
	}
YuanlongLiu/CALDER documentation built on Sept. 11, 2020, 12:24 a.m.