R/HRG_MLE_each_compartment_fun.R

Defines functions HRG_MLE_each_compartment_fun

	## Yuanlong LIU
	## 12/04/2018
	## 23/05/2018
	## 29/05/2018
	
	
	## THINGS LEFT TO DO (MUST)
	## 1. CHANGE A_final / ws. A_final should be A_whole, in if(adjust_segmentss_topdom==TRUE) segmentss = segmentss_adjust_topdom(A_final, segmentss, n2one, ws=5:20)

	## A should be already named
	## A should be symmetric
	## Does not remove 0-rows/columns of A
	HRG_MLE_each_compartment_fun <- function(pA_sym, seg, allowed_max_nbins_seq=500:1000, max_nbins_fine=max(allowed_max_nbins_seq), ncores, res_dir, fast, adjust_segmentss_topdom=TRUE)
	{
		min_n_bins_outer=2
		min_n_bins_inner=2
		distr = 'lnorm'
		A = pA_sym[seg, seg]
		if( is.null(rownames(A)) | is.null(colnames(A))) stop('A should be named by the bin indices')
		arg_list = as.list(environment())
		res_folder = file.path(res_dir)
		dir.create(res_folder, recursive=TRUE, showWarnings = TRUE)
		
		total_execution_time_file = file.path(res_dir, 'total_execution.time')
		cat('n_core used:', ncores, '\n\n', file=total_execution_time_file, append=FALSE)
		time_begin = Sys.time()
		cat('Execution begins:', as.character(time_begin), '\n', file=total_execution_time_file, append=TRUE)
	
		## max_nbins is defined as the one from allowed_max_nbins_seq has the smallest residue, from which, the smallest is taken
		n_A = nrow(A)
		
		## IF THE SEGMENT IS ALREADY SMALL ENOUGH
		if( n_A <= max(allowed_max_nbins_seq) ) 
		{
			## This will be modified
			{
				dists = (2*min_n_bins_outer - 1):(nrow(A)-1)
				counts = sapply(dists, function(v) n_cells2compute( A, v, min_n_bins_outer ))
				split_info = split2BanlancedChunks_min_max(vec=counts, K=min(length(counts)-1, ncores))
				chunks = c(dists[1]-1, dists[split_info$break_points])
				res_outer = HRG_core( A=A, chunks=chunks, res_folder=NULL, min_n_bins=min_n_bins_outer, distr=distr, fast=fast )
			}
			
			## this part gets the hi_tree
			{
				hi_tree = get_tree_v0( res_outer )
				V(hi_tree)$left_rescaled = V(hi_tree)$left
				V(hi_tree)$right_rescaled = V(hi_tree)$right
				V(hi_tree)$width_rescaled = V(hi_tree)$right - V(hi_tree)$left + 1
				V(hi_tree)$name = paste('(',V(hi_tree)$left_rescaled, ',', V(hi_tree)$right_rescaled, ')', sep='')
			}
			
			full_tree = hi_tree
			res_info = list( arg_list=arg_list, trunk=NULL, segmentss_nadj=NULL, segmentss=NULL, res_outer=NULL, res_inner=list(res_outer) )
			res_info$full_tree = full_tree
			res_folder_final = file.path(res_dir, 'final')
			dir.create(res_folder_final, recursive=TRUE, showWarnings = TRUE)
			save(res_info, file=file.path(res_folder_final, 'res_info.Rdata'))
	
			time_finish = Sys.time()
			cat('Execution finishes:', as.character(time_finish), '\n\n', file=total_execution_time_file, append=TRUE)
			cat('Total execution time:', capture.output( time_finish - time_begin ), '\n\n', file=total_execution_time_file, append=TRUE)
			return(res_info)
		}
		
		## which one leads to the least residue
		max_nbins = allowed_max_nbins_seq[ min(which(n_A%%allowed_max_nbins_seq==min(n_A%%allowed_max_nbins_seq))) ]
		residue = n_A %% max_nbins
	
		residue_mat_info = get_least_residue_matrix(A, max_nbins=max_nbins, allowed_shifts=0)
		A_final = residue_mat_info$A_final
		n2one = residue_mat_info$n2one

		## A_low has no row/col name
		A_low = HighResolution2Low_k( A_final, k=max_nbins ) ## k is the dimension of A_final_low
		
		## Starting from here, the original row/col names are no longer used
		
		## this part run HRG_core on the outer part
		{
			dists = (2*min_n_bins_outer - 1):(nrow(A_low)-1)
			counts = sapply(dists, function(v) n_cells2compute( A_low, v, min_n_bins_outer ))
			split_info = split2BanlancedChunks_min_max(vec=counts, K=min(length(counts)-1, ncores))
			chunks = c(dists[1]-1, dists[split_info$break_points])
			res_outer = HRG_core( A=A_low, chunks=chunks, res_folder=NULL, min_n_bins=min_n_bins_outer, distr=distr, fast=fast )
		}

		## this part gets the hi_tree
		{
			hi_tree = get_tree_v0( res_outer )
			V(hi_tree)$left_rescaled = n2one*(V(hi_tree)$left - 1) + 1
			V(hi_tree)$right_rescaled = n2one*V(hi_tree)$right
			node2adjust = which(V(hi_tree)$right_rescaled==max(V(hi_tree)$right_rescaled)) ## append the residue bins here
			V(hi_tree)[node2adjust]$right_rescaled = n_A
			
			V(hi_tree)$width_rescaled = V(hi_tree)$right_rescaled - V(hi_tree)$left_rescaled + 1
			V(hi_tree)$name = paste('(',V(hi_tree)$left_rescaled, ',', V(hi_tree)$right_rescaled, ')', sep='')
		}

		# segmentss = get_segments(hi_tree, binsize_thresh=max_nbins_bp)	

		segmentss_nadj = get_segments(hi_tree, binsize_thresh=max_nbins_fine)
		
		## THIS PARAT ADJUST THE segmentss BASED ON TOPDOM BIN_SIGNAL
		## 14-05-2018
		## Also adjust the tree
		if(adjust_segmentss_topdom==TRUE)
		{
			shift = seg[1]-1
			segmentss = segmentss_adjust_topdom(pA_sym, segmentss_nadj+shift, n2one, ws=5:20) - shift
			for( i in 1:nrow(segmentss) )
			{
				index_left2adj = which(V(hi_tree)$left_rescaled == segmentss_nadj[i,1])
				index_right2adj = which(V(hi_tree)$right_rescaled == segmentss_nadj[i,2])
				V(hi_tree)[index_left2adj]$left_rescaled = segmentss[i,1]
				V(hi_tree)[index_right2adj]$right_rescaled = segmentss[i,2]
				V(hi_tree)$width_rescaled = V(hi_tree)$right_rescaled - V(hi_tree)$left_rescaled + 1
				V(hi_tree)$name = paste('(',V(hi_tree)$left_rescaled, ',', V(hi_tree)$right_rescaled, ')', sep='')
			}
		}
		
		trunk = hi_tree
		# save(segmentss, file=file.path(res_dir, 'outer', 'segmentss.Rdata'))
		
		res_inner = rep(list(list()), nrow(segmentss))
		for( i in 1:nrow(segmentss) )
		{
			cat(i,'\n')
			# if(i==2) while(as.numeric(substr(as.character(Sys.time()),12,13))!=20) {}
			index = segmentss[i,1]:segmentss[i,2]
			A_part = A[index, index]
			dists = (2*min_n_bins_inner - 1):(nrow(A_part)-1)
			counts = sapply(dists, function(v) n_cells2compute( A_part, v, min_n_bins_inner ))
			split_info = split2BanlancedChunks_min_max(vec=counts, K=min(length(counts)-1, ncores))
			chunks = c(dists[1]-1, dists[split_info$break_points])
			
			## A_part is named
			res_info_inner = HRG_core( A=A_part, chunks=chunks, res_folder=NULL, min_n_bins=min_n_bins_inner, distr=distr, fast=fast )
			res_inner[[i]] = res_info_inner      #minimum size filtering
			cat('finished', '\n')
		}
		
		res_info = list( arg_list=arg_list, trunk=trunk, segmentss_nadj=segmentss_nadj, segmentss=segmentss, res_outer=res_outer, res_inner=res_inner )
		res_folder_final = file.path(res_dir, 'final')
		dir.create(res_folder_final, recursive=TRUE, showWarnings = TRUE)
		save(res_info, file=file.path(res_folder_final, 'res_info.Rdata'))
	
		time_finish = Sys.time()
		cat('Execution finishes:', as.character(time_finish), '\n\n', file=total_execution_time_file, append=TRUE)
		cat('Total execution time:', capture.output( time_finish - time_begin ), '\n\n', file=total_execution_time_file, append=TRUE)


		## xenocfraf:
		# res_inner = res_inner[sapply(res_inner, length) > 0]
		branches = lapply( res_inner, get_tree_v0 )
		for( i in 1:length(branches) ) branches[[i]] = update_branch_name(branches[[i]], root_start=segmentss[i,1])
		
		full_tree = xenocraft( trunk, branches )
		names_tmp = do.call(rbind, strsplit(V(full_tree)$name, ','))
		V(full_tree)$left = substring(names_tmp[,1], 2)
		V(full_tree)$right = substring(names_tmp[,2], 1, nchar(names_tmp[,2])-1)
		if(!is_binary_tree(full_tree)) stop("Trunk + branches do not produce a binary tree")
		
		res_info$full_tree = full_tree
		save(res_info, file=file.path(res_folder_final, 'res_info.Rdata'))
	
		return( res_info )
	}
	
YuanlongLiu/CALDER documentation built on Sept. 11, 2020, 12:24 a.m.