R/SMASH.R

Defines functions vis_GRID grid_ITH_optim ITH_optim gen_ITH_RD gen_subj_truth calc_maf poss_mult

Documented in gen_ITH_RD gen_subj_truth grid_ITH_optim ITH_optim vis_GRID

# Minor Functions
poss_mult = function(x,alloc){
	x2 = unique(c(1,x))
	output = x2[x2 > 0]
	if(alloc != 1){
		output = 1
	}
	
	return(output)
	
}
calc_maf = function(purity,vec_q,mult,tCN,vec_alloc){
	purity * mult * sum(vec_q * vec_alloc) /
		(tCN * purity + 2*(1-purity))
}

#' A collection of pre-defined subclone configurations.
#' 
#' A R list containing subclone configurations in matrix 
#'	form for 1 to 5 subclones. For each matrix, each column 
#'	corresponds to a subclone and each row corresponds to a 
#'	variant's allocation across all subclones. For example, 
#'	the first row of each matrix is a vector of 1's to 
#'	represent clonal variants, variants present in all subclones.
#'
"eS"

#' @title gen_subj_truth
#' @description Simulates copy number states, multiplicities, allocations
#' @param mat_eS A subclone configuration matrix pre-defined in R list \code{eS}
#' @param maxLOCI A positive integer number of simulated somatic variant calls
#' @param nCN A positive integer for the number of allelic copy number pairings 
#'	to sample from. If \code{NULL}, it will be randomly sampled between 1 and 5.
#' @return A list containing the following components:
#' \describe{
#' \item{\code{subj_truth}}{dataframe of each variant's simulated minor 
#'	(\code{CN_1}) and major (\code{CN_2}) copy number states, total copy 
#'	number (\code{tCN}), subclone allocation (\code{true_A}), multiplicity 
#'	(\code{true_M}), mutant allele frequency (\code{true_MAF}), and cellular 
#'	prevalence (\code{true_CP})}
#' \item{\code{purity}}{tumor purity}
#' \item{\code{eta}}{the product of tumor purity and subclone proportions}
#' \item{\code{q}}{vector of subclone proportions}
#' }
#' 
#' @export
gen_subj_truth = function(mat_eS,maxLOCI,nCN = NULL){

	# Enumerate possible CN states
	all_CN_states = c()
	for(tCN in seq(5)){
		CN_1 = seq(0,floor(tCN/2))
		CN_2 = tCN - CN_1
		all_CN_states = rbind(all_CN_states,
			smart_df(tCN = tCN,CN_1 = CN_1,CN_2 = CN_2))
		rm(CN_1,CN_2)
	}
	all_CN_states$prob = dgeom(abs(all_CN_states[,2] - 1) 
		+ abs(all_CN_states[,3] - 1),prob = 0.45)
	all_CN_states$prob = all_CN_states$prob / sum(all_CN_states$prob)
	tmp_all_CN_states = all_CN_states[which(all_CN_states$CN_1 == all_CN_states$CN_2),]
	tmp_all_CN_states$prob = tmp_all_CN_states$prob / sum(tmp_all_CN_states$prob)
  
	true_purity = runif(1,0.2,1)
	while( TRUE ){
		# Calculate eta and purity and q
		true_q = stats::runif(ncol(mat_eS),0,2)
		true_q = true_q / sum(true_q)
		true_eta = true_q * true_purity
		if( ncol(mat_eS) == 1 ) break
		min_CP_diff = min(diff(sort(mat_eS %*% true_q)))
		if( min(true_q) >= 0.025 && min_CP_diff >= 0.025 ) break
	}
  
	# Generate CN states
	if( !is.null(nCN) ){
		num_CN_states = nCN
	} else {
		num_CN_states = sample(seq(1,5),1)
	}
	if( num_CN_states == 1 ){
		# sample either tCN = 2 or 4 with allelic balance
		poss_CN = tmp_all_CN_states[sample(x = seq(nrow(tmp_all_CN_states)),
			size = 1,prob = tmp_all_CN_states$prob),c("CN_1","CN_2","prob")]
	} else if( num_CN_states > 1 ){
		poss_CN = all_CN_states[sample(x = seq(nrow(all_CN_states)),
			size = num_CN_states,replace = FALSE,
			prob = all_CN_states$prob),c("CN_1","CN_2","prob")]
	}
  poss_CN$prob = poss_CN$prob / sum(poss_CN$prob)
  
  # Generate multiplicity, allocation
  subj_truth = poss_CN[sample(x = seq(nrow(poss_CN)),
		size = maxLOCI,replace = TRUE,prob = poss_CN$prob),
		c("CN_1","CN_2")]
	rownames(subj_truth) = NULL
	subj_truth$tCN = rowSums(subj_truth)
  
	while(TRUE){
		subj_truth$true_A = sample(seq(nrow(mat_eS)),maxLOCI,replace = TRUE)
		tmp_tab = table(subj_truth$true_A)
		min_alloc = min(tmp_tab)
		if( min_alloc >= 2 && length(tmp_tab) == ncol(mat_eS) ) break
	}
  
	subj_truth$true_M = apply(subj_truth[,c("CN_1","CN_2","true_A")],1,
		function(x) sample(poss_mult(x = x[1:2],alloc = x[3]),1))
	subj_truth$true_MAF = apply(subj_truth[,c("true_M","tCN","true_A")],1,
		function(x) calc_maf(purity = true_purity,vec_q = true_q,
			mult = x[1],tCN = x[2],vec_alloc = mat_eS[x[3],]))
	subj_truth$true_CP = sapply(subj_truth$true_A,function(x) sum(mat_eS[x,] * true_q))
	uniq_data2 = unique(subj_truth[,c("CN_1","CN_2")])
  
  # Output
  return(list(subj_truth = subj_truth,
		purity = true_purity,eta = true_eta,
		q = true_q))
	
}

#' @title gen_ITH_RD
#' @description Simulates observed alternate and reference read counts
#' @param DATA The output data.frame from \code{gen_subj_truth} 
#' @param RD A positive integer for the mean read depth generated from 
#'	the negative binomial distribution
#' @return A matrix of simulated alternate and reference read counts.
#' 
#' @export
gen_ITH_RD = function(DATA,RD){
	
	BB = nrow(DATA)
	while(TRUE){
		DATA$tDP = rnbinom(n = BB,mu = RD,size = 2) + 30
		DATA$tAD = rbinom(n = BB,size = DATA$tDP,prob = DATA$true_MAF)
		if( mean(DATA$tAD == 0) < 0.1 ) break
	}
	DATA$tRD = DATA$tDP - DATA$tAD
	return(as.matrix(DATA[,c("tAD","tRD")]))
}

#' @title ITH_optim
#' @description Performs EM algorithm for a given configuration matrix
#' @param my_data A R dataframe containing the following columns:
#' \describe{
#' \item{\code{tAD}}{tumor alternate read counts}
#' \item{\code{tRD}}{tumor reference read counts}
#' \item{\code{CN_1}}{minor allele count}
#' \item{\code{CN_2}}{major allele count, where \code{CN_1 <= CN_2}}
#' \item{\code{tCN}}{\code{CN_1 + CN_2}}
#' }
#' @param my_purity A single numeric value of known/estimated purity
#' @param init_eS A subclone configuration matrix pre-defined in R 
#'	list \code{eS}
#' @param pi_eps0 A user-specified parameter denoting the proportion 
#'	of loci not explained by the combinations of purity, copy number, 
#'	multiplicity, and allocation. If \code{NULL}, it is initialized at 
#'	1e-3. If set to 0.0, the parameter is not estimated.
#' @param my_unc_q An optimal initial vector for the unconstrained 
#'	\code{q} vector, useful after running \code{grid_ITH_optim}. If 
#'	this variable is \code{NULL}, then the subclone proportions, 
#'	\code{q}, are randomly initialized. For instance, if 
#'	\code{my_unc_q = ( x1 , x2 )}, then \code{q = ( exp(x1) / (1 + exp(x1) + exp(x2)) , exp(x2) / (1 + exp(x1) + exp(x2)) , 1 / (1 + exp(x1) + exp(x2))}.
#' @param max_iter Positive integer, preferably 1000 or more, setting 
#'	the maximum number of iterations
#' @param my_epsilon Convergence criterion threshold for changes in 
#'	the log likelihood, preferably 1e-6 or smaller
#' @return If the EM algorithm converges, the output will be a list containing
#' \describe{
#' \item{\code{iter}}{number of iterations}
#' \item{\code{converge}}{convergence status}
#' \item{\code{unc_q0}}{initial unconstrained subclone proportions parameter}
#' \item{\code{unc_q}}{unconstrained estimate of \code{q}}
#' \item{\code{q}}{estimated subclone proportions among cancer cells}
#' \item{\code{CN_MA_pi}}{estimated mixture probabilities of multiplicities 
#'	and allocations given copy number states}
#' \item{\code{eta}}{estimated subclone proportion among tumor cells}
#' \item{\code{purity}}{user-inputted tumor purity}
#' \item{\code{entropy}}{estimated entropy}
#' \item{\code{infer}}{A R dataframe containing inferred variant allocations 
#'	(\code{infer_A}), multiplicities (\code{infer_M}), cellular prevalences 
#'	(\code{infer_CP}).}
#' \item{\code{ms}}{model size, number of parameters within parameter space}
#' \item{\code{LL}}{The observed log likelihood evaluated at maximum likelihood 
#'	estimates.}
#' \item{\code{AIC = 2 * LL - 2 * ms}}{Negative AIC, used for model selection}
#' \item{\code{BIC = 2 * LL - ms * log(LOCI)}}{Negative BIC, used for model selection}
#' \item{\code{LOCI}}{The number of inputted somatic variants.}
#' }
#'
#' @export
ITH_optim = function(my_data,my_purity,init_eS,
	pi_eps0 = NULL,my_unc_q = NULL,max_iter = 4e3,
	my_epsilon = 1e-6){

  if( !is.data.frame(my_data) ){
    stop("my_data is not a data.frame")
  }
  req_vars = c("tCN","CN_1","CN_2","tAD","tRD")
  miss_vars = req_vars[!req_vars %in% names(my_data)]
  if( length(miss_vars) > 0 ){
		stop(sprintf("The following columns are missing from my_data: %s",
			paste(miss_vars,collapse=", ")))
  }
  if( min(my_data$tAD) <= 0 ){
		stop("Input variants with positive integer alternate read counts")
	}
	if( !all(my_data$tAD == round(my_data$tAD)) || !all(my_data$tRD == round(my_data$tRD)) 
			|| !all(my_data$tCN == round(my_data$tCN)) || !all(my_data$CN_1 == round(my_data$CN_1))
			|| !all(my_data$CN_2 == round(my_data$CN_2)) ){
		stop("Alternate and reference read counts, total, minor, and major copy numbers should all be integers.")
	}
	if( my_purity <= 0 || my_purity > 1 ){
		stop("Tumor purity should be strictly > 0 and <= 1.")
	}
	
  # Calculate some fixed values
  my_data$tDP = rowSums(my_data[,c("tAD","tRD")])
  my_data$log_DP = log(my_data$tDP)
  my_data$LBC = apply(my_data[,c("tDP","tAD")],1,
		function(xx) lchoose(xx[1],xx[2]))
  
  # Initialize Params
  rownames(my_data) = NULL
  uniq_CN = unique(my_data[,c("tCN","CN_2","CN_1")])
  uniq_CN = uniq_CN[order(uniq_CN$tCN,uniq_CN$CN_2),]
  uniq_CN_MA = c()
  for(ii in seq(nrow(uniq_CN))){
    one_row = uniq_CN[ii,c("tCN","CN_2","CN_1")]
    for(kk in seq(nrow(init_eS))){
      tmp_mult = poss_mult(as.numeric(one_row[,c("CN_1","CN_2")]),
				alloc = kk)
      for(jj in seq(length(tmp_mult))){
        uniq_CN_MA = rbind(uniq_CN_MA,smart_df(one_row,
					M = tmp_mult[jj],A = kk))
      }
    }
  }
  rownames(uniq_CN_MA) = NULL
  init_pi_CN_MA = rep(NA,nrow(uniq_CN_MA))
  num_pi = 0
  for(cn in unique(uniq_CN$tCN)){
    for(CN_2 in uniq_CN$CN_2[which(uniq_CN$tCN == cn)]){
      tmp_index = which(uniq_CN_MA$tCN == cn & uniq_CN_MA$CN_2 == CN_2)
      tmp_len = length(tmp_index)
      init_pi_CN_MA[tmp_index] = rep(1/tmp_len,tmp_len)
      num_pi = num_pi + tmp_len - 1
    }
  }
  
  # Initialize vector q, cancer subclone proportions
  if( is.null(my_unc_q) ){
    rr = sample(c(0,1),1)
    if(rr == 0){
      init_unc_q = runif(ncol(init_eS) - 1,-3,1)
    } else if(rr == 1){
      init_unc_q = runif(ncol(init_eS) - 1,1,2)
    }
  } else {
    init_unc_q = my_unc_q
  }
  unc_q0 = init_unc_q
  
	if( is.null(pi_eps0) ){
		pi_eps0 = 1e-3
	}
  
	# Create B matrix: Stores 0s and 1s for what's feasible given copy number state
	num_LOCI = nrow(my_data)
	my_mat_B = matrix(0,num_LOCI,nrow(uniq_CN_MA))
	for(bb in seq(num_LOCI)){
		one_row = my_data[bb,]
		one_index = which(uniq_CN_MA$tCN == one_row$tCN 
			& uniq_CN_MA$CN_2 == one_row$CN_2)
		my_mat_B[bb,one_index] = 1
	}
  
  # Optimize
  if( ncol(init_eS) == 1 ){
		ith_out = Rcpp_ITH_opt(RD = as.matrix(my_data[,c("tAD","tRD")]),
			log_DP = my_data$log_DP,LBC = my_data$LBC,BB = my_mat_B,
			uniq_BB = unique(my_mat_B),uniq_CN_MA = as.matrix(uniq_CN_MA),
			eS = init_eS,purity = my_purity,tCN = my_data$tCN,
			pi_eps0 = pi_eps0,pi0 = init_pi_CN_MA,unc_qq0 = unc_q0,
			mstep = FALSE,max_iter = max_iter,eps = my_epsilon,show = FALSE)
  } else {
		ith_out = Rcpp_ITH_opt(RD = as.matrix(my_data[,c("tAD","tRD")]),
			log_DP = my_data$log_DP,LBC = my_data$LBC,BB = my_mat_B,
			uniq_BB = unique(my_mat_B),uniq_CN_MA = as.matrix(uniq_CN_MA),
			eS = init_eS,purity = my_purity,tCN = my_data$tCN,
			pi_eps0 = pi_eps0,pi0 = init_pi_CN_MA,unc_qq0 = unc_q0,
			mstep = TRUE,max_iter = max_iter,eps = my_epsilon,show = FALSE)
  }
  # ith_out[c("converge","iter","LL","ms","BIC","pi_eps","CN_MA_pi","qq")]
  
  # Output
  if( ith_out$converge == 1 ){
    uniq_CN_MA$infer_MAF = apply(uniq_CN_MA,1,function(x) 
      calc_maf(my_purity,ith_out$qq,x[4],x[1],init_eS[x[5],]))
    uniq_CN_MA$infer_CP = sapply(uniq_CN_MA$A,function(x) 
      sum(init_eS[x,] * ith_out$qq))
    noise_df = smart_df(t(rep(NA,ncol(uniq_CN_MA))))
    names(noise_df) = names(uniq_CN_MA)
    uniq_CN_MA_noise = rbind(noise_df,uniq_CN_MA)
    infer_out = uniq_CN_MA_noise[apply(ith_out$Z,1,which.max),
			c("M","A","infer_MAF","infer_CP")]
		rownames(infer_out) = NULL
		names(infer_out)[1:2] = paste0("infer_",names(infer_out)[1:2])
		list(converge = TRUE,unc_q0 = unc_q0,iter = ith_out$iter,
			q = ith_out$qq,CN_MA_pi = smart_df(uniq_CN_MA,pi = ith_out$CN_MA_pi),
			pi_eps = ith_out$pi_eps,eta = ith_out$qq * my_purity,
			purity = my_purity,entropy = round(ith_out$entropy,2),
			infer = infer_out,ms = ith_out$ms,LL = ith_out$LL,
			AIC = ith_out$AIC,BIC = ith_out$BIC,LOCI = num_LOCI)
  } else {
    list(converge = FALSE,BIC = -Inf)
  }
  
}

#' @title grid_ITH_optim
#' @description This function performs a grid search over enumerated 
#' 	configurations within the pre-defined list \code{eS}
#' @inheritParams ITH_optim
#' @param list_eS A nested list of subclone configuration matrices
#' @param trials Positive integer, number of random initializations 
#'	of subclone proportions
#' @return A R list containing two objects. \code{GRID} is a 
#'	dataframe where each row denotes a feasible subclone configuration 
#'	with corresponding subclone proportion estimates \code{q} and 
#'	somatic variant allocations \code{alloc}. \code{INFER} is a list 
#'	where \code{INFER[[i]]} corresponds to the \code{i}-th row or 
#'	model of \code{GRID}.
#' 
#' @export
grid_ITH_optim = function(my_data,my_purity,list_eS,pi_eps0 = NULL,
	trials = 20,max_iter = 4e3,my_epsilon = 1e-6){
	
	# Run EM on all provided subclone configurations
	all_infer = list() # new code
	all_ck = c()
	
	for(cc in seq(length(list_eS))){
		message(sprintf("cc = %s; ",cc),appendLF = FALSE)
		tmp_ck0 = c()
		tmp_infer0 = list() # new code
	
	for(kk in seq(length(list_eS[[cc]]))){
		message(sprintf("kk = %s ",kk),appendLF = FALSE)
		tmp_ck = c()
		tmp_infer = list() # new code
		
		for(count in seq(trials)){
			if(count %% 5 == 0) message("*",appendLF = FALSE)
			ith_out = list(converge = FALSE,BIC = -Inf)
			
			ith_out1 = ITH_optim(my_data = my_data,my_purity = my_purity,
				init_eS = list_eS[[cc]][[kk]],pi_eps0 = 0,my_unc_q = NULL, 
				max_iter = max_iter,my_epsilon = my_epsilon)
			if( ith_out1$converge ){
				ith_out = ith_out1
				rm(ith_out1)
			}
			
			ith_out2 = ITH_optim(my_data = my_data,my_purity = my_purity,
				init_eS = list_eS[[cc]][[kk]],pi_eps0 = pi_eps0,my_unc_q = NULL, 
				max_iter = max_iter,my_epsilon = my_epsilon)
			if( ith_out2$converge && ith_out2$BIC >= ith_out$BIC ){
				ith_out = ith_out2
			}
			
			if( ith_out$converge ){
				# Even though we have convergence, check all subclones have an allocation
				tab_A = table(ith_out$infer$infer_A)
				len_A = length(tab_A)
				if( len_A == cc && min(round(ith_out$q,2)) > 0 ){
					tmp_ck = rbind(tmp_ck,smart_df(cc = cc,kk = kk,
						ms = ith_out$ms,entropy = ith_out$entropy,
						LL = ith_out$LL,AIC = ith_out$AIC,BIC = ith_out$BIC,
						q = paste(round(ith_out$q,2),collapse = ","),
						unc_q0 = paste(round(ith_out$unc_q0,6),collapse = ","),
						alloc = paste(paste0(names(tab_A),"|",tab_A),collapse = ";"),
						pi_eps = ith_out$pi_eps))
					tmp_infer[[length(tmp_infer) + 1]] = ith_out$infer # new code
				}
			}
			
		}
		message(" ",appendLF = FALSE)
		
		if( !is.null(tmp_ck) && nrow(tmp_ck) > 0 ){
			keep_idx = !duplicated(tmp_ck[,c("cc","kk","ms","entropy","LL")]) # new code
			tmp_ck = tmp_ck[keep_idx,]
			tmp_ck0 = rbind(tmp_ck0,tmp_ck)
			
			# new code
			tmp_infer = tmp_infer[keep_idx]
			for(jk in seq(length(tmp_infer))){
				tmp_infer0[[length(tmp_infer0) + 1]] = tmp_infer[[jk]]
			}
			
		}
		
	}
    
    if( !is.null(tmp_ck0) && nrow(tmp_ck0) > 0 ){
      all_ck = rbind(all_ck,tmp_ck0)
			
			# new code
			for(jk in seq(length(tmp_infer0))){
				all_infer[[length(all_infer) + 1]] = tmp_infer0[[jk]]
			}
			
    }
    message("\n",appendLF = FALSE)
  }
	# all_ck = all_ck[order(all_ck$cc,all_ck$kk,all_ck$ms),]
	rownames(all_ck) = NULL
	# all_ck
  
	if(FALSE){ # Obtain details of inferred results
		message("Obtain inferred results: ",appendLF = FALSE)
		thres = 0; num_res = nrow(all_ck)
		infer_list = list()
		for(ii in seq(num_res)){
			
			if( ii / num_res * 100 >= thres ){
				message(paste0(thres,"% "),appendLF = FALSE)
				thres = thres + 5
			}
			
			if( all_ck$cc[ii] == 1 ){
				ith_out = ITH_optim(my_data = my_data,
					my_purity = my_purity,
					init_eS = list_eS[[all_ck$cc[ii]]][[all_ck$kk[ii]]],
					pi_eps0 = pi_eps0,my_unc_q = NULL,
					max_iter = max_iter,
					my_epsilon = my_epsilon)
			} else if( all_ck$cc[ii] > 1 ){
				ith_out = ITH_optim(my_data = my_data,
					my_purity = my_purity,
					init_eS = list_eS[[all_ck$cc[ii]]][[all_ck$kk[ii]]],
					pi_eps0 = pi_eps0,my_unc_q = as.numeric(strsplit(all_ck$unc_q0[ii],",")[[1]]),
					max_iter = max_iter,
					my_epsilon = my_epsilon)
				if( !ith_out$converge ){
					while(TRUE){
						ith_out = ITH_optim(my_data = my_data,
							my_purity = my_purity,
							init_eS = list_eS[[all_ck$cc[ii]]][[all_ck$kk[ii]]],
							pi_eps0 = pi_eps0,my_unc_q = NULL,
							max_iter = max_iter,
							my_epsilon = my_epsilon)
						if( ith_out$converge 
								&& round(ith_out$LL,2) == all_ck$LL[ii] 
								&& ith_out$ms == all_ck$ms[ii] ){
							break
						}
					}
				}
			}
			
			# After recovering inferred result, save to list
			infer_list[[ii]] = ith_out$infer
			rm(ith_out)
		}
		message("\n",appendLF = FALSE)
  }
	
  return(list(GRID = all_ck,INFER = all_infer))
	
}

#' @title vis_GRID
#' @description A simple visualization of SMASH's grid of solutions
#' @param GRID The \code{GRID} object output from \code{grid_ITH_optim}.
#' @return A ggplot object for data visualization
#'
#' @export
vis_GRID = function(GRID){
	
	GRID$solu = factor(seq(nrow(GRID)))
	GRID$pBIC = 0.5 * GRID$BIC
	GRID$pBIC = exp(GRID$pBIC - logSumExp(GRID$pBIC))
	GRID$pAIC = 0.5 * GRID$AIC
	GRID$pAIC = exp(GRID$pAIC - logSumExp(GRID$pAIC))

	tmp_df = melt(GRID,id.vars = "solu",c("pBIC","pAIC"))
	# tmp_df
	
	solu = value = variable = NULL
	gg = ggplot(data = tmp_df,mapping = aes(x = solu,y = value,
		group = variable,color = variable)) +
		geom_line(size = 1.3) + geom_point(size = 3) +
		scale_x_discrete(guide = guide_axis(n.dodge = 2)) +
		xlab("Solution") + ylab("Posterior Probability") +
		labs(color = "Information Criterion") +
		theme(legend.position = "bottom",
			text = element_text(size = 20))
	
	return(gg)
}

#' @importFrom stats dgeom rbinom rnbinom runif
#' @importFrom smarter smart_df logSumExp
#' @importFrom reshape2 melt
#' @importFrom ggplot2 ggplot aes geom_line geom_point
#'	scale_x_discrete guide_axis xlab ylab labs theme
#'	element_text
#' @importFrom Rcpp sourceCpp
#' @useDynLib SMASH
NULL


###
Sun-lab/SMASH documentation built on March 5, 2025, 8 p.m.