R/projector.R

Defines functions update_models_debatched update_models MAP get_expected_noise_UMI_counts_alpha get_expected_noise_UMI_counts_beta getOneBatchCorrectedLikelihood update_alpha_single_batch getLikelihood get_one_likelihood get_total_likelihood

library(Matrix)
library(reshape2)

get_total_likelihood=function(ll){
  return(mean(apply(ll,1,max)))
}


# get_one_likelihood
#
# returns the log likelihood of all the cells to a single model
# model_v - a probabiliy vector
# umitab
# reg - regularization parameter

get_one_likelihood=function(model_v,umitab,reg){
  return(colSums(umitab*log2(reg+model_v)))
}


getLikelihood=function(umitab,models,reg){
    return(as.matrix(Matrix::t(umitab)%*%log2(reg+models))/Matrix::colSums(umitab))
}


#update_alpha_single_batch=function(umitab,models,noise_model,reg,max_noise_fraction=.75,max_ncells=5000){
#  if (ncol(umitab)>max_ncells){
#    umitab=umitab[,sample(colnames(umitab),size = max_ncells)]
#  }
#
#  if (nrow(models)!=length(noise_model)){
#    stop("noise_models and models have different number of genes")
#  }
#  get_ll_a=function(alpha,models,noise_model,umitab,reg){
#    adjusted_models=t((1-alpha)*t(models)+alpha*matrix(noise_model,ncol(models),nrow(models),byrow=T))
#    ll_b=getLikelihood(umitab,adjusted_models,reg=reg)
#    return(ll_b)
#  }
#  func_to_opt=function(x){
#    message(round(x,digits=2))
#    tot_ll=get_total_likelihood(get_ll_a(x,models,noise_model,umitab,reg))
#    return(tot_ll)
#  }
#
#  optim_val=optimize(func_to_opt,c(1e-3,max_noise_fraction),maximum = T,tol=1e-3)
#
#  return(optim_val$maximum)
#}

update_alpha_single_batch=function(umitab,models,noise_model,cell_to_cluster=NULL,reg,max_noise_fraction=.2,max_ncells=5000){
  if (ncol(umitab)>max_ncells){
    umitab=umitab[,sample(colnames(umitab),size = max_ncells)]
  }

  if (nrow(models)!=length(noise_model)){
    stop("noise_models and models have different number of genes")
  }


  get_tot_likelihood_fixed_clusters=function(alpha){
    adjusted_models=t((1-alpha)*t(models)+alpha*matrix(noise_model,ncol(models),nrow(models),byrow=T))
    ll=getLikelihood(umitab,adjusted_models,reg=reg)
    return(mean(ll[cbind(1:nrow(ll),match(cell_to_cluster,colnames(ll)))]))
  }

  get_tot_likelihood=function(alpha){
    adjusted_models=t((1-alpha)*t(models)+alpha*matrix(noise_model,ncol(models),nrow(models),byrow=T))
    return(get_total_likelihood(getLikelihood(umitab,adjusted_models,reg=reg)))
  }

  if (!is.null(cell_to_cluster)){
    optim_val=optimize(get_tot_likelihood_fixed_clusters,c(0,max_noise_fraction),maximum = T,tol=1e-3)
  }
  else{
    optim_val=optimize(get_tot_likelihood,c(0,max_noise_fraction),maximum = T,tol=1e-3)
  }
  return(optim_val$maximum)
}




getOneBatchCorrectedLikelihood=function(umitab,models,noise_model,alpha_noise=NULL,reg,calc_ll_noise=F,ncells_per_iter=1000){

  ll=matrix(NA,ncol(umitab),ncol(models))
  rownames(ll)=colnames(umitab)
  colnames(ll)=colnames(models)
  if (calc_ll_noise){
    ll_noise=matrix(NA,ncol(umitab),1)
    rownames(ll_noise)=colnames(umitab)
  }
  else{
    ll_noise=NULL
  }

  adjusted_models=t((1-alpha_noise)*t(models)+alpha_noise*matrix(noise_model,ncol(models),nrow(models),byrow=T))
  cells_per_iter=split(colnames(umitab),rep(seq(1,ceiling(ncol(umitab)/ncells_per_iter)),each=ncells_per_iter)[1:ncol(umitab)])
  for (i in 1:length(cells_per_iter)){
    ll[cells_per_iter[[i]],colnames(adjusted_models)]=getLikelihood(umitab[,cells_per_iter[[i]],drop=F],adjusted_models,reg=reg)
  }
  rm(adjusted_models)
  if (calc_ll_noise){
    ll_noise[,1]=getLikelihood(umitab,noise_model,reg=reg)[,1]
  }
  return(list(ll=ll,ll_noise=ll_noise))
}




get_expected_noise_UMI_counts_beta=function(umis,cluster,batch,noise_models,beta_noise,clusters){
 ngenes=nrow(noise_models)
  nmodels=length(clusters)
  nsamps=ncol(noise_models)
  tmp_tab=table(batch,cluster)
  ncells=matrix(0,nsamps,nmodels,dimnames = list(colnames(noise_models),clusters))
  ncells[rownames(tmp_tab),colnames(tmp_tab)]=tmp_tab

  #beta_noise is the inferred number of noise molecules/cell.
  #tab contains the number of cells per (sample,cluster)
  tot_noise_umis=matrix(beta_noise[colnames(noise_models)],nsamps,nmodels,dimnames = list(colnames(noise_models),clusters))*ncells
  arr_tot_noise_umis=array(tot_noise_umis,dim=c(nsamps,nmodels,ngenes))
  arr_tot_noise_umis=aperm(arr_tot_noise_umis,c(1,3,2))
  arr_noise_models=array(noise_models[,rownames(ncells)],dim=c(ngenes,nsamps,nmodels))
  arr_noise_models=aperm(arr_noise_models,c(2,1,3))
  expected_noise_counts=arr_noise_models*arr_tot_noise_umis
  dimnames(expected_noise_counts)=list(colnames(noise_models),rownames(noise_models),clusters)
  return(expected_noise_counts)
}

get_expected_noise_UMI_counts_alpha=function(umis,cluster,batch,noise_models,alpha_noise,clusters){
  ngenes=nrow(noise_models)
  nmodels=length(clusters)
  nsamps=ncol(noise_models)

  raw_counts=Matrix::t(aggregate.Matrix(Matrix::t(umis),cluster))
  ag=aggregate(Matrix::colSums(umis),by=list(batch,cluster),sum)
  numis_per_batch=sapply(split(Matrix::colSums(umis),batch),sum)[colnames(noise_models)]
  numis_per_batch_cluster=matrix(0,nsamps,nmodels,dimnames = list(colnames(noise_models),clusters))
  tmp_numis_per_batch_cluster=invisible(acast(data.frame(batch=batch,cluster=cluster,numis=Matrix::colSums(umis)),batch~cluster,fun.aggregate=sum,value.var = "numis")[colnames(noise_models),colnames(raw_counts)])
  numis_per_batch_cluster[rownames(tmp_numis_per_batch_cluster),colnames(tmp_numis_per_batch_cluster)]=tmp_numis_per_batch_cluster
  tot_noise_umis=matrix(numis_per_batch_cluster*alpha_noise,nsamps,nmodels,dimnames = list(colnames(noise_models),clusters))
  arr_tot_noise_umis=array(tot_noise_umis,dim=c(nsamps,nmodels,ngenes))
  arr_tot_noise_umis=aperm(arr_tot_noise_umis,c(1,3,2))
  arr_noise_models=array(noise_models,dim=c(ngenes,nsamps,nmodels))
  arr_noise_models=aperm(arr_noise_models,c(2,1,3))
  expected_noise_counts=arr_noise_models*arr_tot_noise_umis
  dimnames(expected_noise_counts)=list(colnames(noise_models),rownames(noise_models),clusters)
  return(expected_noise_counts)
}



MAP=function(likelihood){
  v=colnames(likelihood)[apply(likelihood,1,which.max)]
  names(v)=rownames(likelihood)
  return(v)
}

update_models=function(umis,cluster){
    counts=aggregate.Matrix(Matrix::t(umis),cluster)
    models=Matrix::t(counts/Matrix::rowSums(counts))
  return(as.matrix(models))
}

update_models_debatched=function(umis,cell_to_cluster,batch,noise_models,alpha_noise,make_plots=F,figure_prefix=""){
  clusters=unique(cell_to_cluster)
  clusters=as.character(clusters[order(as.numeric(clusters))])
  raw_counts=Matrix(0,nrow(umis),length(clusters),dimnames = list(rownames(umis),clusters))
  tmp_raw_counts=Matrix::t(aggregate.Matrix(Matrix::t(umis),cell_to_cluster))
  raw_counts[match(rownames(tmp_raw_counts),rownames(raw_counts)),match(colnames(tmp_raw_counts),colnames(raw_counts))]=tmp_raw_counts
  rm(tmp_raw_counts)
  gc()

  numis_per_batch_cluster=matrix(0,ncol(noise_models),length(clusters),dimnames = list(colnames(noise_models),clusters))
  tmp_numis_per_batch_cluster=invisible(acast(data.frame(batch=batch,cluster=cell_to_cluster,numis=Matrix::colSums(umis)),batch~cell_to_cluster,fun.aggregate=sum,value.var = "numis")[colnames(noise_models),colnames(raw_counts)])
  numis_per_batch_cluster[rownames(tmp_numis_per_batch_cluster),colnames(tmp_numis_per_batch_cluster)]=tmp_numis_per_batch_cluster
  rm(tmp_numis_per_batch_cluster)
  gc()
  # Substracting the expected noise UMI counts from the raw UMI counts:
  expected_noise_counts=noise_models%*%(numis_per_batch_cluster*alpha_noise)
  adj_counts=pmax(as.matrix(raw_counts-expected_noise_counts),0)
  if (make_plots){
    for (i in 1:ncol(raw_counts)){
      if (!all(expected_noise_counts==0)){
        png(paste(figure_prefix,"_",i,".png",sep=""))
        plot(expected_noise_counts[,i],raw_counts[,i],log="xy",main=i,xlab="expected noise UMIs",ylab="raw UMIs");abline(0,1)
        dev.off()
      }
    }
  }
  #identify(raw_counts[,i],expected_noise_counts[,i],labels = rownames(raw_counts))
  #browser()
  rm(expected_noise_counts)
  gc()
  models=t(t(adj_counts)/colSums(adj_counts))
  return(models)
}
effiken/scDissector documentation built on July 28, 2023, 6:08 a.m.