R/scHiC_assess.R

Defines functions scHiC_assess

Documented in scHiC_assess

#' 
#' This function analyzes both simulated and real datasets, depending on the inputs of the functions.
#'
#' @param scHiC The observed data. It can take three types of formats. The preferred format is a single-cell 
#' matrix with each column being a vector of the upper triangular matrix without including the diagonal entries 
#' of the 2D matrix of a single-cell. Another types of formats are a list with each element being a 2D s
#' ingle-cell contact matrix, or a 3D (\eqn{n\times n\times k}) array that has k matrices of dimension \eqn{n\times n}. 
#' HiCImpute automatically transforms these two types of input into a matrix with each column being the vector of 
#' upper triangular matrix of a single-cell. 
#' @param expected Underline true counts of the simulated data. For real data analysis, just set it as NULL.
#' @param result Output of MCMCImpute.
#' @param imputed The imputed data that has the same dimension as the observed data. This is needed for real 
#' data analysis. For simulated data, set it as NULL.
#' @param cell_index Indicates which cell is used to draw heatmaps and scatterplot.
#' @param n Dimension of 2D contact matrix.
#' @param cell_type A vector of underlying true cluster.
#' @param dims The dimension of 2D matrix.
#' @param perplexity numeric; Perplexity parameter (should not be bigger than \eqn{3\times perplexity<nrow(X)-1}).
#' @param seed Random seed for generating t-SNE data.
#' @param kmeans Logical, whether apply K-means clustering on the t-SNE data.
#' @param ncenters Number of centers in K-means clustering analysis.
#'
#' @return A list of accuracy measurements and plots.
#' @export
#'
#' @import gridExtra
#'
#' @examples
#' data("K562_1_true")
#' options(digits = 2)
#' scHiC_assess(scHiC=K562_T1_7k, expected=K562_1_true, imputed=result$Impute_SZ=T1_7k_imp)

scHiC_assess <- function(scHiC, expected=NULL, result=NULL, imputed=NULL, cell_index=1, n, cell_type, dims = 2,perplexity=10, seed=1000, 
                         kmeans = TRUE, ncenters = 2){
  ##readin data of different types
  if(is.list(scHiC)){
    observed=NULL
    for (c in 1:length(scHiC)) {
      observed=cbind(observed, mattovec(scHiC[[c]]))
    }
  }else if(is.array(scHiC) & length(dim(scHiC))>2){
    observed=NULL
    for (c in 1:(dim(scHiC)[3])) {
      observed=cbind(observed, mattovec(scHiC[,,c]))
    }
  }else if(is.matrix(scHiC)){
    observed=scHiC
  }else{
    message("The input type must be a matrix, 3D array or a list of 2D matrix.")
    exit()
  }

  if(is.null(expected)){##### for real data analysis
    
    ############# correlation
    corr=NULL
    for (i in 1:ncol(observed)) {
      ind=observed[,i]!=0
      corr[i]=cor(observed[,i][ind],imputed[,i][ind])
    }
    corr=as.data.frame(corr)
    p1=ggplot(corr, aes(x="corr", y=corr))+geom_boxplot()+ggtitle("Boxplot of correlations") + theme(axis.title.x=element_blank(),
                                                                 axis.text.x=element_blank(),
                                                                 axis.ticks.x=element_blank()) 
    
    ############# scatterplot
    p2=SOVI(obsvec=observed[,cell_index], impvec = imputed[,cell_index])
    
    p3=scHiC_tSNE(observed, cell_type=cell_type, 
                  dims = dims, perplexity=perplexity, seed=seed, title="Observed", 
                  kmeans = kmeans, ncenters = ncenters)
    
    p4=scHiC_tSNE(imputed, cell_type=cell_type, 
                  dims = dims, perplexity=perplexity, seed=seed, title="HiCImpute", 
                  kmeans = kmeans, ncenters = ncenters)
     
    p=ggarrange(p1, p2, p3, p4,
              labels = c("A", "B", "C", "D"),
              ncol = 2, nrow = 2)
    
    
    cluster1 = scHiC_Kmeans(observed, centers=2, nstart=100, iter.max=1000, seed=1)
    dd1 = as.data.frame(cbind(cluster=cluster1$cluster,type=cell_type))
    table1 = table(dd1$cluster,dd1$type)
    
    cluster2 = scHiC_Kmeans(imputed, centers=2, nstart=100, iter.max=1000, seed=1)
    dd2 = as.data.frame(cbind(cluster=cluster2$cluster,type=cell_type))
    table2 = table(dd2$cluster,dd2$type)
    
    mylist=list(p, table1, table2)
    names(mylist)=c("plots", "obs_cluster", "imp_cluster")
    return(mylist)
  }else{ ######### for simulated data
    
    PTSZ=list()
    PTDO=list()
    #MSE=list()
    CIEZ=list()
    CIEA=list()
    AEOZ=data.frame()
    AEOA=data.frame()
    
    for(j in 1:ncol(observed)){
      index0 <- observed[,j]==0
      index1 <- observed[,j]==1
      index2 <- observed[,j]>=2
      
      ## calibration
      m1 <- lm(result$Impute_SZ[,j][observed[,j]>0]~observed[,j][observed[,j]>0])
      impu0 <- result$Impute_SZ[,j][result$Impute_SZ[,j]>0 & observed[,j]==0]
      obs2 <- (impu0-m1$coefficients[1])/(m1$coefficients[2])
      result$Impute_SZ[,j][result$Impute_SZ[,j]>0 & observed[,j]==0] <- obs2
      
      indexnotrue0=(expected[,j]!=0)
      
      predictv=result$Impute_SZ[,j][index0]
      Truevalue=expected[,j][index0]
      
      PTSZ[[j]]=sum(Truevalue==0 & predictv==0)/sum(Truevalue==0)
      PTDO[[j]]=sum(Truevalue>0 & predictv>0)/sum(Truevalue>0)
      
      #MSE[[j]]=mean((result$Impute_SZ[,j]-expected[,j])^2)
      
      AEOZ=rbind(AEOZ, data.frame(AE=(abs(predictv-Truevalue)),Sample=j))
      AEOA=rbind(AEOA, data.frame(AE=abs(result$Impute_SZ[,j]-expected[,j]),Sample=j))
      
      CIEZ[[j]]=cor(Truevalue,predictv)
      CIEA[[j]]=cor(result$Impute_SZ[,j],observed[,j])
    }
    
    PTSZ=unlist(PTSZ)
    PTDO=unlist(PTDO)
    #MSE=unlist(MSE)
    CIEZ=unlist(CIEZ)
    CIEA=unlist(CIEA)
    
    summa_mean=data.frame(PTSZ=mean(PTSZ),PTDO=mean(PTDO),
                          AEOZ=mean(AEOZ$AE),AEOA=mean(AEOA$AE),
                          CIEZ=mean(CIEZ),CIEA=mean(CIEA))
    summa_se=data.frame(PTSZ=sd(PTSZ),PTDO=sd(PTDO),
                        AEOZ=sd(AEOZ$AE),AEOA=sd(AEOA$AE),
                        CIEZ=sd(CIEZ),CIEA=sd(CIEA))
    rownames(summa_mean)=NULL
    rownames(summa_se)=NULL
    
    
    ############# fix PTSZ=0.95
    PTSZ_95 = PTSZ95(observed=observed, expected=expected, result=result)
    
    par(mar = c(3,3,3,5))
    par(mfrow=c(2,3))
    scHiC_hm(observed[,cell_index], n, title="Observed")
    fig_label("A", cex=2) 
    scHiC_hm(expected[,cell_index], n, title="Expected")
    fig_label("B", cex=2) 
    scHiC_hm(result$Impute_SZ[,cell_index], n, title="HiCImpute")
    fig_label("C", cex=2) 
    ############# ROC curve
    scHiC_ROC(observed=observed, expected=expected, result=result)
    fig_label("D", cex=2) 
    SEVI(obsvec=observed[,cell_index], expvec=expected[,cell_index], impvec=result$Impute_All[,cell_index] )
    fig_label("E", cex=2) 

    mylist <- list(summa_mean, summa_se, PTSZ_95)
    names(mylist) <- c("summary_mean", "summary_se", "PTSZ_95")
    return(mylist)
  }
}
Queen0044/HiCImpute documentation built on Oct. 9, 2022, 9:30 a.m.