R/function.R

Defines functions TagDist IndelPlot PlotTree BuildTree TagProcess IndelIdents IndelForm change_form_stat FindIndel align_to_range ReadCutsite ReadFasta

Documented in align_to_range BuildTree change_form_stat FindIndel IndelForm IndelIdents IndelPlot PlotTree ReadCutsite ReadFasta TagDist TagProcess

#' Title
#' @title ReadFasta
#' @description  Function to read fasta file to DNAString object
#' @param filename The input fasta file name
#'
#' @return A DNAString object
#' @export
#' @importFrom Biostrings DNAString
#'
#' @examples
#' fafile=paste0(system.file("extdata",package = 'LinTInd'),"/V3.fasta")
#' ReadFasta(fafile)
#'
ReadFasta = function(filename){
  sv = read.table(filename)
  scarfull = DNAString(sv[2,1])
  return(c("scarfull" = scarfull))
}

#' Title
#' @title ReadCutsite
#' @description Function to create a reference dataframe include each position and its' group
#' @usage ReadCutsite(segref,reftype=NULL)
#' @param segref The cutsite file
#' @param reftype Choose the reference type you want, if reftype="Accurate" (default), there will only the target sites be generated; if reftype="All", each site will be generated
#'
#' @return reference dataframe
#' @export
#'
#' @examples
#' data("example_data",package = "LinTInd")
#' ReadCutsite(cutsite)
#' ReadCutsite(cutsite,reftype="All")
#'
ReadCutsite = function(segref,reftype=NULL){
  colnames(segref) = c("indx","start","end")
  scar = NULL
  type = NULL
  if(any(is.null(reftype) | reftype=="Accurate")){
    for (i in 2:nrow(segref)) {
      scar = c(scar,segref[i,]$start:segref[i,]$end)
      type = c(type,rep(segref[i,]$indx,(segref[i,]$end-segref[i,]$start)+1))
    }
    scarseg = data.frame("scar" = scar,"type" = as.character(type))
  }else{
    endsite<-NA
    for (i in 2:nrow(segref)) {
      endsite<-c(endsite,segref[["end"]][i]+segref[["start"]][1])
    }
    endsite[nrow(segref)]<-segref[["end"]][1]
    scarseg = data.frame("scar" = c(1:endsite[nrow(segref)]),"type" = NA)
    #endsite<-is.na(endsite)
    for (i in rev(endsite)) {
      if(!is.na(i)){
        scarseg$type[1:i]<-which(endsite==i)-1
      }else{
        break
      }
    }
  }
  return(scarseg)
}

#' Title
#' @title align_to_range
#'
#' @param p A base sequence in character format
#' @param s A base sequence in character format
#' @param cut The distance between the starting sites of two fragments
#'
#' @return A list include two IRanges instances (deletion and insertion)
#' @importFrom IRanges IRanges
#' @importFrom S4Vectors mcols mcols<-
#' @importFrom Biostrings DNAString pairwiseAlignment nucleotideSubstitutionMatrix
#' @export
#' @examples
#' align_to_range(p="AAGG---AAATTTCGGAATAAGGAATTT",s="AAGGCCCCAAATTT-CGGAATAAGGAATTT",cut=0)
#'
#'
align_to_range = function(p,s,cut){
  pn <- strsplit(p,"")[[1]]
  sn <- strsplit(s,"")[[1]]
  lenp <- length(pn)
  index <- 1
  i <- 0
  del_flag <- FALSE
  in_flag <- FALSE
  del_start<-c()
  del_end<-c()
  ins_start<-c()
  ins_width<-c()
  ins_editseq<-c()
  while(i < lenp){
    i <- i + 1
    if(sn[[i]] == '-'){
      if(!del_flag){
        del_flag <- TRUE
        del_start<-c(del_start,index)
        width <- 1
      }
      else{
        width <- width + 1
      }
    }
    else{
      if(del_flag){
        del_flag <- FALSE
        del_end<-c(del_end,index)
        #print(paste("del stop width", width))
      }
    }
    if(pn[[i]] == '-'){
      if(!in_flag){
        in_flag <- TRUE
        ins_start<-c(ins_start,index)
        width <- 1

        editseq<-sn[[i]]
      }
      else{
        width <- width + 1

        editseq<-paste0(editseq,sn[[i]])
      }
    }
    else{
      if(in_flag){
        in_flag <- FALSE
        ins_width<-c(ins_width,width)
        ins_editseq<-c(ins_editseq,editseq)
        #print(paste("in stop width", width))
      }
    }
    if(pn[[i]] != '-')
      index <- index + 1
  }
  if(del_flag){
    del_flag <- FALSE
    del_end<-c(del_end,index)
  }
  if(in_flag){
    in_flag <- FALSE
    ins_width<-c(ins_width,width)
    ins_editseq<-c(ins_editseq,editseq)
    #print(paste("in stop width", width))
  }

  ins_start = ins_start-cut
  ins_end = ins_start + ins_width
  del_start = del_start-cut
  del_end = del_end-cut

  ins<-IRanges(start = ins_start,end = ins_end)
  mcols(ins)$seq <- ins_editseq
  del<-IRanges(start = del_start,end = del_end)


  return(list("del" = del,"ins" = ins))

}

#' Title
#' @title FindIndel
#' @description This function can ident indels for each reads in input data, and create IRanges instances for deletion and insertion
#' @param data data frame, include cell barcode, UMI and reads.
#' @param scarfull DNAString of reference sequence
#' @param scar The cutsite data frame
#' @param align_score The minimum alignment score that matched sequence should get, default in this parameter is the score that the reads which all of the target set were cutted got
#' @param type Group name for this data ("None" in default)
#' @param indel.coverage Choose indels selected scope: "Accurate" (default) means only the indles happenned in target site will be idented; "All" means each indel will be detected even they locate on the anchors
#' @param cln The number of threads
#'
#' @return list include IRanges instances (deletion and insertion), a data frame of reads' informations, reference sequenc, dataframe of cut sites
#' @export
#' @importFrom Biostrings DNAString pairwiseAlignment nucleotideSubstitutionMatrix subseq matchPattern pattern alignedPattern alignedSubject
#' @importFrom IRanges IRanges subject
#' @importFrom parallel makeCluster clusterEvalQ parLapply stopCluster clusterExport
#' @importFrom BiocGenerics score
#' @examples
#' data("example_data",package = "LinTInd")
#' scarinfo<-FindIndel(data=data,scarfull=ref,scar=cutsite,indel.coverage="All",type="test",cln=1)
#'
#'
FindIndel = function(data,scarfull,scar,align_score=NULL,type=NULL,indel.coverage=NULL,cln){
  scarref<-ReadCutsite(scar,reftype=indel.coverage)
  mat  =  Biostrings::nucleotideSubstitutionMatrix(match = 1, mismatch = -3)
  testFunction  =  function (data_in) {
    return(tryCatch(data_in, error=function(e) "unknown"))
  }
  if(is.null(align_score)){
    align_score = length(scarfull)- 2*(scar["end"][1,] - scar["start"][1,])-6
  }else{
    align_score = align_score
  }
  if(is.null(type)){
    type="None"
  }else{
    type=type
  }
  #type="none"
  find_barcode<-function(data){
    #tycpe="none"
    s3<-Biostrings::DNAString(as.character(data))
    alig<-Biostrings::pairwiseAlignment(scarfull,s3,substitutionMatrix = mat,type="global-local",gapOpening = 6, gapExtension= 1)
    if(any(is.null(indel.coverage),indel.coverage=="Accurate")){
      scarshort = Biostrings::subseq(as.character(scarfull[[1]]),scar["start"][1,],scar["end"][1,])
    }else{
      scarshort = as.character(scarfull[[1]])
    }
    if(score(alig)<=align_score){
      r_read<-"unknown"
      r_scar<-"unknown"
      del=NA
      ins=NA
    }else{
      scar_pos = Biostrings::matchPattern(scarshort,as.character(Biostrings::pattern(alig)),with.indels = TRUE,max.mismatch = 10)
      r_read = as.character(subject(alig))
      if(length(scar_pos)!= 0){
        r_scar = testFunction(subseq(as.character(subject(alig)),start=start(scar_pos),end=end(scar_pos)))
      }else{
        r_scar = "unknown"
      }
      stopifnot(is(alig, "PairwiseAlignments"), length(data) == 1L)

      p <- as.character(Biostrings::alignedPattern(alig)[[1L]])
      s <- as.character(Biostrings::alignedSubject(alig)[[1L]])
      if(any(is.null(indel.coverage),indel.coverage=="Accurate")){
        delins = align_to_range(p,s,scar["start"][1,])
      }else{
        delins = align_to_range(p,s,0)
      }
      del = delins$del
      ins = delins$ins
      if(TRUE %in% (del@start<0) | TRUE %in% (ins@start<0)){
        r_scar = "unknown"
      }
    }
    fin_dat<-data.frame(new.reads=r_read, scar.BC=r_scar,type=type)
    return(list("del" = del,"ins" = ins,fin_dat))
  }

  cl = makeCluster(cln,setup_strategy = "sequential")
  #environment(align_score) <- .GlobalEnv
  #environment(scarfull) <- .GlobalEnv
  #environment(scar) <- .GlobalEnv
  #environment(data) <- .GlobalEnv
  #environment(mat) <- .GlobalEnv
  #environment(type) <- .GlobalEnv
  #environment(find_barcode) <- .GlobalEnv
  #environment(testFunction) <- .GlobalEnv
  #environment(indel.coverage) <- .GlobalEnv
  clusterExport(cl,c('align_score','scarfull','scar','data','mat','indel.coverage','find_barcode','testFunction',"align_to_range","type"),envir = environment())
  scar_BC = parLapply(cl,data$Read.Seq,find_barcode)
  stopCluster(cl)

  #output
  data_2<-do.call("rbind",sapply(scar_BC,function(x){return(x[3])}))
  data<-cbind(data,data_2)
  scar_BC<-scar_BC[data$scar.BC!="unknown"]
  data<-data[data$scar.BC!="unknown",]
  saveRDS(list(scar_BC,data),"reads_metadata.rds")
  write.table(data,"all_UMI_reads_scar_full.txt",quote=FALSE,sep="\t",row.names=FALSE)
  data$scar_f<-gsub("[-]", "",as.character(data$scar.BC))
  data_v1<-data
  data_v1<-data_v1[data_v1$scar.BC!="unknown",]
  write.table(data_v1,"UMI_reads_scar_full.txt",quote=FALSE,sep="\t",row.names=FALSE)
  saveRDS(scar_BC,"indel.rds")
  return(list("INDEL" = scar_BC,"Scar" = data_v1,"indel.coverage"=indel.coverage,"cutsite"=scar,"ref"=scarfull,"scarref"=scarref))
}


#' Title
#' @title change_form_stat
#' @param indel List include two IRanges instances, contain start and end site of deletions and inserstions
#'
#' @return A scar form string
#'
#' @export
#' @importFrom IRanges IRanges
#' @examples
#' data("example_data",package = "LinTInd")
#' change_form_stat(cellsinfo$indel[[1]])
#'
#'
change_form_stat<-function(indel){
  indel<-indel[c(1,2)]
  indel<-unlist(indel)
  if(length(indel)==0){
    return("unknown")
  }else{
    ins<-data.frame(indel[2])
    #ins$seq=as.character(indel[[2]]@elementMetadata$seq)
    site_ins<-apply(ins,1,function(x){c(x[1]:x[2])})
    if(dim(ins)[1]==1){
      site_ins<-list(as.numeric(site_ins))
    }
    cutsite_ins<-lapply(site_ins,function(x){unique(scarref$type[scarref$scar %in% x])})
    tag_ins<-apply(ins,1,function(x){paste0(x[3],"I+",x[1],x[4])})
    tag_ins<-lapply(tag_ins,function(x){rep(x,length(cutsite_ins[[which(tag_ins==x)]]))})
    tag_ins<-unlist(tag_ins)
    cutsite_ins<-unlist(cutsite_ins)
    del<-data.frame(indel[1])
    site_del<-apply(del,1,function(x){c(x[1]:x[2])})
    if(dim(del)[1]==1){
      site_del<-list(as.numeric( site_del))
    }
    cutsite_del<-lapply(site_del,function(x){unique(scarref$type[scarref$scar %in% x])})
    tag_del<-apply(del,1,function(x){paste0((x[3]-1),"D+",x[1])})
    tag_del<-lapply(tag_del,function(x){rep(x,length(cutsite_del[[which(tag_del==x)]]))})
    tag_del<-unlist(tag_del)
    cutsite_del<-unlist(cutsite_del)
    tag<-c(tag_del,tag_ins)
    cutsite<-c(cutsite_del,cutsite_ins)
    tag_all<-rep("NONE",length(unique(scarref$type)))
    if(length(tag)==0){
      return(paste(tag_all,collapse = "_"))
    }else{
      for(x in c(1:length(tag))){
        if(tag_all[as.numeric(cutsite[x])]=="NONE"){
          tag_all[as.numeric(cutsite[x])]<-tag[x]
        }else{
          tag_all[as.numeric(cutsite[x])]<-paste(tag_all[as.numeric(cutsite[x])],tag[x],sep="&")
        }
      }
    }
    return(paste(tag_all,collapse = "_"))
  }
}
#' Title
#' @title IndelForm
#' @description Generate scar form strings from scarinfo list for each reads
#' @param scarinfo List generate from FindIndel, for more see \code{\link[LinTInd]{FindIndel}}
#' @param cln The number of threads
#'
#' @return A new list of scarinfo, the scarform of each reads will add in the data frame of reads' informations
#' @export
#' @importFrom IRanges IRanges
#' @importFrom parallel makeCluster clusterEvalQ parLapply stopCluster clusterExport
#' @examples
#' data("example_data",package = "LinTInd")
#' IndelForm(scarinfo,cln=1)
#'
IndelForm = function(scarinfo,cln){
  scarref<-scarinfo$scarref
  cl = makeCluster(cln,setup_strategy = "sequential")
  clusterExport(cl,c('scarinfo','change_form_stat',"scarref"), envir = environment())
  scar_form_p<-parLapply(cl,scarinfo$INDEL,change_form_stat)
  stopCluster(cl)
  scar_form<-unlist(scar_form_p)
  scar_form<-gsub(" ","",scar_form)
  data<-scarinfo$Scar
  data$scar_form<-scar_form
  write.csv(data,"indel_pattern.csv",quote=FALSE,row.names = FALSE)
  return(list("INDEL" = scarinfo$INDEL,"Scar" = data,"indel.coverage"=scarinfo$indel.coverage,"cutsite"=scarinfo$cutsite,"ref"=scarinfo$ref,"scarref"=scarinfo$scarref))
}


#' Title
#' @title IndelIdents
#' @description Function to define a scarform for each cell(single cell) or each reads(bulk seq, generate 'cell barcode' for each reads)
#' @param scarinfo List generate from IndelForm, for more see \code{\link[LinTInd]{IndelForm}}
#' @param method.use Select how to determine a scar form string for each cell:
#'                   "reads.num" (default):find the scar with the most reads in the cell;
#'                   "umi.num":find the scar with the most UMIs in the cell;
#'                   "consensus":find the consistent sequences in each cell, and then generate scar form strings from the new reads
#'
#' @param cln The number of threads
#'
#' @return The list generate from FindIndel, but in 'Scar' element a new column contain scar form strings
#' @export
#' @importFrom stringdist stringdistmatrix
#' @importFrom Biostrings consensusString pairwiseAlignment DNAString nucleotideSubstitutionMatrix alignedPattern alignedSubject
#' @importFrom IRanges IRanges
#'
#' @examples
#' data("example_data",package = "LinTInd")
#' IndelIdents(scarinfo,method.use="umi.num",cln=1)
#'
#'
IndelIdents = function(scarinfo,method.use=NULL,cln){
  scarfull<-scarinfo$ref
  scar<-scarinfo$cutsite
  data<-scarinfo$Scar
  scarref<-scarinfo$scarref
  indel.coverage<-scarinfo$indel.coverage
  Cell.BC<-data.frame(table(data$Cell.BC))
  Cell.BC<-Cell.BC[Cell.BC$Freq>1,]
  data_1<-data[data$Cell.BC %in% Cell.BC$Var1,]
  if(is.null(method.use)){
    method.use<-"reads.num"
  }
  if(all("Cell.BC" %in% names(data),"UMI" %in% names(data))){
    data_1<-data_1[,c("Cell.BC","UMI","scar_f","scar_form")]
  }else if(any("Cell.BC" %in% names(data),"UMI" %in% names(data))){
    names(data_1)[which(names(data_1) %in% c("Cell.BC","UMI"))]<-"Cell.BC"
    data_1<-data_1[,c("Cell.BC","scar_f","scar_form")]
  }else{
    data_1<-data_1[,c("scar_f","scar_form")]
  }
  if(length(names(data_1))==2){
    pattern<-data_1$scar_form
    data_con = data.frame("Cell.BC" = c(1:length(pattern)),
                          "pattern" = pattern,
                          "reads_num" = 1,
                          "reads_pro" = 1,
                          "umi_num" = 1,
                          "umi_pro" = 1,
                          stringsAsFactors = FALSE)
    INDEL_ranges <- scarinfo$INDEL
    INDEL_ranges <- INDEL_ranges[data_con$pattern!="unkown"]
    #INDEL_ranges <-INDEL_ranges[data$Cell.BC %in% Cell.BC$Var1]
    INDEL_ranges_man<-list()
    for(index in seq(length(INDEL_ranges))){
      INDEL_ranges_man<-c(INDEL_ranges_man,list(INDEL_ranges[[index]][c(1,2)]))
    }

    data_con<-data_con[data_con$pattern!="unkown",]
    write.csv(data_con,"final_scarform.csv",quote=FALSE,row.names = FALSE)
  }else{
  mat  =  nucleotideSubstitutionMatrix(match = 1, mismatch = -3)
  # x = as.character(Cell.BC$Var1)[2]c
  # dat =data_1
  max_reads_stat = function(x,dat,method.use=NULL){
    temreads = dat[dat$Cell.BC==x,]
    read_data = data.frame(table(as.character(temreads$scar_f)))
    read_data = read_data[order(-read_data$Freq),]
    scar_data = data.frame(table(as.character(temreads$scar_form)))
    scar_data = scar_data[order(-scar_data$Freq),]
    if(scar_data$Freq[1]==1){
      #can't define
      pattern="unkown"
      reads_num=0
      reads_pro=0
      umi_num=0
      umi_pro=0
      #UMI=0
      del=NA
      ins=NA
    }else{
      if(method.use=="consensus"){
        scarstrdist = stringdist::stringdistmatrix(as.character(read_data$Var1),as.character(read_data$Var1))
        scarindex = which(apply(scarstrdist,1,function(x){sum(read_data$Freq[which(x<9)])>(sum(read_data$Freq)/3)}))
        if(length(scarindex) > 0){
          fin_read = Biostrings::consensusString(temreads$scar_f[temreads$scar_f %in% as.character(read_data$Var1)[scarindex]])
          reads_pro = round(length(which(temreads$scar_f %in%as.character(read_data$Var1)[scarindex]))/sum(read_data$Freq),4)
          reads_num = length(which(temreads$scar_f %in%as.character(read_data$Var1)[scarindex]))
          umi_num = length(unique(temreads$UMI[which(temreads$scar_f %in%as.character(read_data$Var1)[scarindex])]))
          umi_pro = round(umi_num/length(unique(temreads$UMI)),4)
          fin_read_cons = gsub("\\?","",fin_read)
          s1 = Biostrings::DNAString(fin_read_cons)
          aligc = Biostrings::pairwiseAlignment(scarfull,s1,substitutionMatrix = mat,type="global-local",gapOpening = 6, gapExtension = 1)
          stopifnot(is(aligc, "PairwiseAlignments"), length(x) == 1L)
          p <- as.character(Biostrings::alignedPattern(aligc)[[1L]])
          s.cons <- as.character(Biostrings::alignedSubject(aligc)[[1L]])
          if(any(is.null(indel.coverage),indel.coverage=="Accurate")){
            indel = align_to_range(p,s.cons,scar["start"][1,])
          }else{
            indel = align_to_range(p,s.cons,0)
          }
          pattern =  change_form_stat(indel)
        }else{
          pattern = "unknown"
          reads_num=0
          reads_pro=0
          umi_num=0
          umi_pro=0
          #UMI=0
          #del=NA
          #ins=NA
          indel=NULL
        }
        #print("consensus")
      }else if(method.use=="umi.num"){
        #method.use=="umi.num"
        read_data_umi = unique(temreads)
        read_data_umi = data.frame(table(read_data_umi$scar_form))
        read_data_umi = read_data_umi[order(-read_data_umi$Freq),]
        pattern = as.character(read_data_umi$Var1[1])
        umi_num = read_data_umi$Freq[1]
        umi_pro = round(umi_num/length(unique(temreads)$UMI),4)
        reads_num = scar_data[which(scar_data$Var1 == pattern),"Freq"]
        reads_pro = round(reads_num/sum(scar_data$Freq),4)
        #print("umi.num")
      }else{
        pattern = as.character(scar_data$Var1[1])
        reads_num = scar_data$Freq[1]
        reads_pro = round(reads_num/sum(scar_data$Freq),4)
        umi_num = length(unique(temreads$UMI[which(temreads$scar_form %in% pattern)]))
        umi_pro = round(umi_num/length(unique(temreads$UMI)),4)
        #print("reads.num")
      }
    }
    fin_line = data.frame("Cell.BC" = x,
                          "pattern" = pattern,
                          "reads_num" = reads_num,
                          "reads_pro" = reads_pro,
                          "umi_num" = umi_num,
                          "umi_pro" = umi_pro,
                          stringsAsFactors = FALSE)
    if(method.use=="consensus"){
      return(list(indel,fin_line))
    }else{
      return(fin_line)
    }
  }
  cl = makeCluster(cln,setup_strategy = "sequential")
  #environment(data_1) <- .GlobalEnv
  #environment(max_reads_stat) <- .GlobalEnv
  #environment(Cell.BC) <- .GlobalEnv
  #environment(scarref) <- .GlobalEnv
  #environment(align_to_range) <- .GlobalEnv
  #environment(change_form_stat) <- .GlobalEnv
  #environment(scarfull) <- .GlobalEnv
  #environment(method.use) <- .GlobalEnv
  #environment(indel.coverage) <- .GlobalEnv
  clusterExport(cl,c('data_1','indel.coverage','method.use','max_reads_stat','Cell.BC',"align_to_range","scarref","change_form_stat","scarfull","mat","scar"), envir = environment())
  data_con<-parLapply(cl,as.character(Cell.BC$Var1),function(x)tryCatch(max_reads_stat(x,dat=data_1,method.use=method.use),error=function(e) NULL))
  stopCluster(cl)
  data_con<-data_con[!sapply(data_con,is.null)]

  if(method.use=="consensus"){
    INDEL_ranges_man <-list()
    for(i in seq(length(data_con))){
      INDEL_ranges_man <- c(INDEL_ranges_man,list(data_con[[i]][[1]]))
    }
    data_con_sub<-data.frame(Cell.BC=NA,pattern=NA,reads_num=NA, reads_pro=NA, umi_num=NA, umi_pro=NA)
    data_con_sub<-data_con_sub[-1,]
    for(i in seq(length(data_con))){
      data_con_sub<-rbind(data_con_sub,data_con[[i]][[2]])
    }
    data_con<-data_con_sub
    INDEL_ranges_man<-INDEL_ranges_man[data_con$pattern!="unkown"]
    data_con<-data_con[data_con$pattern!="unkown",]
    write.csv(data_con,"final_scarform.csv",quote=FALSE,row.names = FALSE)
    saveRDS(INDEL_ranges_man,"indel_ident.rds")
  }else{
    data_con<-do.call("rbind",data_con)
    data_con<-data_con[data_con$pattern!="unkown",]
    write.csv(data_con,"final_scarform.csv",quote=FALSE,row.names = FALSE)

    INDEL_ranges <- scarinfo$INDEL
    #INDEL_ranges <-INDEL_ranges[data$Cell.BC %in% Cell.BC$Var1]
    INDEL_ranges_man<-list()
    for(scarform in data_con$pattern){
      index=which(data$scar_form==scarform)[1]
      INDEL_ranges_man<-c(INDEL_ranges_man,list(INDEL_ranges[[index]][c(1,2)]))
    }
    saveRDS(INDEL_ranges_man,"indel_ident.rds")
  }
  }
  return(list("indel"=INDEL_ranges_man,"info"=data_con,"indel.coverage"=indel.coverage,"cutsite"=scarinfo$cutsite,"ref"=scarinfo$ref,"scarref"=scarinfo$scarref))
}


#' Title
#' @title TagProcess
#' @description Split each indel from scar form strings and map indel information to cell barcodes
#' @param data List generate from IndelIdents, for more see \code{\link[LinTInd]{IndelIdents}}
#' @param Cells (optional) Dataframe of cells' annotation, with two columns: "Cell.BC" and "Cell.type"
#' @param prefix (optional) Indels' prefix
#'
#' @return List with two dataframes: Indels for each cell barcode and cells' annotation
#' @export
#' @importFrom S4Vectors na.omit
#'
#' @examples
#' data("example_data",package = "LinTInd")
#' TagProcess(cellsinfo$info,Cells=celltype)
#'
TagProcess = function(data,Cells=NULL,prefix=NULL){

  TagStat = function(x) {
    x = as.character(x)
    umi = x[5]
    CB = x[1]
    x = x[2]
    x = unlist(strsplit(x,"_|&"))
    x = x[!x%in%c("NONE")]
    x = unique(x)
    if(length(x) == 0){
      return(NA)
    }else{
      return(data.frame(Cell.BC = CB,Reads_num = umi,Tag = x))
    }
  }

  #tag = NULL
  #common.CB = NULL
  #if((dim(data)[1]>1)){
  # for (i in 1:(dim(data)[1]-1)) {
  #  common.CB = c(common.CB,intersect(data$Cell.BC[i], data$Cell.BC[i+1]))
  #}
  #}else{
  #common.CB = data$Cell.BC[1]
  #}

  #for (i in 1:dim(data)[1]) {
  if(!is.null(Cells)){
    data=data[data$Cell.BC %in% Cells$Cell.BC,]
    data$Cell.type<-Cells$Cell.type[match(data$Cell.BC,Cells$Cell.BC)]
  }
  #data[[i]]=data[[i]][data[[i]]$Cell.BC %in% common.CB,]
  tagi = apply(data,1,TagStat)
  tagi = do.call("rbind",tagi)
  tagi = na.omit(tagi)
  #tagi = data.frame(table(tagi$Tag)/length(as.character(unique(tagi$Cell.BC))))
  #black list filter
  if(!is.null(Cells)){
    tagi$Cell.type<-Cells$Cell.type[match(tagi$Cell.BC,Cells$Cell.BC)]
  }
  if(!is.null(prefix)){
    tagi$Tag = paste(prefix[i], tagi$Tag, sep = "")
  }else{
    tagi$Tag=tagi$Tag
  }
  #tag = rbind(tag,tagi)
  #}
  return(list("tag"=tagi,"celltype"=Cells))
}


#' Title
#' @title BuildTree
#' @description Generate an array generant tree of a data.tree data structure and save it
#' @param tag List generate from TagProcess, for more see \code{\link[LinTInd]{TagProcess}}
#'
#' @return list with two elements, a data.tree data structure and a dataframe of array information for each cell barcode
#' @export
#' @importFrom networkD3 saveNetwork diagonalNetwork
#' @importFrom data.tree ToListExplicit as.Node
#' @examples
#' data("example_data",package = "LinTInd")
#' treeinfo<-BuildTree(tag)
#'
BuildTree = function(tag){
  #tag stat
  Cells=tag$celltype
  tag=tag$tag
  Tag = data.frame(table(tag$Tag))
  tag$Cell.num = Tag$Freq[match(tag$Tag,Tag$Var1)]
  tag_tab = acast(tag,Cell.BC~Tag)
  tag_tab[!is.na(tag_tab)] = 1
  tag_tab[is.na(tag_tab)] = 0
  #tag integrate
  cell_tab = data.frame(table(tag$Cell.BC))
  cell_tab = cell_tab[order(-cell_tab$Freq),]
  tags_all = lapply(as.character(cell_tab$Var1),function(x){sort(as.character(tag$Tag[tag$Cell.BC == x]))})
  tags_paste = sapply(tags_all,function(x){paste(x,collapse = "_")})
  tags_tab = data.frame(table(tags_paste))
  tags_tab$num = unlist(lapply(as.character(tags_tab$tags_paste),function(x){length(strsplit(x,split = "_")[[1]])}))
  tags_tab = tags_tab[order(-tags_tab$num,-tags_tab$Freq),]
  tags_uni = lapply(as.character(tags_tab$tags_paste),function(x){strsplit(x,split = "_")[[1]]})
  Tag_1 = Tag[Tag$Var1 %in% unlist(tags_uni[tags_tab$num == 1]),]
  Tag_1 = Tag_1[order(-Tag_1$Freq),]
  tags_uni[tags_tab$num == 1] = as.list(as.character(Tag_1$Var1))
  tags_tab[tags_tab$num == 1,] = tags_tab[tags_tab$num == 1,][match(as.character(Tag_1$Var1),
                                                                    as.character(tags_tab$tags_paste[tags_tab$num==1])),]

  #node build
  cluster_stat = function(i){
    x = tags_uni[[i]]
    n = tags_tab$num[i]
    tags_belone = NA
    for(y_ind in which(tags_tab$num<n)){
      y=tags_uni[[y_ind]]
      if(length(intersect(x,y))>0 & length(setdiff(y,x))==0){
        tags_belone = y_ind
        break
      }else{
        next
      }
    }
    return(tags_belone)
  }

  belons = sapply(which(tags_tab$num > 1),cluster_stat)
  belons[tags_tab$num == 1]= NA

  Tags = as.list(which(tags_tab$num == 1))
  names(Tags) = which(tags_tab$num == 1)
  nodes = as.list(c(1:length(tags_tab$num)))

  for(i in c(1:length(tags_tab$num))){
    n = belons[[i]]
    while(!is.na(n)){
      nodes[[i]] = c(n,nodes[[i]])
      n = belons[[n]]
    }
  }
  nodes_len = sapply(nodes,length)

  for(i in c(1:length(tags_tab$num))){
    nodes[[i]] = as.character(tags_tab$tags_paste)[nodes[[i]]]
    if(length(nodes[[i]])<max(nodes_len)){
      nodes[[i]][(length(nodes[[i]])+1):max(nodes_len)] = NA
    }else{
      next
    }
  }
  nodes=data.frame(do.call("rbind",nodes))
  names(nodes) = paste("N",as.character(1:ncol(nodes)),sep = "")
  nodes$pathString = do.call(paste, c("N0",nodes, sep="/"))
  nodes$pathString = gsub("/NA","",nodes$pathString)

  #save tree figure and rds
  population = as.Node(nodes)
  saveNetwork(diagonalNetwork(ToListExplicit(population, unname = TRUE),
                              margin = 10,fontSize = 8,width=15*dim(nodes)[2] ,height = 30*dim(nodes)[1]),
              file = "tree.html")

  saveRDS(population,"tree.rds")

  #save celltype tab
  cell_tab$tags = tags_paste
  if(!is.null(Cells)){
    cell_tab$celltype = Cells$Cell.type[match(as.character(cell_tab$Var1),Cells$Cell.BC)]
  }
  write.csv(cell_tab,"cell_tab.csv",row.names = FALSE,quote = FALSE)
  return(list(tree=population,info=cell_tab))

}


#' Title
#' @title PlotTree
#' @description Function to visualise the array generate tree
#' @param treeinfo List generate from BuildTree, for more see \code{\link[LinTInd]{BuildTree}}
#' @param data.extract (optional) If "FALSE" (default), will not return the indel's information, if it's "TRUE", the opposite
#' @param annotation (optional) If "TRUE" (default), the annotation of each cell barcodes have to be provided before, and a heatmap of cells' distribution for each array will be return
#' @param prefix (optional) Indels' prefix
#'
#' @return A list include a ggplot object, a dataframe show the distribution of each array contained in each group of cells (optional), and a dataframe to create the ggplot object
#' @export
#' @importFrom data.tree ToDataFrameNetwork FromDataFrameNetwork ToNewick
#' @importFrom reshape2 dcast melt
#' @importFrom purrr map_chr %>%
#' @importFrom ape rotateConstr
#' @importFrom ggtree ggtree geom_tiplab xlim_expand geom_facet read.tree
#' @importFrom ggplot2 scale_fill_gradient
#' @importFrom ggplot2 theme aes ggplot geom_ribbon geom_line scale_x_continuous xlab ylab theme_bw geom_segment geom_tile
#' @importFrom cowplot plot_grid
#' @importFrom BiocGenerics start end width type
#' @importFrom ggnewscale new_scale_color
#' @importFrom stringr str_extract str_extract_all
#' @importFrom dplyr summarise group_by
#' @importFrom rlist list.insert list.order
#' @examples
#' data("example_data",package = "LinTInd")
#' plotinfo<-PlotTree(treeinfo = treeinfo,data.extract = "TRUE",annotation = "TRUE")
#' plotinfo<-PlotTree(treeinfo = treeinfo,data.extract = "TRUE",annotation = "FALSE")
#'
PlotTree = function(treeinfo,data.extract=NULL,annotation=NULL,prefix=NULL){
  tree=treeinfo$tree
  celltab=treeinfo$info

  #1.trans data.tree to newick with internal node
  td = ToDataFrameNetwork(tree)
  from_node = unique(td$from)
  td[which(td$to %in% from_node),"to"] = paste("node.",td[which(td$to %in% from_node),"to"],sep = "")
  td = rbind(data.frame("from" = from_node, "to" = from_node),td)
  td$from = paste("node.",td$from,sep = "")
  td = td[which(td$to != "N0"),]
  td_node = FromDataFrameNetwork(td)
  tree_nwk = ToNewick(td_node)
  write(tree_nwk,"tree.nwk")


  #2.data build
  #(1)tree_nwk for plotting tree
  tree_nwk = read.tree("tree.nwk")
  tl = tree_nwk$tip.label

  #(2)tree_cell_plot for plotting cell composition
  tree_cell = celltab %>% group_by(tags, celltype) %>% summarise(count = sum(Freq))
  tree_cell = dcast(tree_cell,tags~celltype,value.var = "count")
  tree_cell[is.na(tree_cell)] = 0
  #normlize
  tree_cell[,-1] = log(tree_cell[,-1]+1)
  tree_cell[,-1] = t(apply(tree_cell[,-1], 1, function(x) {x/sum(x)}))
  tree_cell_l = melt(tree_cell)
  #tree_cell_plot = tree_cell_l

  #option for cell composition bar plot
  # tree_cell_l = melt(tree_cell)
  # tree_cell_plot = left_join(data.frame(id=tl),tree_cell_l,by=c("id" = "tags"))

  #option for
  # tree_cell_plot$variable = as.character(tree_cell_plot$variable)
  # tree_cell_plot[which(tree_cell_plot$value>0),3] = tree_cell_plot[which(tree_cell_plot$value>0),2]
  # tree_cell_plot[which(tree_cell_plot$value==0),3] = NA

  #(3)indel for plotting pattern
  #library(stringr)

  indel = data.frame(id=NA,start=NA,end=NA,width=NA,type=NA)
  indel = indel[-1,]
  #build indel data frame
  for(i in c(1:length(tl))){
    x = tl[i]
    x_str = unlist(strsplit(x,split = "_"))
    del = NULL
    ins = NULL
    if(!is.null(prefix)){
      for (k in 1:length(prefix)) {
        x_pre = x_str[[1]][grepl(prefix[k], x_str[[1]])]
        if(length(x_pre)>0){
          x_pre_del = x_pre[grep("D",x_pre)]
          if(length(x_pre_del) > 0){
            x_pre_del = str_extract_all(x_pre_del, "[0-9]+")
            for (j in 1:length(x_pre_del)) {
              line = as.numeric(x_pre_del[[j]])
              del = rbind(del, data.frame(id=x,start=line[length(line)] + (k-1)*300,
                                          end=line[length(line)]+line[length(line)-1] + (k-1)*300,width=line[length(line)-1],
                                          type="deletion"))
            }

          }
          x_pre_ins = x_pre[grep("I",x_pre)]
          if(length(x_pre_ins) > 0){
            x_pre_ins = str_extract_all(x_pre_ins, "[0-9]+")
            for (j in 1:length(x_pre_ins)) {
              line = as.numeric(x_pre_ins[[j]])
              ins = rbind(ins, data.frame(id=x,start=line[length(line)] + (k-1)*300,
                                          end=line[length(line)]+line[length(line)-1] + (k-1)*300,width=line[length(line)-1],
                                          type="insertion"))
            }
          }

        }
      }
    }else{
      x_pre_del = x_str[grep("D",x_str)]
      if(length(x_pre_del) > 0){
        x_pre_del = str_extract_all(x_pre_del, "[0-9]+")
        for (j in 1:length(x_pre_del)) {
          line = as.numeric(x_pre_del[[j]])
          #del = rbind(del, data.frame(id=x,start=line[length(line)] + (k-1)*300,
                                      #end=line[length(line)]+line[length(line)-1] + (k-1)*300,width=line[length(line)-1],
                                      #type="deletion"))
          del = rbind(del, data.frame(id=x,start=line[length(line)],
                                      end=line[length(line)]+line[length(line)-1],width=line[length(line)-1],
                                      type="deletion"))
        }

      }
      x_pre_ins = x_str[grep("I",x_str)]
      if(length(x_pre_ins) > 0){
        x_pre_ins = str_extract_all(x_pre_ins, "[0-9]+")
        for (j in 1:length(x_pre_ins)) {
          line = as.numeric(x_pre_ins[[j]])
          #ins = rbind(ins, data.frame(id=x,start=line[length(line)] + (k-1)*300,
                                      #end=line[length(line)]+line[length(line)-1] + (k-1)*300,width=line[length(line)-1],
                                      #type="insertion"))
          ins = rbind(ins, data.frame(id=x,start=line[length(line)],
                                      end=line[length(line)]+line[length(line)-1],width=line[length(line)-1],
                                      type="insertion"))
        }
      }
    }
    indel = rbind(indel,del,ins)
  }


  #(4)reorder tree
  SortTree = function(label){

    #construct data structure
    taglst = list()
    for (i in 1:length(label)) {
      tchr = label[i]
      tline = strsplit(tchr,"_")
      element = list("labels" = tchr,"tags" = tline[[1]], "tag_number" = length(tline[[1]]))
      taglst[[i]] = element
    }
    taglst.sort = taglst[list.order(taglst,-tag_number)]
    for (i in 2:(length(taglst.sort) - 1)) {
      point = taglst.sort[[i]]
      score = 0
      for (j in (i+1):length(taglst.sort)) {
        queue = taglst.sort[[j]]
        qscore = length(intersect(point$tags,queue$tags))

        if(qscore > score){
          score = qscore
          taglst.sort[[j]] = NULL
          taglst.sort = list.insert(taglst.sort,i+1,queue)
        }
      }
    }


    #result
    label.sort = map_chr(taglst.sort, 1)
  }
  label.sort =  SortTree(tree_nwk$tip.label)
  tree_data_sort = ape::rotateConstr(tree_nwk, label.sort)

  #3.plot integrated tree
  #set expand can change width of the tree
  p = ggtree(tree_data_sort,size = 0.1, ladderize=FALSE) + geom_tiplab(size = 0.3) +
    xlim_expand(c(0, 200),panel = "Tree")

  if(annotation=="FALSE"){
    p1=p + geom_facet(panel = "indel pattern", data = indel,
                      geom = geom_segment,
                      mapping = aes(x = start, xend = end, y =y, yend = y,color=type),
                      size = 0.3)
  }else{
    p1 = p + geom_facet(panel = "cells",data = tree_cell_l,
                        geom =  geom_tile,
                        mapping = aes(x = as.numeric(as.factor(variable)),fill = value,color = variable)) +
      #    scale_fill_viridis_d(option="D", name="discrete\nvalue") +
      scale_fill_gradient(low = "white",high = "#440130")  +
      new_scale_color() +
      geom_facet(panel = "indel pattern", data = indel,
                 geom = geom_segment,
                 mapping = aes(x = start, xend = end, y =y, yend = y,color=type),
                 size = 0.3)
  }
  if(data.extract=="TRUE" & annotation=="TRUE"){
    return(list(p=p1,tagsinfo=tree_cell,indelinfo=indel))
  }else if(data.extract=="TRUE" & is.null(annotation)){
    return(list(p=p1,tagsinfo=tree_cell,indelinfo=indel))
  }else if(data.extract=="TRUE" & annotation=="FALSE"){
    return(list(p=p1,indelinfo=indel))
  }else{
    return(p1)
  }

  # p1 = facet_plot(p,panel = "cell composition",data = tree_cell_l,
  #                 geom = geom_barh,
  #                 mapping = aes(x = value,fill = variable),
  #                 stat="identity") %>%
  #   facet_plot(panel = "indel pattern", data = indel,
  #              geom = geom_segment,
  #              mapping = aes(x = start, xend = end, y =y, yend = y,color=type),
  #              size = 1)


  #change facet grid
  #gt = ggplot_gtable(ggplot_build(p1))
  #gt$widths[7] = 0.5*gt$widths[7] # in this case it was colmun 7 - reduce the width by a half
  #gt$widths[9] = 0.5*gt$widths[9] # in this case it was colmun 7 - reduce the width by a half
  #grid.draw(gt) # plot with grid draw

  #ggsave(gt,filename = paste(outname,"pdf",sep = "."),height = 100,width = 15,units = "cm")

}


#' Title
#' @title IndelPlot
#' @description Return 2 line charts, show the probability of insertion and deletion at each site
#' @param cellsinfo List generate from IndelIdents, for more see \code{\link[LinTInd]{IndelIdents}}
#'
#' @return 2 line charts
#' @export
#'
#' @importFrom IRanges IRanges
#' @importFrom ggplot2 theme aes ggplot geom_ribbon geom_line scale_x_continuous xlab ylab theme_bw
#' @importFrom cowplot plot_grid
#' @importFrom BiocGenerics start end width type
#'
#' @examples
#' data("example_data",package = "LinTInd")
#' IndelPlot(cellsinfo = cellsinfo)
#'

IndelPlot<-function(cellsinfo){
  scar=cellsinfo$cutsite
  indel.coverage<-cellsinfo$indel.coverage
  INDEL_ranges<-cellsinfo$indel
  del_ranges<-unlist(lapply(INDEL_ranges,function(x){x[1]}))
  ins_ranges<-unlist(lapply(INDEL_ranges,function(x){x[2]}))
  del_r<-del_ranges[[1]]
  for(i in 2:length(del_ranges)){
    del_r<-c(del_r,del_ranges[[i]])
  }
  ins_r<-ins_ranges[[1]]
  for(i in 2:length(ins_ranges)){
    ins_r<-c(ins_r,ins_ranges[[i]])
  }
  if(any(is.null(indel.coverage),indel.coverage=="Accurate")){
    scarref<-data.frame(scar=NA,type=NA)
    scarref<-scarref[-1,]
    for(i in c(2:dim(scar)[1])){
      scarref_sub<-data.frame(scar=c(scar$start[i]:(scar$end[i])),type=scar$indx[i])
      scarref<-rbind(scarref,scarref_sub)
    }
  }else{
    scarref<-data.frame(scar=NA,type=NA)
    scarref<-scarref[-1,]
    startP<-scar$start[1]
    for(i in c(2:dim(scar)[1])){
      scarref_sub<-data.frame(scar=c(scar$start[i]:(scar$end[i]))+startP,type=scar$indx[i])
      scarref<-rbind(scarref,scarref_sub)
    }
  }
  scarref$type<-as.character(scarref$type)
  all_site_stat<-function(i,dat){
    return(c(start(dat[i]):end(dat[i])))
  }
  #all_site_per_stat<-function(i,dat){
  #	return(length(which(dat==i))/length(INDEL_ranges))
  #}

  del_allsite<-unlist(lapply(seq(del_r),all_site_stat,dat=del_r))
  del_allsite_per<-data.frame(table(del_allsite))
  del_allsite_per$Freq<-del_allsite_per$Freq/length(INDEL_ranges)
  names(del_allsite_per)[1]<-"Site"
  del_allsite_per<-del_allsite_per[del_allsite_per$Site %in% c(1:scar$end[1]),]
  del_allsite_per$Site<-as.numeric(as.character(del_allsite_per$Site))
  if(length(setdiff(c(1:scar$end[1]),del_allsite_per$Site))>0){
    del_allsite_per<-rbind(del_allsite_per,data.frame(Site=setdiff(c(1:scar$end[1]),del_allsite_per$Site),Freq=0))
  }else{
    del_allsite_per<-del_allsite_per
  }
  if(indel.coverage=="Accurate"){
    del_allsite_per<-del_allsite_per[del_allsite_per$Site<=max(scarref$scar+1),]
  }
  p_del<-ggplot()+
    geom_ribbon(data=scarref,aes(x=scar,ymin=0,ymax=max(del_allsite_per$Freq),fill=type),alpha=0.1)+
    geom_line(data=del_allsite_per,aes(x=Site,y=Freq),size=1,colour="lightskyblue")+theme(legend.position="none")+
    scale_x_continuous(breaks=c())+xlab("")+ylab("Deletion Frequency")+theme_bw()

  ins_allsite<-unlist(lapply(seq(ins_r),all_site_stat,dat=ins_r))
  ins_allsite_per<-data.frame(table(ins_allsite))
  ins_allsite_per$Freq<-ins_allsite_per$Freq/length(INDEL_ranges)
  names(ins_allsite_per)[1]<-"Site"
  ins_allsite_per<-ins_allsite_per[ins_allsite_per$Site %in% c(1:scar$end[1]),]
  ins_allsite_per$Site<-as.numeric(as.character(ins_allsite_per$Site))
  if(length(setdiff(c(1:scar$end[1]),ins_allsite_per$Site))>0){
    ins_allsite_per<-rbind(ins_allsite_per,data.frame(Site=setdiff(c(1:scar$end[1]),ins_allsite_per$Site),Freq=0))
  }else{
    ins_allsite_per<-ins_allsite_per
  }
  if(indel.coverage=="Accurate"){
    ins_allsite_per<-ins_allsite_per[ins_allsite_per$Site<=max(scarref$scar+1),]
  }
  p_ins<-ggplot()+
    geom_ribbon(data=scarref,aes(x=scar,ymin=0,ymax=max(ins_allsite_per$Freq),fill=type),alpha=0.1)+
    geom_line(data=ins_allsite_per,aes(x=Site,y=Freq),size=1,colour="indianred1")+theme(legend.position="none")+
    scale_x_continuous(breaks=c())+xlab("")+ylab("Insertion Frequency")+theme_bw()
  return(plot_grid(p_del,p_ins,nrow = 2))
}


#' Title
#' @title TagDist
#' @description If the cell barcode and the anntation of each cell are provided, this function can calculate the relationship between each cell type in three way
#' @param tag List generate from TagProcess, for more see \code{\link[LinTInd]{TagProcess}}
#' @param method Denote which method to use:
#'
#' - "Jaccard"(default): calculate the weighted jaccard similarity of indels between each pair of groups;
#' - "P": right-tailed test, compare the Indels intersection level with the hypothetical  result generated from random editing, and the former is expected to be significantly higher than the latter;
#' - "spearman": Spearman correlation of indels between each pair of groups
#'
#' @return 2 figures are saved to show the distribution of INDEL and the relationship between groups respectively, the matrix of the relationship between groups is returned
#' @export
#'
#' @importFrom stats pnorm sd cor
#' @importFrom pheatmap pheatmap
#' @importFrom reshape2 acast
#'
#' @examples
#' tag_dist=TagDist(tag,method = "spearman")
#' tag_dist=TagDist(tag)
#' tag_dist=TagDist(tag,method = "Jaccard")
#' tag_dist=TagDist(tag,method = "P")
#' tag_dist=TagDist(tag,method = "spearman")
#'
TagDist<-function(tag,method=NULL){

  one_jac_stat2<-function(l,VBC1,clone_stat_data){
    VBC2<-as.numeric(clone_stat_data[,l])
    num <- sum(sapply(1:length(VBC1), function(x)(min(VBC1[x],VBC2[x]))))
    den <- sum(sapply(1:length(VBC1), function(x)(max(VBC1[x],VBC2[x]))))
    return(num/den)
  }
  one_jac_stat1<-function(i,clone_stat_data){
    VBC1<-as.numeric(clone_stat_data[,i])
    return(unlist(lapply(clu,one_jac_stat2,VBC1=VBC1,clone_stat_data=clone_stat_data)))
  }
  one_op_stat2<-function(y,x,jac,jac_sample){
    jac_real<-jac[x,y]
    jac_pred<-as.numeric(unlist(lapply(jac_sample,function(z){z[x,y]})))
    ob<-jac_real/mean(jac_pred)
    zscore<-(jac_real-mean(jac_pred))/sd(jac_pred)
    p<-pnorm(zscore,lower.tail = FALSE)
    return(c(ob,p))
  }
  one_op_stat1<-function(x,jac,jac_sample,clu){
    return(lapply(clu,one_op_stat2,x=x,jac=jac,jac_sample=jac_sample))
  }
  tag<-tag$tag
  clone_tab<-acast(tag,Tag~Cell.type)
  clu<-colnames(clone_tab)
  clone_tab<-clone_tab[apply(clone_tab,1,sum)>=2,]
  annotation_col = data.frame(Group=factor(unlist(str_extract_all(row.names(clone_tab), "[D|I]+"))))
  row.names(annotation_col)<-rownames(clone_tab)
  pheatmap(t(clone_tab),cluster_cols = FALSE,cluster_rows = FALSE,show_colnames = FALSE,annotation_col = annotation_col,scale = "row",filename = "tag_heatmap_scale.pdf",width = 5,height = 3)

  if(any(is.null(method),method=="Jaccard")){
    all_jac<-data.frame(matrix(unlist(lapply(clu,one_jac_stat1,clone_stat_data=clone_tab)),ncol=length(clu)))
    names(all_jac)<-clu
    row.names(all_jac)<-clu
    pheatmap(all_jac,filename="group_dist.pdf",width = 4.9,height = 4.5)
    return(all_jac)
  }else if(method=="P"){
    all_jac<-data.frame(matrix(unlist(lapply(clu,one_jac_stat1,clone_stat_data=clone_tab)),ncol=length(clu)))
    names(all_jac)<-clu
    row.names(all_jac)<-clu
    jac_sample<-list()
    for(t in c(1:500)){
      tag$tags_sample<-sample(tag$Tag,length(tag$Tag))
      clone_tab_sample_sub<-acast(tag,tags_sample~Cell.type)
      jac_sample_sub<-data.frame(matrix(unlist(lapply(clu,one_jac_stat1,clone_stat_data=clone_tab_sample_sub)),ncol=length(clu)))
      names(jac_sample_sub)<-clu
      row.names(jac_sample_sub)<-clu
      jac_sample<-c(jac_sample,list(jac_sample_sub))
    }

    #plot pvalue
    all_ob_p<-lapply(clu,one_op_stat1,jac=all_jac,jac_sample=jac_sample,clu=clu)
    all_ob<-do.call("rbind",lapply(all_ob_p,function(x){unlist(x)[c(1:length(unlist(x)))[c(1:length(unlist(x)))%%2==1]]}))
    all_ob<-data.frame(all_ob)
    all_p<-do.call("rbind",lapply(all_ob_p,function(x){unlist(x)[c(1:length(unlist(x)))[c(1:length(unlist(x)))%%2==0]]}))
    all_p<-data.frame(all_p)
    names(all_ob)<-clu
    row.names(all_ob)<-clu
    names(all_p)<-clu
    row.names(all_p)<-clu

    all_p[is.na(all_p)] = 0
    pheatmap(-log2(all_p+min(all_p[all_p!=0])),main = "log2(P.val)",filename="group_dist.pdf",width = 4.9,height = 4.7)
    return(all_p)
  }else if(method=="spearman"){
    clone_tab_new<-t(apply(clone_tab,1,function(x){x/sum(x)}))
    clone_tab_new<-apply(clone_tab_new,2,function(x){x/sum(x)})
    cor_s<-cor(clone_tab_new,method = "spearman")
    pheatmap(cor_s,filename="group_dist.pdf",width = 4.9,height = 4.5)
    return(cor_s)
  }
}
mana-W/LinTInd documentation built on Feb. 14, 2022, 10:13 a.m.