tmp/getCDSPvalue_cluster_v1.R

#' Run CNCDriver coding region (CDS) p-value calculation with mutation pre-filtering
#'
#' @param inputFileDir The path for prepared R object file [for example,reducedFunseqOutputCDS_GBM.Rd]
#' @param outputFileDir The path for output files
#' @param codingRegionBedFile The defintion of coding regions in bed file format
#' @param elementKeyWord Default is "Promoter", the keyword of promoter annotation in FunSeq2
#' @param proteinDomainFile protein domain information from pfam
#' @param proteinLengthFile protein length information from ensembl
#' @param minPoints default is 2 [ unit: samples ]  
#' @param dRadius default is 50 [ unit: bp ]
#' @param triNucleotideDistributionFile Cancer type specific mutation counts in 96 trinucleotide category  
#' @param filterOutBlacklistMutations TRUE or FALSE
#' @param mutationBlacklistFile The file for mutation blacklist regions
#' @param replicationTimingGenomeBinnedFile Replication timing value at 1MB bin
#' @param replicationTimingElementBinnedFile Replication timing value of each element within 1MB bin
#' @param tumorType Study name
#' @param mutationType User provided mutated gene list
#' @param cellType Cell type of the study, [ BJ, GM12878, HeLaS3, HepG2, IMR90, K562, MCF7, SKNSH or average ]
#' @param replicationTimingCutOff Default is 0.2, a numeric value ranging from 0 to 1
#' @param seedNum User provided random number seed, default is 42
#' @param reSampleIterations User provided re-sapling iteration numbers
#' @param reRunPvalueCutOff Default is 0.1, a numeric value ranging from 0 to 1
#' @param useCores Default is 1, number of cpu to use
#' @param taskNum Default is 0, 0 means to run all elements
#' @param unitSize  Number of elements to run per task
#' @param debugMode TRUE or FALSE
#'
#' @return results data frame
#'
#' @examples
#' #date<-getRunDates(latest=TRUE)
#' cancerType<-"KIRC"
#' selectedSampleId<-NA
#' #worDir<-getwd()
#' mutSig2CVthreshold<-0.1
#' rareMutationUpperLimit<-0.3
#' rareMutationLowerLimit<-0.1
#' rareMutationFreq<-0.02
#'
#' #runNetBox2(dataDir,cancerType,
#' #           mutationList,ampGeneList,delGeneList,epiSilencedList,
#' #           mutationFreq,ampGeneFreq,delGeneFreq,epiSilencedFreq,
#' #           pathwayCommonsDb,directed,
#' #           linkerPValThreshold,communityDetectionMethod,
#' #           keepIsolatedNodes,verbose=TRUE)
#'
#' @concept CNCDriver
#' @export
#' @import doRNG
#' @import IRanges
#' @import GenomicRanges
#' @importFrom plyr rbind.fill
#' @importFrom stats p.adjust
#' @importFrom utils write.table
#' @importFrom data.table data.table
#' @importFrom data.table fread
#' 
getCDSPvalueWithPreFilter<-function(inputFileDir,outputFileDir,
                            codingRegionBedFile,elementKeyWord="CDS",
                            proteinDomainFile,proteinLengthFile,
                            minPoints=2,dRadius=50,
                            triNucleotideDistributionFile,
                            filterOutBlacklistMutations,mutationBlacklistFile,
                            replicationTimingGenomeBinnedFile,replicationTimingElementBinnedFile,
                            tumorType,mutationType,cellType,replicationTimingCutOff,
                            seedNum=42,reSampleIterations,reRunPvalueCutOff=0.1,
                            useCores=1,taskNum=0,unitSize,debugMode=FALSE){


    #library(data.table)
    #library(plyr)
    ######
    
    #inputDir<-"~/work/Ekta_lab/Priyanka_project/fsigsnv_results"
    #tumorType<-"Prostate"
    
    
    replicationTimingCutOff<-as.numeric(replicationTimingCutOff)
    replicationTimingParm<-paste("RT_",replicationTimingCutOff,sep="")
    
    reSampleIterations<-as.numeric(reSampleIterations)
    reRunPvalueCutOff<-as.numeric(reRunPvalueCutOff)
    
    useCores<-as.numeric(useCores)
    taskNum<-as.numeric(taskNum)
    unitSize<-as.numeric(unitSize)
    
    workDir<-file.path(outputFileDir,"output_result_triMut_match",mutationType,tumorType,replicationTimingParm)
    
    #mutationType<-"CDS"
    #seedNum<-42
    #reSampleNum<-1000
    reSampleNum<-reSampleIterations
    set.seed(seedNum)
    
    if( !file.exists(paste(workDir,sep="/")) ){
      dir.create(paste(workDir,sep=""),recursive=TRUE)
    }
    
    cat(sprintf("Start processing %s - %s\n",tumorType,mutationType))
    
    
    ########
    # Load pre-processed Rd file
    ########
    filePath<-file.path(inputFileDir)
    fileName<-paste("reducedFunseqOutputCDS_",tumorType,".Rd",sep="")
    fileName<-file.path(filePath,fileName)
    cat(sprintf("Load pre-processed Rd file\n"))
    load(fileName)
    
    #####
    ## load CDS bed file
    #####
    
    ###
    ## not load pre-defined bed file. We used cds definitions from VAT.
    ## 
    ###
    if(FALSE){
    
    cat(sprintf("Load elementBedfile\n"))
    #filePath<-"~/work/Ekta_lab/FunSeq2_compositeDriver_SEP_2017/data_context_SEP_2017/gencode"
    #fileName<-"gencode.v19.promoter.bed"
    #fileName<-codingRegionBedFile
    
    elementBedfileName<-file.path(codingRegionBedFile)
    #reducedFunseqOutput<-reducedFunseqOutputNCDS
    
    
    elementBedfile<-read.table(elementBedfileName,sep="\t",header=FALSE,stringsAsFactors = FALSE)
    colnames(elementBedfile)<-c("chr","posStart","posEnd","name")
    #colnames(dat)<-c("chr","posStart","posEnd","name","score","strand","thickStart","thickEnd","itemRgb","blockCount","blockSizes","blockStarts")
    
    
    
    ## extract mutations overlap with pre-defined element regions
    reducedFunseqOutputCDS<-extractElementMutations(elementBedfile,reducedFunseqOutputCDS)
    
    }
    
    ######
    # load triNucleotideDistribution file
    ######
    cat(sprintf("Load triNucleotideDistribution file\n"))
    #filePath<-"~/work/Ekta_lab/compositeDriver_data/triNucleotidedistribution"
    #fileName<-"compositeDriver_mutationDistributionMatrix_all_kmer_3_counts_Mar_2018_v0.txt"
    #fileName<-file.path(filePath,fileName)
    
    fileName<-triNucleotideDistributionFile
    
    mutationDistMatrix<-read.table(fileName,sep="\t",header=TRUE,check.names = FALSE,stringsAsFactors = FALSE)
    
    categoryName<-rownames(mutationDistMatrix)
    categoryName<-gsub(" ","@",categoryName)
    rownames(mutationDistMatrix)<-categoryName
    
    ######
    # This section is for pan-cancer calculation
    # This is not required for single cancer type calculatin
    ######
    if(FALSE){
        filePath<-file.path("~/work/Ekta_lab/cncdriver_analysis_Mar_2018/sample_summary")
        fileName<-"Eric_sampleDist_Mar_2018.txt"
        fileName<-file.path(filePath,fileName)
        tumorDistTable<-read.table(fileName,header=TRUE,sep="\t",stringsAsFactors=FALSE)
        #tumorDistTable<-tumorDistTable[order(-tumorDistTable$totalMutCounts),]
        
        mutationBurden<-sort(colSums(mutationDistMatrix),decreasing = TRUE)
        mutationBurdenDF<-data.frame(names(mutationBurden),mutationBurden,stringsAsFactors = FALSE)
        colnames(mutationBurdenDF)<-c("histologyName","totalMutationCounts")
        
        tumorDistTable<-merge(tumorDistTable,mutationBurdenDF,by="histologyName")
        
        tumorDistTable$mutationBurdenAverage<-tumorDistTable$totalMutationCounts/tumorDistTable$histologySize
        
        tumorDistTable<-tumorDistTable[order(-tumorDistTable$mutationBurdenAverage),]
        
        
        ######
        
        #tumorDistTable<-tumorDistTable[tumorDistTable$histologyName %in% tumorType,]
        
        #sampleSizeFactor<-tumorDistTable$histologySize
        #names(sampleSizeFactor)<-tumorDistTable$histologyName
        
        #sampleSizeFactor<-tumorDistTable$totalMutationCounts
        #names(sampleSizeFactor)<-tumorDistTable$histologyName
        
        #sampleSizeFactor<-tumorDistTable$mutationBurdenAverage
        #names(sampleSizeFactor)<-tumorDistTable$histologyName
        
        #sampleSizeFactor<-rep(1,nrow(tumorDistTable))
        #names(sampleSizeFactor)<-tumorDistTable$histologyName
        
        sampleSizeFactor<-(log10(tumorDistTable$totalMutationCounts/min(tumorDistTable$totalMutationCounts)))+1
        names(sampleSizeFactor)<-tumorDistTable$histologyName
        
        #countsSizeFactor<-tumorDistTable$histologySize
        #names(countsSizeFactor)<-tumorDistTable$histologyName
        
        countsSizeFactor<-tumorDistTable$totalMutationCounts
        names(countsSizeFactor)<-tumorDistTable$histologyName
        
        #countsSizeFactor<-tumorDistTable$mutationBurdenAverage
        #names(countsSizeFactor)<-tumorDistTable$histologyName
    }
    ######
    
    
    
    ######
    
    # normalize tri-nucleotide context per cancer type
    if(TRUE){
      
      tmp_mm<-mutationDistMatrix
      
      #tmp_mm<-tmp_mm[,c(1:2)]
      tumorTypeSum<-colSums(tmp_mm)
      #overallSum<-sum(tumorTypeSum)
      
      # normalization on columns for each tumorType
      tmp_mm_t<-lapply(1:ncol(tmp_mm), function(x){
        tmp_mm[,x]/tumorTypeSum[colnames(tmp_mm)[x]]
      })
      
      tmp_mm_t<-do.call(rbind,tmp_mm_t)
      
      tmp_mm2<-t(tmp_mm_t)
      rownames(tmp_mm2)<-rownames(tmp_mm)
      colnames(tmp_mm2)<-colnames(tmp_mm)
      
      tmp_mm<-tmp_mm2
      
      tmp_mm_normalized<-tmp_mm
      
      mutationDistMatrix<-tmp_mm_normalized
      
    }
    
    cat(sprintf("Finish processing triNucleotideDistribution matrix\n"))
    
    ######
    
    parseFunseqGeneField<-function(field,keyword,useCores=1){
      geneSymbol<-{}
      #a1<-strsplit(aa,",")
      a1<-strsplit(field,"\\)")
      #a1<-unlist(a1)
      
      tmpStr<-mclapply(1:length(a1), function(x){
        #cat(sprintf("gene:%s\t",i))  
        idx <- which(grepl(keyword, a1[[x]]))
        a1[[x]]<-gsub("\\,","",a1[[x]])
        t3 <- strsplit(a1[[x]][idx], "\\(")
        #cat(sprintf("%s\n",t3[1][1]))
        tmpStr<-t3[[1]][1]
        
      },mc.cores=useCores)
      
      geneSymbol<-unlist(tmpStr)
      
      return(geneSymbol)  
    }
    
    #cat(sprintf("Start parsing GENE field annotations\n"))
    #reducedFunseqOutputNCDS$GENEparsed<-parseFunseqGeneField(field=reducedFunseqOutputNCDS$GENE,keyword=elementKeyWord,useCores=1)
    
    tmpString<-strsplit(as.character(reducedFunseqOutputCDS$CDSS),":",fixed=TRUE)
    tmpStringFrame<-data.frame(do.call("rbind",tmpString),stringsAsFactors=FALSE)
    reducedFunseqOutputCDS$score<-as.numeric(tmpStringFrame[,1])
    
    
    #######
    
    #######
    # filter out mutations on blacklist
    ######
    
    #filePath<-"~/work/Ekta_lab/compositeDriver_data/genome_blacklist"
    #fileName<-"wgEncodeDacMapabilityConsensusExcludable.bed"
    #fileName<-file.path(filePath,fileName)
    
    fileName<-mutationBlacklistFile
    
    testDT<-data.table(reducedFunseqOutputCDS)
    setkeyv(testDT,c("chr","posStart","posEnd"))
    
    dacDT<-fread(fileName)
    colnames(dacDT)<-c("chr","posStart","posEnd","annotation","score","strand")
    setkeyv(dacDT,c("chr","posStart","posEnd"))
    
    mutExcludable<-foverlaps(testDT,dacDT,nomatch=0)
    numOfBlacklistMutations<-nrow(mutExcludable)
    
    cat(sprintf("there are %s mutations on blacklist\n",numOfBlacklistMutations))
    
    removeIdx<-mutExcludable[,c("chr","i.posStart","i.posEnd"),with=FALSE]
    
    
    if(filterOutBlacklistMutations){
      woBlacklistDT<-testDT[ !list(removeIdx),]
      reducedFunseqOutputCDS<-setDF(woBlacklistDT)
      remove(woBlacklistDT)
      
    }
    
    remove(dacDT)
    remove(mutExcludable)
    remove(testDT)
    remove(removeIdx)
    
    
    #####
    # sanity check
    if(FALSE){
      
      reducedFunseqOutput<-reducedFunseqOutputCDS  
      
      reducedFunseqOutput$index<-paste(reducedFunseqOutput$sampleID,reducedFunseqOutput$chr,paste(reducedFunseqOutput$posStart,reducedFunseqOutput$posEnd,sep="-"),sep=":")
      
      duplicatedLine<-duplicated(reducedFunseqOutput$index)
      
      educedFunseqOutput<-reducedFunseqOutput[!duplicatedLine,]
      
      reducedFunseqOutput<-reducedFunseqOutput[,c(1:26)]
      
      numOfWhitelistMutations<-nrow(reducedFunseqOutput)
      cat(sprintf("there are %s mutations on the regions of element bed file\n",numOfWhitelistMutations))
      
    }
    
    #####
    # Add triNucleotideDistribution
    #####
    
    cat(sprintf("Add triNucleotideDistribution information\n"))
    
    triNucleotideDat<-addTriNucleotideDistribution(reducedFunseqOutputCDS)
    
    reducedFunseqOutputCDS<-cbind(reducedFunseqOutputCDS,triNucleotideDat)
    
    
    ####
    ## associate replication timing data with each variant
    ## note: there is no chromosome Y or chromosome M data in the replication timing
    ####
    #cellType<-"BJ"
    
    cat(sprintf("Add replication timing signal for cellType %s\n",cellType))
    
    filePath<-"~/work/Ekta_lab/compositeDriver_data/replication_timing/UW/processed"
    fileName<-"UW_RepliSeq_wavelet_1Mb_windows.txt"
    fileName<-file.path(filePath,fileName)
    
    fileName<-replicationTimingGenomeBinnedFile
    
    replicationTimingDF<-read.table(fileName,header=TRUE,sep="\t",stringsAsFactors=FALSE)
    
    selectedColumn<-which(colnames(replicationTimingDF) %in% cellType)
    replicationTimingDF<-replicationTimingDF[,c(1:4,selectedColumn)]
    colnames(replicationTimingDF)<-c("chr","start","end","name","signalValue")
    replicationTimingDF$index<-paste(replicationTimingDF$chr,paste(replicationTimingDF$start,replicationTimingDF$end,sep="-"),sep=":")
    
    reducedFunseqOutputCDS<-addReplicationTimingSignal(reducedFunseqOutputCDS,replicationTimingDF)
    reducedFunseqOutputCDS<-reducedFunseqOutputCDS[!is.na(reducedFunseqOutputCDS$signalValue),]
    
    #####
    # Set element-wise replication timing signal boundary
    #####
    cat(sprintf("Set element-wise replication timing signal boundary at %s similarity [range: 0-1]\n",replicationTimingCutOff))
    
    #filePath<-"~/work/Ekta_lab/compositeDriver_data/replication_timing/UW/processed"
    #fileName<-"UW_RepliSeq_wavelet_1Mb_windows_promoter.txt"
    
    fileName<-replicationTimingElementBinnedFile
    
    elementReplicationTimingDF<-fread(fileName,header=TRUE,sep="\t",stringsAsFactors=FALSE,data.table=FALSE)
    
    selectedColumn<-which(colnames(elementReplicationTimingDF) %in% cellType)
    elementReplicationTimingDF<-elementReplicationTimingDF[,c(1,selectedColumn)]
    colnames(elementReplicationTimingDF)<-c("elementName","signalValue")
    
    #replicationTimingCutOff<-0.3
    
    
    if(FALSE){
      elementMat<-elementReplicationTimingDF
      featureMat<-data.frame(elementMat[,2:ncol(elementMat)])
      colnames(featureMat)<-c("replicationTiming")
      rownames(featureMat)<-elementMat[,1]
      
      rangeMat<-getNearestElement(featureMat,replicationTimingCutOff,useCores=useCores)
      rangeMat$elementName<-rownames(rangeMat)
      
      elementReplicationTimingDF<-merge(elementReplicationTimingDF,rangeMat,by="elementName")
    }
    
    if(TRUE){
      #signalConst<-elementReplicationTimingDF$signalValue
      #names(signalConst)<-elementReplicationTimingDF$elementName
      
      # the original formula used in the cds calculation
      #maxValue<-max(replicationTimingDF$signalValue)
      #minValue<-min(replicationTimingDF$signalValue)
      
      # the formula used in promoter and other elements
      maxValue<-max(elementReplicationTimingDF$signalValue)
      minValue<-min(elementReplicationTimingDF$signalValue)
      
      
      interval<-0.5*(maxValue-minValue)*replicationTimingCutOff
      
      #signalUpper<-rep(maxValue,nrow(elementReplicationTimingDF))
      #signalLower<-rep(minValue,nrow(elementReplicationTimingDF))
      
      signalUpper<-elementReplicationTimingDF$signalValue+interval
      signalLower<-elementReplicationTimingDF$signalValue-interval
      
      rangeValue<-data.frame(elementReplicationTimingDF,signalLower,signalUpper,stringsAsFactors = FALSE)
      colnames(rangeValue)<-c("elementName","signalValue","signalLower","signalUpper")
      
      idx<-which(rangeValue$signalUpper>=maxValue)
      deltaUpper<-rangeValue[idx,]$signalUpper-maxValue
      rangeValue[idx,]$signalLower<-rangeValue[idx,]$signalLower-deltaUpper
      rangeValue[idx,]$signalUpper<-rep(maxValue,length(idx))
      
      idx<-which(rangeValue$signalLower<=minValue)
      deltaLower<-minValue-rangeValue[idx,]$signalLower
      rangeValue[idx,]$signalUpper<-rangeValue[idx,]$signalUpper+deltaLower
      rangeValue[idx,]$signalLower<-rep(minValue,length(idx))
      
      rangeValue<-rangeValue[,-which(colnames(rangeValue) %in% "signalValue")]
    }
    
    elementReplicationTimingDF<-merge(elementReplicationTimingDF,rangeValue,by="elementName")
    
    # debug for plotting
    if(FALSE){
      
      cc<-elementReplicationTimingDF
      
      dd<-cc[order(cc$signalValue),]
      dd$index<-seq(1,nrow(cc),by=1)
      
      #name<-"TERT"
      #name<-"HOXC5"
      name<-"POTEB2"  
      #name<-"PCDH15"
      
      ss<-dd[dd$elementName %in% name,]
      #nameList<-dd[dd$signalValue<=ss$dmax & dd$signalValue>=ss$dmin,]$elementName
      nameList<-dd[dd$signalValue<=ss$signalUpper & dd$signalValue>=ss$signalLower,]$elementName
      
      kk<-dd[dd$elementName %in% nameList,]
      
      plot(dd$index,dd$signalValue,col="black")
      points(kk$index,kk$signalValue,col="red")
      points(ss$index,ss$signalValue,col="green",pch=19)
      
      plot(geneDFunique$signalValue,geneDFunique$occurence)
      
      kk<-geneDFunique[geneDFunique$signalValue<=ss$signalUpper & geneDFunique$signalValue>=ss$signalLower,]
      points(kk$signalValue,kk$occurence,col="red")
      #points(ss$signalValue,ss$signalValue,col="green",pch=19)
      
      plot(geneDFunique$signalValue,geneDFunique$compositeScore)
      points(kk$signalValue,kk$compositeScore,col="red")
      
    }
    
    cat(sprintf("Finish processing replication Timing file \n"))
    
    
    ####
    #  CDS calculation specific section
    #  Add pfam protein domain information
    #  Add protein length information
    ####
    
    #filePath<-"~/work/Ekta_lab/compositeDriver_data/ensembl_hg37"
    #fileName<-"hg37_ensembl_txCDSwithProteinLength.txt"
    #fileName<-file.path(filePath,fileName)  
    fileName<-proteinLengthFile
    cdsExonLength<-read.table(fileName,sep="\t",header=TRUE,stringsAsFactors = FALSE)
    
    
    parseVATannotation<-function(vector,index){
      a2<-strsplit(vector,":")
      
      alteredType<-sapply(a2,"[",5)
      
      tmp<-sapply(a2,"[",6)
      tmp<-strsplit(tmp,"\\/")
      alteredRatio<-(as.numeric(sapply(tmp,"[",1))/as.numeric(sapply(tmp,"[",2)))
      
      tmp<-sapply(a2,"[",8)
      tmp<-strsplit(tmp,"\\.")
      alteredTranscriptId<-sapply(tmp,"[",1)
      
      tmp<-sapply(a2,"[",9)
      tmp<-strsplit(tmp,"\\_")
      alteredPos<-sapply(tmp,"[",3)
      
      tmp<-sapply(tmp,"[",4)
      tmp<-strsplit(tmp,"->")
      refNu<-sapply(tmp,"[",1)
      altNu<-sapply(tmp,"[",2)
      
      tmp<-data.frame(alteredType,alteredRatio,alteredTranscriptId,alteredPos,refNu,altNu,stringsAsFactors = FALSE)
      maxRatio<-max(tmp$alteredRatio)
      tmpVector<-tmp[tmp$alteredRatio==maxRatio,]
      
      if(nrow(tmpVector)>1){
        tmpVector<-tmpVector[1,]
      }
      
      tmpVector$index<-index
      return(tmpVector)
    }
      
      
    arrangeVATannotation<-function(field,useCores){
      tmpVector<-{}
      
      a1<-strsplit(field,",")
      
      tmpVector<-mclapply(1:length(a1), function(x){
        cat(sprintf("variant:%s /%s\n",x,length(a1)))
        
        if(length(a1[[x]])>1){
          tmpVector[[x]]<-parseVATannotation(a1[[x]],x)
        }else{
          
          if(!is.na(a1[[x]])){
            tmpVector[[x]]<-parseVATannotation(a1[[x]],x)
          }else{
            tmpStr<-rep(NA,6)
            tmpFrame<-data.frame(t(tmpStr),x,stringsAsFactors = FALSE)
            colnames(tmpFrame)<-c("alteredType","alteredRatio","alteredTranscriptId","alteredPos","refNu","altNu","index")
            tmpVector[[x]]<-tmpFrame
            
          }
        }
      },mc.cores=useCores)
      
      #alteredDf<-rbindlist(tmpVector)
      alteredDf<-rbind.fill(tmpVector)
      
      return(alteredDf)
        
    }
      
    #filePath<-file.path("~/work/Ekta_lab/cncdriver_analysis_Mar_2018","compositeDriver_input",mutationType)
    filePath<-file.path(inputFileDir,mutationType)
    fileName<-paste("vatDF_",tumorType,"_",mutationType,".Rd",sep="")
    fileName<-file.path(filePath,fileName)  
    
    if(file.exists(fileName)){
      load(fileName)
    }else{
      
      vatDF<-arrangeVATannotation(reducedFunseqOutputCDS$VA,useCores)
      
      if( !file.exists(paste(filePath,sep="/")) ){
        dir.create(paste(filePath,sep=""),recursive=TRUE)
      }
      
      save(vatDF,file=fileName)
      
    }
    ## debug check
    #tt<-cbind(vatDF,reducedFunseqOutputCDS)
    
    ####
    
    #filePath<-"~/work/Ekta_lab/compositeDriver_data/cds_data/domainData"
    #fileName<-"pfam28.9606.processed_w_eValue_cutOff.txt"
    #fileName<-file.path(filePath,fileName)
    fileName<-proteinDomainFile
    domainDF<-read.table(fileName,header=TRUE,sep="\t",stringsAsFactors = FALSE)
    
    domainDF<-split(domainDF,domainDF$geneSymbol)
    
    #####
    
    #geneName<-"CDKN2A"
    
    addDomainScore<-function(domainDF,geneDF,useCores=1){
      
      variantAnno<-{}
      
      nameVector<-unique(names(geneDF))
      
      #for(geneName in nameVector){
      variantAnno<-mclapply(1:length(nameVector), function(x){
        #variantAnno<-lapply(1:length(nameVector), function(x){
        
        geneName<-nameVector[x]
        #cat(sprintf("name: %s\n",geneName))
        domainDat<-domainDF[[geneName]]
        
        if(!is.null(domainDat)){
          domainIRanges<-IRanges(start=(domainDat$envelopeStart),
                                 end=(domainDat$envelopeEnd))
          mcols(domainIRanges)$names<-domainDat$hmmName
          
          variantDat<-geneDF[[geneName]]
          
          #splice site debug
          if(nrow(variantDat[is.na(variantDat$alteredPos),])>0){
            variantDat[is.na(variantDat$alteredPos),]$alteredPos<-0
          }
          
          variantIRanges<-IRanges(start=(as.numeric(variantDat$alteredPos)),
                                  end=(as.numeric(variantDat$alteredPos)))
          
          hit<-findOverlaps(variantIRanges,domainIRanges)
          
          variantDat$domain<-rep(NA,nrow(variantDat))
          variantDat$domain[queryHits(hit)]<-mcols(domainIRanges)$names[subjectHits(hit)]
          
          variantDat$newScore<-variantDat$score
          #withinDomainIndex<-which(!variantDat$domain %in% "NA" & !variantDat$alteredType %in% "synonymous_variant")
          withinDomainIndex<-which(!is.na(variantDat$domain) & !variantDat$alteredType %in% "synonymous_variant")
          if(length(withinDomainIndex)>0){
            # cds score add one when the variant is located within pfam protein domain 
            variantDat[withinDomainIndex,]$newScore<-variantDat[withinDomainIndex,]$newScore+1
          }
          
          variantAnno[[geneName]]<-variantDat
          
        }else{
          
          variantDat<-geneDF[[geneName]]
          variantDat$domain<-rep(NA,nrow(variantDat))
          variantDat$newScore<-variantDat$score
          variantAnno[[geneName]]<-variantDat
        }
        
        #}
        #})
      },mc.cores=useCores)
      
      names(variantAnno)<-nameVector  
      
      #variantAnno<-rbind.fill(variantAnno)
      
      return(variantAnno)
    }
    

    ####
    
    mergeDF<-reducedFunseqOutputCDS
    
    #fileName<-paste(tumorType,"_",mutationType,"_merge_variant_details.txt",sep="")
    #fileName<-file.path(workDir,fileName)
    #write.table(mergeDF,fileName,sep="\t",quote=FALSE,row.names = FALSE,col.names = TRUE)
    
    groupType<-c("allSamples")
    
    groupDF<-{}
    groupDF[[groupType[1]]]<-mergeDF
    
    #tmpDF<-reducedFunseqOutputNCDS
    
    for(i in 1:length(groupType)){
    
      tmpDF<-groupDF[[groupType[i]]]
      
      groupName<-groupType[i]
      cat(sprintf("Processing %s\n",groupName))
      
      posIndex<-paste(tmpDF$chr,paste(tmpDF$posStart,tmpDF$posEnd,sep="-"),sep=":")
      geneDF<-data.frame(tmpDF$sampleID,posIndex,tmpDF$chr,tmpDF$posStart,
                         tmpDF$posEnd,tmpDF$GENE,tmpDF$ref,tmpDF$alt,
                         tmpDF$score,tmpDF$RECUR,tmpDF$index,tmpDF$signalValue,
                         tmpDF$tumorType,stringsAsFactors = FALSE)
                         #tmpDF$tumorType,stringsAsFactors = FALSE)
      colnames(geneDF)<-c("sampleID","posIndex","chr","posStart",
                      "posEnd","geneSymbol","ref","alt",
                      "score","recur","triMutIndex","signalValue","tumorType")

      geneDF<-cbind(geneDF,vatDF)
      #kk<-geneDF[geneDF$geneSymbol=="EGFR",]
    
      #cdsExonLength$geneWithTranscript<-paste(cdsExonLength$external_gene_name,cdsExonLength$ensembl_transcript_id,sep=":")
      cdsExonLengthSimple<-cdsExonLength[,c(4,5,7)]
      colnames(cdsExonLengthSimple)<-c("alteredTranscriptId","cds_length","protein_length")
      #geneDF0<-merge(geneDF,cdsExonLengthSimple,by="alteredTranscriptId",all.x=TRUE)
      
      ## some SNV without VA annotation, we will remove them. It may be the problem of VA software
      geneDF<-merge(geneDF,cdsExonLengthSimple,by="alteredTranscriptId")
      
      ## remove splice_variant mutations
      geneDF<-geneDF[!(geneDF$alteredType %in% "splice_variant"),]
  
      # debug
      # checkScoreDF<-geneDF[is.na(geneDF$score),]
      # dim(checkScoreDF)
      
      if( nrow(geneDF[is.na(geneDF$score),])>0 ){
        geneDF[is.na(geneDF$score),]$score<-0
      }
      
      geneNameVector<-unique(geneDF$geneSymbol)
      geneDF<-split(geneDF,geneDF$geneSymbol)
      
      ## Add new score it variant located within pfam domain 
      geneDF<-addDomainScore(domainDF,geneDF)
  
      ########
      # additional filter
      ########
      
      #filePath<-file.path("~/work/Ekta_lab/cncdriver_analysis_Mar_2018/compositeDriver_input",mutationType)
      filePath<-file.path(inputFileDir,mutationType)
      fileName<-paste("geneDFcluster_",tumorType,"_",mutationType,".Rd",sep="")
      fileName<-file.path(filePath,fileName)  
      
      if(file.exists(fileName)){
        
        load(fileName)
        
      }else{
        
        #minPoints<-2
        #dRadius<-50
        
        cat(sprintf("dbscan cluster parameter: minPoints:%s, dRadius:%s\n",minPoints,dRadius))
        
        geneDFcluster<-{}
        
        cat(sprintf("Processing geneDFcluster object for calculation\n"))
        
        ## this number is 1 for single cancer type calculation
        ## A correction factor only used in pancancer calculation
        if(TRUE){
          sampleSizeFactor<-1
          names(sampleSizeFactor)<-tumorType
        }
        
        geneDFcluster<-mclapply(1:length(geneNameVector), function(x){
          #geneDFcluster<-lapply(1:length(geneNameVector), function(x){
          #geneDFunique<-mclapply(1:6, function(x){
          #for(x in 1:length(geneNameVector)){
          #cat(sprintf("%s\n",x))  
          geneName<-geneNameVector[x]
          
          tmp0DF<-geneDF[[geneName]]
          tmp0DF$oldScore<-tmp0DF$score
          tmp0DF$score<-tmp0DF$newScore
          
          ## when variant cross multiple seqments of replication timing bins, take mean value.
          tmp0DF$signalValue<-mean(tmp0DF$signalValue)
          
          tmpDFdbscan<-data.frame(tmp0DF$posStart,rep(1,nrow(tmp0DF)),stringsAsFactors = FALSE)
          colnames(tmpDFdbscan)<-c("posStart","dummy")
          
          #dRadius<-(max(tmpDFdbscan$posStart)-min(tmpDFdbscan$posStart))/nrow(tmpDFdbscan)
          dbscanOut<-dbscan(tmpDFdbscan,eps=dRadius,minPts = minPoints)
          
          tmpDFdbscan$cluster<-dbscanOut$cluster
          table(tmpDFdbscan$cluster)
          
          if(FALSE){
            plot(tmpDFdbscan$pos,tmpDFdbscan$cluster,col=tmpDFdbscan$cluster+1,pch=20)
            points(tmpDFdbscan[tmpDFdbscan$cluster==0,]$pos,tmpDFdbscan[tmpDFdbscan$cluster==0,]$cluster,col="grey",pch=20)
          }
          
          tmpDFdbscan<-tmpDFdbscan[order(tmpDFdbscan$posStart),]
          tmpDFdbscan<-tmpDFdbscan[!duplicated(tmpDFdbscan$posStart),]
          
          tmp0DF<-merge(tmp0DF,tmpDFdbscan,all.x=TRUE)
          
          tmp0DF<-tmp0DF[,c(2:5,1,6:25,27)]
          #tmp0DFoutlier<-tmp0DF[tmp0DF$cluster==0,]
          
          # cluster 0 as outlier
          #tmp0DF<-tmp0DF[tmp0DF$cluster!=0,]
          
          geneDFcluster[[geneName]]<-mergeVariantPositionClusterV2(tmp0DF,sampleSizeFactor,countsCutOff = 2)
          
          #})    
        },mc.cores=useCores)
        names(geneDFcluster)<-geneNameVector
        
        if( !file.exists(paste(filePath,sep="/")) ){
          dir.create(paste(filePath,sep=""),recursive=TRUE)
        }
        
        save(geneDFcluster,file=fileName)
      }
      
      
      
      geneDFcluster<-rbind.fill(geneDFcluster)
      geneNameVector<-unique(geneDFcluster$geneSymbol)
      
      geneDFcluster<-split(geneDFcluster,geneDFcluster$geneSymbol)
      
      #geneDFbackup<-geneDF
      #geneDF<-geneDFcluster
      #geneNameVector_backup<-geneNameVector
      #geneNameVector<-names(geneDF)
      
      
      ########
      
      geneDFpatient<-{}
      
      geneDFpatient<-mclapply(1:length(geneNameVector), function(x){
        #tmpDat<-geneDF[[geneName]][!(duplicated(geneDF[[geneName]]$posIndex)),]
        geneName<-geneNameVector[x]
        tmpDF<-geneDF[[geneName]]
        npat<-length(unique(tmpDF$sampleID))
        geneDFpatient[[geneName]]<-npat
      },mc.cores=useCores)
      
      geneDFpatient<-unlist(geneDFpatient)
      names(geneDFpatient)<-geneNameVector
      
      compositeScoreVector<-{}
      compositeScoreScaledVector<-{}
      uniqueVariantPos<-{}
      
      ####
      if(FALSE){
      
          #filePath<-file.path("~/work/Ekta_lab/cncdriver_analysis_Mar_2018/compositeDriver_input",mutationType)
          filePath<-file.path(inputFileDir,mutationType)
          fileName<-paste("geneDFunique_",tumorType,"_",mutationType,".Rd",sep="")
          fileName<-file.path(filePath,fileName)  
          
          if(file.exists(fileName)){
            
            cat(sprintf("Load %s\n",fileName))
            load(fileName)
            
          }else{  
            
            cat(sprintf("Processing geneDFunique object for calculation\n"))
            
            geneDFunique<-{}
            
            ## this number is 1 for single cancer type calculation
            ## A correction factor only used in pancancer calculation
            if(TRUE){
             sampleSizeFactor<-1
             names(sampleSizeFactor)<-tumorType
            }
            
            geneDFunique<-mclapply(1:length(geneNameVector), function(x){
              #geneDFunique<-mclapply(1:6, function(x){
              #for(x in 1:length(geneNameVector)){
              #cat(sprintf("%s\n",x))  
              geneName<-geneNameVector[x]
              
              tmp0DF<-geneDF[[geneName]]
              tmp0DF$oldScore<-tmp0DF$score
              tmp0DF$score<-tmp0DF$newScore
              
              ## when variant cross multiple seqments of replication timing bins, take mean value.
              tmp0DF$signalValue<-mean(tmp0DF$signalValue)
              
              
              geneDF[[geneName]]<-tmp0DF
              
              geneDFunique[[geneName]]<-geneDF[[geneName]][!(duplicated(geneDF[[geneName]]$posIndex)),]
              geneDFunique[[geneName]]<-geneDFunique[[geneName]][order(geneDFunique[[geneName]]$posIndex),]
              
              #recurrenceVector<-table(geneDF[[geneName]]$posIndex)
              collapsedDF<-mergeVariantPosition(geneDF[[geneName]],sampleSizeFactor,countsCutOff = 2)
              #geneDFunique[[geneName]]$occurence<-mergedDF[geneDFunique[[geneName]]$posIndex,]$occurence
              #geneDFunique[[geneName]]$compositeScore<-mergedDF[geneDFunique[[geneName]]$posIndex,]$compositeScore
              #geneDFunique[[geneName]]$tumorCounts<-mergedDF[geneDFunique[[geneName]]$posIndex,]$tumorCounts
              tmp2DF<-geneDFunique[[geneName]]
              #tmp3DF<-cbind(tmp2DF[,2:12],tmp2DF[,14:22])
              
              #tmp3DF<-cbind(tmp2DF[,c(1,3:13)],tmp2DF[,15:18])
              #tmp3DF$oldScore<-tmp2DF$oldScore
              #geneDFunique[[geneName]]<-cbind(tmp3DF,collapsedDF[,2:7])
              
              tmp3DF<-cbind(tmp2DF[,c(1,3:12)],tmp2DF[,15:24])
              tmp3DF$oldScore<-tmp2DF$oldScore
              geneDFunique[[geneName]]<-cbind(tmp3DF,collapsedDF[,2:6])
              
              
              #}
              
            },mc.cores=useCores)
            
            names(geneDFunique)<-geneNameVector
            
            if( !file.exists(paste(filePath,sep="/")) ){
              dir.create(paste(filePath,sep=""),recursive=TRUE)
            }
            
            save(geneDFunique,file=fileName)
          }
      
      }
      
      geneDFunique<-geneDFcluster
      
      #####
      
      compositeScoreVector<-mclapply(1:length(geneNameVector), function(x){ 
        geneName<-geneNameVector[x]
        compositeScoreVector[[geneName]]<-sum(geneDFunique[[geneName]]$compositeScore)
      },mc.cores=useCores)
      
      compositeScoreVector<-unlist(compositeScoreVector)
      names(compositeScoreVector)<-geneNameVector
      
      #######
      
      compositeScoreScaledVector<-mclapply(1:length(geneNameVector), function(x){ 
        geneName<-geneNameVector[x]
        compositeScoreScaledVector[[geneName]]<-sum(geneDFunique[[geneName]]$compositeScoreScaled)
      },mc.cores=useCores)
      
      compositeScoreScaledVector<-unlist(compositeScoreScaledVector)
      names(compositeScoreScaledVector)<-geneNameVector
      
      #######
      
      uniqueVariantPos<-mclapply(1:length(geneNameVector), function(x){
        geneName<-geneNameVector[x]
        uniqueVariantPos[[geneName]]<-nrow(geneDFunique[[geneName]])
      },mc.cores=useCores)
      
      uniqueVariantPos<-unlist(uniqueVariantPos)
      names(uniqueVariantPos)<-geneNameVector
      
      #######
      
      numOfAlteration<-mclapply(1:length(geneNameVector), function(x){
        geneName<-geneNameVector[x]
        numOfAlteration<-sum(geneDFunique[[geneName]]$occurence)
      },mc.cores=useCores)
      
      numOfAlteration<-unlist(numOfAlteration)
      names(numOfAlteration)<-geneNameVector
      
      #######
      #geneDF<-rbind.fill(geneDF)
      geneDFunique<-rbind.fill(geneDFunique)
      
      #####
      
      #filePath<-file.path("~/work/Ekta_lab/cncdriver_analysis_Mar_2018/compositeDriver_input",mutationType)
      filePath<-file.path(inputFileDir,mutationType)
      fileName<-paste("variantTriMutCategoryParsed_",tumorType,"_",mutationType,".Rd",sep="")
      fileName<-file.path(filePath,fileName)  
      
      if(!file.exists(fileName)){
        
        cat(sprintf("Processing variantTriMutCategoryParsed object for calculation\n"))
        
        variantTriMutCategoryParsed<-parsePosVariantByTriMutContextWithAnnotationCDS(geneDFunique,mutationDistMatrix,useCores=useCores)
        
        if( !file.exists(paste(filePath,sep="/")) ){
          dir.create(paste(filePath,sep=""),recursive=TRUE)
        }
        
        save(variantTriMutCategoryParsed,file=fileName)
        
      }else{
        
        cat(sprintf("Load %s\n",fileName))
        load(fileName)
      }
      
      ####
      
      compositeScoreDF<-data.frame(uniqueVariantPos,numOfAlteration,geneDFpatient,compositeScoreVector,compositeScoreScaledVector,stringsAsFactors = FALSE)
      rownames(compositeScoreDF)<-names(compositeScoreVector)
      colnames(compositeScoreDF)<-c("uniqueVariantPos","numOfAlteration","numOfPatient","compositeScore","compositeScoreScaled")
      
      cat(sprintf("Finish data preparation\n"))
      
    
    ######
    #set.seed(42)
    #mutationType<-"CDS"
    #reSampleNum<-1000000
    
      compositeScore<-{}
      compositeScoreResample<-{}
      numOfAlterationPos<-{}
      numOfAlteration<-{}
      numOfPatient<-{}
      numOfAboveCScore<-{}
      reSampleSize<-{}
      pValue<-{}
      
      outputDf<-{}
    
    #######
    
      cdsMutationCheckList<-unique(geneDFunique$geneSymbol)
      
      # remove some genes not have replication timing data
      cdsMutationCheckList<-intersect(cdsMutationCheckList,elementReplicationTimingDF$elementName)
      
      #ncdsMutationCheckList<-c("TBX5","TP53","NRXN1","ITGB6","GRIA2","LATS1","YES1","DPP4","PAX7","EXOC1","NOS3","DST","BRCA2","BRCA1","CDK12","RB1")
      #ncdsMutationCheckList<-c("ABCB10","AKT1","LAMA1","LRP2","PIK3CA","SF3B1","TP53","ZNF263","IARS","GATA3","COPB2","CFTR","ZAP70","ALK","NCOR1","PTEN","ARID1A","CBFB")
      #ncdsMutationCheckList<-c("TP53","SPOP","FOXA1","PTEN","RB1","ABCC4","CSMD3","BRINP2","UMODL1")
  
      
      numOfgeneCheck<-length(cdsMutationCheckList)
      
      cat(sprintf("%s elements have alterations\n",numOfgeneCheck))
      
      #numOfgeneCheck<-12
      
      #####
      # further split tasks into small units if large-scale parallelization calculations
      ####
      
      #unitSize<-12
      totalTaskNum<-ceiling(numOfgeneCheck/unitSize)
      #totalTaskNum
      
      loopVector<-{}
      
      for(i in 1:totalTaskNum){
        startNum<-1 + unitSize*(i-1)
        endNum<-startNum + unitSize -1
        if(endNum>numOfgeneCheck){
          endNum<-numOfgeneCheck
        }
        loopVector[[i]]<-c(startNum:endNum)
      }
      
      if(taskNum!=0){
        loopSequence<-loopVector[[taskNum]]
      }else{
        loopSequence<-c(1:numOfgeneCheck)
      }  
      
      
    #######  
    # main p-value calculation section
    #######
      
      detectCores()
      cl<-makeCluster(useCores)
      registerDoParallel(cl)
      
      
      time_start<-proc.time()
      cat(sprintf("Use %s cores\n",useCores))
      cat (sprintf ("Start calculating p-value for %s candidates\n", numOfgeneCheck) )
      
      filePath<-workDir
      fileName<-paste("log_",mutationType,"_triMut_distribution_RT_similarity_",replicationTimingCutOff,"_task_",taskNum,"_.txt",sep="")
      fileName<-file.path(filePath,fileName)
      
      cat(sprintf ("progress can be look up at \n"))
      cat(sprintf("%s\n",fileName))
      
      
      outputDf<-foreach(k=loopSequence, .options.RNG=seedNum, .packages=c("stringr","parallel","plyr","CNCDriver"))  %dorng% {
        #outputDf<-foreach(k=1:numOfgeneCheck, .options.RNG=seedNum, .packages=c("FNN","stringr","parallel","plyr"))  %dorng% {
        #for(k in 1:numOfgeneCheck){ 
        #filePath<-file.path("~/work/Ekta_lab/JasonWong_dataset/compositeFunSeq_result_triMut_match",tumorType)
        filePath<-workDir
        fileName<-paste("log_",mutationType,"_triMut_distribution_RT_similarity_",replicationTimingCutOff,"_task_",taskNum,"_.txt",sep="")
        fileName<-file.path(filePath,fileName)
        
        #cat(sprintf("%s/%s\t",k,numOfgeneCheck),file=fileName,append=TRUE)        
        cat(sprintf("%s/%s\ttype:%s\tgene:%s\n",k,numOfgeneCheck,mutationType,cdsMutationCheckList[k]),file=fileName,append=TRUE)
        
        #cat(sprintf("%s/%s\t",k,numOfgeneCheck))        
        #cat(sprintf("type:%s\tgene:%s\n",mutationType,ncdsMutationCheckList[k]))
        
        numOfAlterationPos<-compositeScoreDF[rownames(compositeScoreDF) %in% cdsMutationCheckList[k],]$uniqueVariantPos
        numOfAlteration<-sum(geneDFunique[geneDFunique$geneSymbol %in% cdsMutationCheckList[k],]$occurence)
        numOfPatient<-geneDFpatient[[cdsMutationCheckList[k]]]
        compositeScore<-compositeScoreDF[rownames(compositeScoreDF) %in% cdsMutationCheckList[k],]$compositeScoreScaled
        
        gname<-cdsMutationCheckList[k]
        #gname<-"TERT"
        #a1<-geneDFunique[geneDFunique$geneSymbol==gname,]
        a1<-variantTriMutCategoryParsed[variantTriMutCategoryParsed$geneSymbol==gname,]
        #reSamplePosSize<-nrow(a1)
        #compositeScore[k]<-sum(a1$compositeScore)
        
        
        #####
        #  still need to modifiy for panCancer triMutIndex complexity
        #####
        #variantTable<-{}
        #variantTable<-table(a1$triMutIndex)
        #variantTriMutCategory<-data.frame(names(variantTable),as.numeric(variantTable),stringsAsFactors = FALSE)
        
        #variantTriMutCategory<-makeTriMutCountsByPosition(a1$categoryCounts)
        #colnames(variantTriMutCategory)<-c("categoryName","variantCounts")
        
        #variantTriMutCategory<-parsePosVariantByTriMutContext(a1$categoryCounts)
        #variantTriMutCategory$compositeScore<-a1$compositeScore
        #variantTriMutCategory$posIndex<-a1$posIndex
        #variantTriMutCategory$geneSymbol<-a1$geneSymbol
        variantTriMutCategory<-a1
        
        #####
        
        #a2<-geneDFunique[geneDFunique$geneSymbol!=gname,]
        #a2<-a2[!is.na(a2$signalValue),]
        
        a2<-variantTriMutCategoryParsed[variantTriMutCategoryParsed$geneSymbol!=gname,]
        a2<-a2[!is.na(a2$signalValue),]
        
        #####
        featureDF<-elementReplicationTimingDF[elementReplicationTimingDF$elementName==gname,]
        #a3<-a2[a2$signalValue<=featureDF$signalUpper & a2$signalValue>=featureDF$signalLower,]
        #geneDFuniqueSelected<-a3
        
        backgroundPosVariant<-a2
        #####
        
        ## this step need reduce calculation to gain speedup
        #backgroundPosVariant<-parsePosVariantByTriMutContext(geneDFuniqueSelected$categoryCounts,useCores=1)
        
        #backgroundPosVariant$compositeScore<-geneDFuniqueSelected$compositeScore
        #backgroundPosVariant$posIndex<-geneDFuniqueSelected$posIndex
        #backgroundPosVariant$geneSymbol<-geneDFuniqueSelected$geneSymbol
        
        #backgroundPosVariant<-a3
        
        #####
        
        #filePath<-file.path("~/work/Ekta_lab/JasonWong_dataset/compositeFunSeq_result_triMut_match",tumorType)
        filePath<-workDir
        debugFileName<-paste("debug_",mutationType,"_triMut_distribution_RT_similarity_",replicationTimingCutOff,"_task_",taskNum,"_.txt",sep="")
        debugFileName<-file.path(filePath,debugFileName)
        
        ## plotting correlation for debug
        if(FALSE){ 
          filePath<-file.path("~/work/Ekta_lab/JasonWong_dataset/checkCorrelation",tumorType)
          fileName<-paste(tumorType,"_correlation_RT_",mutationType,"_",gname,".png",sep="")
          fileName<-file.path(filePath,fileName)
          png(fileName,width=800,height=600)
          plot(geneDFunique$signalValue,geneDFunique$compositeScore,pch=19,xlab="",ylab="",cex=0.6)
          title(main=paste(tumorType," (n=",sampleSize ,"), ",mutationType,sep=""),
                xlab="replication timing [ late to early ]",
                ylab="positional CompositeScore")
          mtext(paste(gname,"num pos:",numOfAlterationPos,sep=" "))
          points(geneDFuniqueSelected$signalValue,geneDFuniqueSelected$compositeScore,col="red",pch=19,cex=0.6)
          #points(a22$signalValue,a22$compositeScore,col="blue",pch=19,cex=0.6)
          points(a1$signalValue,a1$compositeScore,col="green",pch=19,cex=0.6)
          
          dev.off()
        }
        
        #replicationTimingCutOff<-0.1
        #system.time(
        result<-calculatePvalueWithMutCategoryDistributionMatched7(variantTriMutCategory,backgroundPosVariant,mutationDistMatrix,featureDF,reSampleNum,replaceFlag = FALSE,replicationTimingCutOff,debugFileName,debugMode)
        #)
        
        if(result$pValue<=reRunPvalueCutOff){
          
          reSampleNumExpanded<-reSampleNum*10
          result<-calculatePvalueWithMutCategoryDistributionMatched7(variantTriMutCategory,backgroundPosVariant,mutationDistMatrix,featureDF,reSampleNumExpanded,replaceFlag = FALSE,replicationTimingCutOff,debugFileName,debugMode)
          
        }
        
        
        pValue<-result$pValue 
        numOfAboveCScore<-result$numOfAboveCScore
        reSampleSize<-result$reSampleSize
        
        tmpResult<-data.frame(gname,numOfAlterationPos,numOfAlteration,numOfPatient,compositeScore,numOfAboveCScore,reSampleSize,pValue,stringsAsFactors = FALSE)
        #colnames(tmpResult)<-c("geneSymbol","numOfAlterationPos","numOfAlteration","numOfPatient","compositeDriverScore","numOfAboveCDscore","reSampleNum","pValue")
        colnames(tmpResult)<-c("elementPos","numOfAlterationPos","numOfAlteration","numOfPatient","compositeDriverScore","numOfAboveCDscore","reSampleNum","pValue")
        
        
        #filePath<-file.path("~/work/Ekta_lab/JasonWong_dataset/compositeFunSeq_result_triMut_match",tumorType)
        filePath<-workDir
        fileName<-paste("tmpOutput_",mutationType,"_triMut_distribution_RT_similarity_",replicationTimingCutOff,"_task_",taskNum,"_.txt",sep="")
        fileName<-file.path(filePath,fileName)
        
        write.table(tmpResult,file=fileName,sep="\t",quote=FALSE,row.names = FALSE,col.names = FALSE,append=TRUE)        
        
        
        
        return(tmpResult)
      }
          
      stopCluster(cl)
      time_elapesed<-proc.time()-time_start
      cat (sprintf ("Time for calculating p-value of %s candidates: %.2f sec\n", numOfgeneCheck, time_elapesed[3]) )
      
      outputDf<-rbind.fill(outputDf)
          
      
      #####
      
      #outputDf<-data.frame(ncdsMutationCheckList[1:numOfgeneCheck],numOfAlterationPos,numOfAlteration,numOfPatient,compositeScore,numOfAboveCScore,reSampleSize,pValue)
      outputDf<-outputDf[order(outputDf$pValue),]
      outputDf$qValue<-p.adjust(outputDf$pValue,method = "BH")
      #colnames(outputDf)<-c("geneSymbol","numOfAlterationPos","numOfAlteration","numOfPatient","compositeDriverScore","numOfAboveCDscore","reSampleNum","pValue","qValue")
      colnames(outputDf)<-c("elementPos","numOfAlterationPos","numOfAlteration","numOfPatient","compositeDriverScore","numOfAboveCDscore","reSampleNum","pValue","qValue")
      
      
      fileName<-paste(tumorType,"_outputDf_",mutationType,"_",groupName,"_with_RT_correction_similarity_",replicationTimingCutOff,"_triMut_match_distribution_task_",taskNum,"_iter_",reSampleNum,"_.txt",sep="")
      fileName<-file.path(workDir,fileName)  
      write.table(outputDf,file=fileName,sep="\t",quote=FALSE,row.names =FALSE,col.names = TRUE)
      
    
    }
    
    return(outputDf)

}
mil2041/CNCDriver documentation built on Dec. 13, 2020, 3:41 a.m.