R/exomepeak.R

exomepeak <- function(
  IP_BAM,INPUT_BAM,
  GENOME = NA,
  UCSC_TABLE_NAME = "knownGene",
  GENE_ANNO_GTF = NA,
  TXDB = NA,
  TREATED_IP_BAM=character(0),
  TREATED_INPUT_BAM=character(0),
  OUTPUT_DIR=NA,
  EXPERIMENT_NAME="exomePeak_output",
  WINDOW_WIDTH=200,
  SLIDING_STEP=30,
  FRAGMENT_LENGTH=100,
  READ_LENGTH=NA,
  MINIMAL_PEAK_LENGTH=FRAGMENT_LENGTH/2,
  PEAK_CUTOFF_PVALUE=NA,
  PEAK_CUTOFF_FDR=0.05,
  FOLD_ENRICHMENT=1,
  CONSISTENT_PEAK_CUTOFF_PVALUE = 0.05,
  CONSISTENT_PEAK_FOLD_ENRICHMENT=1,
  DIFF_PEAK_METHOD = "rhtest",
  DIFF_PEAK_CUTOFF_PVALUE=NA,
  DIFF_PEAK_CUTOFF_FDR=0.05,
  DIFF_PEAK_ABS_FOLD_CHANGE=1,
  DIFF_PEAK_CONSISTENT_CUTOFF_PVALUE=0.05,
  DIFF_PEAK_CONSISTENT_ABS_FOLD_CHANGE=1,
  MINIMAL_MAPQ=30,
  REMOVE_LOCAL_TAG_ANOMALITIES=TRUE,
  POISSON_MEAN_RATIO=1,
  TESTING_MODE=NA,
  SAVE_RESULT_ON_DISK=TRUE
  ){
  
  # Wrap parameters ##################################################
  PARAMETERS=list();
  PARAMETERS$GENE_ANNO_GTF=GENE_ANNO_GTF
  PARAMETERS$IP_BAM=IP_BAM
  PARAMETERS$INPUT_BAM=INPUT_BAM
  PARAMETERS$GENOME = GENOME
  PARAMETERS$UCSC_TABLE_NAME=UCSC_TABLE_NAME
  UCSC_TABLE_NAME = UCSC_TABLE_NAME
  PARAMETERS$TREATED_IP_BAM=TREATED_IP_BAM
  PARAMETERS$TREATED_INPUT_BAM=TREATED_INPUT_BAM
  PARAMETERS$FRAGMENT_LENGTH=FRAGMENT_LENGTH
  PARAMETERS$READ_LENGTH=READ_LENGTH
  PARAMETERS$WINDOW_WIDTH=WINDOW_WIDTH
  PARAMETERS$SLIDING_STEP=SLIDING_STEP
  PARAMETERS$MINIMAL_PEAK_LENGTH=MINIMAL_PEAK_LENGTH
  PARAMETERS$PEAK_CUTOFF_PVALUE=PEAK_CUTOFF_PVALUE
  PARAMETERS$PEAK_CUTOFF_FDR=PEAK_CUTOFF_FDR
  PARAMETERS$FOLD_ENRICHMENT=FOLD_ENRICHMENT
  PARAMETERS$CONSISTENT_PEAK_CUTOFF_PVALUE=CONSISTENT_PEAK_CUTOFF_PVALUE
  PARAMETERS$CONSISTENT_PEAK_FOLD_ENRICHMENT=CONSISTENT_PEAK_FOLD_ENRICHMENT
  PARAMETERS$MINIMAL_MAPQ=MINIMAL_MAPQ
  PARAMETERS$REMOVE_LOCAL_TAG_ANOMALITIES=REMOVE_LOCAL_TAG_ANOMALITIES
  PARAMETERS$POISSON_MEAN_RATIO=POISSON_MEAN_RATIO
  PARAMETERS$DIFF_PEAK_CUTOFF_PVALUE=DIFF_PEAK_CUTOFF_PVALUE
  PARAMETERS$DIFF_PEAK_CUTOFF_FDR=DIFF_PEAK_CUTOFF_FDR
  PARAMETERS$DIFF_PEAK_ABS_FOLD_CHANGE=DIFF_PEAK_ABS_FOLD_CHANGE
  PARAMETERS$DIFF_PEAK_CONSISTENT_CUTOFF_PVALUE=DIFF_PEAK_CONSISTENT_CUTOFF_PVALUE
  PARAMETERS$DIFF_PEAK_CONSISTENT_ABS_FOLD_CHANGE=DIFF_PEAK_CONSISTENT_ABS_FOLD_CHANGE
  PARAMETERS$OUTPUT_DIR=OUTPUT_DIR
  PARAMETERS$EXPERIMENT_NAME=EXPERIMENT_NAME
  PARAMETERS$TESTING_MODE=TESTING_MODE
  PARAMETERS$SAVE_RESULT_ON_DISK=SAVE_RESULT_ON_DISK
  PARAMETERS$DIFF_PEAK_METHOD=DIFF_PEAK_METHOD
  PARAMETERS$TXDB = TXDB
  
  # save parameter, used when debug
  save_parameters <- FALSE
  if (save_parameters) {save(PARAMETERS,file = "para.Rdata")}
  
  # check annotation
  if (is.na(PARAMETERS$GENOME) & is.na(PARAMETERS$GENE_ANNO_GTF) & is.na(PARAMETERS$TXDB)) { 
		stop("must specify the genome assembly or provide a gtf file/TxDb object for exomePeak to work!", 
		call. = TRUE, domain = NULL)}

  # dependent variables
  if (is.na(PARAMETERS$READ_LENGTH)) {PARAMETERS$READ_LENGTH=.get.bam.read.length(PARAMETERS$IP_BAM[1])}
  if (is.na(PARAMETERS$MINIMAL_PEAK_LENGTH)) {PARAMETERS$MINIMAL_PEAK_LENGTH=PARAMETERS$FRAGMENT_LENGTH/2}
  if (is.na(PARAMETERS$PEAK_CUTOFF_PVALUE)) {PARAMETERS$PEAK_CUTOFF_TYPE="FDR"} else  {PARAMETERS$PEAK_CUTOFF_TYPE="PVALUE"}
  if (is.na(PARAMETERS$DIFF_PEAK_CUTOFF_PVALUE)) {PARAMETERS$DIFF_PEAK_CUTOFF_TYPE="FDR"} else  {PARAMETERS$DIFF_PEAK_CUTOFF_TYPE="PVALUE"} 
  if (is.na(PARAMETERS$OUTPUT_DIR)) {PARAMETERS$OUTPUT_DIR=getwd()}
  
  # algrithm ##################################################
  
  # read gene annotation
  ANNOTATION = .read.gtf(PARAMETERS)
  ANNOTATION_BATCH_ID = .divide.anno.into.batches(ANNOTATION)
  
  # index bam files
  # get bam file
  bam=c(PARAMETERS$IP_BAM,PARAMETERS$INPUT_BAM,PARAMETERS$TREATED_IP_BAM,PARAMETERS$TREATED_INPUT_BAM)
  no_bam_files=length(bam)
  for (ibam in 1:no_bam_files) {file=bam[ibam]
                                if (! file.exists(paste(file,'.bai',sep=""))) {
                                  print(paste("Stage: index bam file", file))
                                  indexBam(file)
                                }}
  
  # get reads count report
  SAMPLE_ID = .get.sample.id(PARAMETERS)
  BAM_CHRS = .get.bam.chrs(PARAMETERS$IP_BAM[1])
  
  # get reads count
  print("Get Reads Count ...")
  print("This step may take a few hours ...")
  if (is.na(PARAMETERS$TESTING_MODE)) {no_batch_to_run=max(ANNOTATION_BATCH_ID)} else {no_batch_to_run=PARAMETERS$TESTING_MODE}
  noGroups=ceiling(no_batch_to_run/170);
  group_bar=round(seq(from = 1, to = no_batch_to_run +1,length.out=noGroups+1))
  # get space
  READS_COUNT=.get.reads.count(1,PARAMETERS,ANNOTATION,ANNOTATION_BATCH_ID,BAM_CHRS,no_bam_files,bam)
  READS_COUNT=READS_COUNT[integer(0),]
  READS_COUNT_FORMAT=READS_COUNT
  # groups
  for (igroup in 1:noGroups){
    print( paste(as.character(signif(igroup*100/noGroups, digits = 3)),"%") )
    temp=list()
    batch_batch=group_bar[igroup]:(group_bar[igroup+1]-1)
    reads_count_group=READS_COUNT_FORMAT
    no_batch=length(batch_batch)
    for (ibatch in 1:no_batch){
      temp[[ibatch]]=.get.reads.count(batch_batch[ibatch],PARAMETERS,ANNOTATION,ANNOTATION_BATCH_ID,BAM_CHRS,no_bam_files,bam)
    }
    for (ibatch in 1:no_batch){
      reads_count_group=rbind(reads_count_group,temp[[ibatch]])
    }
    READS_COUNT=rbind(READS_COUNT,reads_count_group)}
  READS_COUNT=data.frame(READS_COUNT)
  
  # peak calling
  PEAK = .peak.calling.module(READS_COUNT,SAMPLE_ID,PARAMETERS)
  
  # differential analysis
  if (length(SAMPLE_ID$treated_ip)*length(SAMPLE_ID$treated_input) > 0)  {
    print("Comparing two conditions ...") 
    DIFF=.merge.and.compare.two.different.conditions(PEAK,READS_COUNT,SAMPLE_ID, PARAMETERS) # differential analysis
    DIFF.consis=.diff.consistency.evaluate(PEAK,READS_COUNT,SAMPLE_ID,PARAMETERS) # diff consistency
  } else {DIFF=NA;
  DIFF.consis=NA}
  
  # decide peak calling mode or differential expression mode
  if (length(SAMPLE_ID$treated_ip)*length(SAMPLE_ID$treated_input) == 0) {respect_mode = "peak" } else {respect_mode = "diff" }
  if (PARAMETERS$SAVE_RESULT_ON_DISK==TRUE) {
    dir.create(paste(PARAMETERS$OUTPUT_DIR,PARAMETERS$EXPERIMENT_NAME,sep='/'),recursive =TRUE,showWarnings = FALSE)}
  dir=paste(PARAMETERS$OUTPUT_DIR,PARAMETERS$EXPERIMENT_NAME,sep='/')
  
  
  PEAK_RESULT=.report.peak.based.on.result(PEAK,ANNOTATION,READS_COUNT,SAMPLE_ID,PARAMETERS,ANNOTATION_BATCH_ID)
  
  # output result 2
  DIFF_PEAK_RESULT = NA
  if (respect_mode == "diff" ) { 
    # peak and diff
    TOTAL_PEAK_RESULT=.get.table.peak.result(PEAK,ANNOTATION,READS_COUNT,SAMPLE_ID,PARAMETERS,ANNOTATION_BATCH_ID,PEAK$loci2peak_merged)
    DIFF_PEAK_RESULT = .report.diff.peak.based.on.result(TOTAL_PEAK_RESULT,DIFF,DIFF.consis,PARAMETERS)
  }
  
  # save result
  if (PARAMETERS$SAVE_RESULT_ON_DISK==TRUE) {
  tmp_rs =list(ANNOTATION=ANNOTATION,
              ANNOTATION_BATCH_ID=ANNOTATION_BATCH_ID,
              PARAMETERS=PARAMETERS,
              READS_COUNT=READS_COUNT,
              SAMPLE_ID=SAMPLE_ID,PEAK=PEAK,
              DIFF=DIFF,DIFF.consis=DIFF.consis,
              BAM_CHRS=BAM_CHRS,
              BAM=bam)
  save("tmp_rs", file=paste(dir,'exomePeak.Rdata',sep='/'))
  }
  
  # notation
  result=.exomepeak_notification(PARAMETERS,PEAK_RESULT,DIFF_PEAK_RESULT,dir)
  return(result)
}
lzcyzm/exomePeak documentation built on May 21, 2019, 9:16 a.m.