R/simulate_circRNA.R

Defines functions simulate_circRNA

#internal function to simulate tandemRNA from tandemInfo input
simulate_circRNA <- function(circInfo,error_rate,genes.exon.all,fasta_genome,tx.all.fasta,tx.all.NAME,out_name,out_dir="./"){
  sim_dir = paste0(getwd(),"//tem_dir")
  dir.create(sim_dir) #make a temporary directory to simulate reads
  
  circ.tx.fa=DNAStringSet("ATGC")
  circInfo$normID=paste(circInfo$Chr,circInfo$start_EXONSTART,circInfo$end_EXONEND,sep="__")
  #circInfo$cCount= sample(100:1000,nrow(tandemInfo), replace = TRUE)
  TX_num=TXNAME=NULL
  for(i in 1: nrow(circInfo)){
    mycirR=circInfo[i,]
    #for simulated dataset
    num_frag =mycirR$cCount
    circRNA_normID = circInfo$normID[i]
    mygene=as.character(mycirR$GENEID)
    myexonInfo=genes.exon.all[genes.exon.all$GENEID==mygene,]
    start=mycirR$start_EXONSTART
    end=mycirR$end_EXONEND
    #get all exons completely inside the boundary
    pick1=myexonInfo$EXONSTART == mycirR$start_EXONSTART
    pick2=myexonInfo$EXONEND == mycirR$end_EXONEND
    mytx=intersect(unique(myexonInfo[pick1,]$TXNAME), unique(myexonInfo[pick2,]$TXNAME))
    #select a single random transcript and generate circRNA from that transcript
    txNum=length(mytx)

    #restart for each loop
    circRNA.seq = NULL
    circRNA.name = NULL
    count = NULL
    juction_break = NULL

    if (txNum > 0){ # if existing transcripts containing both EXONSTART and EXONEND
      mytx=sample(mytx,1)
      for (atx in mytx){ #do for every tx
        mytxex=myexonInfo[myexonInfo$TXNAME==atx,]
        mytxex=mytxex[order(mytxex$EXONSTART),]
        mytxex.len = mytxex$LENGTH  
        #covert to transcript location
        mytxex.end.all =cumsum(mytxex$LENGTH)
        mytxex.stat.all = mytxex.end.all - mytxex.len +1 
        #select the coding area inside the circRNA region
        pick= which(mytxex$EXONSTART >= start & mytxex$EXONEND <= end) #pick by indexes of exons  
        startR= mytxex.stat.all[pick]
        endR= mytxex.end.all[pick]  
        ## generate pseudo circularRNAs sequences
        # - strand  
        if( mytxex$EXONSTRAND[1] == "-"){
            #pick transcript sequence
            transcript = convertReverseComplement(as.character(tx.all.fasta[which(tx.all.NAME == atx)]))
          }else{
            transcript = as.character(tx.all.fasta[which(tx.all.NAME == atx)])
          }  
        x=substring(transcript,startR,endR)
        txSeq=paste(x,collapse="")
        
        # correct to the right strand for "-"
        #if( mytxex$EXONSTRAND[1] == "-") txSeq=convertReverseComplement(txSeq)
        

        set.seed(2018)
        cvPos=sample(nchar(txSeq),num_frag,replace=TRUE)
        cv.fre=table(cvPos)
        cv.point=as.integer(names(cv.fre))
        startP=1
        endP=nchar(txSeq)
        r = substring(txSeq,cv.point)
        l= substring(txSeq,1,cv.point-1)
        l[which((cv.point-1)==0)] = "" #make sure no error for endding point =0
        atx.break = nchar(r)+1
        atx.seq = paste0(r,l)
        atx.name = paste0(circInfo$normID[i]," ",atx,",",txNum,",circRNA"," juction:",atx.break)
        circRNA.name = c(circRNA.name,atx.name)
        names(atx.seq) = atx.name
        atx.count = as.integer(cv.fre,names=FALSE)
        circRNA.seq = c(circRNA.seq,atx.seq)
        count = c(count,atx.count)
        juction_break =c(juction_break,atx.break)
      }
      circRNA.seq.fa = DNAStringSet(circRNA.seq)
      sim_dir_i = paste0(sim_dir,"//sim_",i)
      fasta_i = paste0(sim_dir,"//fasta_",i,".fasta")
      writeXStringSet(circRNA.seq.fa, fasta_i)
      unif.countmat = as.matrix(count)
      simulate_experiment_countmat(fasta_i, readmat=unif.countmat, outdir=sim_dir_i,error_rate = error_rate , strand_specific=FALSE)
      system(paste0("rm ",fasta_i))     
    }else{ # if not existing the transcript, build circRNA from the exon contig between two ends
      mytx="NONE"
      #get all exons completely inside the boundary
      pick=myexonInfo$EXONSTART >= mycirR$start_EXONSTART & myexonInfo$EXONEND <= mycirR$end_EXONEND
      myex=myexonInfo[pick,]

      #in case of no exon available, skip this circRNA
      if(nrow(myex)==0){
      		mytx="skiping"
  		}else{
	      #select the coding area inside the region. 
	      #NOTE: this can not always satisfy the canonical splicing conditions (GT-AG), but we do not care about it this momment
	      myex=myex[order(myex$EXONSTART),]
	      exstart=myex$EXONSTART/1e6
	      exend=myex$EXONEND/1e6
	      #get clusters of exons
	      exCluster=rep(-1,length(exstart))
	      for (j in 1:length(exstart)){
	        if (exCluster[j]==-1){
	          exCluster[j]=j
	          pick=(exstart[j]-exstart)*(exstart[j]-exend)<=0 | (exend[j]-exstart)*(exend[j]-exend)<=0
	          #assign to the cluster with min index
	          x=exCluster[pick]
	          x=x[x>0]
	          x=min(x)
	          exCluster[pick]=x
	        }
	      }
	      #update exCluster
	      exCluster_u=exCluster
	      repeat{
	        exCluster=exCluster[exCluster]
	        if (sum(exCluster_u!=exCluster)==0) break()
	        exCluster_u=exCluster
	      }
	      #create exon regions from the exon clusters
	      clusterID=sort(unique(exCluster))
	      startR=endR=NULL
	      for (j in 1:length(clusterID)){
	        cID=clusterID[j]
	        cEx=myex[exCluster==cID,]
	        startR=c(startR,min(cEx$EXONSTART))
	        endR=c(endR,max(cEx$EXONEND))
	      }
	      #sort again startR
	      myorder=order(startR)
	      startR=startR[myorder]
	      endR=endR[myorder]
	      # generate pseudo circularRNAs sequences
	      #extract sequences and create a new transcript sequence
	      chrID=as.character(myex$EXONCHROM)[1]
	      x=substring(fasta_genome[chrID],startR,endR)
	      txSeq=paste(x,collapse="")
	      
	      # correct to the right strand for "-"
	      #if(myex$EXONSTRAND[1] == "-") txSeq=convertReverseComplement(txSeq)
	      
	      set.seed(2018) 
	      cvPos=sample(nchar(txSeq),num_frag,replace=TRUE)
	      cv.fre=table(cvPos)
	      cv.point=as.integer(names(cv.fre))
	      startP=1
	      endP=nchar(txSeq)
	      r = substring(txSeq,cv.point)
	      l= substring(txSeq,1,cv.point-1)
	      l[which((cv.point-1)==0)] = "" #make sure no error for endding point =0     
	      juction_break = nchar(r)+1
	      circRNA.seq = paste0(r,l)
	      circRNA.name = paste0(circInfo$normID[i]," ",mytx,",",txNum,",circRNA"," juction:",juction_break)
	      names(circRNA.seq) = circRNA.name
	      circRNA.seq.fa = DNAStringSet(circRNA.seq)
	      count = as.integer(cv.fre,names=FALSE)      
	      sim_dir_i = paste0(sim_dir,"//sim_",i)
	      fasta_i = paste0(sim_dir,"//fasta_",i,".fasta")
	      writeXStringSet(circRNA.seq.fa, fasta_i)      
	      unif.countmat = as.matrix(count)      
	      simulate_experiment_countmat(fasta_i, readmat=unif.countmat, outdir=sim_dir_i,error_rate = error_rate , strand_specific=FALSE)
	      system(paste0("rm ",fasta_i))  
	    }
	}    
    TX_num=c(TX_num,txNum)
    TXNAME=c( TXNAME,mytx)
  }
  
  circ_out_dir = paste(out_dir,"/",out_name,"_circRNA_data",sep="")
  dir.create(circ_out_dir)

  circRead1=paste(circ_out_dir,"/read_1.fasta",sep="")
  circRead2=paste(circ_out_dir,"/read_2.fasta",sep="")

  cmd_cat1 = paste0("for fn in $(find ",sim_dir,' -type f -name "sample_01_1.fasta"); do cat $fn >> ',circRead1,"; done")
  cmd_cat2 = paste0("for fn in $(find ",sim_dir,' -type f -name "sample_01_2.fasta"); do cat $fn >> ',circRead2,"; done")
  system(cmd_cat1)
  system(cmd_cat2)
  system(paste0("rm -r ",sim_dir))
  res=cbind(circInfo,TX_num,TXNAME)
  pick=res$TXNAME=="skiping" #exclude the skipped circRNA candidates
  res=res[!pick,]
  return(res)
}
datngu/CircRNA_simulator documentation built on Feb. 22, 2020, 12:19 a.m.