R/simulate_tandem.R

Defines functions simulate_tandem

#internal function to simulate tandemRNA from tandemInfo input
simulate_tandem <- function(tandemInfo,error_rate,genes.exon.all,fasta_genome,tx.all.fasta,tx.all.NAME,out_name,out_dir="./"){
  tandem.tx.fa=DNAStringSet("ATGC")
  tandemInfo$normID=paste(tandemInfo$Chr,tandemInfo$start_EXONSTART,tandemInfo$end_EXONEND,sep="__")
  #tandemInfo$cCount= sample(100:1000,nrow(tandemInfo), replace = TRUE)
  for(i in 1: nrow(tandemInfo)){
      my_tandem=tandemInfo[i,]
      mynormID=my_tandem$normID
      tandem_gene=as.character(my_tandem$GENEID)
      tandem_exonInfo=genes.exon.all[genes.exon.all$GENEID==tandem_gene,]
      #get all exons completely inside the boundary
      pick1=tandem_exonInfo$EXONSTART == my_tandem$start_EXONSTART
      pick2=tandem_exonInfo$EXONEND == my_tandem$end_EXONEND
      tandem_tx=intersect(unique(tandem_exonInfo[pick1,]$TXNAME), unique(tandem_exonInfo[pick2,]$TXNAME))
      #select a single random transcript and generate tandem duplication from that transcript
      txNum=length(tandem_tx)
      start=my_tandem$start_EXONSTART
      end=my_tandem$end_EXONEND
      mytandem.tx.fa=DNAStringSet("ATGC") # starting tandem.tx.fa
      if (txNum > 0){mytx=sample(tandem_tx,1)}else{next}
      #Only genertate tandem duplication if existing transcripts containing both EXONSTART and EXONEND
        
        for (atx in mytx){ #do for every tx
          mytxex=tandem_exonInfo[tandem_exonInfo$TXNAME==atx,]
          mytxex=mytxex[order(mytxex$EXONSTART),]
          mytxex.len = mytxex$LENGTH # 
          #covert to transcript location
          mytxex.end.all =cumsum(mytxex$LENGTH)
          mytxex.stat.all = mytxex.end.all - mytxex.len +1
    
          #select the coding area inside the circRNA region
          pick= which(mytxex$EXONSTART >= start & mytxex$EXONEND <= end) #pick by indexes of exons # 
          startR= mytxex.stat.all[pick]
          endR= mytxex.end.all[pick]
          
          ## generate pseudo circularRNAs sequences
          # - strand    

          if( mytxex$EXONSTRAND[1] == "-"){
          #pick transcript sequence
            transcript = convertReverseComplement(as.character(tx.all.fasta[which(tx.all.NAME == atx)]))
          }else{
            transcript = as.character(tx.all.fasta[which(tx.all.NAME == atx)])
          }    

          x=substring(transcript,startR,endR)
          circSeq=paste(x,collapse="") # 
          #pick the left side
          pick.Left=which(mytxex$EXONEND < start) #pick by indexes
          x=""  
          start.Left= mytxex.stat.all[pick.Left]
          end.Left= mytxex.end.all[pick.Left]
          if(length(pick.Left)>0) x = substring(transcript,start.Left,end.Left)
          tandem_left.seq=paste0(x,collapse ="") 
          #pick the right side
          pick.Right=which(mytxex$EXONSTART > end)#pick by indexes
          x=""  
          start.Right= mytxex.stat.all[pick.Right]
          end.Right= mytxex.end.all[pick.Right]
          if(length(pick.Right)>0) x = substring(transcript,start.Right,end.Right)
          tandem_right.seq=paste0(x,collapse ="") # 
          #record tandem break point
          ##paste left side + circSeq and count nchar()
          A.seq = paste0(tandem_left.seq,circSeq)
          #tamdem_break[i] = as.numeric(nchar(A.seq))
          dup_point = as.numeric(nchar(A.seq))+1
          #tandem_normID = my_tandem$normID[1]
          tandem_stat = as.numeric(nchar(tandem_left.seq))+1
          tandem_end = dup_point + as.numeric(nchar(circSeq))
          # finish tandem.seq by pasting again circSeq + tandem_right.seq
          tandem.seq = paste0(A.seq,circSeq,tandem_right.seq) 
          
          ##convert back to the correct strand for - strand 
          #if( mytxex$EXONSTRAND[1] == "-") tandem.seq=convertReverseComplement(tandem.seq)
          

          #create tandemRNA.fa and return tadem.tx.fa
          tandem.name=paste(mynormID," ",atx," ",txNum," tandemRNA ",tandem_stat," ",dup_point," ",tandem_end, sep="")
          tandem.tx = DNAStringSet(tandem.seq)
          names(tandem.tx)= tandem.name
          mytandem.tx.fa=c(mytandem.tx.fa,tandem.tx)
        }
      mytandem.tx.fa=mytandem.tx.fa[-1]
      tandem.tx.fa=c(tandem.tx.fa,mytandem.tx.fa)
    }
  tandem.tx.fa=tandem.tx.fa[-1]
  simulated_info=lapply(names(tandem.tx.fa), function(x){
     y=unlist(strsplit(x," "))
     return(y)
    })
  #update tandemInfo
  simulated_info=do.call(rbind,simulated_info)
  simulated_info= as.data.frame(simulated_info,stringsAsFactors=FALSE)
  colnames(simulated_info)= c("normID","TXNAME","TX_num","tag","tandem_stat","tandem_break","tandem_end")
  tandemInfo = tandemInfo[tandemInfo$normID %in% simulated_info$normID,]
  tandemInfo= cbind(tandemInfo,simulated_info[,2:7])  

  #export to file
  #to RData # 
  #save(tandem.tx.fa, tandemInfo, file=paste(out_name,"_tandem_info.RData",sep="")) # 
  #to fa
  exportFasta=c(tandem.tx.fa) # 
  fastaFile_tandem=paste(out_dir,"/",out_name,"_tandem.fa",sep="")
  writeXStringSet(exportFasta, fastaFile_tandem)  # 
    
  #simulation for tandem dataset
  simPosdir = paste(out_dir,"/",out_name,"_tandem_data",sep="") # 
  library("polyester")
  library("Biostrings")
  unif.countmat=tandemInfo$cCount # 
  unif.countmat=as.matrix(unif.countmat)
  simulate_experiment_countmat(fastaFile_tandem, readmat=unif.countmat, outdir=simPosdir,error_rate = error_rate, strand_specific=FALSE) # 
  return(tandemInfo)
}
datngu/CircRNA_simulator documentation built on Feb. 22, 2020, 12:19 a.m.