#internal function to simulate tandemRNA from tandemInfo input
simulate_circRNA <- function(circInfo,error_rate,genes.exon.all,fasta_genome,tx.all.fasta,tx.all.NAME,out_name,out_dir="./"){
sim_dir = paste0(getwd(),"//tem_dir")
dir.create(sim_dir) #make a temporary directory to simulate reads
circ.tx.fa=DNAStringSet("ATGC")
circInfo$normID=paste(circInfo$Chr,circInfo$start_EXONSTART,circInfo$end_EXONEND,sep="__")
#circInfo$cCount= sample(100:1000,nrow(tandemInfo), replace = TRUE)
TX_num=TXNAME=NULL
for(i in 1: nrow(circInfo)){
mycirR=circInfo[i,]
#for simulated dataset
num_frag =mycirR$cCount
circRNA_normID = circInfo$normID[i]
mygene=as.character(mycirR$GENEID)
myexonInfo=genes.exon.all[genes.exon.all$GENEID==mygene,]
start=mycirR$start_EXONSTART
end=mycirR$end_EXONEND
#get all exons completely inside the boundary
pick1=myexonInfo$EXONSTART == mycirR$start_EXONSTART
pick2=myexonInfo$EXONEND == mycirR$end_EXONEND
mytx=intersect(unique(myexonInfo[pick1,]$TXNAME), unique(myexonInfo[pick2,]$TXNAME))
#select a single random transcript and generate circRNA from that transcript
txNum=length(mytx)
#restart for each loop
circRNA.seq = NULL
circRNA.name = NULL
count = NULL
juction_break = NULL
if (txNum > 0){ # if existing transcripts containing both EXONSTART and EXONEND
mytx=sample(mytx,1)
for (atx in mytx){ #do for every tx
mytxex=myexonInfo[myexonInfo$TXNAME==atx,]
mytxex=mytxex[order(mytxex$EXONSTART),]
mytxex.len = mytxex$LENGTH
#covert to transcript location
mytxex.end.all =cumsum(mytxex$LENGTH)
mytxex.stat.all = mytxex.end.all - mytxex.len +1
#select the coding area inside the circRNA region
pick= which(mytxex$EXONSTART >= start & mytxex$EXONEND <= end) #pick by indexes of exons
startR= mytxex.stat.all[pick]
endR= mytxex.end.all[pick]
## generate pseudo circularRNAs sequences
# - strand
if( mytxex$EXONSTRAND[1] == "-"){
#pick transcript sequence
transcript = convertReverseComplement(as.character(tx.all.fasta[which(tx.all.NAME == atx)]))
}else{
transcript = as.character(tx.all.fasta[which(tx.all.NAME == atx)])
}
x=substring(transcript,startR,endR)
txSeq=paste(x,collapse="")
# correct to the right strand for "-"
#if( mytxex$EXONSTRAND[1] == "-") txSeq=convertReverseComplement(txSeq)
set.seed(2018)
cvPos=sample(nchar(txSeq),num_frag,replace=TRUE)
cv.fre=table(cvPos)
cv.point=as.integer(names(cv.fre))
startP=1
endP=nchar(txSeq)
r = substring(txSeq,cv.point)
l= substring(txSeq,1,cv.point-1)
l[which((cv.point-1)==0)] = "" #make sure no error for endding point =0
atx.break = nchar(r)+1
atx.seq = paste0(r,l)
atx.name = paste0(circInfo$normID[i]," ",atx,",",txNum,",circRNA"," juction:",atx.break)
circRNA.name = c(circRNA.name,atx.name)
names(atx.seq) = atx.name
atx.count = as.integer(cv.fre,names=FALSE)
circRNA.seq = c(circRNA.seq,atx.seq)
count = c(count,atx.count)
juction_break =c(juction_break,atx.break)
}
circRNA.seq.fa = DNAStringSet(circRNA.seq)
sim_dir_i = paste0(sim_dir,"//sim_",i)
fasta_i = paste0(sim_dir,"//fasta_",i,".fasta")
writeXStringSet(circRNA.seq.fa, fasta_i)
unif.countmat = as.matrix(count)
simulate_experiment_countmat(fasta_i, readmat=unif.countmat, outdir=sim_dir_i,error_rate = error_rate , strand_specific=FALSE)
system(paste0("rm ",fasta_i))
}else{ # if not existing the transcript, build circRNA from the exon contig between two ends
mytx="NONE"
#get all exons completely inside the boundary
pick=myexonInfo$EXONSTART >= mycirR$start_EXONSTART & myexonInfo$EXONEND <= mycirR$end_EXONEND
myex=myexonInfo[pick,]
#in case of no exon available, skip this circRNA
if(nrow(myex)==0){
mytx="skiping"
}else{
#select the coding area inside the region.
#NOTE: this can not always satisfy the canonical splicing conditions (GT-AG), but we do not care about it this momment
myex=myex[order(myex$EXONSTART),]
exstart=myex$EXONSTART/1e6
exend=myex$EXONEND/1e6
#get clusters of exons
exCluster=rep(-1,length(exstart))
for (j in 1:length(exstart)){
if (exCluster[j]==-1){
exCluster[j]=j
pick=(exstart[j]-exstart)*(exstart[j]-exend)<=0 | (exend[j]-exstart)*(exend[j]-exend)<=0
#assign to the cluster with min index
x=exCluster[pick]
x=x[x>0]
x=min(x)
exCluster[pick]=x
}
}
#update exCluster
exCluster_u=exCluster
repeat{
exCluster=exCluster[exCluster]
if (sum(exCluster_u!=exCluster)==0) break()
exCluster_u=exCluster
}
#create exon regions from the exon clusters
clusterID=sort(unique(exCluster))
startR=endR=NULL
for (j in 1:length(clusterID)){
cID=clusterID[j]
cEx=myex[exCluster==cID,]
startR=c(startR,min(cEx$EXONSTART))
endR=c(endR,max(cEx$EXONEND))
}
#sort again startR
myorder=order(startR)
startR=startR[myorder]
endR=endR[myorder]
# generate pseudo circularRNAs sequences
#extract sequences and create a new transcript sequence
chrID=as.character(myex$EXONCHROM)[1]
x=substring(fasta_genome[chrID],startR,endR)
txSeq=paste(x,collapse="")
# correct to the right strand for "-"
#if(myex$EXONSTRAND[1] == "-") txSeq=convertReverseComplement(txSeq)
set.seed(2018)
cvPos=sample(nchar(txSeq),num_frag,replace=TRUE)
cv.fre=table(cvPos)
cv.point=as.integer(names(cv.fre))
startP=1
endP=nchar(txSeq)
r = substring(txSeq,cv.point)
l= substring(txSeq,1,cv.point-1)
l[which((cv.point-1)==0)] = "" #make sure no error for endding point =0
juction_break = nchar(r)+1
circRNA.seq = paste0(r,l)
circRNA.name = paste0(circInfo$normID[i]," ",mytx,",",txNum,",circRNA"," juction:",juction_break)
names(circRNA.seq) = circRNA.name
circRNA.seq.fa = DNAStringSet(circRNA.seq)
count = as.integer(cv.fre,names=FALSE)
sim_dir_i = paste0(sim_dir,"//sim_",i)
fasta_i = paste0(sim_dir,"//fasta_",i,".fasta")
writeXStringSet(circRNA.seq.fa, fasta_i)
unif.countmat = as.matrix(count)
simulate_experiment_countmat(fasta_i, readmat=unif.countmat, outdir=sim_dir_i,error_rate = error_rate , strand_specific=FALSE)
system(paste0("rm ",fasta_i))
}
}
TX_num=c(TX_num,txNum)
TXNAME=c( TXNAME,mytx)
}
circ_out_dir = paste(out_dir,"/",out_name,"_circRNA_data",sep="")
dir.create(circ_out_dir)
circRead1=paste(circ_out_dir,"/read_1.fasta",sep="")
circRead2=paste(circ_out_dir,"/read_2.fasta",sep="")
cmd_cat1 = paste0("for fn in $(find ",sim_dir,' -type f -name "sample_01_1.fasta"); do cat $fn >> ',circRead1,"; done")
cmd_cat2 = paste0("for fn in $(find ",sim_dir,' -type f -name "sample_01_2.fasta"); do cat $fn >> ',circRead2,"; done")
system(cmd_cat1)
system(cmd_cat2)
system(paste0("rm -r ",sim_dir))
res=cbind(circInfo,TX_num,TXNAME)
pick=res$TXNAME=="skiping" #exclude the skipped circRNA candidates
res=res[!pick,]
return(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.