#internal function to simulate tandemRNA from tandemInfo input
simulate_tandem <- function(tandemInfo,error_rate,genes.exon.all,fasta_genome,tx.all.fasta,tx.all.NAME,out_name,out_dir="./"){
tandem.tx.fa=DNAStringSet("ATGC")
tandemInfo$normID=paste(tandemInfo$Chr,tandemInfo$start_EXONSTART,tandemInfo$end_EXONEND,sep="__")
#tandemInfo$cCount= sample(100:1000,nrow(tandemInfo), replace = TRUE)
for(i in 1: nrow(tandemInfo)){
my_tandem=tandemInfo[i,]
mynormID=my_tandem$normID
tandem_gene=as.character(my_tandem$GENEID)
tandem_exonInfo=genes.exon.all[genes.exon.all$GENEID==tandem_gene,]
#get all exons completely inside the boundary
pick1=tandem_exonInfo$EXONSTART == my_tandem$start_EXONSTART
pick2=tandem_exonInfo$EXONEND == my_tandem$end_EXONEND
tandem_tx=intersect(unique(tandem_exonInfo[pick1,]$TXNAME), unique(tandem_exonInfo[pick2,]$TXNAME))
#select a single random transcript and generate tandem duplication from that transcript
txNum=length(tandem_tx)
start=my_tandem$start_EXONSTART
end=my_tandem$end_EXONEND
mytandem.tx.fa=DNAStringSet("ATGC") # starting tandem.tx.fa
if (txNum > 0){mytx=sample(tandem_tx,1)}else{next}
#Only genertate tandem duplication if existing transcripts containing both EXONSTART and EXONEND
for (atx in mytx){ #do for every tx
mytxex=tandem_exonInfo[tandem_exonInfo$TXNAME==atx,]
mytxex=mytxex[order(mytxex$EXONSTART),]
mytxex.len = mytxex$LENGTH #
#covert to transcript location
mytxex.end.all =cumsum(mytxex$LENGTH)
mytxex.stat.all = mytxex.end.all - mytxex.len +1
#select the coding area inside the circRNA region
pick= which(mytxex$EXONSTART >= start & mytxex$EXONEND <= end) #pick by indexes of exons #
startR= mytxex.stat.all[pick]
endR= mytxex.end.all[pick]
## generate pseudo circularRNAs sequences
# - strand
if( mytxex$EXONSTRAND[1] == "-"){
#pick transcript sequence
transcript = convertReverseComplement(as.character(tx.all.fasta[which(tx.all.NAME == atx)]))
}else{
transcript = as.character(tx.all.fasta[which(tx.all.NAME == atx)])
}
x=substring(transcript,startR,endR)
circSeq=paste(x,collapse="") #
#pick the left side
pick.Left=which(mytxex$EXONEND < start) #pick by indexes
x=""
start.Left= mytxex.stat.all[pick.Left]
end.Left= mytxex.end.all[pick.Left]
if(length(pick.Left)>0) x = substring(transcript,start.Left,end.Left)
tandem_left.seq=paste0(x,collapse ="")
#pick the right side
pick.Right=which(mytxex$EXONSTART > end)#pick by indexes
x=""
start.Right= mytxex.stat.all[pick.Right]
end.Right= mytxex.end.all[pick.Right]
if(length(pick.Right)>0) x = substring(transcript,start.Right,end.Right)
tandem_right.seq=paste0(x,collapse ="") #
#record tandem break point
##paste left side + circSeq and count nchar()
A.seq = paste0(tandem_left.seq,circSeq)
#tamdem_break[i] = as.numeric(nchar(A.seq))
dup_point = as.numeric(nchar(A.seq))+1
#tandem_normID = my_tandem$normID[1]
tandem_stat = as.numeric(nchar(tandem_left.seq))+1
tandem_end = dup_point + as.numeric(nchar(circSeq))
# finish tandem.seq by pasting again circSeq + tandem_right.seq
tandem.seq = paste0(A.seq,circSeq,tandem_right.seq)
##convert back to the correct strand for - strand
#if( mytxex$EXONSTRAND[1] == "-") tandem.seq=convertReverseComplement(tandem.seq)
#create tandemRNA.fa and return tadem.tx.fa
tandem.name=paste(mynormID," ",atx," ",txNum," tandemRNA ",tandem_stat," ",dup_point," ",tandem_end, sep="")
tandem.tx = DNAStringSet(tandem.seq)
names(tandem.tx)= tandem.name
mytandem.tx.fa=c(mytandem.tx.fa,tandem.tx)
}
mytandem.tx.fa=mytandem.tx.fa[-1]
tandem.tx.fa=c(tandem.tx.fa,mytandem.tx.fa)
}
tandem.tx.fa=tandem.tx.fa[-1]
simulated_info=lapply(names(tandem.tx.fa), function(x){
y=unlist(strsplit(x," "))
return(y)
})
#update tandemInfo
simulated_info=do.call(rbind,simulated_info)
simulated_info= as.data.frame(simulated_info,stringsAsFactors=FALSE)
colnames(simulated_info)= c("normID","TXNAME","TX_num","tag","tandem_stat","tandem_break","tandem_end")
tandemInfo = tandemInfo[tandemInfo$normID %in% simulated_info$normID,]
tandemInfo= cbind(tandemInfo,simulated_info[,2:7])
#export to file
#to RData #
#save(tandem.tx.fa, tandemInfo, file=paste(out_name,"_tandem_info.RData",sep="")) #
#to fa
exportFasta=c(tandem.tx.fa) #
fastaFile_tandem=paste(out_dir,"/",out_name,"_tandem.fa",sep="")
writeXStringSet(exportFasta, fastaFile_tandem) #
#simulation for tandem dataset
simPosdir = paste(out_dir,"/",out_name,"_tandem_data",sep="") #
library("polyester")
library("Biostrings")
unif.countmat=tandemInfo$cCount #
unif.countmat=as.matrix(unif.countmat)
simulate_experiment_countmat(fastaFile_tandem, readmat=unif.countmat, outdir=simPosdir,error_rate = error_rate, strand_specific=FALSE) #
return(tandemInfo)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.