# Build and Reload Package: 'Ctrl + Shift + B'
# Check Package: 'Ctrl + Shift + E'
# Test Package: 'Ctrl + Shift + T'
extract_region <- function(ref=NULL,qry_fld=NULL,temp_dir=NULL,len_thresh=NULL) {
if (is.null(ref)) stop("Please provide a fasta file containing reference sequence data")
if (is.null(temp_dir)) temp_dir=tempdir()
ref=normalizePath(ref)
temp_dir=normalizePath(temp_dir)
print("This function extracts DNA regions that best match the provided reference sequence from all sequences in a specified folder, using BLASTN.")
print("The folder should contain files with the extensions .fa .fas .fna .fasta")
print(paste0("Temporary files will be written to this location: ",temp_dir))
lst=list.files(path=qry_fld,pattern=".fa$|.fas$|.fasta|.fna",full.names = T)
nm=length(lst)
print(paste0("The data folder contains ", nm," files to be queried."))
print("Reading reference fasta file.")
tryCatch(expr={fas=Biostrings::readDNAStringSet(ref)},error=function(e){stop("ERROR: Please provide a valid fasta-format reference file.")})
print(paste0("The reference file contains ",length(fas)," sequence(s)."))
print("Generating BLAST database")
pt=normalizePath(paste0(temp_dir,"/sequences"),winslash = "\\",mustWork = F)
print(pt)
pt2=normalizePath(paste0(temp_dir,"/results"),winslash = "\\",mustWork = F)
print(pt2)
system(paste0("makeblastdb -dbtype nucl -input_type fasta -in ",ref," -out ",pt))
print("Done.")
print("BLASTing all query sequences, and extracting the best hits")
res=list()
for (x in 1:nm){
print(paste0("BLASTing sequence ",x))
print(paste0("blastn -num_threads 4 -query ",normalizePath(lst[x])," -db ",pt," -outfmt '6 qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qseq sseq' -out ",pt2))
system(paste0("blastn -num_threads 4 -query ",normalizePath(lst[x])," -db ",pt," -outfmt \"6 qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qseq sseq\" -out ",pt2))
#
if(file.info(pt2)$size>0){
tab=read.table(pt2,header=F,stringsAsFactors = F)
colnames(tab)=c("qaccver", "saccver", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore", "qseq", "sseq")
res[[x]]= tab %>% dplyr::group_by(qaccver) %>% dplyr::summarise(best_hit=saccver[order(evalue,-rank(bitscore))[1]],qseq=qseq[order(evalue,-rank(bitscore))[1]],score=evalue[order(evalue,-rank(bitscore))[1]],length=length[order(evalue,-rank(bitscore))[1]],file=basename(lst[x]))
}else{
print(paste0("Sequence ",x," (",basename(lst[x]),") has no BLAST hits"))
}
}
res=do.call(rbind,res)
if(!is.null(len_thresh)){
res=res[res$length>len_thresh,]
}
return(res)
}
BLAST_single_ref <- function(ref=NULL,qry_fld=NULL,temp_dir=NULL,len_thresh=NULL,ident_thresh=NULL) {
if (is.null(ref)) stop("Please provide a fasta file containing reference sequence data")
if (is.null(temp_dir)) temp_dir=tempdir()
ref=normalizePath(ref)
temp_dir=normalizePath(temp_dir)
print("This function extracts DNA regions that best match the provided reference sequence from all sequences in a specified folder, using BLASTN.")
print("The folder should contain files with the extensions .fa .fas .fna .fasta")
print(paste0("Temporary files will be written to this location: ",temp_dir))
lst=list.files(path=qry_fld,pattern=".fa$|.fas$|.fasta|.fna",full.names = T)
nm=length(lst)
print(paste0("The data folder contains ", nm," files to be queried."))
print("Reading reference fasta file.")
tryCatch(expr={fas=Biostrings::readDNAStringSet(ref)},error=function(e){stop("ERROR: Please provide a valid fasta-format reference file.")})
print(paste0("The reference file contains ",length(fas)," sequence(s)."))
print("Generating BLAST database")
pt=normalizePath(paste0(temp_dir,"/sequences"),winslash = "\\",mustWork = F)
print(pt)
pt2=normalizePath(paste0(temp_dir,"/results"),winslash = "\\",mustWork = F)
print(pt2)
system(paste0("makeblastdb -dbtype nucl -input_type fasta -in ",ref," -out ",pt))
print("Done.")
print("BLASTing all query sequences, and extracting the best hits")
res=list()
for (x in 1:nm){
print(paste0("BLASTing sequence ",x))
print(paste0("blastn -num_threads 4 -query ",normalizePath(lst[x])," -db ",pt," -outfmt '6 qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qseq sseq' -out ",pt2))
system(paste0("blastn -num_threads 4 -query ",normalizePath(lst[x])," -db ",pt," -outfmt \"6 qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qseq sseq\" -out ",pt2))
#
if(file.info(pt2)$size>0){
tab=read.table(pt2,header=F,stringsAsFactors = F)
colnames(tab)=c("qaccver", "saccver", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore", "qseq", "sseq")
res[[x]]= tab %>% dplyr::group_by(qaccver) %>% dplyr::summarise(best_hit=saccver[order(evalue,-rank(bitscore))[1]],
qseq=qseq[order(evalue,-rank(bitscore))[1]],
score=evalue[order(evalue,-rank(bitscore))[1]],
length=length[order(evalue,-rank(bitscore))[1]],
sstart=sstart[order(evalue,-rank(bitscore))[1]],
send=send[order(evalue,-rank(bitscore))[1]],
qstart=qstart[order(evalue,-rank(bitscore))[1]],
qend=qend[order(evalue,-rank(bitscore))[1]],
pident=pident[order(evalue,-rank(bitscore))[1]],
file=basename(lst[x]))
}else{
print(paste0("Sequence ",x," (",basename(lst[x]),") has no BLAST hits"))
}
}
res=do.call(rbind,res)
if(!is.null(len_thresh)){
res=res[res$length>len_thresh,]
}
if(!is.null(ident_thresh)){
res=res[res$pident>=ident_thresh,]
}
return(res)
}
plot_coverage=function(df=NULL,marker1=NULL,marker2=NULL){
dfp=data.frame(pos=1:max(df$send),cov=0)
for (x in 1:dim(df)[1]){
dfp$cov[df$sstart[x]:df$send[x]]=dfp$cov[df$sstart[x]:df$send[x]]+1
}
ls=c(1,which(diff(dfp$cov)!=0),max(df$send))
ltmp=lapply(1:(length(ls)-1),function(x){
data.frame(xmin=ls[x],xmax=ls[x+1],ymin=0,ymax=dfp$cov[ls[x+1]])
})
ltmp=do.call(rbind,ltmp)
a=ggplot(ltmp)+geom_rect(aes(xmin=xmin,xmax=xmax,ymin=ymin,ymax=ymax),fill="lightblue")+
geom_rect(xmin=1,xmax=max(df$send),ymin=-700,ymax=-100,fill="grey50")+
geom_text(x=max(df$send)/2,y=-450,label="Genome",size=3)+
theme_minimal()+
ylim(-1200,max(ltmp$ymax)*1.5)+
theme(legend.position="none")+
ggtitle("Coverage of different hits")+
geom_hline(yintercept = 100,colour="red",linetype="dashed")+
geom_hline(yintercept = 300,colour="yellow",linetype="dashed")+
geom_hline(yintercept = 1000,colour="green",linetype="dashed")
if(!is.null(marker1)){
a=a+geom_vline(xintercept=marker1)
}
if(!is.null(marker2)){
a=a+geom_vline(xintercept=marker2)
}
a
}
plot_similarity=function(df=NULL,marker1=NULL){
a=ggplot(df,aes(x=pident))+geom_histogram(bins=100)
if(!is.null(marker1)){
a=a+geom_vline(xintercept = marker1)
}
a
}
crop_region=function(df=NULL,start=NULL,end=NULL,len_thresh=90,ref=NULL){
df=df[df$send>(end-((end-start)*((100-len_thresh)/200))) & df$sstart<(start+((end-start)*((100-len_thresh)/200))),]
s=df$qstart+(start-df$sstart)
s[s<1]=1
e=df$qend+(end-df$send)
df2=data.frame(file=df$qaccver,qseq=gsub("-","",substr(df$qseq,s,e)))
td=tempdir()
dir.create(paste0(td,"/tmp_seq/"))
write.fasta(sequences = as.list(df2$qseq),names = as.list(df2$file),file.out = paste0(td,"/tmp_seq/cropped_sequences.fasta"))
df=BLAST_single_ref(ref=ref,qry_fld= paste0(td,"/tmp_seq/"))
system(paste0("rm -r ",td,"/tmp_seq/"))
return(df)
}
crop_similarity=function(df=NULL,pident_thresh=NULL,keep_outgroup=T){
if(keep_outgroup){
wch=which(df$pident<pident_thresh)
og=which(df$pident==max(df$pident[wch]))[1]
print("Sequence used as outgroup:")
print(df$qaccver[og])
ls=c(og,which(df$pident>pident_thresh))
} else {
ls=which(df$pident>pident_thresh)
}
df=df[ls,]
return(df)
}
align_df=function(df=NULL,temp_dir=NULL){
if (is.null(temp_dir)) temp_dir=tempdir()
print("This function aligns the sequences in the output from function extract_region using system calls to mafft.")
print(paste0("Temporary files will be written to this location: ",temp_dir))
print("Writing sequence data to disk")
seqinr::write.fasta(sequences = as.list(gsub("-| ","",df$qseq)),names = as.list(df$file),file.out = paste0(temp_dir,"/aln.fasta"),open="w")
print("Calling mafft")
setwd(temp_dir)
print(paste0("mafft --adjustdirection --thread 10 aln.fasta > aln2.fasta"))
system(paste0("mafft --adjustdirection --thread 10 aln.fasta > aln2.fasta"),intern=T)
dna=Biostrings::readDNAStringSet("aln2.fasta")
print("Masking Alignment")
dna=DNAStringSet(MaskAlignment(dna,includeTerminalGaps = T))
df=data.frame(file=names(dna),id=df$qaccver,aln=paste(dna))
un=unique(df$aln)
df$allele=match(df$aln,un)
return(df)
}
get_source_data=function(ids=NULL){
md=lapply(1:length(ids),function(y){
print(paste0(y," of ",length(ids)))
URL=paste0("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=",ids[y],"&rettype=gb&retmode=text")
data <- tryCatch(scan(file = URL, what = "", sep = "\n", quiet = TRUE),error=function(cond){NA},warning=function(cond){NA})
if(!any(is.na(data))){
pos1=which(unlist(sapply(1:length(data),function(x) length(grep(" source",data[x]))==1)))+1
pos2=which(unlist(sapply(1:length(data),function(x) length(grep(" ",data[x]))==0)) & sapply(1:length(data),function(x) x>pos1))[1]-1
tmp=lapply(pos1:pos2,function(x){
tmp=str_split(str_match(data[x], "^ /(.*?)\"$")[[2]],"=\"",simplify = T)
dftmp=data.frame(tmp=tmp[2])
colnames(dftmp)=tmp[1]
return(dftmp)
})
return(cbind(data.frame(id=ids[y],stringsAsFactors = F),bind_cols(tmp[!is.na(unlist(tmp))])))
}else{
return(data.frame(id=ids[y],stringsAsFactors = F))
}
})
md=bind_rows(md)
return(md[,!grepl("V[0-9]",colnames(md))])
}
write_alleles=function(df=NULL,file.out=NULL){
un=unique(df$aln)
cor=data.frame(id=df$id,allele=paste0("allele_",match(df$aln,un)),stringsAsFactors = F)
write.table(cor,paste0(dirname(file.out),"/allele_correspondence.csv"),row.names = F)
write.fasta(as.list(un),lapply(1:length(un),function(x) paste0("allele_",x)),file.out)
}
make_tree=function(df=NULL,dir_out=NULL,outgroup=NULL){
dna=DNAStringSet(df$aln)
names(dna)=df$id
tf=tempfile(fileext = ".fa")
dna=DNAStringSet(MaskAlignment(dna,includeTerminalGaps = T))
writeXStringSet(dna,paste0(dir_out,"/alignment_for_tree.fa"),format="fasta")
system(paste0("iqtree -T AUTO -o ",outgroup," -s ",dir_out, "/",tf))
tree=read.tree(paste0(dir_out, "/",".treefile"))
return(tree)
}
make_contigs = function(tab=NULL,outfld=NULL){
tab=read.table(tab,header=T)
res=sapply(1:dim(tab)[1],function(x){
print(paste0("Attempting to generate contig for sequence ",tab$name[x]," using method M2"))
sancon=sangeranalyseR::SangerContig(parentDirectory = tab$folder[x],
suffixForwardRegExp = tab$forward[x],
suffixReverseRegExp = tab$reverse[x],
TrimmingMethod = "M2",
processorsNum=2,
M1TrimmingCutoff = NULL,
M2CutoffQualityScore = 40,
M2SlidingWindowSize = 30,
contigName = tab$name[x])
if(length(sancon@contigSeq)==0){
print("Method M2 was NOT SUCCESSFUL")
return(0)
}else{
print("Writing contig to file")
sangeranalyseR::writeFastaSC(sancon,outputDir = tab$folder[x],selection = "contig")
return(1)
}
}
)
system(paste(c("cat",paste0(tab$folder[which(res==1)],tab$name[which(res==1)],"_contig.fa"), ">",paste0(outfld,"sequences.fasta")),collapse=" "))
}
align_seqs=function(list=NULL,outfld=NULL){
tmp=tempfile(tmpdir = outfld)
tmp2=tempfile(tmpdir = outfld)
system(paste0("cat ",paste(list,collapse=" ")," > ",tmp))
system(paste0("mafft --auto --thread 2 --adjustdirectionaccurately ",tmp," > ",tmp2))
system(paste0("cp ",tmp2," ",outfld,"/aligned_sequences.fasta"))
system(paste0("rm ",tmp," ",tmp2))
}
get_dists=function(aln=NULL,fileout=NULL){
tmp=tempfile()
system(paste0("snp-dists -b ",aln," > ",tmp))
tab=as.matrix(read.table(tmp,header=T))
tab2=reshape2::melt(tab, varnames = c("From", "To"),value.name = "SNPs",na.rm=T)
write.table(tab2,fileout,row.names = F)
return(tab2)
}
get_tips=function(tree=NULL){
d = fortify(tree)
d = subset(d, isTip)
tips=rev(with(d, label[order(y, decreasing=T)]))
return(tips)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.