bossa-refpkg.R
In BoSSA: A Bunch of Structure and Sequence Analysis

## ----style, echo = FALSE, results = 'asis'------------------------------------

## -----------------------------------------------------------------------------
library("rentrez")
library("httr")
library("XML")
library("ape")
library("BoSSA")
set_config(config(http_version = 0))

## ---- eval=F------------------------------------------------------------------
#  r_search <- entrez_search(db="taxonomy", term="polerovirus")
#  r_search$ids

## ---- eval=F------------------------------------------------------------------
#  r_search <- entrez_search(db="nucleotide", term="txid119164[orgn] NOT srcdb_refseq[PROP]")
#  r_search$count

## ---- eval=F------------------------------------------------------------------
#  r_search <- entrez_search(db="nucleotide", term="txid119164[orgn] NOT srcdb_refseq[PROP] AND 5000:6500[slen]")
#  r_search$count

## ---- eval=F------------------------------------------------------------------
#  r_search <- entrez_search(db="nucleotide", term="txid119164[orgn] NOT srcdb_refseq[PROP] AND 5000:6500[slen]",retmax=1000)
#  gi <- r_search$ids
#  gi[1:10]

## ---- eval=F------------------------------------------------------------------
#  r_search <- entrez_search(db="nucleotide", term="txid119164[orgn] NOT srcdb_refseq[PROP] AND 5000:6500[slen]",use_history=TRUE)
#  all_recs <- NULL
#  for(seq_start in seq(1,ceiling(r_search$count/200)*200,200)){
#       all_recs <- c(all_recs,entrez_fetch(db="nuccore", web_history=r_search$web_history,rettype="gbc",retmode="xml",retmax=200,retstart=seq_start))
#  }
#  rec_list <- do.call(c,lapply(all_recs,xmlToList))

## ---- eval=F------------------------------------------------------------------
#  write(sapply(rec_list,function(X){paste(">",X$INSDSeq_locus,"\n",X$INSDSeq_sequence,sep="")}),"polerovirus_from_genbank.fasta")

## ---- eval=F------------------------------------------------------------------
#  source_name <- t(sapply(rec_list,function(X){c(X$INSDSeq_locus,X$INSDSeq_organism)}))

## -----------------------------------------------------------------------------
tree <- read.tree(paste(find.package("BoSSA"),"/extdata/polerovirus_from_genbank_MAFFT.tre",sep=""))
plot(tree,cex=0.4,no.margin=TRUE)
nodelabels(cex=0.8)

## -----------------------------------------------------------------------------
root_tree <- root(tree,node=231,resolve=TRUE)
plot(root_tree,cex=0.4,no.margin=TRUE)

## -----------------------------------------------------------------------------
write.tree(root_tree,"polerovirus_ROOTED.tre")

## ---- eval=F------------------------------------------------------------------
#  r_search <- entrez_search(db="taxonomy", term="txid119164[orgn]",retmax=1000)
#  tax_ids <-  r_search$ids
#  write(tax_ids,"taxonomy_polerovirus.id")

## ---- eval=F------------------------------------------------------------------
#  taxo <- read.csv(paste(find.package("BoSSA"),"/extdata/polerovirus_taxonomy.csv",sep=""))
#  tax_id <- taxo$tax_id[match(source_name[,2],taxo$tax_name)]
#  info <- data.frame(seqname=source_name[,1],accession=source_name[,1],tax_id=tax_id,species=source_name[,2],is_type=rep("no",nrow(source_name)))
#  write.table(info,"polerovirus_info.csv",sep=",",row.names=FALSE)

## -----------------------------------------------------------------------------
refpkg_path <- paste(find.package("BoSSA"),"/extdata/polerovirus.refpkg",sep="")

refpkg(refpkg_path)

## ----tree1, fig.width=5, fig.height=5-----------------------------------------
refpkg(refpkg_path,type="tree",cex.text=0.3,rank_tree="species")

## ----pie1, fig.width=5, fig.height=5------------------------------------------
refpkg(refpkg_path,type="pie",rank_pie="species",cex.text=0.6)

## -----------------------------------------------------------------------------
refpkg(refpkg_path,type="krona")

## ---- eval=F------------------------------------------------------------------
#  d <- cophenetic.phylo(root_tree)
#  
#  ids <- NULL
#  for(i in 1:length(unique(info$species))){
#      usp <- unique(info$species)[i]
#    sub <- info[as.character(info$species)==usp,]
#    acc <- as.character(sub[,1])
#  
#    # the following code line prevent the code from crashing
#    # i.e. new sequences not available in the example may be uploaded
#    # when you will run the code
#    acc <- acc[!is.na(match(acc,colnames(d)))]
#  
#    if(length(acc)<=10){
#      ids <- c(ids,acc)
#    }
#    if(length(acc)>10){
#      h <- hclust(as.dist(d[acc,acc]))
#      grp <- cutree(h,k=10)
#      ids <- c(ids,names(grp)[match(1:10,grp)])
#    }
#  }
#  
#  to_remove <- root_tree$tip.label[!root_tree$tip.label%in%ids]

## ---- eval=F------------------------------------------------------------------
#  root_tree2 <- drop.tip(root_tree,to_remove)
#  write.tree(root_tree2,"polerovirus_ROOTED_SUBSAMPLED.tre")

## -----------------------------------------------------------------------------
citation("BoSSA")