tmp/runTS.R

#-----------------------------------------------------------------------------#
# Parameters for the analysis, uncomment and edit them if you prefer to leave
# it automated for future use.

#Root dir where TS is installed

setwd("/Users/chico/projects/TaxonSampling/TS/")



## Input/output files

#path to tabular file linking NCBI taxon IDs to sequence IDs
idsFile <- "data/metadata/TaxID2SeqID.txt"

#path to multi-fasta file from where sequences should be sampled
multifasta <- "data/fasta/mit_vertebrata.fasta"

#path to file linking NCBI taxon IDs to sequence IDs
knownSppFile <- "data/metadata/TaxID2sppCounts.tsv"

#output directory
outDir <- "results/"

#path to output file
outFile <- paste0(outDir, "output_TS.fasta")

#path to NCBI taxonomy files (execute "install.sh" to automatically download them)
taxondir <- "data/taxdump/"


## Parameters for TS

# where should we start looking? (root node where to start algorithm)

#mammalia
root_taxon <- 40674

#vertebrata
#root_taxon <- 7742

#whether/how to randomize (options are "yes", "no" and "after_first_round")
#randomize <- "yes"
#randomize <- "no"
randomize <- "after_first_round"

#sampling procedure (either "agnostic" or "known_species")
sampling <- "agnostic"
#sampling <- "known_species"

#number of sequences to sample
m <- 200

#IDs to be ignored during sampling procedure (either terminal or internal taxons)
ignoreIDs <- NULL

#required IDs to be present in final output file (only terminal taxa - species)
#requireIDs <- c(2026169, 8364, 57393, 241292, 61967)
requireIDs <- NULL


#from here on, users do not need to edit this file
#-----------------------------------------------------------------------------#

source("bin/TaxonSampling/TaxonSampling.R")

# These variables are highly experimental and probably
# will not to work, but future versions of TS are likely to support
# them, so we decided to keep them here for now.

#method to use (either 'diversity' or 'balance').
#"Balance" is heavily outdated, and currently does not support
#distinct sampling strategies. We haven't been using it for a
#while.
method <- "diversity"

# should we ignore the following non-leaf IDs while sampling?
ignoreNonLeafID <- NULL

#allow TS to repeat IDs if needed? ('no' is better to get a higher diversity)
replacement <- "no"

#get all nodes from NCBI taxonomy
print("Parsing NCBI Taxonomy data")
nodes <- suppressMessages(getnodes(taxondir))
print("Done.")

print("Computing sequence counts per taxon")
#count how many sequences per node
countIDs <- TS_TaxonomyData(idsFile, nodes)
print("Done.")

print("Computing species count per taxon")
#count how many known spp per node
countSpp <- TS_SpeciesData(knownSppFile, countIDs)
print("Done.")

print("Simplifying taxonomic space")
#simplify taxonomic space by
#1) removing nodes that are not children of root node
#2) removing nodes that have no sequences to be sampled in current sequence database
nodes <- Simplify_Nodes(nodes, countIDs)
print("Done.")

#list to store output IDs
outputIDs1 <- list()

#selecting user-defined IDs that must be present in output file
if (!is.null(requireIDs)) {
  total_requiredIDs <- length(requireIDs)
  if (!all(is.element(requireIDs, nodes[,1]))) {
    stop("One of the required IDs provided is not present in database")
  }
  outputIDs1 <- intersect(requireIDs, nodes[,1])
  m <- m - total_requiredIDs
}

#no required IDs anymore
requireIDs <- NULL

#creating output directory, if needed
if (dir.exists(outDir)){
} else {
  dir.create(file.path(outDir), recursive=TRUE)
}

print("Running TaxonSampling")
outputIDs2 <- TS_Algorithm(root_taxon, m, nodes, countIDs, method, randomize,
                           replacement, ignoreIDs, requireIDs,
                           ignoreNonLeafID, sampling)
print("Done.")

print("Writing output file")

#Merging user-defined IDs with the ones selectec by TS
outputIDs <- c(outputIDs1, outputIDs2)
#Writing output fasta file
WriteFasta(idsFile, multifasta, outputIDs, outFile)
print("Done.")
print("TaxonSampling analysis finished.")
fcampelo/TaxonSampling documentation built on Jan. 29, 2022, 7:11 a.m.