Nothing
#' Load tables generated by \code{sqm2tables.py}, \code{sqmreads2tables.py} or \code{combine-sqm-tables.py} into R.
#'
#' This function takes the path to the output directory generated by \code{sqm2tables.py}, \code{sqmreads2tables.py} or \code{combine-sqm-tables.py} a SQMlite object.
#' The SQMlite object will contain taxonomic and functional profiles, but no detailed information on ORFs, contigs or bins. However, it will also have a much smaller memory footprint.
#' A SQMlite object can be used for plotting and exporting, but it can not be subsetted.
#'
#' @section The SQMlite object structure:
#' {
#' The SQMlite object is a nested list which contains the following information:
#' \tabular{lllllll}{
#' \bold{lvl1} \tab \bold{lvl2} \tab \bold{lvl3} \tab \bold{type} \tab \bold{rows/names} \tab \bold{columns} \tab \bold{data} \cr
#' \bold{$taxa} \tab \bold{$superkingdom} \tab \bold{$abund} \tab \emph{numeric matrix} \tab superkingdoms \tab samples \tab abundances \cr
#' \tab \tab \bold{$percent} \tab \emph{numeric matrix} \tab superkingdoms \tab samples \tab percentages \cr
#' \tab \bold{$phylum} \tab \bold{$abund} \tab \emph{numeric matrix} \tab phyla \tab samples \tab abundances \cr
#' \tab \tab \bold{$percent} \tab \emph{numeric matrix} \tab phyla \tab samples \tab percentages \cr
#' \tab \bold{$class} \tab \bold{$abund} \tab \emph{numeric matrix} \tab classes \tab samples \tab abundances \cr
#' \tab \tab \bold{$percent} \tab \emph{numeric matrix} \tab classes \tab samples \tab percentages \cr
#' \tab \bold{$order} \tab \bold{$abund} \tab \emph{numeric matrix} \tab orders \tab samples \tab abundances \cr
#' \tab \tab \bold{$percent} \tab \emph{numeric matrix} \tab orders \tab samples \tab percentages \cr
#' \tab \bold{$family} \tab \bold{$abund} \tab \emph{numeric matrix} \tab families \tab samples \tab abundances \cr
#' \tab \tab \bold{$percent} \tab \emph{numeric matrix} \tab families \tab samples \tab percentages \cr
#' \tab \bold{$genus} \tab \bold{$abund} \tab \emph{numeric matrix} \tab genera \tab samples \tab abundances \cr
#' \tab \tab \bold{$percent} \tab \emph{numeric matrix} \tab genera \tab samples \tab percentages \cr
#' \tab \bold{$species} \tab \bold{$abund} \tab \emph{numeric matrix} \tab species \tab samples \tab abundances \cr
#' \tab \tab \bold{$percent} \tab \emph{numeric matrix} \tab species \tab samples \tab percentages \cr
#' \bold{$functions} \tab \bold{$KEGG} \tab \bold{$abund} \tab \emph{numeric matrix} \tab KEGG ids \tab samples \tab abundances (reads) \cr
#' \tab \tab \bold{$bases} \tab \emph{numeric matrix} \tab KEGG ids \tab samples \tab abundances (bases) \cr
#' \tab \tab \bold{$tpm} \tab \emph{numeric matrix} \tab KEGG ids \tab samples \tab tpm \cr
#' \tab \tab \bold{$copy_number} \tab \emph{numeric matrix} \tab KEGG ids \tab samples \tab avg. copies \cr
#' \tab \bold{$COG} \tab \bold{$abund} \tab \emph{numeric matrix} \tab COG ids \tab samples \tab abundances (reads) \cr
#' \tab \tab \bold{$bases} \tab \emph{numeric matrix} \tab COG ids \tab samples \tab abundances (bases) \cr
#' \tab \tab \bold{$tpm} \tab \emph{numeric matrix} \tab COG ids \tab samples \tab tpm \cr
#' \tab \tab \bold{$copy_number} \tab \emph{numeric matrix} \tab COG ids \tab samples \tab avg. copies \cr
#' \tab \bold{$PFAM} \tab \bold{$abund} \tab \emph{numeric matrix} \tab PFAM ids \tab samples \tab abundances (reads) \cr
#' \tab \tab \bold{$bases} \tab \emph{numeric matrix} \tab PFAM ids \tab samples \tab abundances (bases) \cr
#' \tab \tab \bold{$tpm} \tab \emph{numeric matrix} \tab PFAM ids \tab samples \tab tpm \cr
#' \tab \tab \bold{$copy_number} \tab \emph{numeric matrix} \tab PFAM ids \tab samples \tab avg. copies \cr
#' \bold{$total_reads} \tab \tab \tab \emph{numeric vector} \tab samples \tab (n/a) \tab total reads \cr
#' \bold{$misc} \tab \bold{$project_name} \tab \tab \emph{character vector} \tab (empty) \tab (n/a) \tab project name \cr
#' \tab \bold{$samples} \tab \tab \emph{character vector} \tab (empty) \tab (n/a) \tab samples \cr
#' \tab \bold{$tax_names_long} \tab \bold{$superkingdom} \tab \emph{character vector} \tab short names \tab (n/a) \tab full names \cr
#' \tab \tab \bold{$phylum} \tab \emph{character vector} \tab short names \tab (n/a) \tab full names \cr
#' \tab \tab \bold{$class} \tab \emph{character vector} \tab short names \tab (n/a) \tab full names \cr
#' \tab \tab \bold{$order} \tab \emph{character vector} \tab short names \tab (n/a) \tab full names \cr
#' \tab \tab \bold{$family} \tab \emph{character vector} \tab short names \tab (n/a) \tab full names \cr
#' \tab \tab \bold{$genus} \tab \emph{character vector} \tab short names \tab (n/a) \tab full names \cr
#' \tab \tab \bold{$species} \tab \emph{character vector} \tab short names \tab (n/a) \tab full names \cr
#' \tab \bold{$tax_names_short} \tab \tab \emph{character vector} \tab full names \tab (n/a) \tab short names \cr
#' \tab \bold{$KEGG_names} \tab \tab \emph{character vector} \tab KEGG ids \tab (n/a) \tab KEGG names \cr
#' \tab \bold{$KEGG_paths} \tab \tab \emph{character vector} \tab KEGG ids \tab (n/a) \tab KEGG hiararchy \cr
#' \tab \bold{$COG_names} \tab \tab \emph{character vector} \tab COG ids \tab (n/a) \tab COG names \cr
#' \tab \bold{$COG_paths} \tab \tab \emph{character vector} \tab COG ids \tab (n/a) \tab COG hierarchy \cr
#' \tab \bold{$ext_annot_sources} \tab \tab \emph{character vector} \tab (empty) \tab (n/a) \tab external databases \cr
#' }
#' If external databases for functional classification were provided to SqueezeMeta or SqueezeMeta_reads via the \code{-extdb} argument, the corresponding abundance, tpm and copy number profiles will be present in \code{SQM$functions} (e.g. results for the CAZy database would be present in \code{SQM$functions$CAZy}). Additionally, the extended names of the features present in the external database will be present in \code{SQM$misc} (e.g. \code{SQM$misc$CAZy_names}). Note that results generated by SqueezeMeta_reads will contain only read abundances, but not bases, tpm or copy number estimations.
#' }
#' @param tables_path character, tables directory generated by \code{sqm2table.py}, \code{sqmreads2tables.py} or \code{combine-sqm-tables.py}.
#' @param tax_mode character, which taxonomic classification should be loaded? SqueezeMeta applies the identity thresholds described in \href{https://pubmed.ncbi.nlm.nih.gov/24589583/}{Luo \emph{et al.}, 2014}. Use \code{allfilter} for applying the minimum identity threshold to all taxa (default), \code{prokfilter} for applying the threshold to Bacteria and Archaea, but not to Eukaryotes, and \code{nofilter} for applying no thresholds at all.
#' @return SQMlite object containing the parsed tables.
#' @seealso \code{\link{plotBars}} and \code{\link{plotFunctions}} will plot the most abundant taxa and functions in a SQMlite object. \code{\link{exportKrona}} will generate Krona charts reporting the taxonomy in a SQMlite object.
#' @examples
#' \dontrun{
#' ## (outside R)
#' ## Run SqueezeMeta on the test data.
#' /path/to/SqueezeMeta/scripts/SqueezeMeta.pl -p Hadza -f raw -m coassembly -s test.samples
#' ## Generate the tabular outputs!
#' /path/to/SqueezeMeta/utils/sqm2tables.py Hadza Hadza/results/tables
#' ## Now go into R.
#' library(SQMtools)
#' Hadza = loadSQMlite("Hadza/results/tables")
#' # Where Hadza is the path to the SqueezeMeta output directory.
#' # Note that this is not the whole SQM project, just the directory containing the tables.
#' # It would also work with tables generated by sqmreads2tables.py, or combine-sqm-tables.py
#' plotTaxonomy(Hadza)
#' plotFunctions(Hadza)
#' exportKrona(Hadza, 'myKronaTest.html')
#' }
#' @importFrom utils read.table
#' @export
loadSQMlite = function(tables_path, tax_mode = 'allfilter')
{
if(!tax_mode %in% c('allfilter', 'prokfilter', 'nofilter'))
{
stop('tax_mode must be either "allfilter" (apply minimum identity threshold for all taxa), "prokfilter" (don\'t apply thresholds to Eukaryotes) or "nofilter" (don\'t apply thresholds at all).')
}
SQM = list()
allFiles = strsplit(list.files(tables_path), '.', fixed=TRUE)
if(length(allFiles) == 0)
{
stop(sprintf('Directory "%s" does not seem to contain valid SqueezeMeta tables', tables_path))
}
project_name = allFiles[sapply(allFiles, function(x) x[2] == 'superkingdom' & x[3] =='allfilter' & x[4] == 'abund' & x[5] == 'tsv')][[1]][1]
SQM$misc = list()
SQM$misc$project_name = project_name
message('Loading taxonomies\n')
SQM$taxa = list()
SQM$taxa$superkingdom = list()
SQM$taxa$phylum = list()
SQM$taxa$class = list()
SQM$taxa$order = list()
SQM$taxa$family = list()
SQM$taxa$genus = list()
SQM$taxa$species = list()
### Check that this is a valid SQM project.
if(is.null(project_name))
{
stop(sprintf('Directory "%s" does not seem to contain valid SqueezeMeta tables', tables_path))
}
SQM$taxa$superkingdom$abund = as.matrix(read.table(sprintf('%s/%s.superkingdom.%s.abund.tsv', tables_path, project_name, tax_mode),
header=TRUE, sep='\t', row.names=1, check.names=FALSE))
SQM$taxa$phylum$abund = as.matrix(read.table(sprintf('%s/%s.phylum.%s.abund.tsv', tables_path, project_name, tax_mode),
header=TRUE, sep='\t', row.names=1, check.names=FALSE))
SQM$taxa$class$abund = as.matrix(read.table(sprintf('%s/%s.class.%s.abund.tsv', tables_path, project_name, tax_mode),
header=TRUE, sep='\t', row.names=1, check.names=FALSE))
SQM$taxa$order$abund = as.matrix(read.table(sprintf('%s/%s.order.%s.abund.tsv', tables_path, project_name, tax_mode),
header=TRUE, sep='\t', row.names=1, check.names=FALSE))
SQM$taxa$family$abund = as.matrix(read.table(sprintf('%s/%s.family.%s.abund.tsv', tables_path, project_name, tax_mode),
header=TRUE, sep='\t', row.names=1, check.names=FALSE))
SQM$taxa$genus$abund = as.matrix(read.table(sprintf('%s/%s.genus.%s.abund.tsv', tables_path, project_name, tax_mode),
header=TRUE, sep='\t', row.names=1, check.names=FALSE))
SQM$taxa$species$abund = as.matrix(read.table(sprintf('%s/%s.species.%s.abund.tsv', tables_path, project_name, tax_mode),
header=TRUE, sep='\t', row.names=1, check.names=FALSE))
SQM$taxa$superkingdom$percent = 100 * t(t(SQM$taxa$superkingdom$abund) / colSums(SQM$taxa$superkingdom$abund))
SQM$taxa$phylum$percent = 100 * t(t(SQM$taxa$phylum$abund) / colSums(SQM$taxa$phylum$abund ))
SQM$taxa$class$percent = 100 * t(t(SQM$taxa$class$abund) / colSums(SQM$taxa$class$abund ))
SQM$taxa$order$percent = 100 * t(t(SQM$taxa$order$abund) / colSums(SQM$taxa$order$abund ))
SQM$taxa$family$percent = 100 * t(t(SQM$taxa$family$abund) / colSums(SQM$taxa$family$abund ))
SQM$taxa$genus$percent = 100 * t(t(SQM$taxa$genus$abund) / colSums(SQM$taxa$genus$abund ))
SQM$taxa$species$percent = 100 * t(t(SQM$taxa$species$abund) / colSums(SQM$taxa$species$abund ))
SQM$misc$tax_names_long = list()
SQM$misc$tax_names_long$superkingdom = rownames(SQM$tax$superkingdom$abund)
names(SQM$misc$tax_names_long$superkingdom) = gsub('^k_', '', SQM$misc$tax_names_long$superkingdom)
SQM$misc$tax_names_long$phylum = rownames(SQM$tax$phylum$abund)
names(SQM$misc$tax_names_long$phylum) = sapply(strsplit(SQM$misc$tax_names_long$phylum, split=';p_'), FUN = function(x) x[2])
SQM$misc$tax_names_long$class = rownames(SQM$tax$class$abund)
names(SQM$misc$tax_names_long$class) = sapply(strsplit(SQM$misc$tax_names_long$class, split=';c_'), FUN = function(x) x[2])
SQM$misc$tax_names_long$order = rownames(SQM$tax$order$abund)
names(SQM$misc$tax_names_long$order) = sapply(strsplit(SQM$misc$tax_names_long$order, split=';o_'), FUN = function(x) x[2])
SQM$misc$tax_names_long$family = rownames(SQM$tax$family$abund)
names(SQM$misc$tax_names_long$family) = sapply(strsplit(SQM$misc$tax_names_long$family, split=';f_'), FUN = function(x) x[2])
SQM$misc$tax_names_long$genus = rownames(SQM$tax$genus$abund)
names(SQM$misc$tax_names_long$genus) = sapply(strsplit(SQM$misc$tax_names_long$genus, split=';g_'), FUN = function(x) x[2])
SQM$misc$tax_names_long$species = rownames(SQM$tax$species$abund)
names(SQM$misc$tax_names_long$species) = sapply(strsplit(SQM$misc$tax_names_long$species, split=';s_'), FUN = function(x) x[2])
SQM$misc$tax_names_short = unlist(lapply(SQM$misc$tax_names_long, names))
names(SQM$misc$tax_names_short) = unlist(SQM$misc$tax_names_long)
# Use short names for taxonomy tables, since it makes it easier to search for specific taxa
rownames(SQM$taxa$superkingdom$abund) = SQM$misc$tax_names_short[rownames(SQM$taxa$superkingdom$abund)]
rownames(SQM$taxa$phylum$abund) = SQM$misc$tax_names_short[rownames(SQM$taxa$phylum$abund)]
rownames(SQM$taxa$class$abund) = SQM$misc$tax_names_short[rownames(SQM$taxa$class$abund)]
rownames(SQM$taxa$order$abund) = SQM$misc$tax_names_short[rownames(SQM$taxa$order$abund)]
rownames(SQM$taxa$family$abund) = SQM$misc$tax_names_short[rownames(SQM$taxa$family$abund)]
rownames(SQM$taxa$genus$abund) = SQM$misc$tax_names_short[rownames(SQM$taxa$genus$abund)]
rownames(SQM$taxa$species$abund) = SQM$misc$tax_names_short[rownames(SQM$taxa$species$abund)]
rownames(SQM$taxa$superkingdom$percent) = SQM$misc$tax_names_short[rownames(SQM$taxa$superkingdom$percent)]
rownames(SQM$taxa$phylum$percent) = SQM$misc$tax_names_short[rownames(SQM$taxa$phylum$percent)]
rownames(SQM$taxa$class$percent) = SQM$misc$tax_names_short[rownames(SQM$taxa$class$percent)]
rownames(SQM$taxa$order$percent) = SQM$misc$tax_names_short[rownames(SQM$taxa$order$percent)]
rownames(SQM$taxa$family$percent) = SQM$misc$tax_names_short[rownames(SQM$taxa$family$percent)]
rownames(SQM$taxa$genus$percent) = SQM$misc$tax_names_short[rownames(SQM$taxa$genus$percent)]
rownames(SQM$taxa$species$percent) = SQM$misc$tax_names_short[rownames(SQM$taxa$species$percent)]
# Now the rownames are a character vector which actually has its own names inside. Some R packages don't like this, so we correct it.
for(taxlevel in names(SQM$taxa))
{
for(counts in names(SQM$taxa[[taxlevel]]))
{
names(rownames(SQM$taxa[[taxlevel]][[counts]])) = NULL
}
}
message('Loading functions\n')
SQM$functions = list()
funAbundFiles = allFiles[sapply(allFiles, function(x) x[1] == project_name & x[3] == 'abund' & x[4] == 'tsv')]
funMethods = sapply(funAbundFiles, function(x) x[2])
for(method in funMethods)
{
if(method == 'KO') { methodn = 'KEGG' } else { methodn = method }
SQM$functions[[methodn]] = list()
methodFiles = allFiles[sapply(allFiles, function(x) x[1] == project_name & x[2] == method & x[4] == 'tsv')]
for(fvec in methodFiles)
{
fname = paste(fvec, collapse = '.')
count = fvec[3]
if(count == 'copyNumber') { count = 'copy_number' }
if(count == 'names')
{
fieldn = sprintf('%s_names', methodn)
fieldp = sprintf('%s_paths', methodn)
ta = read.table(sprintf('%s/%s', tables_path, fname),
header=TRUE, row.names=1, sep='\t', comment.char='', quote='', as.is=TRUE)
SQM$misc[[fieldn]] = ta[,'Name']
names(SQM$misc[[fieldn]]) = rownames(ta)
if(methodn %in% c('KEGG', 'COG'))
{
SQM$misc[[fieldp]] = ta[,'Path']
names(SQM$misc[[fieldp]]) = rownames(ta) # :'(
}
}else
{
SQM$functions[[methodn]][[count]] = as.matrix(read.table(sprintf('%s/%s', tables_path, fname),
header=TRUE, sep='\t', row.names=1, check.names=FALSE, comment.char='', quote=''))
}
}
}
SQM$misc$ext_annot_sources = funMethods[!funMethods %in% c('KO', 'COG', 'PFAM')]
### Finish.
SQM$misc$samples = colnames(SQM$tax$superkingdom$abund) # This should contain all samples, user is responsible for inconsistencies in the data.
class(SQM) = 'SQMlite'
return(SQM)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.