Nothing
library(reshape2)
library(data.table)
#' Load a SqueezeMeta project into R
#'
#' This function takes the path to a project directory generated by \href{https://github.com/jtamames/SqueezeMeta}{SqueezeMeta} (whose name is specified in the \code{-p} parameter of the SqueezeMeta.pl script) and parses the results into a SQM object. Alternatively, it can load the project data from a zip file produced by \code{sqm2zip.py}.
#'
#' @section Prerequisites:
#' {
#' Run \href{https://github.com/jtamames/SqueezeMeta}{SqueezeMeta}! An example call for running it would be:
#'
#' \code{/path/to/SqueezeMeta/scripts/SqueezeMeta.pl}\cr
#' \code{-m coassembly -f fastq_dir -s samples_file -p project_dir}
#' }
#'
#' @section The SQM object structure:
#' {
#' The SQM object is a nested list which contains the following information:
#' \tabular{lllllll}{
#' \bold{lvl1} \tab \bold{lvl2} \tab \bold{lvl3} \tab \bold{type} \tab \bold{rows/names} \tab \bold{columns} \tab \bold{data} \cr
#' \bold{$orfs} \tab \bold{$table} \tab \tab \emph{dataframe} \tab orfs \tab misc. data \tab misc. data \cr
#' \tab \bold{$abund} \tab \tab \emph{numeric matrix} \tab orfs \tab samples \tab abundances (reads) \cr
#' \tab \bold{$bases} \tab \tab \emph{numeric matrix} \tab orfs \tab samples \tab abundances (bases) \cr
#' \tab \bold{$cov} \tab \tab \emph{numeric matrix} \tab orfs \tab samples \tab coverages \cr
#' \tab \bold{$cpm} \tab \tab \emph{numeric matrix} \tab orfs \tab samples \tab covs. / 10^6 reads \cr
#' \tab \bold{$tpm} \tab \tab \emph{numeric matrix} \tab orfs \tab samples \tab tpm \cr
#' \tab \bold{$seqs} \tab \tab \emph{character vector} \tab orfs \tab (n/a) \tab sequences \cr
#' \tab \bold{$tax} \tab \tab \emph{character matrix} \tab orfs \tab tax. ranks \tab taxonomy \cr
#' \tab \bold{$tax16S} \tab \tab \emph{character vector} \tab orfs \tab (n/a) \tab 16S rRNA taxonomy \cr
#' \tab \bold{$markers} \tab \tab \emph{list} \tab orfs \tab (n/a) \tab CheckM1 markers \cr
#' \bold{$contigs} \tab \bold{$table} \tab \tab \emph{dataframe} \tab contigs \tab misc. data \tab misc. data \cr
#' \tab \bold{$abund} \tab \tab \emph{numeric matrix} \tab contigs \tab samples \tab abundances (reads) \cr
#' \tab \bold{$bases} \tab \tab \emph{numeric matrix} \tab contigs \tab samples \tab abundances (bases) \cr
#' \tab \bold{$cov} \tab \tab \emph{numeric matrix} \tab contigs \tab samples \tab coverages \cr
#' \tab \bold{$cpm} \tab \tab \emph{numeric matrix} \tab contigs \tab samples \tab covs. / 10^6 reads \cr
#' \tab \bold{$tpm} \tab \tab \emph{numeric matrix} \tab contigs \tab samples \tab tpm \cr
#' \tab \bold{$seqs} \tab \tab \emph{character vector} \tab contigs \tab (n/a) \tab sequences \cr
#' \tab \bold{$tax} \tab \tab \emph{character matrix} \tab contigs \tab tax. ranks \tab taxonomies \cr
#' \tab \bold{$bins} \tab \tab \emph{character matrix} \tab contigs \tab bin. methods \tab bins \cr
#' $bins \tab \bold{$table} \tab \tab \emph{dataframe} \tab bins \tab misc. data \tab misc. data \cr
#' \tab \bold{$length} \tab \tab \emph{numeric vector} \tab bins \tab (n/a) \tab length \cr
#' \tab \bold{$abund} \tab \tab \emph{numeric matrix} \tab bins \tab samples \tab abundances (reads) \cr
#' \tab \bold{$percent} \tab \tab \emph{numeric matrix} \tab bins \tab samples \tab abundances (reads) \cr
#' \tab \bold{$bases} \tab \tab \emph{numeric matrix} \tab bins \tab samples \tab abundances (bases) \cr
#' \tab \bold{$cov} \tab \tab \emph{numeric matrix} \tab bins \tab samples \tab coverages \cr
#' \tab \bold{$cpm} \tab \tab \emph{numeric matrix} \tab bins \tab samples \tab covs. / 10^6 reads \cr
#' \tab \bold{$tax} \tab \tab \emph{character matrix} \tab bins \tab tax. ranks \tab taxonomy \cr
#' \bold{$taxa} \tab \bold{$superkingdom} \tab \bold{$abund} \tab \emph{numeric matrix} \tab superkingdoms \tab samples \tab abundances (reads) \cr
#' \tab \tab \bold{$percent} \tab \emph{numeric matrix} \tab superkingdoms \tab samples \tab percentages \cr
#' \tab \bold{$phylum} \tab \bold{$abund} \tab \emph{numeric matrix} \tab phyla \tab samples \tab abundances (reads) \cr
#' \tab \tab \bold{$percent} \tab \emph{numeric matrix} \tab phyla \tab samples \tab percentages \cr
#' \tab \bold{$class} \tab \bold{$abund} \tab \emph{numeric matrix} \tab classes \tab samples \tab abundances (reads) \cr
#' \tab \tab \bold{$percent} \tab \emph{numeric matrix} \tab classes \tab samples \tab percentages \cr
#' \tab \bold{$order} \tab \bold{$abund} \tab \emph{numeric matrix} \tab orders \tab samples \tab abundances (reads) \cr
#' \tab \tab \bold{$percent} \tab \emph{numeric matrix} \tab orders \tab samples \tab percentages \cr
#' \tab \bold{$family} \tab \bold{$abund} \tab \emph{numeric matrix} \tab families \tab samples \tab abundances (reads) \cr
#' \tab \tab \bold{$percent} \tab \emph{numeric matrix} \tab families \tab samples \tab percentages \cr
#' \tab \bold{$genus} \tab \bold{$abund} \tab \emph{numeric matrix} \tab genera \tab samples \tab abundances (reads) \cr
#' \tab \tab \bold{$percent} \tab \emph{numeric matrix} \tab genera \tab samples \tab percentages \cr
#' \tab \bold{$species} \tab \bold{$abund} \tab \emph{numeric matrix} \tab species \tab samples \tab abundances (reads) \cr
#' \tab \tab \bold{$percent} \tab \emph{numeric matrix} \tab species \tab samples \tab percentages \cr
#' \bold{$functions} \tab \bold{$KEGG} \tab \bold{$abund} \tab \emph{numeric matrix} \tab KEGG ids \tab samples \tab abundances (reads) \cr
#' \tab \tab \bold{$bases} \tab \emph{numeric matrix} \tab KEGG ids \tab samples \tab abundances (bases) \cr
#' \tab \tab \bold{$cov} \tab \emph{numeric matrix} \tab KEGG ids \tab samples \tab coverages \cr
#' \tab \tab \bold{$cpm} \tab \emph{numeric matrix} \tab KEGG ids \tab samples \tab covs. / 10^6 reads \cr
#' \tab \tab \bold{$tpm} \tab \emph{numeric matrix} \tab KEGG ids \tab samples \tab tpm \cr
#' \tab \tab \bold{$copy_number} \tab \emph{numeric matrix} \tab KEGG ids \tab samples \tab avg. copies \cr
#' \tab \bold{$COG} \tab \bold{$abund} \tab \emph{numeric matrix} \tab COG ids \tab samples \tab abundances (reads) \cr
#' \tab \tab \bold{$bases} \tab \emph{numeric matrix} \tab COG ids \tab samples \tab abundances (bases) \cr
#' \tab \tab \bold{$cov} \tab \emph{numeric matrix} \tab COG ids \tab samples \tab coverages \cr
#' \tab \tab \bold{$cpm} \tab \emph{numeric matrix} \tab COG ids \tab samples \tab covs. / 10^6 reads \cr
#' \tab \tab \bold{$tpm} \tab \emph{numeric matrix} \tab COG ids \tab samples \tab tpm \cr
#' \tab \tab \bold{$copy_number} \tab \emph{numeric matrix} \tab COG ids \tab samples \tab avg. copies \cr
#' \tab \bold{$PFAM} \tab \bold{$abund} \tab \emph{numeric matrix} \tab PFAM ids \tab samples \tab abundances (reads) \cr
#' \tab \tab \bold{$bases} \tab \emph{numeric matrix} \tab PFAM ids \tab samples \tab abundances (bases) \cr
#' \tab \tab \bold{$cov} \tab \emph{numeric matrix} \tab PFAM ids \tab samples \tab coverages \cr
#' \tab \tab \bold{$cpm} \tab \emph{numeric matrix} \tab PFAM ids \tab samples \tab covs. / 10^6 reads \cr
#' \tab \tab \bold{$tpm} \tab \emph{numeric matrix} \tab PFAM ids \tab samples \tab tpm \cr
#' \tab \tab \bold{$copy_number} \tab \emph{numeric matrix} \tab PFAM ids \tab samples \tab avg. copies \cr
#' \bold{$total_reads} \tab \tab \tab \emph{numeric vector} \tab samples \tab (n/a) \tab total reads \cr
#' \bold{$misc} \tab \bold{$project_name} \tab \tab \emph{character vector} \tab (empty) \tab (n/a) \tab project name \cr
#' \tab \bold{$samples} \tab \tab \emph{character vector} \tab (empty) \tab (n/a) \tab samples \cr
#' \tab \bold{$tax_names_long} \tab \bold{$superkingdom} \tab \emph{character vector} \tab short names \tab (n/a) \tab full names \cr
#' \tab \tab \bold{$phylum} \tab \emph{character vector} \tab short names \tab (n/a) \tab full names \cr
#' \tab \tab \bold{$class} \tab \emph{character vector} \tab short names \tab (n/a) \tab full names \cr
#' \tab \tab \bold{$order} \tab \emph{character vector} \tab short names \tab (n/a) \tab full names \cr
#' \tab \tab \bold{$family} \tab \emph{character vector} \tab short names \tab (n/a) \tab full names \cr
#' \tab \tab \bold{$genus} \tab \emph{character vector} \tab short names \tab (n/a) \tab full names \cr
#' \tab \tab \bold{$species} \tab \emph{character vector} \tab short names \tab (n/a) \tab full names \cr
#' \tab \bold{$tax_names_short} \tab \tab \emph{character vector} \tab full names \tab (n/a) \tab short names \cr
#' \tab \bold{$KEGG_names} \tab \tab \emph{character vector} \tab KEGG ids \tab (n/a) \tab KEGG names \cr
#' \tab \bold{$KEGG_paths} \tab \tab \emph{character vector} \tab KEGG ids \tab (n/a) \tab KEGG hiararchy \cr
#' \tab \bold{$COG_names} \tab \tab \emph{character vector} \tab COG ids \tab (n/a) \tab COG names \cr
#' \tab \bold{$COG_paths} \tab \tab \emph{character vector} \tab COG ids \tab (n/a) \tab COG hierarchy \cr
#' \tab \bold{$ext_annot_sources} \tab \tab \emph{character vector} \tab COG ids \tab (n/a) \tab external databases \cr
#' }
#' If external databases for functional classification were provided to SqueezeMeta via the \code{-extdb} argument, the corresponding abundance (reads and bases), coverages, tpm and copy number profiles will be present in \code{SQM$functions} (e.g. results for the CAZy database would be present in \code{SQM$functions$CAZy}). Additionally, the extended names of the features present in the external database will be present in \code{SQM$misc} (e.g. \code{SQM$misc$CAZy_names}).
#' }
#' @param project_path character, a vector of project directories generated by SqueezeMeta, and/or zip files generated by \code{sqm2zip.py}.
#' @param tax_mode character, which taxonomic classification should be loaded? SqueezeMeta applies the identity thresholds described in \href{https://pubmed.ncbi.nlm.nih.gov/24589583/}{Luo \emph{et al.}, 2014}. Use \code{allfilter} for applying the minimum identity threshold to all taxa, \code{prokfilter} for applying the threshold to Bacteria and Archaea, but not to Eukaryotes, and \code{nofilter} for applying no thresholds at all (default \code{prokfilter}).
#' @param trusted_functions_only logical. If \code{TRUE}, only highly trusted functional annotations (best hit + best average) will be considered when generating aggregated function tables. If \code{FALSE}, best hit annotations will be used (default \code{FALSE}). Will only have an effect if \code{project_path} is not a zip file, and \code{project_path/results/tables} is not already present.
#' @param single_copy_genes character, source of single copy genes for copy number normalization, either \code{RecA} (COG0468, RecA/RadA), \code{MGOGs} (COGs for 10 single copy and housekeeping genes, Salazar, G \emph{et al.} 2019), \code{MGKOs} (KOs for 10 single copy and housekeeping genes, Salazar, G \emph{et al.}, 2019) or \code{USiCGs} (KOs for 15 single copy genes, Carr \emph{et al.}, 2013. Table S1). For \code{MGOGs}, \code{MGKOs} and \code{USiCGs}, the median coverage of a set of single copy genes will be used for normalization. Default \code{MGOGs}.
#' @param load_sequences logical. If \code{TRUE}, contig and orf sequences will be loaded in the SQM object. Setting it to \code{FALSE} will reduce memory usage. Default \code{TRUE}.
#' @param engine character. Engine used to load the ORFs and contigs tables. Either \code{data.frame} or \code{data.table} (significantly faster if your project is large). Default \code{data.table}.
#' @return SQM object containing the parsed project. If more than one path is provided in \code{project_path} this function will return a SQMbunch object instead. The structure of this object is similar to that of a SQMlite object (see \code{\link{loadSQMlite}}) but with an extra entry named \code{projects} that contains one SQM object for input project. SQM and SQMbunch objects will otherwise behave similarly when used with the subset and plot functions from this package.
#' @examples
#' \dontrun{
#' ## (outside R)
#' ## Run SqueezeMeta on the test data.
#' /path/to/SqueezeMeta/scripts/SqueezeMeta.pl -p Hadza -f raw -m coassembly -s test.samples
#' ## Now go into R.
#' library(SQMtools)
#' Hadza = loadSQM("Hadza") # Where Hadza is the path to the SqueezeMeta output directory.
#' }
#'
#' data(Hadza) # We will illustrate the structure of the SQM object on the test data
#' # Which are the ten most abundant KEGG IDs in our data?
#' topKEGG = names(sort(rowSums(Hadza$functions$KEGG$tpm), decreasing=TRUE))[1:11]
#' topKEGG = topKEGG[topKEGG!="Unclassified"]
#' # Which functions do those KEGG IDs represent?
#' Hadza$misc$KEGG_names[topKEGG]
#' # What is the relative abundance of the Negativicutes class across samples?
#' Hadza$taxa$class$percent["Negativicutes",]
#' # Which information is stored in the orf, contig and bin tables?
#' colnames(Hadza$orfs$table)
#' colnames(Hadza$contigs$table)
#' colnames(Hadza$bins$table)
#' # What is the GC content distribution of my metagenome?
#' boxplot(Hadza$contigs$table[,"GC perc"]) # Not weighted by contig length or abundance!
#' @export
loadSQM = function(project_path, tax_mode = 'prokfilter', trusted_functions_only = FALSE, single_copy_genes = 'MGOGs', load_sequences = TRUE, engine = 'data.table')
{
if(length(project_path) == 1)
{
SQM = loadSQM_(project_path, tax_mode, trusted_functions_only, single_copy_genes, load_sequences, engine)
} else
{
projs = list()
for(p in project_path)
{
message(sprintf('Loading project in %s', p))
projs = c(projs, list(loadSQM_(p, tax_mode, trusted_functions_only, single_copy_genes, load_sequences, engine)))
}
SQM = combineSQM(projs)
}
return(SQM)
}
#' @importFrom utils read.table tail
#' @importFrom stats aggregate
loadSQM_ = function(project_path, tax_mode = 'prokfilter', trusted_functions_only = FALSE, single_copy_genes = 'MGOGs', load_sequences = TRUE, engine = 'data.table')
{
if(!tax_mode %in% c('allfilter', 'prokfilter', 'nofilter'))
{
stop('tax_mode must be either "allfilter" (apply minimum identity threshold for all taxa), "prokfilter" (don\'t apply thresholds to Eukaryotes) or "nofilter" (don\'t apply thresholds at all).')
}
if(!single_copy_genes %in% c('RecA', 'MGOGs', 'MGKOs', 'USiCGs'))
{
stop('single_copy_genes must be either "RecA", "MGOGs", "MGKOs", or "USiCGs"')
}
if(!engine %in% c('data.frame', 'data.table'))
{
stop('engine must be either "data.frame" or "data.table"')
}
project_name = tail(unlist(strsplit(project_path, split='/')), 1)
zipmode = endsWith(project_name, '.zip')
project_name = gsub('.zip','',project_name)
### Check that this is a valid SQM project.
if(!file.exists.zip(project_path, 'SqueezeMeta_conf.pl'))
{
stop(sprintf('Directory "%s" does not seem to contain a valid SqueezeMeta project', project_path))
}
### Check whether this project was created with an old version SQM
sqmtools_version = sprintf('v%s', .__NAMESPACE__.$spec['version'])
if(file.exists.zip(project_path, sprintf('results/20.%s.contigtable', project_name)))
{
stop('Your project was created with a version of SqueezeMeta prior to 1.5. Running utils/versionchange.pl might fix this.')
} else if(!file.exists.zip(project_path,'creator.txt'))
{
warning(sprintf('Your project was created with SqueezeMeta v1.5, while this is SQMtools %s. You can ignore this message if things are working fine for you, but if you experience any issue consider using the right version of SQMtools for this project', sqmtools_version)
)
} else {
conn = open_conn_zip(project_path, 'creator.txt')
project_version = gsub(',','',scan(conn, what = 'character', quiet = T)[2])
close(conn)
if(project_version != sqmtools_version)
{
warning(sprintf('Your project was created with SqueezeMeta %s, while this is SQMtools %s. You can ignore this message if things are working fine for you, but if you experience any issue consider using the right version of SQMtools for this project', project_version, sqmtools_version))
}
}
### Check whether we need to create the projectdir/results/tables directory.
if(zipmode)
{
if(!file.exists.zip(project_path, sprintf('results/tables/%s.superkingdom.%s.abund.tsv', project_name, tax_mode)))
{ stop('Projects provided as zip files must contain the results/tables directories. This should have been generated by sqm2zip.py.')}
warning('You are trying to load a zipped project. If this doesn\'t work, see workaround in https://github.com/jtamames/SqueezeMeta/issues/755.')
}
else if(!file.exists(sprintf('%s/results/tables/%s.superkingdom.%s.abund.tsv', project_path, project_name, tax_mode)))
{
message(sprintf('Generating tabular outputs for project in %s', project_path))
lines = readLines(sprintf('%s/SqueezeMeta_conf.pl', project_path))
lines = trimws(lines)
lines = gsub(pattern = "#[^\\\n]*", replacement = "", x = lines) # Remove comments.
install_path = lines[grepl("^\\$installpath",lines)]
if(length(install_path)==0) { stop(sprintf('$installpath can not be found in conf file \"%s/SqueezeMeta_conf.pl\".', project_path)) }
install_path = gsub(';|\"', '', trimws(strsplit(install_path, split = '=')[[1]][2]))
if(trusted_functions_only)
{
extra_args = '--trusted-functions'
}else
{
extra_args = ''
}
command = sprintf('%s/utils/sqm2tables.py %s %s/results/tables %s', install_path, project_path, project_path, extra_args)
ecode = system(command)
if(ecode != 0) { stop('An error occurred while running sqm2tables.py') }
}
### Do stuff.
SQM = list()
SQM$misc = list()
SQM$misc$project_name = project_name
SQM$misc$single_copy_genes = single_copy_genes
message('Loading total reads')
conn = open_conn_zip(project_path, sprintf('results/10.%s.mappingstat', project_name))
lines = readLines(conn)
close(conn)
evilLines = (substr(lines,1,1) == '#' & substr(lines,1,8) != '# Sample') | substr(lines,1,1) == ''
lines = lines[!evilLines]
SQM$total_reads = as.matrix(
read.table(text = lines,
header=TRUE, sep='\t', row.names=1, comment.char='')
)[,'Total.reads']
message('Loading orfs')
SQM$orfs = list()
message(' table...') # option to remove table from memory after getting abund & TPM?
SQM$orfs$table = read.generic.table.zip(project_path, sprintf('results/13.%s.orftable', project_name), engine = engine,
header=TRUE, sep='\t', row.names=1, quote='', comment.char='', skip=1, as.is=TRUE, check.names=FALSE)
message(' abundances...')
SQM$orfs$abund = as.matrix(SQM$orfs$table[,grepl('Raw read count', colnames(SQM$orfs$table)),drop=FALSE])
colnames(SQM$orfs$abund) = gsub('Raw read count ', '', colnames(SQM$orfs$abund), fixed=TRUE)
storage.mode(SQM$orfs$abund) = 'numeric'
SQM$orfs$bases = as.matrix(SQM$orfs$table[,grepl('Raw base count', colnames(SQM$orfs$table)),drop=FALSE])
colnames(SQM$orfs$bases) = gsub('Raw base count ', '', colnames(SQM$orfs$abund), fixed=TRUE)
storage.mode(SQM$orfs$bases) = 'numeric'
SQM$orfs$cov = as.matrix(SQM$orfs$table[,grepl('Coverage', colnames(SQM$orfs$table)),drop=FALSE])
colnames(SQM$orfs$cov) = gsub('Coverage ', '', colnames(SQM$orfs$cov), fixed=TRUE)
SQM$orfs$cpm = t(t(SQM$orfs$cov) / (SQM$total_reads / 1000000))
SQM$orfs$tpm = as.matrix(SQM$orfs$table[,grepl('TPM', colnames(SQM$orfs$table)),drop=FALSE])
colnames(SQM$orfs$tpm) = gsub('TPM ', '', colnames(SQM$orfs$tpm), fixed=TRUE)
SQM$misc$samples = colnames(SQM$orfs$abund)
goodORFcols = colnames(SQM$orfs$table)[!grepl('Raw read|TPM|Coverage|Tax|Raw base', colnames(SQM$orfs$table))]
goodORFcols = c('startPos', 'endPos', goodORFcols)
start_end = sapply(strsplit(rownames(SQM$orfs$table), split='_'), FUN=function(x) x[length(x)])
SQM$orfs$table$startPos = as.numeric(sapply(strsplit(start_end, split = '-'), FUN=function(x) x[1]))
SQM$orfs$table$endPos = as.numeric(sapply(strsplit(start_end, split = '-'), FUN=function(x) x[2]))
SQM$orfs$table = SQM$orfs$table[,goodORFcols]
if(load_sequences)
{
message(' sequences...')
SQM$orfs$seqs = read.namedvector.zip(project_path, sprintf('results/tables/%s.orf.sequences.tsv', project_name), engine=engine)
}
message(' taxonomy...')
SQM$orfs$tax = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.orf.tax.%s.tsv', project_name, tax_mode), engine = engine,
header=TRUE, row.names=1, sep='\t'))
if(file.exists.zip(project_path, sprintf('results/tables/%s.orf.16S.tsv', project_name)))
{
SQM$orfs$tax16S = read.namedvector.zip(project_path, sprintf('results/tables/%s.orf.16S.tsv', project_name), engine=engine)
}
if(file.exists.zip(project_path, sprintf('results/18.%s.bintable', project_name)) & file.exists.zip(project_path, sprintf('results/tables/%s.orf.marker.genes.tsv', project_name)))
{
message(' checkm markers...')
conn = open_conn_zip(project_path, sprintf('results/tables/%s.orf.marker.genes.tsv', project_name))
lines = readLines(conn)
close(conn)
lines = strsplit(lines, split='\t')
SQM$orfs$markers = sapply(lines, FUN = function(x) strsplit(x[2], split=','))
names(SQM$orfs$markers) = sapply(lines, FUN = function(x) x[1]) # this is also the same ORF order as in SQM$orfs$table
}
# THIS SHOULD NOT BE NEEDED ANYMORE (BUT ALAS IT IS! >_<)
SQM$orfs$table = SQM$orfs$table[rownames(SQM$orfs$table) %in% rownames(SQM$orfs$tax),]
SQM$orfs$abund = SQM$orfs$abund[rownames(SQM$orfs$table),,drop=FALSE]
SQM$orfs$tpm = SQM$orfs$tpm[rownames(SQM$orfs$table),,drop=FALSE]
if(load_sequences)
{
SQM$orfs$seqs = SQM$orfs$seqs[rownames(SQM$orfs$table)[rownames(SQM$orfs$table) %in% names(SQM$orfs$seqs)]]
}
message('Loading contigs')
SQM$contigs = list()
message(' table...') # option to remove table from memory after getting abund & TPM?
SQM$contigs$table = read.generic.table.zip(project_path, sprintf('results/19.%s.contigtable', project_name), engine = engine,
header=TRUE, sep='\t', row.names=1, quote='', comment.char='', skip=1, as.is=TRUE, check.names=FALSE)
message(' abundances...')
# In issue #589 we found out that bin bases and coverages were not being calculated correctly
# This was because data.table::fread was stoif(bycol) { data = t(data) }ring some columns containing large values as integer64
# When casted from generic.data.frame to matrix with as.matrix, some integer64 became very small numerics (like e-318)
# In theory data.table::fread has the option `integer64="double"` to avoid integer64 being created in the first place
# But it fails in some cases as described in https://github.com/Rdatatable/data.table/issues/2607
# Thus I manually cast all columns containing abund and bases to numeric before casting to matrixz
abundcols = colnames(SQM$contigs$table)[grepl('Raw read count', colnames(SQM$contigs$table), fixed=TRUE)]
abund = SQM$contigs$table[,abundcols,drop=FALSE]
for(col in abundcols) { abund[,col] = as.numeric(abund[,col]) }
abund = as.matrix(abund)
colnames(abund) = colnames(abund) = gsub('Raw read count ', '', abundcols, fixed = TRUE)
SQM$contigs$abund = abund
basescols = colnames(SQM$contigs$table)[grepl('Raw base count', colnames(SQM$contigs$table), fixed=TRUE)]
bases = SQM$contigs$table[,basescols,drop=FALSE]
for(col in basescols) { bases[,col] = as.numeric(bases[,col]) }
bases = as.matrix(bases)
colnames(bases) = gsub('Raw base count ', '', basescols, fixed = TRUE)
SQM$contigs$bases = bases
# I treat the cov, cpm and tpm matrices normally, as I don't expect them to contain very large values
SQM$contigs$cov = as.matrix(SQM$contigs$table[,grepl('Coverage', colnames(SQM$contigs$table)),drop=FALSE])
colnames(SQM$contigs$cov) = gsub('Coverage ', '', colnames(SQM$contigs$cov), fixed=TRUE)
SQM$contigs$cpm = t(t(SQM$contigs$cov) / (SQM$total_reads / 1000000))
SQM$contigs$tpm = as.matrix(SQM$contigs$table[,grepl('TPM', colnames(SQM$contigs$table)),drop=FALSE])
colnames(SQM$contigs$tpm) = gsub('TPM ', '', colnames(SQM$contigs$tpm), fixed=TRUE)
goodContigCols = colnames(SQM$contigs$table)[!grepl('Raw read|TPM|Coverage|Tax|Raw base', colnames(SQM$contigs$table))]
SQM$contigs$table = SQM$contigs$table[,goodContigCols,drop=FALSE]
if(load_sequences)
{
message(' sequences...')
SQM$contigs$seqs = read.namedvector.zip(project_path, sprintf('results/tables/%s.contig.sequences.tsv', project_name), engine=engine)
SQM$contigs$seqs = SQM$contigs$seqs[rownames(SQM$contigs$table)]
}
message(' taxonomy...')
SQM$contigs$tax = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.contig.tax.%s.tsv', project_name, tax_mode), engine = engine,
header=TRUE, row.names=1, sep='\t'))
SQM$contigs$tax = SQM$contigs$tax[rownames(SQM$contigs$table),,drop=FALSE]
if(file.exists.zip(project_path, sprintf('results/18.%s.bintable', project_name)))
{
message(' binning info...')
inBins = read.generic.table.zip(project_path, sprintf('intermediate/18.%s.contigsinbins', project_name), engine = 'data.frame',
header=TRUE, sep='\t', quote='', comment.char='', skip=1, as.is=TRUE)
binmethods = unique(inBins$Method)
if(max(table(paste(inBins$X..Contig, inBins$Method)))>1) {
warning('Some contigs are assigned to more than one bin with the same method, something went wrong...')
}
SQM$contigs$bins = matrix(NA, nrow = nrow(inBins), ncol = length(binmethods),
dimnames = list(inBins$X..Contig, binmethods))
for(bm in binmethods)
{
ibm = inBins[inBins$Method == bm,]
SQM$contigs$bins[ibm$X..Contig,] = ibm$Bin.ID
}
notInBins = setdiff(rownames(SQM$contigs$table), rownames(SQM$contigs$bins))
notInBins = matrix(NA, nrow=length(notInBins), ncol=ncol(SQM$contigs$bins), dimnames=list(notInBins, colnames(SQM$contigs$bins)))
SQM$contigs$bins = rbind(SQM$contigs$bins, notInBins)
SQM$contigs$bins = SQM$contigs$bins[rownames(SQM$contigs$table),,drop=FALSE]
SQM$contigs$bins[is.na(SQM$contigs$bins)] = 'No_bin'
message('Loading bins')
message(' table...')
SQM$bins = list()
SQM$bins$table = read.generic.table.zip(project_path, sprintf('results/18.%s.bintable', project_name),
engine = 'data.frame', header=TRUE, sep='\t', row.names=1,
quote='', comment.char='', skip=1, as.is=TRUE, check.names=FALSE)
message(' abundances...')
bin_abunds = get.bin.abunds(SQM, track_unmapped=TRUE)
SQM$bins$abund = bin_abunds[['abund']]
SQM$bins$percent = bin_abunds[['percent']]
SQM$bins$bases = bin_abunds[['bases']]
SQM$bins$length = bin_abunds[['length']]
SQM$bins$cov = bin_abunds[['cov']]
SQM$bins$cpm = bin_abunds[['cpm']]
message(' taxonomy...')
SQM$bins$tax = as.matrix(read.generic.table.zip(project_path,
sprintf('results/tables/%s.bin.tax.tsv', project_name),
engine = 'data.frame',
header=TRUE, row.names=1, sep='\t'))
if('Tax GTDB-Tk' %in% colnames(SQM$bins$table))
{
goodcols = c('Method', 'Num contigs', 'GC perc', 'Tax 16S', 'Tax GTDB-Tk')
} else
{
goodcols = c('Method', 'Num contigs', 'GC perc', 'Tax 16S')
}
goodcols = c(goodcols, c('Disparity', 'Completeness','Contamination'))
SQM$bins$table = SQM$bins$table[,goodcols,drop=FALSE]
} else
{
warning(' There are no binning results in your project. Skipping...')
}
message('Loading taxonomies')
SQM$taxa = list()
SQM$taxa$superkingdom = list()
SQM$taxa$phylum = list()
SQM$taxa$class = list()
SQM$taxa$order = list()
SQM$taxa$family = list()
SQM$taxa$genus = list()
SQM$taxa$species = list()
SQM$taxa$superkingdom$abund = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.superkingdom.%s.abund.tsv', project_name, tax_mode),
header=TRUE, sep='\t', row.names=1, check.names=FALSE))
SQM$taxa$phylum$abund = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.phylum.%s.abund.tsv' , project_name, tax_mode),
header=TRUE, sep='\t', row.names=1, check.names=FALSE))
SQM$taxa$class$abund = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.class.%s.abund.tsv' , project_name, tax_mode),
header=TRUE, sep='\t', row.names=1, check.names=FALSE))
SQM$taxa$order$abund = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.order.%s.abund.tsv' , project_name, tax_mode),
header=TRUE, sep='\t', row.names=1, check.names=FALSE))
SQM$taxa$family$abund = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.family.%s.abund.tsv' , project_name, tax_mode),
header=TRUE, sep='\t', row.names=1, check.names=FALSE))
SQM$taxa$genus$abund = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.genus.%s.abund.tsv' , project_name, tax_mode),
header=TRUE, sep='\t', row.names=1, check.names=FALSE))
SQM$taxa$species$abund = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.species.%s.abund.tsv' , project_name, tax_mode),
header=TRUE, sep='\t', row.names=1, check.names=FALSE))
SQM$taxa$superkingdom$percent = 100 * t(t(SQM$taxa$superkingdom$abund) / colSums(SQM$taxa$superkingdom$abund))
SQM$taxa$phylum$percent = 100 * t(t(SQM$taxa$phylum$abund) / colSums(SQM$taxa$phylum$abund ))
SQM$taxa$class$percent = 100 * t(t(SQM$taxa$class$abund) / colSums(SQM$taxa$class$abund ))
SQM$taxa$order$percent = 100 * t(t(SQM$taxa$order$abund) / colSums(SQM$taxa$order$abund ))
SQM$taxa$family$percent = 100 * t(t(SQM$taxa$family$abund) / colSums(SQM$taxa$family$abund ))
SQM$taxa$genus$percent = 100 * t(t(SQM$taxa$genus$abund) / colSums(SQM$taxa$genus$abund ))
SQM$taxa$species$percent = 100 * t(t(SQM$taxa$species$abund) / colSums(SQM$taxa$species$abund ))
SQM$misc$tax_names_long = list()
SQM$misc$tax_names_long$superkingdom = rownames(SQM$tax$superkingdom$abund)
names(SQM$misc$tax_names_long$superkingdom) = gsub('^k_', '', SQM$misc$tax_names_long$superkingdom)
SQM$misc$tax_names_long$phylum = rownames(SQM$tax$phylum$abund)
names(SQM$misc$tax_names_long$phylum) = sapply(strsplit(SQM$misc$tax_names_long$phylum, split=';p_'), FUN = function(x) x[2])
SQM$misc$tax_names_long$class = rownames(SQM$tax$class$abund)
names(SQM$misc$tax_names_long$class) = sapply(strsplit(SQM$misc$tax_names_long$class, split=';c_'), FUN = function(x) x[2])
SQM$misc$tax_names_long$order = rownames(SQM$tax$order$abund)
names(SQM$misc$tax_names_long$order) = sapply(strsplit(SQM$misc$tax_names_long$order, split=';o_'), FUN = function(x) x[2])
SQM$misc$tax_names_long$family = rownames(SQM$tax$family$abund)
names(SQM$misc$tax_names_long$family) = sapply(strsplit(SQM$misc$tax_names_long$family, split=';f_'), FUN = function(x) x[2])
SQM$misc$tax_names_long$genus = rownames(SQM$tax$genus$abund)
names(SQM$misc$tax_names_long$genus) = sapply(strsplit(SQM$misc$tax_names_long$genus, split=';g_'), FUN = function(x) x[2])
SQM$misc$tax_names_long$species = rownames(SQM$tax$species$abund)
names(SQM$misc$tax_names_long$species) = sapply(strsplit(SQM$misc$tax_names_long$species, split=';s_'), FUN = function(x) x[2])
SQM$misc$tax_names_short = unlist(lapply(SQM$misc$tax_names_long, names))
names(SQM$misc$tax_names_short) = unlist(SQM$misc$tax_names_long)
# Use short names for taxonomy tables, since it makes it easier to search for specific taxa
rownames(SQM$taxa$superkingdom$abund) = SQM$misc$tax_names_short[rownames(SQM$taxa$superkingdom$abund)]
rownames(SQM$taxa$phylum$abund) = SQM$misc$tax_names_short[rownames(SQM$taxa$phylum$abund)]
rownames(SQM$taxa$class$abund) = SQM$misc$tax_names_short[rownames(SQM$taxa$class$abund)]
rownames(SQM$taxa$order$abund) = SQM$misc$tax_names_short[rownames(SQM$taxa$order$abund)]
rownames(SQM$taxa$family$abund) = SQM$misc$tax_names_short[rownames(SQM$taxa$family$abund)]
rownames(SQM$taxa$genus$abund) = SQM$misc$tax_names_short[rownames(SQM$taxa$genus$abund)]
rownames(SQM$taxa$species$abund) = SQM$misc$tax_names_short[rownames(SQM$taxa$species$abund)]
rownames(SQM$taxa$superkingdom$percent) = SQM$misc$tax_names_short[rownames(SQM$taxa$superkingdom$percent)]
rownames(SQM$taxa$phylum$percent) = SQM$misc$tax_names_short[rownames(SQM$taxa$phylum$percent)]
rownames(SQM$taxa$class$percent) = SQM$misc$tax_names_short[rownames(SQM$taxa$class$percent)]
rownames(SQM$taxa$order$percent) = SQM$misc$tax_names_short[rownames(SQM$taxa$order$percent)]
rownames(SQM$taxa$family$percent) = SQM$misc$tax_names_short[rownames(SQM$taxa$family$percent)]
rownames(SQM$taxa$genus$percent) = SQM$misc$tax_names_short[rownames(SQM$taxa$genus$percent)]
rownames(SQM$taxa$species$percent) = SQM$misc$tax_names_short[rownames(SQM$taxa$species$percent)]
# Now the rownames are a character vector which actually has its own names inside. Some R packages don't like this, so we correct it.
for(taxlevel in names(SQM$taxa))
{
for(counts in names(SQM$taxa[[taxlevel]]))
{
names(rownames(SQM$taxa[[taxlevel]][[counts]])) = NULL
}
}
message('Loading functions')
result_files = strsplit(list.files.zip(project_path, 'results/tables'), '.', fixed=TRUE)
external_annotation_results = result_files[sapply(result_files, FUN=function(x) x[3]=='abund' & !x[2] %in% c('COG', 'KO', 'PFAM'))]
SQM$misc$ext_annot_sources = sapply(external_annotation_results, FUN=function(x) x[2])
SQM$functions = list()
### KEGG
if(file.exists.zip(project_path, sprintf('results/tables/%s.KO.abund.tsv', project_name)))
{
has_KEGG = TRUE
SQM$functions$KEGG = list()
SQM$functions$KEGG$abund = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.KO.abund.tsv', project_name), engine = 'data.frame',
header=TRUE, sep='\t', row.names=1, check.names=FALSE, comment.char='', quote=''))
SQM$functions$KEGG$bases = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.KO.bases.tsv', project_name), engine = 'data.frame',
header=TRUE, sep='\t', row.names=1, check.names=FALSE, comment.char='', quote=''))
SQM$functions$KEGG$cov = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.KO.cov.tsv', project_name), engine = 'data.frame',
header=TRUE, sep='\t', row.names=1, check.names=FALSE, comment.char='', quote=''))
SQM$functions$KEGG$cpm = t(t(SQM$functions$KEGG$cov) / (SQM$total_reads / 1000000))
SQM$functions$KEGG$tpm = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.KO.tpm.tsv', project_name), engine = 'data.frame',
header=TRUE, sep='\t', row.names=1, check.names=FALSE, comment.char='', quote=''))
funinfo = read.generic.table.zip(project_path, sprintf('results/tables/%s.KO.names.tsv', project_name), engine = 'data.frame',
header=TRUE, sep='\t', row.names=1, check.names=FALSE, comment.char='', quote='', as.is=TRUE)
SQM$misc$KEGG_names = funinfo[,1]
names(SQM$misc$KEGG_names) = rownames(funinfo)
SQM$misc$KEGG_paths = funinfo[,2]
names(SQM$misc$KEGG_paths) = rownames(funinfo)
}else
{
warning(' There are no KEGG results in your project. Skipping...')
has_KEGG = FALSE
}
### COG
if(file.exists.zip(project_path, sprintf('results/tables/%s.COG.abund.tsv', project_name)))
{
has_COG = TRUE
SQM$functions$COG = list()
SQM$functions$COG$abund = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.COG.abund.tsv', project_name), engine = 'data.frame',
header=TRUE, sep='\t', row.names=1, check.names=FALSE, comment.char='', quote=''))
SQM$functions$COG$bases = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.COG.bases.tsv', project_name), engine = 'data.frame',
header=TRUE, sep='\t', row.names=1, check.names=FALSE, comment.char='', quote=''))
SQM$functions$COG$cov = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.COG.cov.tsv', project_name), engine = 'data.frame',
header=TRUE, sep='\t', row.names=1, check.names=FALSE, comment.char='', quote=''))
SQM$functions$COG$cpm = t(t(SQM$functions$COG$cov) / (SQM$total_reads / 1000000))
SQM$functions$COG$tpm = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.COG.tpm.tsv', project_name), engine = 'data.frame',
header=TRUE, sep='\t', row.names=1, check.names=FALSE, comment.char='', quote=''))
funinfo = read.generic.table.zip(project_path, sprintf('results/tables/%s.COG.names.tsv', project_name), engine = 'data.frame',
header=TRUE, sep='\t', row.names=1, check.names=FALSE, comment.char='', quote='', as.is=TRUE)
SQM$misc$COG_names = funinfo[,1]
names(SQM$misc$COG_names) = rownames(funinfo)
SQM$misc$COG_paths = funinfo[,2]
names(SQM$misc$COG_paths) = rownames(funinfo)
}else
{
warning(' There are no COG results in your project. Skipping...')
has_COG = FALSE
}
### PFAM
if(file.exists.zip(project_path, sprintf('results/tables/%s.PFAM.abund.tsv', project_name)))
{
has_PFAM = TRUE
SQM$functions$PFAM = list()
SQM$functions$PFAM$abund = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.PFAM.abund.tsv', project_name), engine = 'data.frame',
header=TRUE, sep='\t', row.names=1, check.names=FALSE, comment.char='', quote=''))
SQM$functions$PFAM$bases = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.PFAM.bases.tsv', project_name), engine = 'data.frame',
header=TRUE, sep='\t', row.names=1, check.names=FALSE, comment.char='', quote=''))
SQM$functions$PFAM$cov = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.PFAM.cov.tsv', project_name), engine = 'data.frame',
header=TRUE, sep='\t', row.names=1, check.names=FALSE, comment.char='', quote=''))
SQM$functions$PFAM$cpm = t(t(SQM$functions$PFAM$cov) / (SQM$total_reads / 1000000))
SQM$functions$PFAM$tpm = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.PFAM.tpm.tsv', project_name), engine = 'data.frame',
header=TRUE, sep='\t', row.names=1, check.names=FALSE, comment.char='', quote=''))
}else
{
warning(' There are no PFAM results in your project. Skipping...')
has_PFAM = FALSE
}
### EXTERNAL ANNOTATION SOURCES
for(method in SQM$misc$ext_annot_sources)
{
SQM$functions[[method]] = list()
SQM$functions[[method]]$abund = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.%s.abund.tsv', project_name, method), engine = 'data.frame',
header=TRUE, sep='\t', row.names=1, check.names=FALSE, comment.char='', quote=''))
SQM$functions[[method]]$bases = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.%s.bases.tsv', project_name, method), engine = 'data.frame',
header=TRUE, sep='\t', row.names=1, check.names=FALSE, comment.char='', quote=''))
SQM$functions[[method]]$cov = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.%s.cov.tsv', project_name, method), engine = 'data.frame',
header=TRUE, sep='\t', row.names=1, check.names=FALSE, comment.char='', quote=''))
SQM$functions[[method]]$cpm = t(t(SQM$functions[[method]]$cov) / (SQM$total_reads / 1000000))
SQM$functions[[method]]$tpm = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.%s.tpm.tsv', project_name, method), engine = 'data.frame',
header=TRUE, sep='\t', row.names=1, check.names=FALSE, comment.char='', quote=''))
funinfo = read.generic.table.zip(project_path, sprintf('results/tables/%s.%s.names.tsv', project_name, method), engine = 'data.frame',
header=TRUE, sep='\t', row.names=1, check.names=FALSE, comment.char='', quote='', as.is=TRUE)
field = sprintf('%s_names', method)
SQM$misc[[field]] = funinfo[,1]
names(SQM$misc[[field]]) = rownames(funinfo)
}
### CODING FRACTION FOR CORRECTING TPMs
SQM$misc$coding_fraction = list()
for(method in names(SQM$functions))
{
#SQM$misc$coding_fraction[[method]] = 1 - (SQM$functions[[method]]$tpm['Unmapped',] / colSums(SQM$functions[[method]]$tpm))
SQM$misc$coding_fraction[[method]] = 1 # this was used to correct TPMs ignoring the effect of unmapped reads, but is not needed anymore since we don't track unmapped reads for functions anymore
}
### COPY NUMBERS
SQM$misc$single_copy_cov = get_median_single_copy_cov(SQM)
if(!any(is.na(SQM$misc$single_copy_cov)))
{
message(sprintf(' Using %s for copy number normalization', SQM$misc$single_copy_genes))
for(method in names(SQM$functions))
{
SQM$functions[[method]]$copy_number = t(t(SQM$functions[[method]]$cov) / SQM$misc$single_copy_cov)
}
}
class(SQM) = 'SQM'
return(SQM)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.