R/loadSQMlite.R
In SQMtools: Analyze Results Generated by the 'SqueezeMeta' Pipeline

Documented in loadSQMlite

#' Load tables generated by \code{sqm2tables.py}, \code{sqmreads2tables.py} or \code{combine-sqm-tables.py} into R.
#'
#' This function takes the path to the output directory generated by \code{sqm2tables.py}, \code{sqmreads2tables.py} or \code{combine-sqm-tables.py} a SQMlite object.
#' The SQMlite object will contain taxonomic and functional profiles, but no detailed information on ORFs, contigs or bins. However, it will also have a much smaller memory footprint.
#' A SQMlite object can be used for plotting and exporting, but it can not be subsetted.
#'
#' @section The SQMlite object structure:
#' {
#' The SQMlite object is a nested list which contains the following information:
#' \tabular{lllllll}{
#' \bold{lvl1}         \tab \bold{lvl2}               \tab \bold{lvl3}          \tab \bold{type}             \tab \bold{rows/names} \tab \bold{columns} \tab \bold{data}        \cr
#' \bold{$taxa}        \tab \bold{$superkingdom}      \tab \bold{$abund}        \tab \emph{numeric matrix}   \tab superkingdoms     \tab samples        \tab abundances         \cr
#'                     \tab                           \tab \bold{$percent}      \tab \emph{numeric matrix}   \tab superkingdoms     \tab samples        \tab percentages        \cr
#'                     \tab \bold{$phylum}            \tab \bold{$abund}        \tab \emph{numeric matrix}   \tab phyla             \tab samples        \tab abundances         \cr
#'                     \tab                           \tab \bold{$percent}      \tab \emph{numeric matrix}   \tab phyla             \tab samples        \tab percentages        \cr
#'                     \tab \bold{$class}             \tab \bold{$abund}        \tab \emph{numeric matrix}   \tab classes           \tab samples        \tab abundances         \cr
#'                     \tab                           \tab \bold{$percent}      \tab \emph{numeric matrix}   \tab classes           \tab samples        \tab percentages        \cr
#'                     \tab \bold{$order}             \tab \bold{$abund}        \tab \emph{numeric matrix}   \tab orders            \tab samples        \tab abundances         \cr
#'                     \tab                           \tab \bold{$percent}      \tab \emph{numeric matrix}   \tab orders            \tab samples        \tab percentages        \cr
#'                     \tab \bold{$family}            \tab \bold{$abund}        \tab \emph{numeric matrix}   \tab families          \tab samples        \tab abundances         \cr
#'                     \tab                           \tab \bold{$percent}      \tab \emph{numeric matrix}   \tab families          \tab samples        \tab percentages        \cr
#'                     \tab \bold{$genus}             \tab \bold{$abund}        \tab \emph{numeric matrix}   \tab genera            \tab samples        \tab abundances         \cr
#'                     \tab                           \tab \bold{$percent}      \tab \emph{numeric matrix}   \tab genera            \tab samples        \tab percentages        \cr
#'                     \tab \bold{$species}           \tab \bold{$abund}        \tab \emph{numeric matrix}   \tab species           \tab samples        \tab abundances         \cr
#'                     \tab                           \tab \bold{$percent}      \tab \emph{numeric matrix}   \tab species           \tab samples        \tab percentages        \cr
#' \bold{$functions}   \tab \bold{$KEGG}              \tab \bold{$abund}        \tab \emph{numeric matrix}   \tab KEGG ids          \tab samples        \tab abundances (reads) \cr
#'                     \tab                           \tab \bold{$bases}        \tab \emph{numeric matrix}   \tab KEGG ids          \tab samples        \tab abundances (bases) \cr
#'                     \tab                           \tab \bold{$tpm}          \tab \emph{numeric matrix}   \tab KEGG ids          \tab samples        \tab tpm                \cr
#'                     \tab                           \tab \bold{$copy_number}  \tab \emph{numeric matrix}   \tab KEGG ids          \tab samples        \tab avg. copies        \cr
#'                     \tab \bold{$COG}               \tab \bold{$abund}        \tab \emph{numeric matrix}   \tab COG ids           \tab samples        \tab abundances (reads) \cr
#'                     \tab                           \tab \bold{$bases}        \tab \emph{numeric matrix}   \tab COG ids           \tab samples        \tab abundances (bases) \cr
#'                     \tab                           \tab \bold{$tpm}          \tab \emph{numeric matrix}   \tab COG ids           \tab samples        \tab tpm                \cr
#'                     \tab                           \tab \bold{$copy_number}  \tab \emph{numeric matrix}   \tab COG ids           \tab samples        \tab avg. copies        \cr
#'                     \tab \bold{$PFAM}              \tab \bold{$abund}        \tab \emph{numeric matrix}   \tab PFAM ids          \tab samples        \tab abundances (reads) \cr
#'                     \tab                           \tab \bold{$bases}        \tab \emph{numeric matrix}   \tab PFAM ids          \tab samples        \tab abundances (bases) \cr
#'                     \tab                           \tab \bold{$tpm}          \tab \emph{numeric matrix}   \tab PFAM ids          \tab samples        \tab tpm                \cr
#'                     \tab                           \tab \bold{$copy_number}  \tab \emph{numeric matrix}   \tab PFAM ids          \tab samples        \tab avg. copies        \cr
#' \bold{$total_reads} \tab                           \tab                      \tab \emph{numeric vector}   \tab samples           \tab (n/a)          \tab total reads        \cr
#' \bold{$misc}        \tab \bold{$project_name}      \tab                      \tab \emph{character vector} \tab (empty)           \tab (n/a)          \tab project name       \cr
#'                     \tab \bold{$samples}           \tab                      \tab \emph{character vector} \tab (empty)           \tab (n/a)          \tab samples            \cr
#'                     \tab \bold{$tax_names_long}    \tab \bold{$superkingdom} \tab \emph{character vector} \tab short names       \tab (n/a)          \tab full names         \cr
#'                     \tab                           \tab \bold{$phylum}       \tab \emph{character vector} \tab short names       \tab (n/a)          \tab full names         \cr
#'                     \tab                           \tab \bold{$class}        \tab \emph{character vector} \tab short names       \tab (n/a)          \tab full names         \cr
#'                     \tab                           \tab \bold{$order}        \tab \emph{character vector} \tab short names       \tab (n/a)          \tab full names         \cr
#'                     \tab                           \tab \bold{$family}       \tab \emph{character vector} \tab short names       \tab (n/a)          \tab full names         \cr
#'                     \tab                           \tab \bold{$genus}        \tab \emph{character vector} \tab short names       \tab (n/a)          \tab full names         \cr
#'                     \tab                           \tab \bold{$species}      \tab \emph{character vector} \tab short names       \tab (n/a)          \tab full names         \cr
#'                     \tab \bold{$tax_names_short}   \tab                      \tab \emph{character vector} \tab full names        \tab (n/a)          \tab short names        \cr
#'                     \tab \bold{$KEGG_names}        \tab                      \tab \emph{character vector} \tab KEGG ids          \tab (n/a)          \tab KEGG names         \cr
#'                     \tab \bold{$KEGG_paths}        \tab                      \tab \emph{character vector} \tab KEGG ids          \tab (n/a)          \tab KEGG hiararchy     \cr
#'                     \tab \bold{$COG_names}         \tab                      \tab \emph{character vector} \tab COG ids           \tab (n/a)          \tab COG names          \cr
#'                     \tab \bold{$COG_paths}         \tab                      \tab \emph{character vector} \tab COG ids           \tab (n/a)          \tab COG hierarchy      \cr
#'                     \tab \bold{$ext_annot_sources} \tab                      \tab \emph{character vector} \tab (empty)           \tab (n/a)          \tab external databases \cr
#' }
#' If external databases for functional classification were provided to SqueezeMeta or SqueezeMeta_reads via the \code{-extdb} argument, the corresponding abundance, tpm and copy number profiles will be present in \code{SQM$functions} (e.g. results for the CAZy database would be present in \code{SQM$functions$CAZy}). Additionally, the extended names of the features present in the external database will be present in \code{SQM$misc} (e.g. \code{SQM$misc$CAZy_names}). Note that results generated by SqueezeMeta_reads will contain only read abundances, but not bases, tpm or copy number estimations.

#' }
#' @param tables_path character, tables directory generated by \code{sqm2table.py}, \code{sqmreads2tables.py} or \code{combine-sqm-tables.py}.
#' @param tax_mode character, which taxonomic classification should be loaded? SqueezeMeta applies the identity thresholds described in \href{https://pubmed.ncbi.nlm.nih.gov/24589583/}{Luo \emph{et al.}, 2014}. Use \code{allfilter} for applying the minimum identity threshold to all taxa (default), \code{prokfilter} for applying the threshold to Bacteria and Archaea, but not to Eukaryotes, and \code{nofilter} for applying no thresholds at all.
#' @return SQMlite object containing the parsed tables.
#' @seealso \code{\link{plotBars}} and \code{\link{plotFunctions}} will plot the most abundant taxa and functions in a SQMlite object. \code{\link{exportKrona}} will generate Krona charts reporting the taxonomy in a SQMlite object.
#' @examples
#' \dontrun{
#' ## (outside R)
#' ## Run SqueezeMeta on the test data.
#' /path/to/SqueezeMeta/scripts/SqueezeMeta.pl -p Hadza -f raw -m coassembly -s test.samples
#' ## Generate the tabular outputs!
#' /path/to/SqueezeMeta/utils/sqm2tables.py Hadza Hadza/results/tables
#' ## Now go into R.
#' library(SQMtools)
#' Hadza = loadSQMlite("Hadza/results/tables")
#' # Where Hadza is the path to the SqueezeMeta output directory.
#' # Note that this is not the whole SQM project, just the directory containing the tables.
#' # It would also work with tables generated by sqmreads2tables.py, or combine-sqm-tables.py
#' plotTaxonomy(Hadza)
#' plotFunctions(Hadza)
#' exportKrona(Hadza, 'myKronaTest.html')
#' }
#' @importFrom utils read.table
#' @export

loadSQMlite = function(tables_path, tax_mode = 'allfilter')
    {

    if(!tax_mode %in% c('allfilter', 'prokfilter', 'nofilter'))
        {
        stop('tax_mode must be either "allfilter" (apply minimum identity threshold for all taxa), "prokfilter" (don\'t apply thresholds to Eukaryotes) or "nofilter" (don\'t apply thresholds at all).')
        }
     
    SQM                           = list()

    allFiles                      = strsplit(list.files(tables_path), '.', fixed=TRUE)
    if(length(allFiles) == 0)
        {
        stop(sprintf('Directory "%s" does not seem to contain valid SqueezeMeta tables', tables_path))
        }
    
    project_name                  = allFiles[sapply(allFiles, function(x) x[2] == 'superkingdom' & x[3] =='allfilter' & x[4] == 'abund' & x[5] == 'tsv')][[1]][1]
    SQM$misc                      = list()
    SQM$misc$project_name         = project_name

    message('Loading taxonomies\n')                                   
    SQM$taxa                      = list()
    SQM$taxa$superkingdom         = list()
    SQM$taxa$phylum               = list()
    SQM$taxa$class                = list()
    SQM$taxa$order                = list()
    SQM$taxa$family               = list()
    SQM$taxa$genus                = list()
    SQM$taxa$species              = list()

    ### Check that this is a valid SQM project.
    if(is.null(project_name))
        {
        stop(sprintf('Directory "%s" does not seem to contain valid SqueezeMeta tables', tables_path))
        }

    
    SQM$taxa$superkingdom$abund   = as.matrix(read.table(sprintf('%s/%s.superkingdom.%s.abund.tsv', tables_path, project_name, tax_mode),
                                                         header=TRUE, sep='\t', row.names=1, check.names=FALSE))
    SQM$taxa$phylum$abund         = as.matrix(read.table(sprintf('%s/%s.phylum.%s.abund.tsv', tables_path, project_name, tax_mode),
                                                         header=TRUE, sep='\t', row.names=1, check.names=FALSE))
    SQM$taxa$class$abund          = as.matrix(read.table(sprintf('%s/%s.class.%s.abund.tsv', tables_path, project_name, tax_mode),
                                                         header=TRUE, sep='\t', row.names=1, check.names=FALSE))
    SQM$taxa$order$abund          = as.matrix(read.table(sprintf('%s/%s.order.%s.abund.tsv', tables_path, project_name, tax_mode),
                                                         header=TRUE, sep='\t', row.names=1, check.names=FALSE))
    SQM$taxa$family$abund         = as.matrix(read.table(sprintf('%s/%s.family.%s.abund.tsv', tables_path, project_name, tax_mode),
                                                         header=TRUE, sep='\t', row.names=1, check.names=FALSE))
    SQM$taxa$genus$abund          = as.matrix(read.table(sprintf('%s/%s.genus.%s.abund.tsv', tables_path, project_name, tax_mode),
                                                         header=TRUE, sep='\t', row.names=1, check.names=FALSE))
    SQM$taxa$species$abund        = as.matrix(read.table(sprintf('%s/%s.species.%s.abund.tsv', tables_path, project_name, tax_mode),
                                                         header=TRUE, sep='\t', row.names=1, check.names=FALSE))
                                                          
     
    SQM$taxa$superkingdom$percent = 100 * t(t(SQM$taxa$superkingdom$abund) / colSums(SQM$taxa$superkingdom$abund))
    SQM$taxa$phylum$percent       = 100 * t(t(SQM$taxa$phylum$abund)       / colSums(SQM$taxa$phylum$abund      ))
    SQM$taxa$class$percent        = 100 * t(t(SQM$taxa$class$abund)        / colSums(SQM$taxa$class$abund       )) 
    SQM$taxa$order$percent        = 100 * t(t(SQM$taxa$order$abund)        / colSums(SQM$taxa$order$abund       ))
    SQM$taxa$family$percent       = 100 * t(t(SQM$taxa$family$abund)       / colSums(SQM$taxa$family$abund      ))
    SQM$taxa$genus$percent        = 100 * t(t(SQM$taxa$genus$abund)        / colSums(SQM$taxa$genus$abund       ))
    SQM$taxa$species$percent      = 100 * t(t(SQM$taxa$species$abund)      / colSums(SQM$taxa$species$abund     ))

   
    # Make vectors translating long taxonomy strings into the name of the lowest rank 
    SQM$misc$tax_names_long                 = list()

    SQM$misc$tax_names_long$superkingdom    = rownames(SQM$tax$superkingdom$abund)
    names(SQM$misc$tax_names_long$superkingdom) = gsub('^k_', '', SQM$misc$tax_names_long$superkingdom)

    SQM$misc$tax_names_long$phylum          = rownames(SQM$tax$phylum$abund)
    names(SQM$misc$tax_names_long$phylum)   = sapply(strsplit(SQM$misc$tax_names_long$phylum,  split=';p_'), FUN = function(x) x[2])

    SQM$misc$tax_names_long$class           = rownames(SQM$tax$class$abund)
    names(SQM$misc$tax_names_long$class)    = sapply(strsplit(SQM$misc$tax_names_long$class,   split=';c_'), FUN = function(x) x[2])

    SQM$misc$tax_names_long$order           = rownames(SQM$tax$order$abund)
    names(SQM$misc$tax_names_long$order)    = sapply(strsplit(SQM$misc$tax_names_long$order,   split=';o_'), FUN = function(x) x[2])

    SQM$misc$tax_names_long$family          = rownames(SQM$tax$family$abund)
    names(SQM$misc$tax_names_long$family)   = sapply(strsplit(SQM$misc$tax_names_long$family,  split=';f_'), FUN = function(x) x[2])

    SQM$misc$tax_names_long$genus           = rownames(SQM$tax$genus$abund)
    names(SQM$misc$tax_names_long$genus)    = sapply(strsplit(SQM$misc$tax_names_long$genus,   split=';g_'), FUN = function(x) x[2])

    SQM$misc$tax_names_long$species         = rownames(SQM$tax$species$abund)
    names(SQM$misc$tax_names_long$species)  = sapply(strsplit(SQM$misc$tax_names_long$species, split=';s_'), FUN = function(x) x[2]) 

    # Different species can have the same name, notorious example is `Buchnera`, both an eukaryote and a prokaryote
    # Make a fix to avoid duplicate names
    for(rank in names(SQM$misc$tax_names_long))
        {
        names(SQM$misc$tax_names_long[[rank]]) = make.unique(names(SQM$misc$tax_names_long[[rank]]))
        }

    # Now make a named vector connecting short names and long names
    SQM$misc$tax_names_short                = unlist(lapply(SQM$misc$tax_names_long, names))
    names(SQM$misc$tax_names_short)         = unlist(SQM$misc$tax_names_long)

    # Use short names for taxonomy tables, since it makes it easier to search for specific taxa
    rownames(SQM$taxa$superkingdom$abund)   = SQM$misc$tax_names_short[rownames(SQM$taxa$superkingdom$abund)]
    rownames(SQM$taxa$phylum$abund)         = SQM$misc$tax_names_short[rownames(SQM$taxa$phylum$abund)]
    rownames(SQM$taxa$class$abund)          = SQM$misc$tax_names_short[rownames(SQM$taxa$class$abund)]
    rownames(SQM$taxa$order$abund)          = SQM$misc$tax_names_short[rownames(SQM$taxa$order$abund)]
    rownames(SQM$taxa$family$abund)         = SQM$misc$tax_names_short[rownames(SQM$taxa$family$abund)]
    rownames(SQM$taxa$genus$abund)          = SQM$misc$tax_names_short[rownames(SQM$taxa$genus$abund)]
    rownames(SQM$taxa$species$abund)        = SQM$misc$tax_names_short[rownames(SQM$taxa$species$abund)]

    rownames(SQM$taxa$superkingdom$percent) = SQM$misc$tax_names_short[rownames(SQM$taxa$superkingdom$percent)]
    rownames(SQM$taxa$phylum$percent)       = SQM$misc$tax_names_short[rownames(SQM$taxa$phylum$percent)]
    rownames(SQM$taxa$class$percent)        = SQM$misc$tax_names_short[rownames(SQM$taxa$class$percent)]
    rownames(SQM$taxa$order$percent)        = SQM$misc$tax_names_short[rownames(SQM$taxa$order$percent)]
    rownames(SQM$taxa$family$percent)       = SQM$misc$tax_names_short[rownames(SQM$taxa$family$percent)]
    rownames(SQM$taxa$genus$percent)        = SQM$misc$tax_names_short[rownames(SQM$taxa$genus$percent)]
    rownames(SQM$taxa$species$percent)      = SQM$misc$tax_names_short[rownames(SQM$taxa$species$percent)]


    # Now the rownames are a character vector which actually has its own names inside. Some R packages don't like this, so we correct it.
    for(taxlevel in names(SQM$taxa))
        {
        for(counts in names(SQM$taxa[[taxlevel]]))
            {
            names(rownames(SQM$taxa[[taxlevel]][[counts]])) = NULL
            }
        }



    message('Loading functions\n')
    SQM$functions                           = list()

    
    funAbundFiles                           = allFiles[sapply(allFiles, function(x) x[1] == project_name & x[3] == 'abund' & x[4] == 'tsv')]
    funMethods                              = sapply(funAbundFiles, function(x) x[2])

    for(method in funMethods)
        {
        if(method == 'KO') { methodn = 'KEGG' } else { methodn = method }
	SQM$functions[[methodn]]            = list()
        methodFiles                         = allFiles[sapply(allFiles, function(x) x[1] == project_name & x[2] == method & x[4] == 'tsv')]
	for(fvec in methodFiles)
            {
            fname                           = paste(fvec, collapse = '.')
	    count                           = fvec[3]
	    if(count == 'copyNumber') { count = 'copy_number' }
	    if(count == 'names')
                {
                fieldn                      = sprintf('%s_names', methodn)
		fieldp                      = sprintf('%s_paths', methodn)
                ta                          = read.table(sprintf('%s/%s', tables_path, fname),
				                         header=TRUE, row.names=1, sep='\t', comment.char='', quote='', as.is=TRUE)
		SQM$misc[[fieldn]]          = ta[,'Name']
		names(SQM$misc[[fieldn]])   = rownames(ta)
		if(methodn %in% c('KEGG', 'COG'))
                    {
                    SQM$misc[[fieldp]]      = ta[,'Path']
		    names(SQM$misc[[fieldp]]) = rownames(ta) # :'(
		    }	
            }else
	        {
                SQM$functions[[methodn]][[count]] = as.matrix(read.table(sprintf('%s/%s', tables_path, fname),
                                                              header=TRUE, sep='\t', row.names=1, check.names=FALSE, comment.char='', quote=''))
                }
            }
        }	
        SQM$misc$ext_annot_sources = funMethods[!funMethods %in% c('KO', 'COG', 'PFAM')]
 
    ### Finish.
    SQM$misc$samples = colnames(SQM$tax$superkingdom$abund) # This should contain all samples, user is responsible for inconsistencies in the data.
    class(SQM)       = 'SQMlite'
    return(SQM)

    }