R/loadSQM.R

Defines functions loadSQM_ loadSQM

Documented in loadSQM

library(reshape2)
library(data.table)


#' Load a SqueezeMeta project into R
#'
#' This function takes the path to a project directory generated by \href{https://github.com/jtamames/SqueezeMeta}{SqueezeMeta} (whose name is specified in the \code{-p} parameter of the SqueezeMeta.pl script) and parses the results into a SQM object. Alternatively, it can load the project data from a zip file produced by \code{sqm2zip.py}.
#'
#' @section Prerequisites:
#' {
#' Run \href{https://github.com/jtamames/SqueezeMeta}{SqueezeMeta}! An example call for running it would be:
#'
#'         \code{/path/to/SqueezeMeta/scripts/SqueezeMeta.pl}\cr
#'         \code{-m coassembly -f fastq_dir -s samples_file -p project_dir}
#' }
#' 
#' @section The SQM object structure:
#' {
#' The SQM object is a nested list which contains the following information:
#' \tabular{lllllll}{
#' \bold{lvl1}         \tab \bold{lvl2}               \tab \bold{lvl3}          \tab \bold{type}             \tab \bold{rows/names} \tab \bold{columns} \tab \bold{data}        \cr
#' \bold{$orfs}        \tab \bold{$table}             \tab                      \tab \emph{dataframe}        \tab orfs              \tab misc. data     \tab misc. data         \cr
#'                     \tab \bold{$abund}             \tab                      \tab \emph{numeric matrix}   \tab orfs              \tab samples        \tab abundances (reads) \cr
#'                     \tab \bold{$bases}             \tab                      \tab \emph{numeric matrix}   \tab orfs              \tab samples        \tab abundances (bases) \cr
#'                     \tab \bold{$cov}               \tab                      \tab \emph{numeric matrix}   \tab orfs              \tab samples        \tab coverages          \cr
#'                     \tab \bold{$cpm}               \tab                      \tab \emph{numeric matrix}   \tab orfs              \tab samples        \tab covs. / 10^6 reads \cr
#'                     \tab \bold{$tpm}               \tab                      \tab \emph{numeric matrix}   \tab orfs              \tab samples        \tab tpm                \cr
#'                     \tab \bold{$seqs}              \tab                      \tab \emph{character vector} \tab orfs              \tab (n/a)          \tab sequences          \cr
#'                     \tab \bold{$tax}               \tab                      \tab \emph{character matrix} \tab orfs              \tab tax. ranks     \tab taxonomy           \cr
#'                     \tab \bold{$tax16S}            \tab                      \tab \emph{character vector} \tab orfs              \tab (n/a)          \tab 16S rRNA taxonomy       \cr
#'                     \tab \bold{$markers}           \tab                      \tab \emph{list}             \tab orfs              \tab (n/a)          \tab CheckM1 markers    \cr
#' \bold{$contigs}     \tab \bold{$table}             \tab                      \tab \emph{dataframe}        \tab contigs           \tab misc. data     \tab misc. data         \cr
#'                     \tab \bold{$abund}             \tab                      \tab \emph{numeric matrix}   \tab contigs           \tab samples        \tab abundances (reads) \cr
#'                     \tab \bold{$bases}             \tab                      \tab \emph{numeric matrix}   \tab contigs           \tab samples        \tab abundances (bases) \cr
#'                     \tab \bold{$cov}               \tab                      \tab \emph{numeric matrix}   \tab contigs           \tab samples        \tab coverages          \cr
#'                     \tab \bold{$cpm}               \tab                      \tab \emph{numeric matrix}   \tab contigs           \tab samples        \tab covs. / 10^6 reads \cr
#'                     \tab \bold{$tpm}               \tab                      \tab \emph{numeric matrix}   \tab contigs           \tab samples        \tab tpm                \cr
#'                     \tab \bold{$seqs}              \tab                      \tab \emph{character vector} \tab contigs           \tab (n/a)          \tab sequences          \cr
#'                     \tab \bold{$tax}               \tab                      \tab \emph{character matrix} \tab contigs           \tab tax. ranks     \tab taxonomies         \cr
#'                     \tab \bold{$bins}              \tab                      \tab \emph{character matrix} \tab contigs           \tab bin. methods   \tab bins               \cr
#' $bins               \tab \bold{$table}             \tab                      \tab \emph{dataframe}        \tab bins              \tab misc. data     \tab misc. data         \cr
#'                     \tab \bold{$length}            \tab                      \tab \emph{numeric vector}   \tab bins              \tab (n/a)          \tab length             \cr
#'                     \tab \bold{$abund}             \tab                      \tab \emph{numeric matrix}   \tab bins              \tab samples        \tab abundances (reads) \cr
#'                     \tab \bold{$percent}           \tab                      \tab \emph{numeric matrix}   \tab bins              \tab samples        \tab abundances (reads) \cr
#'                     \tab \bold{$bases}             \tab                      \tab \emph{numeric matrix}   \tab bins              \tab samples        \tab abundances (bases) \cr
#'                     \tab \bold{$cov}               \tab                      \tab \emph{numeric matrix}   \tab bins              \tab samples        \tab coverages          \cr
#'                     \tab \bold{$cpm}               \tab                      \tab \emph{numeric matrix}   \tab bins              \tab samples        \tab covs. / 10^6 reads \cr
#'                     \tab \bold{$tax}               \tab                      \tab \emph{character matrix} \tab bins              \tab tax. ranks     \tab taxonomy           \cr
#' \bold{$taxa}        \tab \bold{$superkingdom}      \tab \bold{$abund}        \tab \emph{numeric matrix}   \tab superkingdoms     \tab samples        \tab abundances (reads) \cr
#'                     \tab                           \tab \bold{$percent}      \tab \emph{numeric matrix}   \tab superkingdoms     \tab samples        \tab percentages        \cr
#'                     \tab \bold{$phylum}            \tab \bold{$abund}        \tab \emph{numeric matrix}   \tab phyla             \tab samples        \tab abundances (reads) \cr
#'                     \tab                           \tab \bold{$percent}      \tab \emph{numeric matrix}   \tab phyla             \tab samples        \tab percentages        \cr
#'                     \tab \bold{$class}             \tab \bold{$abund}        \tab \emph{numeric matrix}   \tab classes           \tab samples        \tab abundances (reads) \cr
#'                     \tab                           \tab \bold{$percent}      \tab \emph{numeric matrix}   \tab classes           \tab samples        \tab percentages        \cr
#'                     \tab \bold{$order}             \tab \bold{$abund}        \tab \emph{numeric matrix}   \tab orders            \tab samples        \tab abundances (reads) \cr
#'                     \tab                           \tab \bold{$percent}      \tab \emph{numeric matrix}   \tab orders            \tab samples        \tab percentages        \cr
#'                     \tab \bold{$family}            \tab \bold{$abund}        \tab \emph{numeric matrix}   \tab families          \tab samples        \tab abundances (reads) \cr
#'                     \tab                           \tab \bold{$percent}      \tab \emph{numeric matrix}   \tab families          \tab samples        \tab percentages        \cr
#'                     \tab \bold{$genus}             \tab \bold{$abund}        \tab \emph{numeric matrix}   \tab genera            \tab samples        \tab abundances (reads) \cr
#'                     \tab                           \tab \bold{$percent}      \tab \emph{numeric matrix}   \tab genera            \tab samples        \tab percentages        \cr
#'                     \tab \bold{$species}           \tab \bold{$abund}        \tab \emph{numeric matrix}   \tab species           \tab samples        \tab abundances (reads) \cr
#'                     \tab                           \tab \bold{$percent}      \tab \emph{numeric matrix}   \tab species           \tab samples        \tab percentages        \cr
#' \bold{$functions}   \tab \bold{$KEGG}              \tab \bold{$abund}        \tab \emph{numeric matrix}   \tab KEGG ids          \tab samples        \tab abundances (reads) \cr
#'                     \tab                           \tab \bold{$bases}        \tab \emph{numeric matrix}   \tab KEGG ids          \tab samples        \tab abundances (bases) \cr
#'                     \tab                           \tab \bold{$cov}          \tab \emph{numeric matrix}   \tab KEGG ids          \tab samples        \tab coverages          \cr
#'                     \tab                           \tab \bold{$cpm}          \tab \emph{numeric matrix}   \tab KEGG ids          \tab samples        \tab covs. / 10^6 reads \cr
#'                     \tab                           \tab \bold{$tpm}          \tab \emph{numeric matrix}   \tab KEGG ids          \tab samples        \tab tpm                \cr
#'                     \tab                           \tab \bold{$copy_number}  \tab \emph{numeric matrix}   \tab KEGG ids          \tab samples        \tab avg. copies        \cr
#'                     \tab \bold{$COG}               \tab \bold{$abund}        \tab \emph{numeric matrix}   \tab COG ids           \tab samples        \tab abundances (reads) \cr
#'                     \tab                           \tab \bold{$bases}        \tab \emph{numeric matrix}   \tab COG ids           \tab samples        \tab abundances (bases) \cr
#'                     \tab                           \tab \bold{$cov}          \tab \emph{numeric matrix}   \tab COG ids           \tab samples        \tab coverages          \cr
#'                     \tab                           \tab \bold{$cpm}          \tab \emph{numeric matrix}   \tab COG ids           \tab samples        \tab covs. / 10^6 reads \cr
#'                     \tab                           \tab \bold{$tpm}          \tab \emph{numeric matrix}   \tab COG ids           \tab samples        \tab tpm                \cr
#'                     \tab                           \tab \bold{$copy_number}  \tab \emph{numeric matrix}   \tab COG ids           \tab samples        \tab avg. copies        \cr
#'                     \tab \bold{$PFAM}              \tab \bold{$abund}        \tab \emph{numeric matrix}   \tab PFAM ids          \tab samples        \tab abundances (reads) \cr
#'                     \tab                           \tab \bold{$bases}        \tab \emph{numeric matrix}   \tab PFAM ids          \tab samples        \tab abundances (bases) \cr
#'                     \tab                           \tab \bold{$cov}          \tab \emph{numeric matrix}   \tab PFAM ids          \tab samples        \tab coverages          \cr
#'                     \tab                           \tab \bold{$cpm}          \tab \emph{numeric matrix}   \tab PFAM ids          \tab samples        \tab covs. / 10^6 reads \cr
#'                     \tab                           \tab \bold{$tpm}          \tab \emph{numeric matrix}   \tab PFAM ids          \tab samples        \tab tpm                \cr
#'                     \tab                           \tab \bold{$copy_number}  \tab \emph{numeric matrix}   \tab PFAM ids          \tab samples        \tab avg. copies        \cr
#' \bold{$total_reads} \tab                           \tab                      \tab \emph{numeric vector}   \tab samples           \tab (n/a)          \tab total reads        \cr
#' \bold{$misc}        \tab \bold{$project_name}      \tab                      \tab \emph{character vector} \tab (empty)           \tab (n/a)          \tab project name       \cr
#'                     \tab \bold{$samples}           \tab                      \tab \emph{character vector} \tab (empty)           \tab (n/a)          \tab samples            \cr
#'                     \tab \bold{$tax_names_long}    \tab \bold{$superkingdom} \tab \emph{character vector} \tab short names       \tab (n/a)          \tab full names         \cr
#'                     \tab                           \tab \bold{$phylum}       \tab \emph{character vector} \tab short names       \tab (n/a)          \tab full names         \cr
#'                     \tab                           \tab \bold{$class}        \tab \emph{character vector} \tab short names       \tab (n/a)          \tab full names         \cr
#'                     \tab                           \tab \bold{$order}        \tab \emph{character vector} \tab short names       \tab (n/a)          \tab full names         \cr
#'                     \tab                           \tab \bold{$family}       \tab \emph{character vector} \tab short names       \tab (n/a)          \tab full names         \cr
#'                     \tab                           \tab \bold{$genus}        \tab \emph{character vector} \tab short names       \tab (n/a)          \tab full names         \cr
#'                     \tab                           \tab \bold{$species}      \tab \emph{character vector} \tab short names       \tab (n/a)          \tab full names         \cr
#'                     \tab \bold{$tax_names_short}   \tab                      \tab \emph{character vector} \tab full names        \tab (n/a)          \tab short names        \cr
#'                     \tab \bold{$KEGG_names}        \tab                      \tab \emph{character vector} \tab KEGG ids          \tab (n/a)          \tab KEGG names         \cr
#'                     \tab \bold{$KEGG_paths}        \tab                      \tab \emph{character vector} \tab KEGG ids          \tab (n/a)          \tab KEGG hiararchy     \cr
#'                     \tab \bold{$COG_names}         \tab                      \tab \emph{character vector} \tab COG ids           \tab (n/a)          \tab COG names          \cr
#'                     \tab \bold{$COG_paths}         \tab                      \tab \emph{character vector} \tab COG ids           \tab (n/a)          \tab COG hierarchy      \cr
#'                     \tab \bold{$ext_annot_sources} \tab                      \tab \emph{character vector} \tab COG ids           \tab (n/a)          \tab external databases \cr
#' }
#' If external databases for functional classification were provided to SqueezeMeta via the \code{-extdb} argument, the corresponding abundance (reads and bases), coverages, tpm and copy number profiles will be present in \code{SQM$functions} (e.g. results for the CAZy database would be present in \code{SQM$functions$CAZy}). Additionally, the extended names of the features present in the external database will be present in \code{SQM$misc} (e.g. \code{SQM$misc$CAZy_names}).
#' }
#' @param project_path character, a vector of project directories generated by SqueezeMeta, and/or zip files generated by \code{sqm2zip.py}.
#' @param tax_mode character, which taxonomic classification should be loaded? SqueezeMeta applies the identity thresholds described in \href{https://pubmed.ncbi.nlm.nih.gov/24589583/}{Luo \emph{et al.}, 2014}. Use \code{allfilter} for applying the minimum identity threshold to all taxa, \code{prokfilter} for applying the threshold to Bacteria and Archaea, but not to Eukaryotes, and \code{nofilter} for applying no thresholds at all (default \code{prokfilter}).
#' @param trusted_functions_only logical. If \code{TRUE}, only highly trusted functional annotations (best hit + best average) will be considered when generating aggregated function tables. If \code{FALSE}, best hit annotations will be used (default \code{FALSE}). Will only have an effect if \code{project_path} is not a zip file, and \code{project_path/results/tables} is not already present.
#' @param single_copy_genes character, source of single copy genes for copy number normalization, either \code{RecA} (COG0468, RecA/RadA), \code{MGOGs} (COGs for 10 single copy and housekeeping genes, Salazar, G \emph{et al.} 2019), \code{MGKOs} (KOs for 10 single copy and housekeeping genes, Salazar, G \emph{et al.}, 2019) or \code{USiCGs} (KOs for 15 single copy genes, Carr \emph{et al.}, 2013. Table S1). For \code{MGOGs}, \code{MGKOs} and \code{USiCGs}, the median coverage of a set of single copy genes will be used for normalization. Default \code{MGOGs}.
#' @param load_sequences logical. If \code{TRUE}, contig and orf sequences will be loaded in the SQM object. Setting it to \code{FALSE} will reduce memory usage. Default \code{TRUE}.
#' @param engine character. Engine used to load the ORFs and contigs tables. Either \code{data.frame} or \code{data.table} (significantly faster if your project is large). Default \code{data.table}.
#' @return SQM object containing the parsed project. If more than one path is provided in \code{project_path} this function will return a SQMbunch object instead. The structure of this object is similar to that of a SQMlite object (see \code{\link{loadSQMlite}}) but with an extra entry named \code{projects} that contains one SQM object for input project. SQM and SQMbunch objects will otherwise behave similarly when used with the subset and plot functions from this package.
#' @examples
#' \dontrun{
#' ## (outside R)
#' ## Run SqueezeMeta on the test data.
#'  /path/to/SqueezeMeta/scripts/SqueezeMeta.pl -p Hadza -f raw -m coassembly -s test.samples
#' ## Now go into R.
#' library(SQMtools)
#' Hadza = loadSQM("Hadza") # Where Hadza is the path to the SqueezeMeta output directory.
#' }
#'
#' data(Hadza) # We will illustrate the structure of the SQM object on the test data
#' # Which are the ten most abundant KEGG IDs in our data?
#' topKEGG = names(sort(rowSums(Hadza$functions$KEGG$tpm), decreasing=TRUE))[1:11]
#' topKEGG = topKEGG[topKEGG!="Unclassified"]
#' # Which functions do those KEGG IDs represent?
#' Hadza$misc$KEGG_names[topKEGG]
#' # What is the relative abundance of the Negativicutes class across samples?
#' Hadza$taxa$class$percent["Negativicutes",]
#' # Which information is stored in the orf, contig and bin tables?
#' colnames(Hadza$orfs$table)
#' colnames(Hadza$contigs$table)
#' colnames(Hadza$bins$table)
#' # What is the GC content distribution of my metagenome?
#' boxplot(Hadza$contigs$table[,"GC perc"]) # Not weighted by contig length or abundance!
#' @export
loadSQM = function(project_path, tax_mode = 'prokfilter', trusted_functions_only = FALSE, single_copy_genes = 'MGOGs', load_sequences = TRUE, engine = 'data.table')
    {
    if(length(project_path) == 1)
        {
        SQM = loadSQM_(project_path, tax_mode, trusted_functions_only, single_copy_genes, load_sequences, engine)
    } else
        {
        projs = list()
        for(p in project_path)
            {
            message(sprintf('Loading project in %s', p))
	    projs = c(projs, list(loadSQM_(p, tax_mode, trusted_functions_only, single_copy_genes, load_sequences, engine)))
    }
        SQM = combineSQM(projs)
        }
    return(SQM)
    }


#' @importFrom utils read.table tail
#' @importFrom stats aggregate
loadSQM_ = function(project_path, tax_mode = 'prokfilter', trusted_functions_only = FALSE, single_copy_genes = 'MGOGs', load_sequences = TRUE, engine = 'data.table')
    {
    if(!tax_mode %in% c('allfilter', 'prokfilter', 'nofilter'))
        {
        stop('tax_mode must be either "allfilter" (apply minimum identity threshold for all taxa), "prokfilter" (don\'t apply thresholds to Eukaryotes) or "nofilter" (don\'t apply thresholds at all).')
        }

    if(!single_copy_genes %in% c('RecA', 'MGOGs', 'MGKOs', 'USiCGs'))
        {
        stop('single_copy_genes must be either "RecA", "MGOGs", "MGKOs", or "USiCGs"')
        }

    if(!engine %in% c('data.frame', 'data.table'))
        {
        stop('engine must be either "data.frame" or "data.table"')
        }

    project_name = tail(unlist(strsplit(project_path, split='/')), 1)
    zipmode = endsWith(project_name, '.zip')
    project_name = gsub('.zip','',project_name)

    ### Check that this is a valid SQM project.
    if(!file.exists.zip(project_path, 'SqueezeMeta_conf.pl'))
        {
        stop(sprintf('Directory "%s" does not seem to contain a valid SqueezeMeta project', project_path))
        }


    ### Check whether this project was created with an old version SQM
    sqmtools_version = sprintf('v%s', .__NAMESPACE__.$spec['version'])
    
    if(file.exists.zip(project_path, sprintf('results/20.%s.contigtable', project_name)))
        {
        stop('Your project was created with a version of SqueezeMeta prior to 1.5. Running utils/versionchange.pl might fix this.')
    } else if(!file.exists.zip(project_path,'creator.txt'))
        {
	warning(sprintf('Your project was created with SqueezeMeta v1.5, while this is SQMtools %s. You can ignore this message if things are working fine for you, but if you experience any issue consider using the right version of SQMtools for this project', sqmtools_version)
	    )
    } else {
	conn = open_conn_zip(project_path, 'creator.txt')
	project_version = gsub(',','',scan(conn, what = 'character', quiet = T)[2])
	close(conn)
        if(project_version != sqmtools_version)
	    {
	    warning(sprintf('Your project was created with SqueezeMeta %s, while this is SQMtools %s. You can ignore this message if things are working fine for you, but if you experience any issue consider using the right version of SQMtools for this project', project_version, sqmtools_version))	
            }
	}


    ### Check whether we need to create the projectdir/results/tables directory.
    if(zipmode)
        {
        if(!file.exists.zip(project_path, sprintf('results/tables/%s.superkingdom.%s.abund.tsv', project_name, tax_mode)))
	    { stop('Projects provided as zip files must contain the results/tables directories. This should have been generated by sqm2zip.py.')}
        warning('You are trying to load a zipped project. If this doesn\'t work, see workaround in https://github.com/jtamames/SqueezeMeta/issues/755.')
        }
    else if(!file.exists(sprintf('%s/results/tables/%s.superkingdom.%s.abund.tsv', project_path, project_name, tax_mode)))
        {
	message(sprintf('Generating tabular outputs for project in %s', project_path))
        lines = readLines(sprintf('%s/SqueezeMeta_conf.pl', project_path))
	lines = trimws(lines)
        lines = gsub(pattern = "#[^\\\n]*", replacement = "", x = lines) # Remove comments.
	install_path = lines[grepl("^\\$installpath",lines)]
	if(length(install_path)==0) { stop(sprintf('$installpath can not be found in conf file \"%s/SqueezeMeta_conf.pl\".', project_path)) }
        install_path = gsub(';|\"', '', trimws(strsplit(install_path, split = '=')[[1]][2]))
	if(trusted_functions_only)
            {
            extra_args = '--trusted-functions'
        }else
            {
            extra_args = ''
	    }
	command = sprintf('%s/utils/sqm2tables.py %s %s/results/tables %s', install_path, project_path, project_path, extra_args)
	ecode = system(command)
	if(ecode != 0) { stop('An error occurred while running sqm2tables.py') }
	}

    ### Do stuff.
    SQM                          = list()
    
    SQM$misc                     = list()
    SQM$misc$project_name        = project_name
    SQM$misc$single_copy_genes   = single_copy_genes

    message('Loading total reads')
    conn            = open_conn_zip(project_path, sprintf('results/10.%s.mappingstat', project_name))
    lines           = readLines(conn)
    close(conn)
    evilLines       = (substr(lines,1,1) == '#' & substr(lines,1,8) != '# Sample') | substr(lines,1,1) == ''
    lines           = lines[!evilLines]
    SQM$total_reads = as.matrix(
                                read.table(text = lines,
                                           header=TRUE, sep='\t', row.names=1, comment.char='')
                               )[,'Total.reads']

    message('Loading orfs')
    SQM$orfs                     = list()

    message('    table...')        # option to remove table from memory after getting abund & TPM?
    SQM$orfs$table               = read.generic.table.zip(project_path, sprintf('results/13.%s.orftable', project_name), engine = engine,
                                                          header=TRUE, sep='\t', row.names=1, quote='', comment.char='', skip=1, as.is=TRUE, check.names=FALSE)
    message('    abundances...')
    SQM$orfs$abund               = as.matrix(SQM$orfs$table[,grepl('Raw read count', colnames(SQM$orfs$table)),drop=FALSE])
    colnames(SQM$orfs$abund)     = gsub('Raw read count ', '', colnames(SQM$orfs$abund), fixed=TRUE)
    storage.mode(SQM$orfs$abund) = 'numeric'
    SQM$orfs$bases               = as.matrix(SQM$orfs$table[,grepl('Raw base count', colnames(SQM$orfs$table)),drop=FALSE])
    colnames(SQM$orfs$bases)     = gsub('Raw base count ', '', colnames(SQM$orfs$abund), fixed=TRUE)
    storage.mode(SQM$orfs$bases) = 'numeric'
    SQM$orfs$cov                 = as.matrix(SQM$orfs$table[,grepl('Coverage', colnames(SQM$orfs$table)),drop=FALSE])
    colnames(SQM$orfs$cov)       = gsub('Coverage ', '', colnames(SQM$orfs$cov), fixed=TRUE)
    SQM$orfs$cpm                 = t(t(SQM$orfs$cov) / (SQM$total_reads / 1000000))
    SQM$orfs$tpm                 = as.matrix(SQM$orfs$table[,grepl('TPM', colnames(SQM$orfs$table)),drop=FALSE])
    colnames(SQM$orfs$tpm)       = gsub('TPM ', '', colnames(SQM$orfs$tpm), fixed=TRUE)
    SQM$misc$samples             = colnames(SQM$orfs$abund)

    goodORFcols                  = colnames(SQM$orfs$table)[!grepl('Raw read|TPM|Coverage|Tax|Raw base', colnames(SQM$orfs$table))]
    goodORFcols                  = c('startPos', 'endPos', goodORFcols)
    start_end                    = sapply(strsplit(rownames(SQM$orfs$table), split='_'), FUN=function(x) x[length(x)])
    SQM$orfs$table$startPos      = as.numeric(sapply(strsplit(start_end, split = '-'), FUN=function(x) x[1]))
    SQM$orfs$table$endPos        = as.numeric(sapply(strsplit(start_end, split = '-'), FUN=function(x) x[2]))
    SQM$orfs$table               = SQM$orfs$table[,goodORFcols]

    if(load_sequences)
        { 
        message('    sequences...')
        SQM$orfs$seqs            = read.namedvector.zip(project_path, sprintf('results/tables/%s.orf.sequences.tsv', project_name), engine=engine)
        }
    
    message('    taxonomy...')
    SQM$orfs$tax                 = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.orf.tax.%s.tsv', project_name, tax_mode), engine = engine,
							            header=TRUE, row.names=1, sep='\t'))
    if(file.exists.zip(project_path, sprintf('results/tables/%s.orf.16S.tsv', project_name)))
        {
        SQM$orfs$tax16S = read.namedvector.zip(project_path, sprintf('results/tables/%s.orf.16S.tsv', project_name), engine=engine)
        }

    if(file.exists.zip(project_path, sprintf('results/18.%s.bintable', project_name)) & file.exists.zip(project_path, sprintf('results/tables/%s.orf.marker.genes.tsv', project_name)))
        {
        message('    checkm markers...')
        conn                     = open_conn_zip(project_path, sprintf('results/tables/%s.orf.marker.genes.tsv', project_name))
        lines                    = readLines(conn)
        close(conn)
	lines                    = strsplit(lines, split='\t')
	SQM$orfs$markers         = sapply(lines, FUN = function(x) strsplit(x[2], split=','))
	names(SQM$orfs$markers)  = sapply(lines, FUN = function(x) x[1]) # this is also the same ORF order as in SQM$orfs$table 
	}

    # THIS SHOULD NOT BE NEEDED ANYMORE (BUT ALAS IT IS! >_<)
    SQM$orfs$table               = SQM$orfs$table[rownames(SQM$orfs$table) %in% rownames(SQM$orfs$tax),]
    SQM$orfs$abund               = SQM$orfs$abund[rownames(SQM$orfs$table),,drop=FALSE]
    SQM$orfs$tpm                 = SQM$orfs$tpm[rownames(SQM$orfs$table),,drop=FALSE]
    if(load_sequences)
        {
        SQM$orfs$seqs            = SQM$orfs$seqs[rownames(SQM$orfs$table)[rownames(SQM$orfs$table) %in% names(SQM$orfs$seqs)]]
        }

    message('Loading contigs')
    SQM$contigs                   = list()

    message('    table...')                    # option to remove table from memory after getting abund & TPM?
    SQM$contigs$table             = read.generic.table.zip(project_path, sprintf('results/19.%s.contigtable', project_name), engine = engine,
                                                           header=TRUE, sep='\t', row.names=1, quote='', comment.char='', skip=1, as.is=TRUE, check.names=FALSE)
    
    message('    abundances...') 
    # In issue #589 we found out that bin bases and coverages were not being calculated correctly
    # This was because data.table::fread was stoif(bycol) { data = t(data) }ring some columns containing large values as integer64
    # When casted from generic.data.frame to matrix with as.matrix, some integer64 became very small numerics (like e-318)
    # In theory data.table::fread has the option `integer64="double"` to avoid integer64 being created in the first place
    # But it fails in some cases as described in https://github.com/Rdatatable/data.table/issues/2607
    # Thus I manually cast all columns containing abund and bases to numeric before casting to matrixz
    abundcols = colnames(SQM$contigs$table)[grepl('Raw read count', colnames(SQM$contigs$table), fixed=TRUE)]
    abund = SQM$contigs$table[,abundcols,drop=FALSE]
    for(col in abundcols) { abund[,col] = as.numeric(abund[,col]) }
    abund = as.matrix(abund)
    colnames(abund) = colnames(abund) = gsub('Raw read count ', '', abundcols, fixed = TRUE)
    SQM$contigs$abund             = abund

    basescols = colnames(SQM$contigs$table)[grepl('Raw base count', colnames(SQM$contigs$table), fixed=TRUE)]
    bases = SQM$contigs$table[,basescols,drop=FALSE]
    for(col in basescols) { bases[,col] = as.numeric(bases[,col]) }
    bases = as.matrix(bases)
    colnames(bases) = gsub('Raw base count ', '', basescols, fixed = TRUE)
    SQM$contigs$bases             = bases

    # I treat the cov, cpm and tpm matrices normally, as I don't expect them to contain very large values
    SQM$contigs$cov               = as.matrix(SQM$contigs$table[,grepl('Coverage', colnames(SQM$contigs$table)),drop=FALSE])
    colnames(SQM$contigs$cov)     = gsub('Coverage ', '', colnames(SQM$contigs$cov), fixed=TRUE)
    SQM$contigs$cpm               = t(t(SQM$contigs$cov) / (SQM$total_reads / 1000000))
    SQM$contigs$tpm               = as.matrix(SQM$contigs$table[,grepl('TPM', colnames(SQM$contigs$table)),drop=FALSE])
    colnames(SQM$contigs$tpm)     = gsub('TPM ', '', colnames(SQM$contigs$tpm), fixed=TRUE)

    goodContigCols                = colnames(SQM$contigs$table)[!grepl('Raw read|TPM|Coverage|Tax|Raw base', colnames(SQM$contigs$table))]
    SQM$contigs$table             = SQM$contigs$table[,goodContigCols,drop=FALSE]

    if(load_sequences)
        {
        message('    sequences...')
        SQM$contigs$seqs          = read.namedvector.zip(project_path, sprintf('results/tables/%s.contig.sequences.tsv', project_name), engine=engine)
        SQM$contigs$seqs          = SQM$contigs$seqs[rownames(SQM$contigs$table)]
        }

    message('    taxonomy...')
    SQM$contigs$tax               = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.contig.tax.%s.tsv', project_name, tax_mode), engine = engine,
                                                                     header=TRUE, row.names=1, sep='\t'))
    SQM$contigs$tax               = SQM$contigs$tax[rownames(SQM$contigs$table),,drop=FALSE]

					  
    if(file.exists.zip(project_path, sprintf('results/18.%s.bintable', project_name)))
        {
        message('    binning info...')
        inBins                    = read.generic.table.zip(project_path, sprintf('intermediate/18.%s.contigsinbins', project_name), engine = 'data.frame',
                                                           header=TRUE, sep='\t', quote='', comment.char='', skip=1, as.is=TRUE)
        binmethods                = unique(inBins$Method)

        if(max(table(paste(inBins$X..Contig, inBins$Method)))>1) { 
	    warning('Some contigs are assigned to more than one bin with the same method, something went wrong...')
	}

        SQM$contigs$bins          = matrix(NA, nrow = nrow(inBins), ncol = length(binmethods),
					   dimnames = list(inBins$X..Contig, binmethods))

        for(bm in binmethods)
	    {
            ibm = inBins[inBins$Method == bm,]
            SQM$contigs$bins[ibm$X..Contig,] = ibm$Bin.ID
            }
 
        notInBins                 = setdiff(rownames(SQM$contigs$table), rownames(SQM$contigs$bins))
        notInBins                 = matrix(NA, nrow=length(notInBins), ncol=ncol(SQM$contigs$bins), dimnames=list(notInBins, colnames(SQM$contigs$bins)))
        SQM$contigs$bins          = rbind(SQM$contigs$bins, notInBins)
        SQM$contigs$bins          = SQM$contigs$bins[rownames(SQM$contigs$table),,drop=FALSE]
        SQM$contigs$bins[is.na(SQM$contigs$bins)] = 'No_bin'
        message('Loading bins')
        message('    table...')
        SQM$bins                  = list()
        SQM$bins$table            = read.generic.table.zip(project_path, sprintf('results/18.%s.bintable', project_name),
                                                           engine = 'data.frame', header=TRUE, sep='\t', row.names=1,
                                                           quote='', comment.char='', skip=1, as.is=TRUE, check.names=FALSE)
	message('    abundances...')
	bin_abunds                = get.bin.abunds(SQM, track_unmapped=TRUE)
        SQM$bins$abund            = bin_abunds[['abund']]
        SQM$bins$percent          = bin_abunds[['percent']]
        SQM$bins$bases            = bin_abunds[['bases']]
        SQM$bins$length           = bin_abunds[['length']]
        SQM$bins$cov              = bin_abunds[['cov']]
        SQM$bins$cpm              = bin_abunds[['cpm']]
	
        message('    taxonomy...')
        SQM$bins$tax              = as.matrix(read.generic.table.zip(project_path,
                                                                     sprintf('results/tables/%s.bin.tax.tsv', project_name),
                                                                     engine = 'data.frame',
							             header=TRUE, row.names=1, sep='\t'))
        if('Tax GTDB-Tk' %in% colnames(SQM$bins$table))
            {
            goodcols = c('Method', 'Num contigs', 'GC perc', 'Tax 16S', 'Tax GTDB-Tk')
        } else 
            {
            goodcols = c('Method', 'Num contigs', 'GC perc', 'Tax 16S')
            }
        goodcols = c(goodcols, c('Disparity', 'Completeness','Contamination'))
	SQM$bins$table            = SQM$bins$table[,goodcols,drop=FALSE]
    } else
        {
	warning('    There are no binning results in your project. Skipping...')
	}
    message('Loading taxonomies')                                   
    SQM$taxa                      = list()
    SQM$taxa$superkingdom         = list()
    SQM$taxa$phylum               = list()
    SQM$taxa$class                = list()
    SQM$taxa$order                = list()
    SQM$taxa$family               = list()
    SQM$taxa$genus                = list()
    SQM$taxa$species              = list()
    

    SQM$taxa$superkingdom$abund   = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.superkingdom.%s.abund.tsv', project_name, tax_mode),
                                                                     header=TRUE, sep='\t', row.names=1, check.names=FALSE))
    SQM$taxa$phylum$abund         = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.phylum.%s.abund.tsv'      , project_name, tax_mode),
                                                                     header=TRUE, sep='\t', row.names=1, check.names=FALSE))
    SQM$taxa$class$abund          = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.class.%s.abund.tsv'       , project_name, tax_mode),
                                                                     header=TRUE, sep='\t', row.names=1, check.names=FALSE))
    SQM$taxa$order$abund          = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.order.%s.abund.tsv'       , project_name, tax_mode),
                                                                     header=TRUE, sep='\t', row.names=1, check.names=FALSE))
    SQM$taxa$family$abund         = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.family.%s.abund.tsv'      , project_name, tax_mode),
                                                                     header=TRUE, sep='\t', row.names=1, check.names=FALSE))
    SQM$taxa$genus$abund          = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.genus.%s.abund.tsv'       , project_name, tax_mode),
                                                                     header=TRUE, sep='\t', row.names=1, check.names=FALSE))
    SQM$taxa$species$abund        = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.species.%s.abund.tsv'     , project_name, tax_mode),
                                                                     header=TRUE, sep='\t', row.names=1, check.names=FALSE))
                                                          
     
    SQM$taxa$superkingdom$percent = 100 * t(t(SQM$taxa$superkingdom$abund) / colSums(SQM$taxa$superkingdom$abund))
    SQM$taxa$phylum$percent       = 100 * t(t(SQM$taxa$phylum$abund)       / colSums(SQM$taxa$phylum$abund      ))
    SQM$taxa$class$percent        = 100 * t(t(SQM$taxa$class$abund)        / colSums(SQM$taxa$class$abund       )) 
    SQM$taxa$order$percent        = 100 * t(t(SQM$taxa$order$abund)        / colSums(SQM$taxa$order$abund       ))
    SQM$taxa$family$percent       = 100 * t(t(SQM$taxa$family$abund)       / colSums(SQM$taxa$family$abund      ))
    SQM$taxa$genus$percent        = 100 * t(t(SQM$taxa$genus$abund)        / colSums(SQM$taxa$genus$abund       ))
    SQM$taxa$species$percent      = 100 * t(t(SQM$taxa$species$abund)      / colSums(SQM$taxa$species$abund     ))

    
    SQM$misc$tax_names_long                 = list()

    SQM$misc$tax_names_long$superkingdom    = rownames(SQM$tax$superkingdom$abund)
    names(SQM$misc$tax_names_long$superkingdom) = gsub('^k_', '', SQM$misc$tax_names_long$superkingdom)

    SQM$misc$tax_names_long$phylum          = rownames(SQM$tax$phylum$abund)
    names(SQM$misc$tax_names_long$phylum)   = sapply(strsplit(SQM$misc$tax_names_long$phylum,  split=';p_'), FUN = function(x) x[2])

    SQM$misc$tax_names_long$class           = rownames(SQM$tax$class$abund)
    names(SQM$misc$tax_names_long$class)    = sapply(strsplit(SQM$misc$tax_names_long$class,   split=';c_'), FUN = function(x) x[2])

    SQM$misc$tax_names_long$order           = rownames(SQM$tax$order$abund)
    names(SQM$misc$tax_names_long$order)    = sapply(strsplit(SQM$misc$tax_names_long$order,   split=';o_'), FUN = function(x) x[2])

    SQM$misc$tax_names_long$family          = rownames(SQM$tax$family$abund)
    names(SQM$misc$tax_names_long$family)   = sapply(strsplit(SQM$misc$tax_names_long$family,  split=';f_'), FUN = function(x) x[2])

    SQM$misc$tax_names_long$genus           = rownames(SQM$tax$genus$abund)
    names(SQM$misc$tax_names_long$genus)    = sapply(strsplit(SQM$misc$tax_names_long$genus,   split=';g_'), FUN = function(x) x[2])

    SQM$misc$tax_names_long$species         = rownames(SQM$tax$species$abund)
    names(SQM$misc$tax_names_long$species)  = sapply(strsplit(SQM$misc$tax_names_long$species, split=';s_'), FUN = function(x) x[2]) 

    SQM$misc$tax_names_short                = unlist(lapply(SQM$misc$tax_names_long, names))
    names(SQM$misc$tax_names_short)         = unlist(SQM$misc$tax_names_long)


    # Use short names for taxonomy tables, since it makes it easier to search for specific taxa
    rownames(SQM$taxa$superkingdom$abund)   = SQM$misc$tax_names_short[rownames(SQM$taxa$superkingdom$abund)]
    rownames(SQM$taxa$phylum$abund)         = SQM$misc$tax_names_short[rownames(SQM$taxa$phylum$abund)]
    rownames(SQM$taxa$class$abund)          = SQM$misc$tax_names_short[rownames(SQM$taxa$class$abund)]
    rownames(SQM$taxa$order$abund)          = SQM$misc$tax_names_short[rownames(SQM$taxa$order$abund)]
    rownames(SQM$taxa$family$abund)         = SQM$misc$tax_names_short[rownames(SQM$taxa$family$abund)]
    rownames(SQM$taxa$genus$abund)          = SQM$misc$tax_names_short[rownames(SQM$taxa$genus$abund)]
    rownames(SQM$taxa$species$abund)        = SQM$misc$tax_names_short[rownames(SQM$taxa$species$abund)]

    rownames(SQM$taxa$superkingdom$percent) = SQM$misc$tax_names_short[rownames(SQM$taxa$superkingdom$percent)]
    rownames(SQM$taxa$phylum$percent)       = SQM$misc$tax_names_short[rownames(SQM$taxa$phylum$percent)]
    rownames(SQM$taxa$class$percent)        = SQM$misc$tax_names_short[rownames(SQM$taxa$class$percent)]
    rownames(SQM$taxa$order$percent)        = SQM$misc$tax_names_short[rownames(SQM$taxa$order$percent)]
    rownames(SQM$taxa$family$percent)       = SQM$misc$tax_names_short[rownames(SQM$taxa$family$percent)]
    rownames(SQM$taxa$genus$percent)        = SQM$misc$tax_names_short[rownames(SQM$taxa$genus$percent)]
    rownames(SQM$taxa$species$percent)      = SQM$misc$tax_names_short[rownames(SQM$taxa$species$percent)]

    # Now the rownames are a character vector which actually has its own names inside. Some R packages don't like this, so we correct it.
    for(taxlevel in names(SQM$taxa))
        {
        for(counts in names(SQM$taxa[[taxlevel]]))
            {
            names(rownames(SQM$taxa[[taxlevel]][[counts]])) = NULL
            }
        }


    message('Loading functions')

    result_files                 = strsplit(list.files.zip(project_path, 'results/tables'), '.', fixed=TRUE)
    external_annotation_results  = result_files[sapply(result_files, FUN=function(x) x[3]=='abund' & !x[2] %in% c('COG', 'KO', 'PFAM'))]
    SQM$misc$ext_annot_sources   = sapply(external_annotation_results, FUN=function(x) x[2])
 
    SQM$functions                  = list()
    ### KEGG
    if(file.exists.zip(project_path, sprintf('results/tables/%s.KO.abund.tsv', project_name)))
        {
        has_KEGG                               = TRUE
	SQM$functions$KEGG                     = list()
        SQM$functions$KEGG$abund               = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.KO.abund.tsv', project_name), engine = 'data.frame',
                                                                                  header=TRUE, sep='\t', row.names=1, check.names=FALSE, comment.char='', quote=''))
        SQM$functions$KEGG$bases               = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.KO.bases.tsv', project_name), engine = 'data.frame',
                                                                                  header=TRUE, sep='\t', row.names=1, check.names=FALSE, comment.char='', quote=''))
        SQM$functions$KEGG$cov                 = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.KO.cov.tsv',   project_name), engine = 'data.frame',
                                                                                  header=TRUE, sep='\t', row.names=1, check.names=FALSE, comment.char='', quote=''))
	SQM$functions$KEGG$cpm                 = t(t(SQM$functions$KEGG$cov) / (SQM$total_reads / 1000000))

        SQM$functions$KEGG$tpm                 = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.KO.tpm.tsv',   project_name), engine = 'data.frame',
                                                                                  header=TRUE, sep='\t', row.names=1, check.names=FALSE, comment.char='', quote=''))
	funinfo                                = read.generic.table.zip(project_path, sprintf('results/tables/%s.KO.names.tsv', project_name), engine = 'data.frame',
                                                                        header=TRUE, sep='\t', row.names=1, check.names=FALSE, comment.char='', quote='', as.is=TRUE)
	SQM$misc$KEGG_names                    = funinfo[,1]
	names(SQM$misc$KEGG_names)             = rownames(funinfo)
        SQM$misc$KEGG_paths                    = funinfo[,2]
        names(SQM$misc$KEGG_paths)             = rownames(funinfo)
    }else
        {
        warning('    There are no KEGG results in your project. Skipping...')
        has_KEGG                               = FALSE
        }
    ### COG              
    if(file.exists.zip(project_path, sprintf('results/tables/%s.COG.abund.tsv', project_name)))
        {
        has_COG = TRUE
        SQM$functions$COG                      = list()
        SQM$functions$COG$abund                = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.COG.abund.tsv', project_name), engine = 'data.frame',
                                                                                  header=TRUE, sep='\t', row.names=1, check.names=FALSE, comment.char='', quote=''))
        SQM$functions$COG$bases                = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.COG.bases.tsv', project_name), engine = 'data.frame',
                                                                                  header=TRUE, sep='\t', row.names=1, check.names=FALSE, comment.char='', quote=''))
        SQM$functions$COG$cov                  = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.COG.cov.tsv',   project_name), engine = 'data.frame',
                                                                                  header=TRUE, sep='\t', row.names=1, check.names=FALSE, comment.char='', quote=''))
	SQM$functions$COG$cpm                  = t(t(SQM$functions$COG$cov) / (SQM$total_reads / 1000000))

        SQM$functions$COG$tpm                  = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.COG.tpm.tsv',   project_name), engine = 'data.frame',
                                                                                  header=TRUE, sep='\t', row.names=1, check.names=FALSE, comment.char='', quote=''))
        funinfo                                = read.generic.table.zip(project_path, sprintf('results/tables/%s.COG.names.tsv', project_name), engine = 'data.frame',
                                                                        header=TRUE, sep='\t', row.names=1, check.names=FALSE, comment.char='', quote='', as.is=TRUE)
        SQM$misc$COG_names                     = funinfo[,1]
        names(SQM$misc$COG_names)              = rownames(funinfo)
        SQM$misc$COG_paths                     = funinfo[,2]
        names(SQM$misc$COG_paths)              = rownames(funinfo)

    }else
        {
        warning('    There are no COG results in your project. Skipping...')
        has_COG                            = FALSE
        }
    ### PFAM
    if(file.exists.zip(project_path, sprintf('results/tables/%s.PFAM.abund.tsv', project_name)))
        {
        has_PFAM = TRUE
        SQM$functions$PFAM                 = list()
        SQM$functions$PFAM$abund           = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.PFAM.abund.tsv', project_name), engine = 'data.frame',
                                                                              header=TRUE, sep='\t', row.names=1, check.names=FALSE, comment.char='', quote=''))
        SQM$functions$PFAM$bases           = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.PFAM.bases.tsv', project_name), engine = 'data.frame',
                                                                              header=TRUE, sep='\t', row.names=1, check.names=FALSE, comment.char='', quote=''))
        SQM$functions$PFAM$cov             = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.PFAM.cov.tsv',   project_name), engine = 'data.frame',
                                                                              header=TRUE, sep='\t', row.names=1, check.names=FALSE, comment.char='', quote=''))
	SQM$functions$PFAM$cpm             = t(t(SQM$functions$PFAM$cov) / (SQM$total_reads / 1000000))
        SQM$functions$PFAM$tpm             = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.PFAM.tpm.tsv',   project_name), engine = 'data.frame',
                                                                              header=TRUE, sep='\t', row.names=1, check.names=FALSE, comment.char='', quote=''))
    }else
        {
        warning('    There are no PFAM results in your project. Skipping...')
        has_PFAM                           = FALSE
        }

    ### EXTERNAL ANNOTATION SOURCES
    for(method in SQM$misc$ext_annot_sources)
        {
        SQM$functions[[method]] = list()
	SQM$functions[[method]]$abund      = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.%s.abund.tsv', project_name, method), engine = 'data.frame',
                                                                              header=TRUE, sep='\t', row.names=1, check.names=FALSE, comment.char='', quote=''))
        SQM$functions[[method]]$bases      = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.%s.bases.tsv', project_name, method), engine = 'data.frame',
                                                                              header=TRUE, sep='\t', row.names=1, check.names=FALSE, comment.char='', quote=''))
        SQM$functions[[method]]$cov        = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.%s.cov.tsv',   project_name, method), engine = 'data.frame',
                                                                              header=TRUE, sep='\t', row.names=1, check.names=FALSE, comment.char='', quote=''))
	SQM$functions[[method]]$cpm        = t(t(SQM$functions[[method]]$cov) / (SQM$total_reads / 1000000))

        SQM$functions[[method]]$tpm        = as.matrix(read.generic.table.zip(project_path, sprintf('results/tables/%s.%s.tpm.tsv',   project_name, method), engine = 'data.frame',
                                                                              header=TRUE, sep='\t', row.names=1, check.names=FALSE, comment.char='', quote=''))
	funinfo                            = read.generic.table.zip(project_path, sprintf('results/tables/%s.%s.names.tsv', project_name, method), engine = 'data.frame',
                                                                    header=TRUE, sep='\t', row.names=1, check.names=FALSE, comment.char='', quote='', as.is=TRUE)
	field                              = sprintf('%s_names', method)
	SQM$misc[[field]]                  = funinfo[,1]
        names(SQM$misc[[field]])           = rownames(funinfo)	
        }

    ### CODING FRACTION FOR CORRECTING TPMs
    SQM$misc$coding_fraction               = list()
    for(method in names(SQM$functions))
        {
        #SQM$misc$coding_fraction[[method]] = 1 - (SQM$functions[[method]]$tpm['Unmapped',] / colSums(SQM$functions[[method]]$tpm))
	SQM$misc$coding_fraction[[method]] = 1 # this was used to correct TPMs ignoring the effect of unmapped reads, but is not needed anymore since we don't track unmapped reads for functions anymore
        }

    ### COPY NUMBERS
    SQM$misc$single_copy_cov = get_median_single_copy_cov(SQM)
    
    if(!any(is.na(SQM$misc$single_copy_cov)))
        {
        message(sprintf('    Using %s for copy number normalization', SQM$misc$single_copy_genes))
        for(method in names(SQM$functions))
            {
            SQM$functions[[method]]$copy_number = t(t(SQM$functions[[method]]$cov) / SQM$misc$single_copy_cov)
            }
        }

    class(SQM)      = 'SQM'

    return(SQM)

    }

Try the SQMtools package in your browser

Any scripts or data that you put into this service are public.

SQMtools documentation built on April 3, 2025, 6:16 p.m.