Nothing
#' Combine several SQM objects
#'
#' Combine an arbitrary number of SQM objects into a single SQM object (if the input objects contain the same samples, i.e. they come from the same SqueezeMeta run) or a single SQMbunch object. For combining results from sqm_reads.pl or sqm_longreads.pl please check \code{\link{combineSQMlite}}. The parameters below (other than ...) will take only effect if the input objects contain the same samples. Otherwise the input objects will be taken as they are, with no recalculation of taxonomy, function or rescaling,
#' @param ... an arbitrary number of SQM objects. Alternatively, a single list containing an arbitrary number of SQM objects.
#' @param tax_source character. Features used for calculating aggregated abundances at the different taxonomic ranks. Either \code{"orfs"} or \code{"contigs"} (default \code{"orfs"}). If the objects being combined contain a subset of taxa or bins, this parameter can be set to \code{TRUE}.
#' @param trusted_functions_only logical. If \code{TRUE}, only highly trusted functional annotations (best hit + best average) will be considered when generating aggregated function tables. If \code{FALSE}, best hit annotations will be used (default \code{FALSE}).
#' @param ignore_unclassified_functions logical. If \code{FALSE}, ORFs with no functional classification will be aggregated together into an "Unclassified" category. If \code{TRUE}, they will be ignored (default \code{FALSE}).
#' @param rescale_tpm logical. If \code{TRUE}, TPMs for KEGGs, COGs, and PFAMs will be recalculated (so that the TPMs in the subset actually add up to 1 million). Otherwise, per-function TPMs will be calculated by aggregating the TPMs of the ORFs annotated with that function, and will thus keep the scaling present in the parent object (default \code{TRUE}).
#' @param rescale_copy_number logical. If \code{TRUE}, copy numbers with be recalculated using the RecA/RadA coverages in the subset. Otherwise, RecA/RadA coverages will be taken from the parent object with the highest RecA/RadA coverages. By default it is set to \code{TRUE}, which means that the returned copy numbers will represent the average copy number per function \emph{in the genomes of the selected bins or contigs}. If any SQM objects that are being combined contain a functional subset rather than a contig/bins subset, this parameter should be set to \code{FALSE}.
#' @param recalculate_bin_stats logical. If \code{TRUE}, bin stats and taxonomy are recalculated based on the contigs present in the subsetted object (default \code{TRUE}).
#' @return A SQM or SQMbunch object
#' @seealso \code{\link{subsetFun}}, \code{\link{subsetTax}}, \code{\link{combineSQMlite}}
#' @examples
#' data(Hadza)
#' # Select Carbohydrate metabolism ORFs in Bacteroidota,
#' # and Amino acid metabolism ORFs in Proteobacteria
#' bact = subsetTax(Hadza, "phylum", "Bacteroidota")
#' bact.carb = subsetFun(bact, "Carbohydrate metabolism")
#' baci = subsetTax(Hadza, "phylum", "Bacillota")
#' baci.amins = subsetFun(baci, "Amino acid metabolism")
#' bact.carb_proteo.amins = combineSQM(bact.carb, baci.amins, rescale_copy_number=FALSE)
#' @export
combineSQM = function(..., tax_source = 'orfs', trusted_functions_only = FALSE, ignore_unclassified_functions = FALSE, rescale_tpm = TRUE, rescale_copy_number = TRUE, recalculate_bin_stats = TRUE)
{
inSQM = list(...)
### if there is only one argument and this argument is a list, treat it as a list containing SQM objects
if(length(inSQM) == 1 & inherits(inSQM[[1]], 'list')) { inSQM = list(...)[[1]] }
### check whether the input objects come from the same project or not
projNames = sapply(inSQM, FUN=function(proj) proj$misc$project_name)
if(length(unique(projNames))==1)
{
# intermediate function so that we can pass extra args to combineSQM
myFun = function(SQM1, SQM2) combineSQM_(SQM1, SQM2, tax_source, trusted_functions_only, ignore_unclassified_functions, rescale_tpm, rescale_copy_number, recalculate_bin_stats)
combSQM = Reduce(myFun, inSQM)
} else
{
combSQM = combineSQMlite(inSQM)
combSQM$projects = inSQM
names(combSQM$projects) = projNames
class(combSQM) = 'SQMbunch'
}
return(combSQM)
}
#' @importFrom stats aggregate
combineSQM_ = function(SQM1, SQM2, tax_source = 'orfs', trusted_functions_only = FALSE, ignore_unclassified_functions = FALSE, rescale_tpm = TRUE, rescale_copy_number = TRUE, recalculate_bin_stats = TRUE)
{
if(!inherits(SQM1, 'SQM') | !inherits(SQM2, 'SQM')) { stop('This function only accepts SQM objects') }
if (!identical(colnames(SQM1$orfs$table), colnames(SQM2$orfs$table)))
{
stop('The input objects do not seem to come from the same SQM project')
}
combSQM = SQM1
### ORFs
extraORFs = setdiff(rownames(SQM2$orfs$table), rownames(SQM1$orfs$table))
# Table
combSQM$orfs$table = rbind(combSQM$orfs$table, SQM2$orfs$table[extraORFs,,drop=FALSE])
combSQM$orfs$table = combSQM$orfs$table[sort(rownames(combSQM$orfs$table)),,drop=FALSE]
# Abundances
combSQM$orfs$abund = rbind(combSQM$orfs$abund, SQM2$orfs$abund[extraORFs,,drop=FALSE])
combSQM$orfs$abund = combSQM$orfs$abund[rownames(combSQM$orfs$table),,drop=FALSE]
combSQM$orfs$bases = rbind(combSQM$orfs$bases, SQM2$orfs$bases[extraORFs,,drop=FALSE])
combSQM$orfs$bases = combSQM$orfs$bases[rownames(combSQM$orfs$table),,drop=FALSE]
combSQM$orfs$cov = rbind(combSQM$orfs$cov, SQM2$orfs$cov[extraORFs,,drop=FALSE])
combSQM$orfs$cov = combSQM$orfs$cov[rownames(combSQM$orfs$table),,drop=FALSE]
combSQM$orfs$cpm = t(t(combSQM$orfs$cov) / (combSQM$total_reads / 1000000))
combSQM$orfs$tpm = rbind(combSQM$orfs$tpm, SQM2$orfs$tpm[extraORFs,,drop=FALSE])
combSQM$orfs$tpm = combSQM$orfs$tpm[rownames(combSQM$orfs$table),,drop=FALSE]
# Sequences
if(!is.null(combSQM$orfs$seqs))
{
combSQM$orfs$seqs = c(combSQM$orfs$seqs, SQM2$orfs$seqs[extraORFs])
combSQM$orfs$seqs = combSQM$orfs$seqs[rownames(combSQM$orfs$table)]
}
# Taxonomy
combSQM$orfs$tax = rbind(combSQM$orfs$tax, SQM2$orfs$tax[extraORFs,,drop=FALSE])
combSQM$orfs$tax = combSQM$orfs$tax[rownames(combSQM$orfs$table),,drop=FALSE]
if('tax16S' %in% names(combSQM$orfs))
{
combSQM$orfs$tax16S = c(combSQM$orfs$tax16S, SQM2$orfs$tax16S[extraORFs])
combSQM$orfs$tax16S = combSQM$orfs$tax16S[rownames(combSQM$orfs$table)]
}
# Markers
if('markers' %in% names(combSQM$orfs))
{
combSQM$orfs$markers = c(combSQM$orfs$markers, SQM2$orfs$markers[extraORFs])
combSQM$orfs$markers = combSQM$orfs$markers[rownames(combSQM$orfs$table)]
}
### Contigs
extraContigs = setdiff(rownames(SQM2$contigs$table), rownames(SQM1$contigs$table))
# Table
combSQM$contigs$table = rbind(combSQM$contigs$table, SQM2$contigs$table[extraContigs,,drop=FALSE])
combSQM$contigs$table = combSQM$contigs$table[sort(rownames(combSQM$contigs$table)),]
# Abundances
combSQM$contigs$abund = rbind(combSQM$contigs$abund, SQM2$contigs$abund[extraContigs,,drop=FALSE])
combSQM$contigs$abund = combSQM$contigs$abund[rownames(combSQM$contigs$table),,drop=FALSE]
combSQM$contigs$bases = rbind(combSQM$contigs$bases, SQM2$contigs$bases[extraContigs,,drop=FALSE])
combSQM$contigs$bases = combSQM$contigs$bases[rownames(combSQM$contigs$table),,drop=FALSE]
combSQM$contigs$cov = rbind(combSQM$contigs$cov, SQM2$contigs$cov[extraContigs,,drop=FALSE])
combSQM$contigs$cov = combSQM$contigs$cov[rownames(combSQM$contigs$table),,drop=FALSE]
combSQM$contigs$cpm = t(t(combSQM$contigs$cov) / (combSQM$total_reads / 1000000))
combSQM$contigs$tpm = rbind(combSQM$contigs$tpm, SQM2$contigs$tpm[extraContigs,,drop=FALSE])
combSQM$contigs$tpm = combSQM$contigs$tpm[rownames(combSQM$contigs$table),,drop=FALSE]
# Sequences
if(!is.null(combSQM$contigs$seqs))
{
combSQM$contigs$seqs = c(combSQM$contigs$seqs, SQM2$contigs$seqs[extraContigs])
combSQM$contigs$seqs = combSQM$contigs$seqs[rownames(combSQM$contigs$table)]
}
# Taxonomy
combSQM$contigs$tax = rbind(combSQM$contigs$tax, SQM2$contigs$tax[extraContigs,,drop=FALSE])
combSQM$contigs$tax = combSQM$contigs$tax[rownames(combSQM$contigs$table),,drop=FALSE]
# Binning info
if('bins' %in% names(combSQM))
{
combSQM$contigs$bins = rbind(combSQM$contigs$bins, SQM2$contigs$bins[extraContigs,,drop=FALSE])
combSQM$contigs$bins = combSQM$contigs$bins[rownames(combSQM$contigs$table),,drop=FALSE]
### Bins
if(recalculate_bin_stats & (!('tax16S' %in% names(combSQM$orfs)) & 'markers' %in% names(combSQM$orfs)))
{
warning('You requested to recalculate bin stats but 16S or marker gene info are missing. Will not recalculate bin stats')
}
# Table and Taxonomy
if(recalculate_bin_stats & ('tax16S' %in% names(combSQM$orfs) & 'markers' %in% names(combSQM$orfs)))
{
bin_stats = get.bin.stats(combSQM)
combSQM$bins$table = bin_stats[['table']]
combSQM$bins$tax = bin_stats[['tax']]
}else
{
extraBins = setdiff(rownames(SQM2$bins$table), rownames(SQM1$bins$table))
combSQM$bins$table = rbind(combSQM$bins$table, SQM2$bins$table[extraBins,,drop=FALSE])
combSQM$bins$table = combSQM$bins$table[sort(rownames(combSQM$bins$table)),,drop=FALSE]
combSQM$bins$tax = rbind(combSQM$bins$tax, SQM2$bins$tax[extraBins,,drop=FALSE])
combSQM$bins$tax = combSQM$bins$tax[rownames(combSQM$bins$table),,drop=FALSE]
}
# Abundances
bin_abunds = get.bin.abunds(combSQM)
combSQM$bins$abund = bin_abunds[['abund']]
combSQM$bins$percent = bin_abunds[['percent']]
combSQM$bins$bases = bin_abunds[['bases']]
combSQM$bins$length = bin_abunds[['length']]
combSQM$bins$cov = bin_abunds[['cov']]
combSQM$bins$cpm = bin_abunds[['cpm']]
}
### Taxonomy
combSQM$taxa$superkingdom$abund = aggregate_taxa(combSQM, 'superkingdom', tax_source)
combSQM$taxa$phylum$abund = aggregate_taxa(combSQM, 'phylum' , tax_source)
combSQM$taxa$class$abund = aggregate_taxa(combSQM, 'class' , tax_source)
combSQM$taxa$order$abund = aggregate_taxa(combSQM, 'order' , tax_source)
combSQM$taxa$family$abund = aggregate_taxa(combSQM, 'family' , tax_source)
combSQM$taxa$genus$abund = aggregate_taxa(combSQM, 'genus' , tax_source)
combSQM$taxa$species$abund = aggregate_taxa(combSQM, 'species' , tax_source)
combSQM$taxa$superkingdom$percent = 100 * t(t(combSQM$taxa$superkingdom$abund) / combSQM$total_reads) #colSums(combSQM$taxa$superkingdom$abund))
combSQM$taxa$phylum$percent = 100 * t(t(combSQM$taxa$phylum$abund) / combSQM$total_reads) #colSums(combSQM$taxa$phylum$abund))
combSQM$taxa$class$percent = 100 * t(t(combSQM$taxa$class$abund) / combSQM$total_reads) #colSums(combSQM$taxa$class$abund))
combSQM$taxa$order$percent = 100 * t(t(combSQM$taxa$order$abund) / combSQM$total_reads) #colSums(combSQM$taxa$order$abund))
combSQM$taxa$family$percent = 100 * t(t(combSQM$taxa$family$abund) / combSQM$total_reads) #colSums(combSQM$taxa$family$abund))
combSQM$taxa$genus$percent = 100 * t(t(combSQM$taxa$genus$abund) / combSQM$total_reads) #colSums(combSQM$taxa$genus$abund))
combSQM$taxa$species$percent = 100 * t(t(combSQM$taxa$species$abund) / combSQM$total_reads) #colSums(combSQM$taxa$species$abund))
### Functions
if('KEGG' %in% names(combSQM$functions))
{
KEGG = aggregate_fun(combSQM, 'KEGG', trusted_functions_only, ignore_unclassified_functions)
combSQM$functions$KEGG$abund = KEGG$abund
combSQM$functions$KEGG$bases = KEGG$bases
combSQM$functions$KEGG$cov = KEGG$cov
combSQM$functions$KEGG$cpm = t(t(combSQM$functions$KEGG$cov) / (combSQM$total_reads / 1000000))
}
if('COG' %in% names(combSQM$functions))
{
COG = aggregate_fun(combSQM, 'COG' , trusted_functions_only, ignore_unclassified_functions)
combSQM$functions$COG$abund = COG$abund
combSQM$functions$COG$bases = COG$bases
combSQM$functions$COG$cov = COG$cov
combSQM$functions$COG$cpm = t(t(combSQM$functions$COG$cov) / (combSQM$total_reads / 1000000))
}
if('PFAM' %in% names(combSQM$functions))
{
PFAM = aggregate_fun(combSQM, 'PFAM', trusted_functions_only, ignore_unclassified_functions)
combSQM$functions$PFAM$abund = PFAM$abund
combSQM$functions$PFAM$bases = PFAM$bases
combSQM$functions$PFAM$cov = PFAM$cov
combSQM$functions$PFAM$cpm = t(t(combSQM$functions$PFAM$cov) / (combSQM$total_reads / 1000000))
}
ext_annots = list()
for(method in combSQM$misc$ext_annot_sources)
{
ext_annots[[method]] = aggregate_fun(combSQM, method, trusted_functions_only, ignore_unclassified_functions)
combSQM$functions[[method]]$abund = ext_annots[[method]]$abund
combSQM$functions[[method]]$bases = ext_annots[[method]]$bases
combSQM$functions[[method]]$cov = ext_annots[[method]]$cov
combSQM$functions[[method]]$cpm = t(t(combSQM$functions[[method]]$cov) / (combSQM$total_reads / 1000000))
}
if(rescale_tpm)
{
if('KEGG' %in% names(combSQM$functions)) { combSQM$functions$KEGG$tpm = KEGG$tpm_rescaled }
if('COG' %in% names(combSQM$functions)) { combSQM$functions$COG$tpm = COG$tpm_rescaled }
if('PFAM' %in% names(combSQM$functions)) { combSQM$functions$PFAM$tpm = PFAM$tpm_rescaled }
for(method in combSQM$misc$ext_annot_sources)
{
combSQM$functions[[method]]$tpm = ext_annots[[method]]$tpm_rescaled
}
}else
{
if('KEGG' %in% names(combSQM$functions)) { combSQM$functions$KEGG$tpm = KEGG$tpm }
if('COG' %in% names(combSQM$functions)) { combSQM$functions$COG$tpm = COG$tpm }
if('PFAM' %in% names(combSQM$functions)) { combSQM$functions$PFAM$tpm = PFAM$tpm }
for(method in combSQM$misc$ext_annot_sources)
{
combSQM$functions[[method]]$tpm = ext_annots[[method]]$tpm
}
}
### COPY NUMBERS
combSQM$misc$single_copy_cov = get_median_single_copy_cov(combSQM)
if(any(is.na(combSQM$misc$single_copy_cov)) | rescale_copy_number) # use the largest and hope for the best
{
combSQM$misc$single_copy_cov = pmax(SQM1$misc$single_copy_cov, SQM2$misc$single_copy_cov)
}
for(method in names(combSQM$functions))
{
combSQM$functions[[method]]$copy_number = t(t(combSQM$functions[[method]]$cov) / combSQM$misc$single_copy_cov)
}
return(combSQM)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.