Nothing
#' @include aaa.R
#' @include pgVirtual.R
NULL
#' @describeIn graphGrouping graph grouping for all pgVirtual subclasses
#'
#' @param similarity A similarity matrix with rows and columns corresponding to
#' the genes in the pangenome.
#'
#' @param algorithm A string naming the algorithm. See
#' \code{\link[igraph]{communities}}
#' for an overview. The trailing '.community' can be omitted from the name.
#' Default is 'infomap', which is also the recommended.
#'
setMethod(
'graphGrouping', 'pgVirtual',
function(object, similarity, algorithm, ...) {
time1 <- proc.time()['elapsed']
.fillDefaults(defaults(object))
members <- igGroup(similarity, algorithm, ...)
object <- groupGenes(object, as.integer(members))
time2 <- proc.time()['elapsed']
message('Grouping resulted in ', nGeneGroups(object), ' gene groups (',
formatSeconds(time2 - time1), ' elapsed)')
object
}
)
#' @describeIn gpcGrouping gpc grouping for all pgVirtual subclasses
#'
#' @param lowMem logical. Should low memory footprint be ensured over
#' computation speed
#'
#' @param kmerSize The size of the kmer's used for the comparison. If two values
#' are given and the 'tree' argument is missing, the second value is used for
#' tree generation. If only one value is given it is recycled.
#'
#' @param tree An optional tree of class dendrogram (or that can be coerced to
#' one) to guide the recursive algorithm. If none is supplied it will be
#' generated by clustering the organisms by their total kmer numbers (summing up
#' for each of their genes).
#'
#' @param lowerLimit A numeric giving the lower bounds of similarity below which
#' it will be set to zero.
#'
#' @param pParam An optional BiocParallelParam object that defines the workers
#' used for parallelisation.
#'
#' @param cacheDB A \code{\linkS4class{filehash}} object or a path to a
#' directory where cached results should be stored. If omitted caching will not
#' be done. Highly recommended for long running instances.
#'
#' @param precluster Logical. Should genes be preclustered using CD-Hit.
#' Defaults to TRUE.
#'
#' @importFrom kebabs getExRep spectrumKernel
#' @importFrom stats as.dendrogram
#' @importFrom filehash dbCreate dbInit
#'
setMethod(
'gpcGrouping', 'pgVirtual',
function(object, lowMem, kmerSize, tree, lowerLimit, pParam, cacheDB,
precluster = TRUE, ...) {
.fillDefaults(defaults(object))
args <- mget(ls())
time1 <- proc.time()['elapsed']
if (precluster) {
clusters <- precluster(object, kmerSize[1], ...)
clusters <- rep(seq_along(clusters), lengths(clusters))[order(unlist(clusters))]
} else {
clusters <- NA
}
time2 <- proc.time()['elapsed']
args$lowMem <- NULL
args$object <- NULL
args$precluster <- NULL
args$pangenome <- object
args$clusters <- clusters
if (!missing(cacheDB)) {
if (!inherits(cacheDB, 'filehash')) {
if (suppressMessages(dbCreate(cacheDB, type = 'RDS'))) {
cacheDB <- dbInit(cacheDB, type = 'RDS')
} else {
stop('Failed to create cache databse')
}
}
args$cacheDB <- cacheDB
}
if (missing(tree)) {
tree <- orgTree(object, type = 'kmer',
kmerSize = rep(kmerSize, length = 2)[2],
dist = 'euclidean', clust = 'ward.D2')
}
args$tree <- as.dendrogram(tree)
if (!lowMem) {
args$kmerSize <- NULL
args$er <- getExRep(genes(object), spectrumKernel(kmerSize))
}
groups <- do.call(recurseCompare, args)
object <- manualGrouping(object, groups)
time3 <- proc.time()['elapsed']
message('Grouping resulted in ', nGeneGroups(object), ' gene groups (',
formatSeconds(time3 - time2), ' elapsed)')
message('Total time elapsed was ', formatSeconds(time3 - time1))
object
}
)
#' @describeIn manualGrouping manual grouping defined by integer vector
#'
setMethod(
'manualGrouping', c('pgVirtual', 'integer'),
function(object, groups) {
groupGenes(object, groups)
}
)
#' @describeIn manualGrouping manual grouping defined by list
#'
setMethod(
'manualGrouping', c('pgVirtual', 'list'),
function(object, groups) {
groups <- groups[order(lengths(groups), decreasing = TRUE)]
members <- convertGrouping(groups)
manualGrouping(object, members)
}
)
#' @describeIn cdhitGrouping Grouping using cdhit for all pgVirtual subclasses
#'
#' @param kmerSize The size of the kmer's used for the comparison. If two values
#' are given the first will be used for the CD-HIT algorithm and the second will
#' be used for the cosine similarity calculations.
#'
#' @param lowerLimit A numeric giving the lower bounds of similarity below which
#' it will be set to zero.
#'
#' @param maxLengthDif The maximum deviation in sequence length to allow during
#' preclustering with CD-HIT. Below 1 it describes a percentage. Above 1 it
#' describes a fixed length.
#'
#' @param geneChunkSize The maximum number of genes to pass to the CD-HIT
#' algorithm. If object contains more genes than this, CD-HIT will be run in
#' chunks and combined with a second CD-HIT pass before the final cosine
#' similarity grouping.
#'
#' @param cdhitOpts Additional arguments passed on to CD-HIT. It should be a
#' named list with names corresponding to the arguments expected in the CD-HIT
#' algorithm (without the dash). i, n and s/S will be overwritten based on the
#' other parameters given to this function and all values in cdhitOpts will be
#' converted to character using as.character
#'
#' @param cdhitIter Logical. Should the preclustered groups be grouped by
#' gradually lowering the threshold in CD-Hit or by directly calculating kmer
#' similarities between all preclusters and group by that. Defaults to TRUE
#'
#' @param nrep If \code{cdhitIter = TRUE}, controls how many iterations should
#' be performed at each threshold level. Defaults to 1.
#'
#' @param from The start similarity threshold to use for the iterative CD-Hit
#' grouping. Together with by and nrep it defines the number of times and levels
#' CD-Hit is run. Defaults to 0.9
#'
#' @param by The step size to use for the iterative CD-Hit grouping. Defaults to
#' 0.05
#'
#' @importFrom Biostrings order
#' @importFrom kebabs getExRep spectrumKernel
#'
setMethod(
'cdhitGrouping', 'pgVirtual',
function(object, kmerSize, lowerLimit, maxLengthDif, geneChunkSize,
cdhitOpts, cdhitIter = TRUE, nrep = 1, from = 0.9, by = 0.05) {
time1 <- proc.time()['elapsed']
.fillDefaults(defaults(object))
groups <- precluster(object, kmerSize[1], maxLengthDif, geneChunkSize,
cdhitOpts)
time2 <- proc.time()['elapsed']
if (cdhitIter) {
cdhitOpts$l <- kmerSize[1]
if (maxLengthDif < 1) {
cdhitOpts$s <- 1 - maxLengthDif
} else {
cdhitOpts$S <- maxLengthDif
}
cdhitOpts <- lapply(cdhitOpts, as.character)
opts <- data.frame(lowerLimit = seq(from, lowerLimit, by = -by),
kmerSize = 5)
opts$kmerSize[opts$lowerLimit < 0.7] <- 4
opts$kmerSize[opts$lowerLimit < 0.6] <- 3
opts$kmerSize[opts$lowerLimit < 0.5] <- 2
opts <- opts[opts$lowerLimit < 1, ]
for (i in seq_len(nrow(opts))) {
for (j in seq_len(nrep)) {
reps <- sapply(groups, function(x) {
x[sample.int(length(x), size = 1)]
})
seqs <- genes(object, subset = reps)
cdhitOpts$n <- as.character(opts$kmerSize[i])
cdhitOpts$c <- as.character(opts$lowerLimit[i])
if (!(i == 1 && j == 1) && interactive()) cat('\n')
groupsGroups <- cdhit(seqs, cdhitOpts, 'Grouping ')
groups <- lapply(split(groups, groupsGroups), unlist)
}
}
} else {
reps <- sapply(groups, function(x) {
x[sample.int(length(x), size = 1)]
})
seqs <- genes(object, subset = reps)
groupsGroups <- lkFMF(getExRep(seqs, spectrumKernel(rep(kmerSize, 2)[2])),
order = order(seqs), lowerLimit = lowerLimit,
upperLimit = lowerLimit)
groups <- lapply(split(groups, groupsGroups), unlist)
}
object <- manualGrouping(object, groups)
time3 <- proc.time()['elapsed']
message('Grouping resulted in ', nGeneGroups(object), ' gene groups (',
formatSeconds(time3 - time2), ' elapsed)')
message('Total time elapsed was ', formatSeconds(time3 - time1))
object
}
)
setMethod(
'precluster', 'pgVirtual',
precluster <- function(object, kmerSize, maxLengthDif, geneChunkSize,
cdhitOpts) {
time1 <- proc.time()['elapsed']
.fillDefaults(defaults(object))
cdhitOpts$n <- kmerSize
cdhitOpts$l <- kmerSize
if (maxLengthDif < 1) {
cdhitOpts$s <- 1 - maxLengthDif
} else {
cdhitOpts$S <- maxLengthDif
}
cdhitOpts <- lapply(cdhitOpts, as.character)
nChunks <- ceiling(nGenes(object)/geneChunkSize)
chunks <- ceiling(nGenes(object)/nChunks) * seq_len(nChunks)
chunks[nChunks] <- nGenes(object)
chunks <- data.frame(start = c(1, chunks[-nChunks] + 1), end = chunks)
groups <- lapply(seq_len(nChunks), function(i) {
if (i != 1 && interactive()) cat('\n')
cdhit(genes(object, subset = seq.int(chunks$start[i], chunks$end[i])),
cdhitOpts, 'Preclustering')
})
if (nChunks > 1) {
nClusters <- sapply(groups, max)
offset <- c(0, cumsum(nClusters)[-nChunks])
groups <- Map(function(group, offset) {
group + offset
}, group = groups, offset = offset)
groups <- split(seq_len(nGenes(object)), unlist(groups))
reps <- sapply(groups, function(x) {
x[sample.int(length(x), size = 1)]
})
cat('\n')
groupsGroups <- cdhit(genes(object, subset = reps), cdhitOpts, 'Merging ')
groups <- lapply(split(groups, groupsGroups), unlist)
} else {
groups <- split(seq_len(nGenes(object)), unlist(groups))
}
time2 <- proc.time()['elapsed']
message('Preclustering resulted in ', length(groups), ' gene groups (',
formatSeconds(time2 - time1), ' elapsed)')
groups
}
)
### GROUPING HELPER FUNCTIONS
#' Extract community membership based on similarity matrix
#'
#' This function creates an igraph object and runs community detection on it,
#' returning the community memberships of the vertices.
#'
#' @param similarity A similarity matrix with weights in the lower triangle.
#' Diagonal is ignored.
#'
#' @param algorithm The community detection algorithm to use
#'
#' @param ... Parameters passed on to the community detection
#'
#' @return An integer vector with an element for each vertice
#'
#' @importFrom igraph graph_from_adjacency_matrix
#'
#' @noRd
#'
igGroup <- function(similarity, algorithm, ...) {
graph <- graph_from_adjacency_matrix(similarity, mode = 'lower',
weighted = TRUE, diag = FALSE)
igMembers(graph, algorithm, ...)
}
#' Extract community membership from a graph object
#'
#' This function runs community detection on a graph based on the name of the
#' algorithm.
#'
#' @param graph An igraph object
#'
#' @param algorithm The community detection algorithm to use
#'
#' @param ... Parameters passed on to the community detection
#'
#' @return An integer vector with an element for each vertice
#'
#' @importFrom igraph membership cluster_edge_betweenness cluster_fast_greedy cluster_infomap cluster_label_prop cluster_leading_eigen cluster_louvain cluster_optimal cluster_spinglass cluster_walktrap
#'
#' @noRd
#'
igMembers <- function(graph, algorithm, ...) {
algParam <- list(...)
algParam$graph <- graph
if (!grepl(pattern = 'cluster_', x = algorithm)) {
algorithm <- paste0('cluster_', algorithm)
}
community <- do.call(algorithm, algParam)
membership(community)
}
#' Create a combined graph of all-vs-all within groups comparisons
#'
#' This function calculates all-vs-all similarities for members within each
#' group and return the result as a weighted graph.
#'
#' @param pangenome A pgVirtual subclass
#'
#' @param groups A list with group member indexes
#'
#' @param er ExplicitRepresentation for all genes given in groups
#'
#' @param lowerLimit The lower threshold for similarity, below which it will be
#' set to 0
#'
#' @return An igraph object containing all groups as disconnected subgraphs
#'
#' @importFrom kebabs linearKernel
#' @importFrom igraph as_data_frame graph_from_adjacency_matrix graph_from_data_frame
#'
#' @noRd
#'
groupToGraph <- function(pangenome, groups, er, lowerLimit) {
edges <- lapply(groups, function(members, er, lowerLimit) {
sim <- linearKernel(er, selx = members, sparse = TRUE, diag = FALSE,
lowerLimit = lowerLimit)
as_data_frame(graph_from_adjacency_matrix(sim, mode = 'lower',
weighted = TRUE,
diag = FALSE))
}, er = er, lowerLimit = lowerLimit)
edges <- do.call(rbind, edges)
graph_from_data_frame(edges, directed = FALSE,
vertices = data.frame(name = geneNames(pangenome)))
}
#' @importFrom Biostrings writeXStringSet
#'
#' @noRd
#'
cdhit <- function(seqs, options, name = 'CD-Hit', showProgress = interactive()) {
options$i <- tempfile()
writeXStringSet(seqs, options$i)
on.exit(unlink(options$i))
switch(
class(seqs),
AAStringSet = cdhitC(options, name, showProgress) + 1,
DNAStringSet = cdhitestC(options, name, showProgress) + 1,
stop('seqs must be either AAStringSet or DNAStringSet')
)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.