R/grouping.R

Defines functions cdhit groupToGraph igMembers igGroup

#' @include aaa.R
#' @include pgVirtual.R
NULL

#' @describeIn graphGrouping graph grouping for all pgVirtual subclasses
#' 
#' @param similarity A similarity matrix with rows and columns corresponding to
#' the genes in the pangenome.
#' 
#' @param algorithm A string naming the algorithm. See 
#' \code{\link[igraph]{communities}}
#' for an overview. The trailing '.community' can be omitted from the name. 
#' Default is 'infomap', which is also the recommended.
#' 
setMethod(
    'graphGrouping', 'pgVirtual',
    function(object, similarity, algorithm, ...) {
        time1 <- proc.time()['elapsed']
        .fillDefaults(defaults(object))
        
        members <- igGroup(similarity, algorithm, ...)
        object <- groupGenes(object, as.integer(members))
        time2 <- proc.time()['elapsed']
        message('Grouping resulted in ', nGeneGroups(object), ' gene groups (',
                formatSeconds(time2 - time1), ' elapsed)')
        object
    }
)

#' @describeIn gpcGrouping gpc grouping for all pgVirtual subclasses
#' 
#' @param lowMem logical. Should low memory footprint be ensured over 
#' computation speed
#' 
#' @param kmerSize The size of the kmer's used for the comparison. If two values
#' are given and the 'tree' argument is missing, the second value is used for 
#' tree generation. If only one value is given it is recycled.
#'   
#' @param tree An optional tree of class dendrogram (or that can be coerced to 
#' one) to guide the recursive algorithm. If none is supplied it will be 
#' generated by clustering the organisms by their total kmer numbers (summing up
#' for each of their genes).
#'   
#' @param lowerLimit A numeric giving the lower bounds of similarity below which
#' it will be set to zero.
#'   
#' @param pParam An optional BiocParallelParam object that defines the workers 
#' used for parallelisation.
#'   
#' @param cacheDB A \code{\linkS4class{filehash}} object or a path to a 
#' directory where cached results should be stored. If omitted caching will not
#' be done. Highly recommended for long running instances.
#' 
#' @param precluster Logical. Should genes be preclustered using CD-Hit. 
#' Defaults to TRUE.
#' 
#' @importFrom kebabs getExRep spectrumKernel
#' @importFrom stats as.dendrogram
#' @importFrom filehash dbCreate dbInit
#' 
setMethod(
    'gpcGrouping', 'pgVirtual',
    function(object, lowMem, kmerSize, tree, lowerLimit, pParam, cacheDB,
             precluster = TRUE, ...) {
        .fillDefaults(defaults(object))
        args <- mget(ls())
        time1 <- proc.time()['elapsed']
        
        if (precluster) {
            clusters <- precluster(object, kmerSize[1], ...)
            clusters <- rep(seq_along(clusters), lengths(clusters))[order(unlist(clusters))]
        } else {
            clusters <- NA
        }
        time2 <- proc.time()['elapsed']
        
        args$lowMem <- NULL
        args$object <- NULL
        args$precluster <- NULL
        args$pangenome <- object
        args$clusters <- clusters
        
        if (!missing(cacheDB)) {
            if (!inherits(cacheDB, 'filehash')) {
                if (suppressMessages(dbCreate(cacheDB, type = 'RDS'))) {
                    cacheDB <- dbInit(cacheDB, type = 'RDS')
                } else {
                    stop('Failed to create cache databse')
                }
            }
            args$cacheDB <- cacheDB
        }
        if (missing(tree)) {
            tree <- orgTree(object, type = 'kmer', 
                            kmerSize = rep(kmerSize, length = 2)[2], 
                            dist = 'euclidean', clust = 'ward.D2')
        }
        args$tree <- as.dendrogram(tree)
        if (!lowMem) {
            args$kmerSize <- NULL
            args$er <- getExRep(genes(object), spectrumKernel(kmerSize))
        }
        groups <- do.call(recurseCompare, args)
        object <- manualGrouping(object, groups)
        time3 <- proc.time()['elapsed']
        message('Grouping resulted in ', nGeneGroups(object), ' gene groups (',
                formatSeconds(time3 - time2), ' elapsed)')
        message('Total time elapsed was ', formatSeconds(time3 - time1))
        object
    }
)

#' @describeIn manualGrouping manual grouping defined by integer vector
#' 
setMethod(
    'manualGrouping', c('pgVirtual', 'integer'),
    function(object, groups) {
        groupGenes(object, groups)
    }
)
#' @describeIn manualGrouping manual grouping defined by list
#' 
setMethod(
    'manualGrouping', c('pgVirtual', 'list'),
    function(object, groups) {
        groups <- groups[order(lengths(groups), decreasing = TRUE)]
        members <- convertGrouping(groups)
        
        manualGrouping(object, members)
    }
)
#' @describeIn cdhitGrouping Grouping using cdhit for all pgVirtual subclasses
#' 
#' @param kmerSize The size of the kmer's used for the comparison. If two values
#' are given the first will be used for the CD-HIT algorithm and the second will
#' be used for the cosine similarity calculations.
#' 
#' @param lowerLimit A numeric giving the lower bounds of similarity below which
#' it will be set to zero.
#' 
#' @param maxLengthDif The maximum deviation in sequence length to allow during
#' preclustering with CD-HIT. Below 1 it describes a percentage. Above 1 it 
#' describes a fixed length.
#' 
#' @param geneChunkSize The maximum number of genes to pass to the CD-HIT
#' algorithm. If object contains more genes than this, CD-HIT will be run in 
#' chunks and combined with a second CD-HIT pass before the final cosine 
#' similarity grouping.
#' 
#' @param cdhitOpts Additional arguments passed on to CD-HIT. It should be a 
#' named list with names corresponding to the arguments expected in the CD-HIT
#' algorithm (without the dash). i, n and s/S will be overwritten based on the
#' other parameters given to this function and all values in cdhitOpts will be
#' converted to character using as.character
#' 
#' @param cdhitIter Logical. Should the preclustered groups be grouped by 
#' gradually lowering the threshold in CD-Hit or by directly calculating kmer
#' similarities between all preclusters and group by that. Defaults to TRUE
#' 
#' @param nrep If \code{cdhitIter = TRUE}, controls how many iterations should
#' be performed at each threshold level. Defaults to 1.
#' 
#' @param from The start similarity threshold to use for the iterative CD-Hit
#' grouping. Together with by and nrep it defines the number of times and levels
#' CD-Hit is run. Defaults to 0.9
#' 
#' @param by The step size to use for the iterative CD-Hit grouping. Defaults to
#' 0.05
#' 
#' @importFrom Biostrings order
#' @importFrom kebabs getExRep spectrumKernel
#' 
setMethod(
    'cdhitGrouping', 'pgVirtual',
    function(object, kmerSize, lowerLimit, maxLengthDif, geneChunkSize, 
             cdhitOpts, cdhitIter = TRUE, nrep = 1, from = 0.9, by = 0.05) {
        time1 <- proc.time()['elapsed']
        .fillDefaults(defaults(object))
        groups <- precluster(object, kmerSize[1], maxLengthDif, geneChunkSize, 
                             cdhitOpts)
        
        time2 <- proc.time()['elapsed']
        
        if (cdhitIter) {
            cdhitOpts$l <- kmerSize[1]
            if (maxLengthDif < 1) {
                cdhitOpts$s <- 1 - maxLengthDif
            } else {
                cdhitOpts$S <- maxLengthDif
            }
            cdhitOpts <- lapply(cdhitOpts, as.character)
            opts <- data.frame(lowerLimit = seq(from, lowerLimit, by = -by), 
                               kmerSize = 5)
            opts$kmerSize[opts$lowerLimit < 0.7] <- 4
            opts$kmerSize[opts$lowerLimit < 0.6] <- 3
            opts$kmerSize[opts$lowerLimit < 0.5] <- 2
            opts <- opts[opts$lowerLimit < 1, ]
            
            for (i in seq_len(nrow(opts))) {
                for (j in seq_len(nrep)) {
                    reps <- sapply(groups, function(x) {
                        x[sample.int(length(x), size = 1)]
                    })
                    seqs <- genes(object, subset = reps)
                    
                    cdhitOpts$n <- as.character(opts$kmerSize[i])
                    cdhitOpts$c <- as.character(opts$lowerLimit[i])
                    if (!(i == 1 && j == 1) && interactive()) cat('\n')
                    groupsGroups <- cdhit(seqs, cdhitOpts, 'Grouping     ')
                    groups <- lapply(split(groups, groupsGroups), unlist)
                }
            }
        } else {
            reps <- sapply(groups, function(x) {
                x[sample.int(length(x), size = 1)]
            })
            seqs <- genes(object, subset = reps)
            
            groupsGroups <- lkFMF(getExRep(seqs, spectrumKernel(rep(kmerSize, 2)[2])),
                                  order = order(seqs), lowerLimit = lowerLimit, 
                                  upperLimit = lowerLimit)
            groups <- lapply(split(groups, groupsGroups), unlist)
        }
        object <- manualGrouping(object, groups)
        time3 <- proc.time()['elapsed']
        message('Grouping resulted in ', nGeneGroups(object), ' gene groups (',
                formatSeconds(time3 - time2), ' elapsed)')
        message('Total time elapsed was ', formatSeconds(time3 - time1))
        object
    }
)
setMethod(
    'precluster', 'pgVirtual',
    precluster <- function(object, kmerSize, maxLengthDif, geneChunkSize, 
                           cdhitOpts) {
        time1 <- proc.time()['elapsed']
        .fillDefaults(defaults(object))
        cdhitOpts$n <- kmerSize
        cdhitOpts$l <- kmerSize
        if (maxLengthDif < 1) {
            cdhitOpts$s <- 1 - maxLengthDif
        } else {
            cdhitOpts$S <- maxLengthDif
        }
        cdhitOpts <- lapply(cdhitOpts, as.character)
        nChunks <- ceiling(nGenes(object)/geneChunkSize)
        chunks <- ceiling(nGenes(object)/nChunks) * seq_len(nChunks)
        chunks[nChunks] <- nGenes(object)
        chunks <- data.frame(start = c(1, chunks[-nChunks] + 1), end = chunks)
        
        groups <- lapply(seq_len(nChunks), function(i) {
            if (i != 1 && interactive()) cat('\n')
            cdhit(genes(object, subset = seq.int(chunks$start[i], chunks$end[i])), 
                  cdhitOpts, 'Preclustering')
        })
        if (nChunks > 1) {
            nClusters <- sapply(groups, max)
            offset <- c(0, cumsum(nClusters)[-nChunks])
            groups <- Map(function(group, offset) {
                group + offset
            }, group = groups, offset = offset)
            groups <- split(seq_len(nGenes(object)), unlist(groups))
            reps <- sapply(groups, function(x) {
                x[sample.int(length(x), size = 1)]
            })
            cat('\n')
            groupsGroups <- cdhit(genes(object, subset = reps), cdhitOpts, 'Merging      ')
            groups <- lapply(split(groups, groupsGroups), unlist)
        } else {
            groups <- split(seq_len(nGenes(object)), unlist(groups))
        }
        time2 <- proc.time()['elapsed']
        message('Preclustering resulted in ', length(groups), ' gene groups (',
                formatSeconds(time2 - time1), ' elapsed)')
        groups
    }
)
### GROUPING HELPER FUNCTIONS

#' Extract community membership based on similarity matrix
#' 
#' This function creates an igraph object and runs community detection on it,
#' returning the community memberships of the vertices.
#' 
#' @param similarity A similarity matrix with weights in the lower triangle. 
#' Diagonal is ignored.
#' 
#' @param algorithm The community detection algorithm to use
#' 
#' @param ... Parameters passed on to the community detection
#' 
#' @return An integer vector with an element for each vertice
#' 
#' @importFrom igraph graph_from_adjacency_matrix
#' 
#' @noRd
#' 
igGroup <- function(similarity, algorithm, ...) {
    graph <- graph_from_adjacency_matrix(similarity, mode = 'lower', 
                                         weighted = TRUE, diag = FALSE)
    igMembers(graph, algorithm, ...)
}

#' Extract community membership from a graph object
#' 
#' This function runs community detection on a graph based on the name of the
#' algorithm.
#' 
#' @param graph An igraph object
#' 
#' @param algorithm The community detection algorithm to use
#' 
#' @param ... Parameters passed on to the community detection
#' 
#' @return An integer vector with an element for each vertice
#' 
#' @importFrom igraph membership cluster_edge_betweenness cluster_fast_greedy cluster_infomap cluster_label_prop cluster_leading_eigen cluster_louvain cluster_optimal cluster_spinglass cluster_walktrap
#' 
#' @noRd
#' 
igMembers <- function(graph, algorithm, ...) {
    algParam <- list(...)
    algParam$graph <- graph
    if (!grepl(pattern = 'cluster_', x = algorithm)) {
        algorithm <- paste0('cluster_', algorithm)
    }
    community <- do.call(algorithm, algParam)
    membership(community)
}

#' Create a combined graph of all-vs-all within groups comparisons
#' 
#' This function calculates all-vs-all similarities for members within each 
#' group and return the result as a weighted graph.
#' 
#' @param pangenome A pgVirtual subclass
#' 
#' @param groups A list with group member indexes
#' 
#' @param er ExplicitRepresentation for all genes given in groups
#' 
#' @param lowerLimit The lower threshold for similarity, below which it will be
#' set to 0
#' 
#' @return An igraph object containing all groups as disconnected subgraphs
#' 
#' @importFrom kebabs linearKernel
#' @importFrom igraph as_data_frame graph_from_adjacency_matrix graph_from_data_frame
#' 
#' @noRd
#' 
groupToGraph <- function(pangenome, groups, er, lowerLimit) {
    edges <- lapply(groups, function(members, er, lowerLimit) {
        sim <- linearKernel(er, selx = members, sparse = TRUE, diag = FALSE, 
                            lowerLimit = lowerLimit)
        as_data_frame(graph_from_adjacency_matrix(sim, mode = 'lower', 
                                                  weighted = TRUE, 
                                                  diag = FALSE))
    }, er = er, lowerLimit = lowerLimit)
    edges <- do.call(rbind, edges)
    graph_from_data_frame(edges, directed = FALSE, 
                          vertices = data.frame(name = geneNames(pangenome)))
}
#' @importFrom Biostrings writeXStringSet
#' 
#' @noRd
#' 
cdhit <- function(seqs, options, name = 'CD-Hit', showProgress = interactive()) {
    options$i <- tempfile()
    writeXStringSet(seqs, options$i)
    on.exit(unlink(options$i))
    switch(
        class(seqs),
        AAStringSet = cdhitC(options, name, showProgress) + 1,
        DNAStringSet = cdhitestC(options, name, showProgress) + 1,
        stop('seqs must be either AAStringSet or DNAStringSet')
    )
}

Try the FindMyFriends package in your browser

Any scripts or data that you put into this service are public.

FindMyFriends documentation built on Nov. 8, 2020, 6:46 p.m.