#' @name PgR6MS
#' @title PgR6 class with Methods and Sequences.
#' @description PgR6 with Methods and Sequences. Final users should use \code{\link{pagoo}}
#' instead of this, since is more easy to understand.
#' Inherits: \code{\link[pagoo]{PgR6M}}
#' @importFrom GenomicRanges mcols mcols<-
#' @importFrom Biostrings BStringSet DNAStringSet
#' @importFrom S4Vectors split
#' @export
PgR6MS <- R6Class('PgR6MS',
inherit = PgR6M,
private = list(
.sequences = NULL
),
public = list(
#' @description
#' Create a \code{PgR6MS} object.
#' @param data A \code{data.frame} or \code{\link[S4Vectors:DataFrame-class]{DataFrame}} containing at least the
#' following columns: \code{gene} (gene name), \code{org} (organism name to which the gene belongs to),
#' and \code{cluster} (group of orthologous to which the gene belongs to). More columns can be added as metadata
#' for each gene.
#' @param org_meta (optional) A \code{data.frame} or \code{\link[S4Vectors:DataFrame-class]{DataFrame}}
#' containing additional metadata for organisms. This \code{data.frame} must have a column named "org" with
#' valid organisms names (that is, they should match with those provided in \code{data}, column \code{org}), and
#' additional columns will be used as metadata. Each row should correspond to each organism.
#' @param cluster_meta (optional) A \code{data.frame} or \code{\link[S4Vectors:DataFrame-class]{DataFrame}}
#' containing additional metadata for clusters. This \code{data.frame} must have a column named "cluster" with
#' valid organisms names (that is, they should match with those provided in \code{data}, column \code{cluster}), and
#' additional columns will be used as metadata. Each row should correspond to each cluster.
#' @param core_level The initial core_level (that's the percentage of organisms a core cluster must be in to be
#' considered as part of the core genome). Must be a number between 100 and 85, (default: 95). You can change it
#' later by using the \code{$core_level} field once the object was created.
#' @param sep A separator. By default is '__'(two underscores). It will be used to
#' create a unique \code{gid} (gene identifier) for each gene. \code{gid}s are created by pasting
#' \code{org} to \code{gene}, separated by \code{sep}.
#' @param DF Deprecated. Use \code{data} instead.
#' @param group_meta Deprecated. Use \code{cluster_meta} instead.
#' @param sequences Can accept: 1) a named \code{list} of named \code{character} vector. Name of list
#' are names of organisms, names of character vector are gene names; or 2) a named \code{list} of \code{\link[Biostrings:XStringSet-class]{DNAStringSetList}}
#' objects (same requirements as (1), but with BStringSet names as gene names); or 3) a \code{\link[Biostrings:XStringSetList-class]{DNAStringSetList}}
#' (same requirements as (2) but \code{DNAStringSetList} names are organisms names).
#' @param verbose \code{logical}. Whether to display progress messages when loading class.
#' @return An R6 object of class PgR6MS. It contains basic fields and methods for analyzing a pangenome. It also
#' contains additional statistical methods for analyze it, methods to make basic exploratory plots, and methods
#' for sequence manipulation.
initialize = function(data,
org_meta,
cluster_meta,
core_level = 95,
sep = '__',
DF,
group_meta,
sequences,
verbose = TRUE){
# Deprecated args
if (!missing(DF)){
data <- DF
warning('Argument "DF" is deprecated. You should use "data" instead. ',
'This still exists for compatibility with previous versions, ',
'but will throw an error in the future.',
immediate. = TRUE, noBreaks. = TRUE)
}
if (!missing(group_meta)){
cluster_meta <- group_meta
warning('Argument "group_meta" is deprecated. You should use "cluster_meta" instead. ',
'This still exists for compatibility with previous versions, ',
'but will throw an error in the future.',
immediate. = TRUE, noBreaks. = TRUE)
}
super$initialize(data = data,
org_meta,
cluster_meta,
core_level = core_level,
sep = sep,
verbose = verbose)
# Check input sequences
if (verbose) message('Checking input sequences.')
if (class(sequences)%in%c('list',
'BStringSetList',
'DNAStringSetList')){
norgs <- length(sequences)
orgNames <- names(sequences)
if (length(orgNames)==0){
stop('Unnamed list.')
}
clss <- unique(sapply(sequences, class))
genNames <- lapply(sequences, names)
if (clss%in%c('character',
'BStringSet',
'DNAStringSet')){
if (length(genNames)==0){
stop('Unnamed sequences.')
}
gids <- mapply(paste,
orgNames, genNames,
MoreArgs = list(sep = sep))
gids <- unlist(gids, use.names = FALSE)
if (clss=='character'){
sqs <- unlist(sequences, use.names = FALSE)
sqs <- DNAStringSet(sqs)
names(sqs) <- gids
}else{
sqs <- DNAStringSetList(sequences)
sqs <- unlist(sqs, use.names = FALSE)
names(sqs) <- gids
}
private$.sequences <- sqs
}else{
stop('Unrecognized sequences format.')
}
}else{
stop('Unrecognized sequences format')
}
if (verbose) message('Checking that sequence names matches with DataFrame.')
dfgid <- private$.data[, 'gid']
if (!all(dfgid%in%gids)){
stop('Missing sequences: some gid do not match any sequence name.')
}
if (any(!gids%in%dfgid)){
warning('Missing gid: some sequence names do not match to any gid. Continuing anyway..\n', immediate. = TRUE)
}
# Add metadata to sequences
if (verbose) message('Adding metadata to sequences.')
spl <- strsplit(gids, sep)
mcols(private$.sequences)$org <- vapply(spl, '[', 1,
FUN.VALUE = NA_character_)
mcols(private$.sequences)$cluster <- as.character(private$.data$cluster[match(gids, dfgid)])
},
# Methods for sequences
#' @description
#' A field for obtaining core gene sequences is available (see below), but for creating a phylogeny with this
#' sets is useful to: 1) have the possibility of extracting just one sequence of each organism on each cluster, in
#' case paralogues are present, and 2) filling gaps with empty sequences in case the core_level was set below 100\%,
#' allowing more genes (some not in 100\% of organisms) to be incorporated to the phylogeny. That is the purpose of
#' this special function.
#' @param max_per_org Maximum number of sequences of each organism to be taken from each cluster.
#' @param fill \code{logical}. If fill \code{DNAStringSet} with empty \code{DNAString} in cases where
#' \code{core_level} is set below 100\%, and some clusters with missing organisms are also considered.
#' @return A \code{DNAStringSetList} with core genes. Order of organisms on each cluster is conserved, so it is easier
#' to concatenate them into a super-gene suitable for phylogenetic inference.
#' @importFrom S4Vectors mcols
#' @importFrom BiocGenerics table
core_seqs_4_phylo = function(max_per_org = 1, fill = TRUE){
if (!class(max_per_org)%in%c('logical', 'NULL', 'numeric', 'integer'))
stop('"max_per_org" is not numeric, NULL, or NA')
if (!is.logical(fill))
stop('"fill" is not logical')
sqs <- unlist(self$core_sequences, use.names = FALSE)
mcls <- mcols(sqs)
ccl <- unique(mcls$cluster)
if (!(is.na(max_per_org) | is.null(max_per_org))){
wma <- which(self$pan_matrix[, ccl] > max_per_org, arr.ind = T)
if (dim(wma)[1]){
ogs2fix <- ccl[wma[, 2]]
orgs2fix <- rownames(wma)
remrw <- apply(cbind(ogs2fix, orgs2fix), 1, function(x){
wh <- which(mcls$cluster==x[1] & mcls$org==x[2])
wh[-seq_len(max_per_org)]
})
torm <- unlist(remrw)
if (length(torm)){
sqs <- sqs[-torm]
mcls <- mcols(sqs)
}
}
}
# Fill '0' cells with empthy DNAStringSet
# spl <- split(sqs, mcls$group)
# nro <- elementNROWS(spl)
if (fill){
tbl <- table(mcls)
tofll <- which(tbl==0, arr.ind = TRUE)
if (dim(tofll)[1]){
sqs <- append(sqs, DNAStringSet(rep('', dim(tofll)[1])))
mcls <- mcols(sqs)
mfill <- which(is.na(mcls$org))
mcols(sqs)$org[mfill] <- rownames(tofll)
mcols(sqs)$cluster[mfill] <- ccl[tofll[, 2]]
mcls <- mcols(sqs)
}
}
## Order ! ! !
orgs <- as.character(self$organisms$org)
sqs <- sqs[order(match(mcls$org, orgs))]
mcls <- mcols(sqs)
# Return
split(sqs, mcls$cluster)
}
),
active = list(
#' @field sequences A \code{\link[Biostrings:XStringSetList-class]{DNAStringSetList}} with the
#' set of sequences grouped by cluster. Each group is accessible as were a list. All
#' \code{Biostrings} methods are available.
sequences = function(){
sqs <- private$.sequences
sset <- unlist(self$genes)$gid
split(sqs[sset], mcols(sqs[sset])$cluster)
},
#' @field core_sequences Like \code{$sequences}, but only showing core
#' sequences.
core_sequences = function(){
sqs <- private$.sequences
sset <- unlist(self$core_genes)$gid
split(sqs[sset], mcols(sqs[sset])$cluster)
},
#' @field cloud_sequences Like \code{$sequences}, but only showing cloud
#' sequences as defined above.
cloud_sequences = function(){
sqs <- private$.sequences
sset <- unlist(self$cloud_genes)$gid
split(sqs[sset], mcols(sqs[sset])$cluster)
},
#' @field shell_sequences Like \code{$sequences}, but only showing shell
#' sequences, as defined above.
shell_sequences = function(){
sqs <- private$.sequences
sset <- unlist(self$shell_genes)$gid
split(sqs[sset], mcols(sqs[sset])$cluster)
}
)
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.