#' @name RMSobject
#' @title Constructing an RMS object
#'
#' @description Constructs an RMS object with information about a set of genomes.
#'
#' @param genome.tbl A table (data.frame or tibble) with genome information, see below.
#' @param frg.dir Path to folder with fragment fasta files.
#' @param vsearch.exe Text with the VSEARCH executable command.
#' @param identity The sequence identity for clustering fragments (0.0-1.0).
#' @param min.length Minimum fragment length (integer).
#' @param max.length Maximum fragment length (integer).
#' @param verbose Turn on/off output text during processing (logical).
#' @param threads Number of threads to be used by \code{vsearch} (integer).
#' @param tmp.dir Name of folder for temporary output, will be created if not already existing.
#'
#' @details The \code{genome.tbl} has a row for each genome to include in the RMS database.
#' There must be a column named \code{genome_file}, containing fasta filenames. These must be the
#' names of the fasta files containing the RMS fragments from each genome. Use \code{\link{getRMSfragments}}
#' to create these fasta files, ensuring the fasta headers follow the pattern
#' <genome.ID>_RMSx, where <genome.ID> is some text unique to each genome and x is some integer.
#' The \code{genome.tbl} may contain other columns as well, but \code{genome_file} is required.
#'
#' The \code{vsearch.exe} is the exact command to invoke the VSEARCH software. This is normally just "vsearch",
#' but if you run this as a singularity container (or any other container) it may be something like
#' "srun singularity exec <container_name> vsearch".
#'
#' @return A list with the following objects: \code{Cluster.tbl}, \code{Cpn.mat}
#' and \code{Genome.tbl}.
#'
#' The \code{Cluster.tbl} is a \code{\link{tibble}} with data about all fragment clusters.
#' It contains columns with data about each cluster, including the centroid \code{Sequence}
#' and its \code{Header}, making it possible to write the table to a fasta file using
#' \code{\link{writeFasta}}.
#'
#' The \code{Cpn.mat} is the copy number matrix, implemented as a sparse dgeMatrix from the
#' \code{\link{Matrix}} package. It has one row for each fragment cluster and one column
#' for each genome. This is the central data structure for de-convolving the genome
#' content from read-count data, see \code{\link{rmscols}}.
#'
#' The \code{Genome.tbl} is a copy of the argument \code{genome.tbl}, but with columns
#' \code{N_cluster} and \code{N_unique} added, containing the
#' number of clusters and the number of unique fragment clusters to each genome.
#'
#'
#' @author Lars Snipen.
#'
#' @seealso \code{\link{getRMSfragments}}.
#'
#' @importFrom microseq readFasta
#' @importFrom data.table fread
#' @importFrom Matrix Matrix rowSums colSums
#' @importFrom stringr word str_length str_remove str_c
#' @importFrom dplyr mutate filter select group_by slice summarise
#' @importFrom tibble tibble
#'
#' @examples
#'
#' @export RMSobject
#'
RMSobject <- function(genome.tbl, frg.dir, vsearch.exe = "vsearch", identity = 0.99,
min.length = 30, max.length = 500, verbose = TRUE, threads = 1,
tmp.dir = "tmp"){
if(length(grep("genome_id", colnames(genome.tbl))) == 0) stop("The genome.tbl must contain a column 'genome_id' with unique texts")
if(length(genome.tbl$genome_id) != length(unique(genome.tbl$genome_id))) stop("The genome_id's must be unique for each genome (row)")
if(length(grep("genome_file", colnames(genome.tbl))) == 0) stop("The genome.tbl must contain a column 'genome_file' with filenames")
frg_files <- normalizePath(file.path(frg.dir, genome.tbl$genome_file))
ok <- file.exists(frg_files)
idx <- which(!ok)
if(length(idx) > 0) stop("The file(s)", frg_files[idx], "does not exist")
is.gz <- unique(str_detect(genome.tbl$genome_file, "\\.gz$"))
if(length(is.gz) != 1) stop("Either all or none of the fragment files must be compressed")
ext <- ifelse(is.gz, ".fasta.gz", ".fasta")
if(!base::dir.exists(tmp.dir)) dir.create(tmp.dir, recursive = T)
all.frg <- file.path(tmp.dir, str_c("all_frg", ext))
ok <- file.append(file1 = all.frg, file2 = frg_files)
if(min(ok) == 0) stop("Could not copy all fragment fasta files from", frg.dir)
if(is.gz){
R.utils::gunzip(all.frg)
all.frg <- str_remove(all.frg, "\\.gz$")
}
### The VSEARCH clustering
if(verbose) cat("VSEARCH clustering of RMS fragments...\n")
if(!dir.exists(tmp.dir)) dir.create(tmp.dir)
ctr.file <- file.path(tmp.dir, "centroid.fasta")
uc.file <- file.path(tmp.dir, "uc.txt")
cmd <- paste(vsearch.exe,
"--threads", threads,
"--cluster_fast", all.frg,
"--id", identity,
"--minseqlength", min.length,
"--maxseqlength", max.length,
"--iddef", "2",
"--qmask", "none",
"--strand plus --sizeout",
"--relabel CLST",
"--relabel_keep",
"--uc", uc.file,
"--centroids", ctr.file)
system(cmd)
readFasta(ctr.file) %>%
mutate(Cluster = word(Header, 1, 1, sep = ";")) -> centroids
if(verbose) cat("...produced", nrow(centroids), "clusters\n...the cluster table...")
fread(uc.file, sep = "\t", header = F, drop = c(3,4,5,6,7,8,10), data.table = F) %>%
filter(V1 != "C") %>%
select(Cluster = V2, Tag = V9) %>%
mutate(Cluster = Cluster + 1) %>%
mutate(Cluster = str_remove(str_c("CLST", format(Cluster, scientific = F)), " +")) %>%
mutate(Genome.id = str_remove(Tag, "_RMS[0-9]+$")) -> uc.tbl
uc.tbl %>%
group_by(Cluster) %>%
summarise(Members = str_c(Tag, collapse = ";"), N.genomes = length(unique(Genome.id))) %>%
right_join(centroids, by = "Cluster") %>%
mutate(Length = str_length(Sequence)) %>%
mutate(GC = baseCount(Sequence, c("C", "G")) / Length) %>%
select(Cluster, Length, GC, N.genomes, Members, Header, Sequence) -> cluster.tbl
if(verbose) cat("done\n")
if(verbose) cat("...the copy number matrix\n")
cpn <- Matrix(0, nrow = nrow(cluster.tbl), ncol= nrow(genome.tbl))
colnames(cpn) <- genome.tbl$genome_id
rownames(cpn) <- cluster.tbl$Cluster
for(j in 1:nrow(genome.tbl)){
if(verbose) cat("genome", j, "/", nrow(genome.tbl), "\r")
uc.tbl %>%
filter(Genome.id == genome.tbl$genome_file[j]) %>%
group_by(Cluster) %>%
summarise(Count = n()) -> tbl
cpn[match(tbl$Cluster, cluster.tbl$Cluster),j] <- tbl$Count
}
if(verbose) cat("\n...the genome table...")
pa <- (cpn > 0)
pau <- pa[Matrix::rowSums(pa) == 1,]
genome.tbl %>%
mutate(N_clusters = Matrix::colSums(pa)) %>%
mutate(N_unique = Matrix::colSums(pau)) -> genome.tbl
### Cleaning up
file.remove(all.frg, ctr.file, uc.file)
rms.obj <- list(Cluster.tbl = cluster.tbl,
Cpn.mat = cpn,
Genome.tbl = genome.tbl)
if(verbose) cat("done\n")
return(rms.obj)
}
# local function
baseCount <- function(seq, bases){
return(sapply(strsplit(seq, split = ""), function(x){sum(x %in% bases)}))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.