#' @title Grade of Membership (GoM) model clustering of aDNA samples
#' using DNA damage patterns
#'
#' @description Performs a mixed membership model clustering of aDNA samples
#' using DNA damage patterns- mutation, flanking base, distance from read end
#' and strand break information. The default implementation of this model
#' follows the modeling framework of Shiraishi et al (2015).
#'
#' @param dat Either
#' (a) output from \code{archaic_prepare} or
#' (b) a vector of directories hosting the MFF files that the user
#' wants to jointly model
#' (c) a matrix of counts with samples along the rows and the mismatch
#' signatures along the columns with entries reporting the counts.
#' @param K the number of clusters to fit to the model.
#' @param tol The tolerance level of convergence of the GoM model fit
#' @param labs The factor of labels used to group the samples in visualization.
#' May be used to distinguish
#' samples from different labs, or different library prep.
#' @param gom_method The GoM method type. Defaults to \code{independent} model proposed by
#' Shiraishi2015. The other option is to use the \code{full}
#' model which is uses the implementation due to Taddy2012.
#' @param gom.control Control parameters for the GoM model fit.
#' @param output_dir The output directory where the model is saved.
#' If NULL, it picks the current working directory.
#'
#' @return Fits a GoM model on the aggregated data from \code{archaic_prepare}
#' and outputs both the clusters (represented by mismatch signature frequencies)
#' and the mixing proportion of clusters represented in each sample/MFF file.
#' It also returns an assessment score like the BIC, to compare the models.
#'
#' @references
#' Taddy2012.
#' Taddy, M., 2012, March. On estimation and selection for topic models.
#' In Artificial Intelligence and Statistics (pp. 1184-1193).
#'
#' Shiraishi2015.
#' Shiraishi, Y., Tremmel, G., Miyano, S. and Stephens, M., 2015.
#' A simple model-based approach to inferring and visualizing cancer
#' mutation signatures. PLoS genetics, 11(12), p.e1005657.
#'
#' @importFrom CountClust compGoM
#' @import maptpx
#' @export
archaic_fit <- function(dat,
K,
tol=0.1,
labs = NULL,
gom_method = "independent",
gom.control = list(),
output_dir = NULL){
flanking_bases = 1
gom.control.default <- list(bf = FALSE, kill = 2, ord = TRUE, verb = 1, admix = TRUE,
nbundles = 1, use_squarem = FALSE, init.adapt = FALSE, type = "full",
light = 1, method_admix = 1, sample_init = TRUE, tmax = 10000)
gom.control <- modifyList(gom.control.default, gom.control)
if(class(dat) == "list"){
datalist <- dat
if(is.null(labs)) {
labs <- c()
for(numdir in 1:length(datalist)){
labs <- c(labs, rep(names(datalist)[numdir], dim(datalist[[numdir]])[1]))
}
}
cat("The data is read as a list of matrices - processed by archaic_prepare() \n")
sig_names <- colnames(datalist[[1]])
row_names_pool <- rownames(datalist[[1]])
if(length(datalist) >= 2){
for(num in 2:length(datalist)){
sig_names <- union(sig_names, colnames(datalist[[num]]))
row_names_pool <- c(row_names_pool, rownames(datalist[[num]]))
}
}
pooled_data <- matrix(0, length(row_names_pool), length(sig_names))
rownames(pooled_data) <- row_names_pool
colnames(pooled_data) <- sig_names
for(num in 1:length(datalist)){
pooled_data[match(rownames(datalist[[num]]), rownames(pooled_data)),
match(colnames(datalist[[num]]), sig_names)] <- as.matrix(datalist[[num]])
}
}else if(class(dat) == "character"){
message("The dat is read as names of folders")
folders <- dat
for(numdir in 1:length(folders)){
if(regmatches(folders[numdir],regexpr(".$", folders[numdir])) != "/") folders[numdir] <- paste0(folders[numdir], "/")
}
datalist <- list()
for(numdir in 1:length(folders)){
if(file.exists(paste0(folders[numdir], tail(strsplit(folders[numdir], "/")[[1]],1), ".rda"))){
datalist[[numdir]] <- get(load(paste0(folders[numdir], tail(strsplit(folders[numdir], "/")[[1]],1), ".rda")))
cat("Successfully read .RData file from the folder, ", folders[numdir], "CHECK : \n")
}else{
message(".RData file not found in folder", folders[numdir], "running
archaic_prepare on the MFF files in the folder")
proc_out <- archaic_prepare(folders[numdir])
datalist[[numdir]] <- proc_out
}
}
if(is.null(labs)){
labs <- c()
for(numdir in 1:length(folders)){
labs <- c(labs, rep(tail(strsplit(folders[numdir], "/")[[1]],1), dim(datalist[[numdir]])[1]))
}
}
sig_names <- colnames(datalist[[1]])
row_names_pool <- rownames(datalist[[1]])
if(length(datalist) >= 2){
for(num in 2:length(datalist)){
sig_names <- union(sig_names, colnames(datalist[[num]]))
row_names_pool <- c(row_names_pool, rownames(datalist[[num]]))
}
}
pooled_data <- matrix(0, length(row_names_pool), length(sig_names))
rownames(pooled_data) <- row_names_pool
colnames(pooled_data) <- sig_names
for(num in 1:length(datalist)){
pooled_data[match(rownames(datalist[[num]]), rownames(pooled_data)),
match(colnames(datalist[[num]]), sig_names)] <- as.matrix(datalist[[num]])
}
}else if (class(dat) == "matrix" | class(dat) == "data.frame"){
pooled_data <- dat
if(is.null(labs)) labs <- 1:nrow(pooled_data)
if(is.null(levels)) levels <- labs
if (min( pooled_data %% 1) !=0 ) stop ("dat must be a counts data matrix ")
if (min(pooled_data) < 0 ) stop ("dat must contain only positive integers ")
}else{
stop("class of the dat input must be a character or a vector of characters
indicating dorectory names, or a
matrix of data frame of counts")
}
if(length(labs) != dim(pooled_data)[1]) stop(paste0("the length of labs is :", length(labs),
" which does not match the dimension of the data matrix compiled: ", dim(pooled_data)[1]))
zero_sum_rows <- which(rowSums(pooled_data) == 0)
if(length(zero_sum_rows) > 0){
pooled_data <- pooled_data[-zero_sum_rows, ]
labs <- labs[-zero_sum_rows]
}
pooled_data_2 <- filter_out_strand(pooled_data)
if(gom_method == "full"){
message("Fitting the Grade of Membership Model - full version - due to Matt Taddy")
suppressWarnings(topic_clus <- do.call(maptpx::topics, append(list(counts = pooled_data_2, K=K, tol=tol, model = "full", signatures = NULL), topics.control)))
}else if(gom_method == "independent"){
message("Fitting the Grade of Membership Model - independent version - due to Y. Shiraichi and M. Stephens")
signature_set <- colnames(pooled_data_2)
sig_split <- t(sapply(1:length(signature_set), function(x) return(strsplit(signature_set[x], "")[[1]][1:8])))
new_sig_split <- matrix(0, dim(sig_split)[1], 3);
new_sig_split[,1] <- sig_split[,flanking_bases]
new_sig_split[,2] <- sapply(1:length(signature_set), function(x) return(paste(sig_split[x,(flanking_bases+1):(flanking_bases+4)], collapse="")))
new_sig_split[,3] <- sig_split[,(flanking_bases+5)]
levels(new_sig_split[,1]) <- c("0", "1", "2", "3", "4")
pos <- t(sapply(1:length(signature_set), function(x)
{
y = strsplit(signature_set[x], "")[[1]]
return(paste(y[10:length(y)], collapse=""))
}))
mat <- matrix(0, dim(new_sig_split)[1], dim(new_sig_split)[2])
for(k in 1:dim(new_sig_split)[2]){
temp <- as.factor(new_sig_split[,k])
mat[,k] <- as.numeric(as.matrix(plyr::mapvalues(temp, from = levels(temp),
to = 0:(length(levels(temp))-1))))
}
pos <- as.numeric(pos)
pos <- pos - min(pos)
pos <- factor(pos, levels = 0:21)
signatures <- mat;
signature_pos <- cbind.data.frame(signatures, pos)
suppressWarnings(topic_clus <- do.call(maptpx::topics,
append(list(counts = pooled_data_2, K=K, tol=tol,
model = "full", signatures = signatures), gom.control)))
}
model_assessment <- CountClust::compGoM(pooled_data_2, topic_clus)
ll <- list("omega" = topic_clus$omega,
"theta" = topic_clus$theta,
"assessment" = model_assessment,
"labs" = labs)
if(is.null(output_dir)){ output_dir <- paste0(getwd(),"/")}else{
if(regmatches(output_dir,regexpr(".$", output_dir)) != "/"){output_dir <- paste0(output_dir, "/")}
}
save(ll, file = paste0(output_dir, "model.rda"))
return(ll)
}
filter_out_strand <- function(mat){
strand_out <- as.character(sapply(as.character(colnames(mat)), function(x) return (paste0(strsplit(x, "_")[[1]][-2], collapse="_"))))
mat_filtered <- as.numeric()
for(l in 1:dim(mat)[1]){
mat_filtered <- rbind(mat_filtered, tapply(mat[l,], strand_out, sum))
}
rownames(mat_filtered) <- rownames(mat)
return(mat_filtered)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.