###########################################################################################
#' checkInput
#'
#' Checks the input of EMMLi to make sure it's all what is expected. Throws errors with
#' a descriptive error when it is not.
#' @name checkInput
#' @param corr Correlation matrix
#' @param mod Model file (landmarks classified into modules)
#' @param abs Logical - whether to use abosulte values of correlations
#' @param pprob posterior probability cut off
#' @param saveAs filename to save output too
#' @keywords internal
checkInput <- function(corr, mod, N_sample, abs, pprob, saveAs) {
# Check inputs
if(!is.numeric(corr) & !is.data.frame(corr)){
stop('corr should be a (square) matrix or data frame.')
}
if(!(is.factor(mod[, 1]) | is.character(mod[, 1]))){
stop('The first column of mod should be landmark names (factor or character).')
}
# Make elements in mod, after the first column, integers or NAs
modClasses <- sapply(mod[, -1], class)
modClasses[modClasses == 'integer'] <- 'numeric'
if(!all(modClasses == 'factor') & !all(modClasses == 'numeric') & !all(modClasses == 'character')){
stop('mod should contain landmark names in the first column and integers, factors or character vectors in subsequent columns')
}
# check that numerics are integers
if(all(modClasses == 'numeric')){
if(!all(sapply(mod[, -1], function(x) all(abs(stats::na.omit(x) - round(stats::na.omit(x))) < .Machine$double.eps^0.5)))){
stop('mod should contain landmark names in the first column and integers, factors or character vectors in subsequent columns')
}
}
if(all(modClasses == 'factor')){
mod[, -1] <- sapply(mod[, -1], function(x) as.numeric(x))
}
if(all(modClasses == 'character')){
mod[, -1] <- sapply(mod[, -1], function(x) as.numeric(factor(x)))
}
# Check corr
if(!dim(corr)[1] == dim(corr)[2]) stop('corr should be a square matrix')
# Check other parameters
stopifnot(is.numeric(N_sample), N_sample > 0, is.logical(abs), pprob > 0, pprob < 1)
if(!is.null(saveAs)){
stopifnot(is.character(saveAs))
if(!grepl('\\.csv$', saveAs)){
warning("Output will be saved as a csv but saveAs does not end in '.csv'")
}
}
}
###########################################################################################
#' getModules
#'
#' Breaks correlation matrix up into discrete modules (within and between) based on
#' a model table (landmarks classified into modules)
#' @name getModules
#' @param varlist internal variable from EMMLi
#' @param symmet internal variable from EMMLi
#' @param corr correlation matrix
#' @param mod Landmark classifications
#' @param corr_list internal variable from EMMLi
#' @keywords internal
getModules <- function(corr, mod) {
# Create varlist variable
varlist = paste0('mod$', names(mod)[-1])
# make the upper triangle of corr NA, we only use the lower triangle.
corr[upper.tri(corr, diag = T)] = NA
# array of coefficient matrix, NAs are removed.
corr_list = (as.array(corr[!is.na(corr)]))
symmet = corr
# a symmetric matrix formed from the coefficient matrix, only used to find intermodular coefficients.
symmet[upper.tri(symmet)] = t(symmet)[upper.tri(symmet)]
all_modules = list()
for(colnam in varlist){
col = array(eval(parse(text = colnam)))
# na.omit() used to remove unclassified landmarks.
modNF = stats::na.omit(cbind(1:nrow(mod), col))
w = unique(modNF[, 2])
modules = list()
for(i in seq(length(w))){
# identify landmarks within a class
fg = modNF[modNF[, 2] == w[i], ]
# coefficients between identified landmarks.
l <- corr[as.numeric(fg[, 1]), as.numeric(fg[, 1])]
modules[[i]] = (as.array(l[!is.na(l)]))
}
names(modules) = paste("Module", w)
between_mod = list()
betweenModules = list()
withinModules = list()
unintegrated = list()
betweenFloat = list()
if (length(w) > 1){ #check that the num. of modules is greater than 1
# all possible combination of modules
cb = utils::combn(w, 2)
for (i in seq(dim(cb)[2])){
fg1 = modNF[modNF[, 2] == cb[1, i], ]
fg2 = modNF[modNF[, 2] == cb[2, i], ]
# setdiff(A,B) - present in A but not in B
between = symmet[as.integer(setdiff(fg2[, 1], fg1[, 1])), as.integer(setdiff(fg1[, 1], fg2[, 1]))]
between_mod[[i]] = between[!is.na(between)]
}
names(between_mod) = paste(cb[1, ], "to", cb[2, ])
betweenModules['betweenModules'] = list(as.vector(rle(unlist(between_mod))$values))
withinModules['withinModules'] = list(as.vector(rle(unlist(modules))$values))
}
unintegrated_list = setdiff(corr_list,unlist(c(modules,between_mod)))
unintegrated['unintegrated'] = list(as.vector(rle(unlist(unintegrated_list))$values))
# unintegrated['unintegrated+between'] = list(as.vector(rle(unlist(c(unintegrated_list,betweenModules)))$values))
if (length(unintegrated_list) != 0){
betweenFloat['betweenFloat'] = list(as.vector(rle(unlist(c(betweenModules['betweenModules'], unintegrated['unintegrated'])))$values))
all_modules[[colnam]] = c(modules, between_mod, withinModules, betweenModules, betweenFloat, unintegrated)
} else {
all_modules[[colnam]] = c(modules, between_mod, withinModules, betweenModules, unintegrated)
}
}
return(all_modules)
}
###########################################################################################
#' moduleLikelihoods
#'
#' Calculates the maximum likelihood value of rho for each module (between/within) from
#' getModules
#' @name moduleLikelihoods
#' @param all_modules output from getModules - internal to EMMLi
#' @param abs use absolute values or not
#' @param N_sample sample size
#' @param correction Experimental - whether to calculate number of parameters normally,
#' or number of parameters + number of modules - 1
#' @keywords internal
moduleLikelihoods <- function(all_modules, abs, N_sample, correction = "normal") {
LogL = function(z_r, z_p) {
-0.5 * log(var) - ((z_r - z_p)^2) / (2 * var)
}
output = list()
maxlogL = list()
logp = list()
for (m in seq(length(all_modules))){
# table of max. likelihood and rho
maxres = matrix(, nrow = 2, ncol = (length(all_modules[[m]])))
dimnames(maxres) = list(c("MaxL", "MaxL_p"), c(names(all_modules[[m]][seq((length(all_modules[[m]])))])))
###################################
# Coulds for loop be a seperate function? Seems a good target.
for(g in seq((length(all_modules[[m]])))){
r = unlist(unname(all_modules[[m]][g]))
n_value = length(r)
# Calculate z_r differently depending on abs argument in original call.
if(abs){
z_r = 0.5 * log((1 + abs(r)) / (1 - abs(r)))
#rho
p = seq(0, 0.99, 0.01)
} else if(!abs) {
z_r = 0.5 * log((1 + r) / (1 - r))
#rho
p = seq(-0.99, 0.99, 0.01)
}
n = N_sample
var = 1 / (n - 3)
#zeta
z_p = 0.5 * log((1 + p)/(1 - p))
LogL_table = outer(z_r, z_p, LogL)
Likelihoods = colSums(LogL_table)
MaxIndex = which.max(Likelihoods)
MaxL = Likelihoods[MaxIndex]
MaxL_p = p[MaxIndex]
maxres[1, g] = MaxL
maxres[2, g] = MaxL_p
}
# list of maximum likelihood for all modules.
output[[names(all_modules)[m]]] = maxres
#calculate sum of modular likelihood and k
if (dim(output[[m]])[2] == 2){
maxlogL[[names(all_modules)[m]]][['default']] = c(output[[m]][1], 2)
logp[[names(all_modules)[m]]][['default']] = output[[m]][, 1, drop = FALSE]
} else if (dim(output[[m]])[2] == 6){
#if(output[[m]][1,'unintegrated'] == 0){output[[m]] = output[[m]][,colnames(output[[m]]) != 'unintegrated']}
mod_between = output[[m]]['MaxL', grep('Module |betweenModules|unintegrated', names(output[[m]][1, ]))]
mod_between_p = output[[m]][,grep('Module |betweenModules|unintegrated', names(output[[m]][1, ]))]
if (mod_between['unintegrated'] == 0){
K = length(mod_between) - 1
} else {
K = length(mod_between)
}
maxlogL[[names(all_modules)[m]]][['sep.Mod + same.between']] = c(sum(mod_between), K + 1)
logp[[names(all_modules)[m]]][['sep.Mod + same.between']] = mod_between_p
within_between = output[[m]]['MaxL', c('withinModules','betweenModules','unintegrated')]
within_between_p = output[[m]][, c('withinModules','betweenModules','unintegrated')]
if (within_between['unintegrated'] == 0){
K = length(within_between) - 1
} else {
K = length(within_between)
}
maxlogL[[names(all_modules)[m]]][['same.Mod + same.between']] = c(sum(within_between), K+1)
logp[[names(all_modules)[m]]][['same.Mod + same.between']] = within_between_p
} else {
#if(output[[m]][1,'unintegrated'] == 0){output[[m]] = output[[m]][,colnames(output[[m]]) != 'unintegrated']}
mod_to = output[[m]]['MaxL', grep('Module |to |unintegrated', names(output[[m]][1, ]))]
mod_to_p = output[[m]][, grep('Module |to |unintegrated', names(output[[m]][1,]))]
if (mod_to['unintegrated'] == 0){
K = length(mod_to)-1
} else {
K = length(mod_to)
}
maxlogL[[names(all_modules)[m]]][['sep.Mod + sep.between']] = c(sum(mod_to), K + 1)
logp[[names(all_modules)[m]]][['sep.Mod + sep.between']] = mod_to_p
within_between = output[[m]]['MaxL', c('withinModules', 'betweenModules', 'unintegrated')]
within_between_p = output[[m]][, c('withinModules', 'betweenModules', 'unintegrated')]
if (within_between['unintegrated'] == 0){
K = length(within_between) - 1
} else {
K = length(within_between)
}
maxlogL[[names(all_modules)[m]]][['same.Mod + same.between']] = c(sum(within_between), K + 1)
logp[[names(all_modules)[m]]][['same.Mod + same.between']] = within_between_p
mod_between = output[[m]]['MaxL', grep('Module |betweenModules|unintegrated', names(output[[m]][1, ]))]
mod_between_p = output[[m]][, grep('Module |betweenModules|unintegrated', names(output[[m]][1, ]))]
if (mod_between['unintegrated'] == 0){
K = length(mod_between) - 1
} else {
K = length(mod_between)
}
maxlogL[[names(all_modules)[m]]][['sep.Mod + same.between']] = c(sum(mod_between), K + 1)
logp[[names(all_modules)[m]]][['sep.Mod + same.between']] = mod_between_p
to_within = output[[m]]['MaxL', grep('to |withinModules|unintegrated', names(output[[m]][1, ]))]
to_within_p = output[[m]][, grep('to |withinModules|unintegrated', names(output[[m]][1, ]))]
if (to_within['unintegrated'] == 0){
K = length(to_within) - 1
} else {
K = length(to_within)
}
maxlogL[[names(all_modules)[m]]][['same.Mod + sep.between']] = c(sum(to_within), K + 1)
logp[[names(all_modules)[m]]][['same.Mod + sep.between']] = to_within_p
if (output[[m]]['MaxL', grep('unintegrated', names(output[[m]][1, ]))] != 0){
sepmod_samebetweenunintegrated = output[[m]][, grep('Module |betweenFloat',names(output[[m]][1, ]))]
K = length(sepmod_samebetweenunintegrated['MaxL', ])
maxlogL[[names(all_modules)[m]]][['sep.mod + same.between.unintegrated']] = c(sum(sepmod_samebetweenunintegrated['MaxL', ]), K + 1)
logp[[names(all_modules)[m]]][['sep.mod + same.between.unintegrated']] = sepmod_samebetweenunintegrated
samemod_samebetweenunintegrated = output[[m]][, grep('withinModules|betweenFloat', names(output[[m]][1, ]))]
K = length(samemod_samebetweenunintegrated['MaxL', ])
maxlogL[[names(all_modules)[m]]][['same.mod + same.between.unintegrated']] = c(sum(samemod_samebetweenunintegrated['MaxL', ]), K + 1)
logp[[names(all_modules)[m]]][['same.mod + same.between.unintegrated']] = samemod_samebetweenunintegrated
}
}
}
# Experimental correction to parameter number. Adds the number of modules - 1 to K, the
# parameter count. Extra penalisation, to see if EMMLi then has less of a tendency to
# overparamaterise.
if (correction == "new") {
for (i in seq_along(maxlogL)) {
n_modules <- length(grep("^Module", names(all_modules[[i]])))
for (g in seq_along(maxlogL[[i]])) {
maxlogL[[i]][[g]][2] <- maxlogL[[i]][[g]][2] + (n_modules - 1)
}
}
}
rets <- list(maxlogL = maxlogL, logp = logp)
return(rets)
}
##### Title #####
#' Evaluating modularity with maximum likelihood
#'
#### Description #####
#' Calculates the AICc values, model likelihoods, and posterior probabilities of different models of modularity, as described in Goswami and Finarelli (2016).
#'
#### Details #####
#' The publication describing this analysis is A. Goswami and J. Finarelli
#' (2016) EMMLi: A maximum likelihood approach to the analysis of modularity.
#' Evolution \url{http://onlinelibrary.wiley.com/doi/10.1111/evo.12956/abstract}.
#'
#'@param corr Lower triangle or full correlation matrix. n x n square matrix for n landmarks.
#'@param N_sample The number of specimens
#'@param mod A data frame defining the models. The first column should contain the landmark names.
#'Subsequent columns should define which landmarks are contained within each module with integers,
#'factors or characters. If a landmark should be ignored for a specific model (i.e., it is
#'unintegrated in any module), the element should be NA.
#'@param saveAs A character string defining the filename and path for where to save output. If NULL,
#'the output is not saved to file
#'@param abs Logical denoting whether absolute values should be used. Default is TRUE, as in Goswami
#'and Finarelli (2016)
#'@param pprob posterior probability cutoff for reporting of models. Default is 0.05, as suggested in
#'Goswami and Finarelli (2016)
#'@param correction If "normal" then AIC is calculated normally, if "new" then the number of modules - 1
#' is added to the n parameter penalisation during AIC calculation. This is experimental and is not
#' recommended!
#'
#'@export
#'@return A list containing two elements. The first (results) gives the AIC results for each model.
#' The second (rho) gives the within and between module correlations.
#' Optionally, the output is saved to the file defined by the saveAs argument with only models with a
#' posterior probability > 0.01 being saved.
#'
#'
#'@examples
#' set.seed(1)
#'
#' # Chose a filename and directory for output
#' dir <- tempdir()
#' file <- paste0(dir, 'EMMLiTest.csv')
#'
#' # Examine a correlation matrix and model dataframe
#' dim(macacaCorrel)
#' head(macacaModels)
#'
#' # run EMMLi
#' output <- EMMLi(macacaCorrel, 20, macacaModels, file)
#'
#' unlink(file)
#'
#' # run EMMLi without writing output
#' output <- EMMLi(macacaCorrel, 20, macacaModels)
#'
#' # Raw data example to illustrate pitfalls
#' corrPath <- system.file("extdata", "M1lmcorrel.csv", package = "EMMLi")
#' corr <- read.csv(corrPath, header = FALSE)
#'
#' modelPath <- system.file("extdata", "macaca_landmarklist.csv", package = "EMMLi")
#' mod <- read.csv(modelPath, header = TRUE, row.names = 1)
#'
#' # First column should be character or factor. Subsequent columns integer
#' sapply(mod, class)
#'
#' out <- EMMLi(corr, 42, mod)
#'
EMMLi <- function(corr, N_sample, mod, saveAs = NULL, abs = TRUE, pprob = 0.05,
all_rhos = FALSE, correction = "normal"){
checkInput(corr, mod, N_sample, abs, pprob, saveAs)
# Create null model
mod$No.modules = 1
# get correlation matrices for each modules.
all_modules <- getModules(corr, mod)
# maxlogL will have the log likelihood for each module.
liks <- moduleLikelihoods(all_modules, abs, N_sample, correction = correction)
maxlogL <- liks$maxlogL
logp <- liks$logp
results = matrix(unlist(maxlogL), ncol = 2, byrow = TRUE)
a = names(unlist(maxlogL))[seq(1, dim(results)[1] * dim(results)[2], dim(results)[2])]
dimnames(results) = list(a, c('MaxL', 'K'))
n = length(which(!is.na(corr) == TRUE))
AICc = apply(results, 1, function(x) -2 * x['MaxL'] + 2 * x['K'] + (2 * x['K'] * (x['K'] + 1)) / (n - x['K'] - 1))
dAICc = AICc - min(AICc)
Model_L = exp(-0.5 * dAICc)
Post_Pob = Model_L / sum(Model_L)
results = cbind(results, n, AICc, dAICc, Model_L, Post_Pob)
n = names(all_modules)
nm = unlist(strsplit(n, split = 'mod\\$'))[seq(2, 2 * length(n), 2)]
a = names(unlist(maxlogL))[seq(1, dim(results)[1] * 2, 2)]
b = unlist(strsplit(a, split = 'mod\\$'))
b = unlist(strsplit(b, split = '1$'))
rownames(results) = b
s = 1
i = length(nm)
o = order(results[grep(nm[i], rownames(results)), 2])
t = results[grep(nm[i], rownames(results)), , drop = FALSE][o, , drop = FALSE]
s = s + length(o)
for(i in 1:(length(nm) - 1)){
o = order(results[grep(paste(nm[i], '.s', sep = ""), rownames(results)), 2])
t = rbind(t, results[grep(paste(nm[i], '.s', sep = ""), rownames(results)), , drop = FALSE][o, , drop = FALSE])
s = s + length(o)
}
results = t
rholist = list()
h = 1
for (i in 1:(length(logp))){
for (j in 1:length(logp[[i]])){
rholist[h] = logp[[i]][j]
h = h + 1
}
}
if (all_rhos) {
names(rholist) <- names(Post_Pob)
}
# build output for the csv.
rho_output = rholist[which(Post_Pob > pprob)]
rholist_name = names(which(Post_Pob > pprob))
rholist_name = unlist(strsplit(rholist_name, split = 'mod\\$'))
rholist_name = unlist(strsplit(rholist_name, split = '1$'))
return_rho <- rho_output
names(return_rho) <- rholist_name
if(!is.null(saveAs)){
utils::write.table(results, file = saveAs, row.names = TRUE, col.names = NA, sep = ",")
cat("\n\n", file = saveAs, append = TRUE)
for(q in 1:length(rho_output)){
cat(rholist_name[q], "\n", file = saveAs, append = TRUE)
write(paste(c('', colnames(rho_output[[q]])), collapse = ','), saveAs, append = TRUE)
utils::write.table(rho_output[q], saveAs, row.names = TRUE, col.names = FALSE, sep = ",", append = TRUE)
cat("\n", file = saveAs, append = TRUE)
}
}
if (all_rhos) {
res <- list(results = results, best_rho = return_rho, all_rhos = rholist)
} else {
res <- list(results = results, rho = return_rho)
}
return(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.