# Title : TODO
# Objective : TODO
# Created by: kf
# Created on: 5/19/18
remove_invariant_traits = function(trait_table, small_dif=0.001) {
is_small_dif = c()
for (i in 1:length(trait_table)) {
trait_min = min(trait_table[,i], na.rm=TRUE)
trait_max = max(trait_table[,i], na.rm=TRUE)
is_small_dif = c(is_small_dif, trait_max - trait_min < small_dif)
}
removed_traits = original_traits[is_small_dif]
if (length(removed_traits)) {
cat("Trait removed due to small difference (<", small_dif, '):', removed_traits, '\n')
} else {
cat('All traits passed small difference check.\n')
}
trait_table = trait_table[,!is_small_dif]
out = list(trait_table=trait_table, removed_traits=removed_traits)
return(out)
}
merge_replicates = function(trait_table, replicate_sep) {
if (replicate_sep=='') {
return(trait_table)
}
without_reps = c()
for (col in colnames(trait_table)) {
split = strsplit(col, replicate_sep)[[1]]
num_split = length(split)
if (num_split == 1) {
without_rep = split
} else if (num_split > 1) {
without_rep = split[1:(num_split-1)]
}
if (length(without_rep)>1) {
cat(col, without_rep, '\n')
}
without_reps = c(without_reps, without_rep)
}
if (length(unique(without_reps))==ncol(trait_table)) {
cat(paste0('No replicate was found with --replicate_sep="', replicate_sep, '"\n'))
return(trait_table)
} else {
cat(paste0('Replicates were found with --replicate_sep="', replicate_sep, '". Mean values will be used.\n'))
}
new_cols = unique(without_reps)
out = data.frame(matrix(ncol=length(new_cols), nrow=nrow(trait_table)))
colnames(out) = new_cols
rownames(out) = rownames(trait_table)
for (new_col in colnames(out)) {
is_col = startsWith(colnames(trait_table), paste0(new_col, replicate_sep))
if (sum(is_col)==1) {
values = trait_table[,is_col]
} else {
values = apply(trait_table[,is_col], 1, function(x){mean(x, na.rm=TRUE)})
}
out[,new_col] = values
}
return(out)
}
get_high_similarity_clades = function(tree, trait_table, method, threshold, verbose=FALSE, num_test=0) {
num_all_leaves = length(tree$tip.label)
collapse_node_nums = c()
root_num = get_root_num(tree)
subroot_nums = get_children_num(tree, root_num)
next_node_nums = subroot_nums
for (nnn in next_node_nums) {
if (length(get_tip_labels(tree, nnn))==1) {
next_node_nums = next_node_nums[next_node_nums!=nnn]
}
}
num_processed_node = 0
while (length(next_node_nums) > 0) {
current_node_num = next_node_nums[1]
tip_labels = get_tip_labels(tree, current_node_num)
num_leaves = length(tip_labels)
leaf_combinations = expand.grid(c1=1:num_leaves, c2=1:num_leaves)
leaf_combinations = subset(leaf_combinations, c1 < c2)
rownames(leaf_combinations) = NULL
num_combinations = nrow(leaf_combinations)
if (verbose) {
cat('number of next_node_nums =', length(next_node_nums), 'number of leaf combinations =', num_combinations, '\n')
}
min_similarity = 1
do_collapse = TRUE
for (i in 1:num_combinations) {
current_similarity = 1
leaf_c1 = tip_labels[leaf_combinations[i,'c1']]
leaf_c2 = tip_labels[leaf_combinations[i,'c2']]
trait_c1 = unlist(trait_table[leaf_c1,])
trait_c2 = unlist(trait_table[leaf_c2,])
if (method=='complementarity') {
current_similarity = 1 - calc_complementarity(trait_c1, trait_c2, method='weighted')
} else {
current_similarity = suppressWarnings(cor(trait_c1, trait_c2, method=method))
current_similarity = ifelse(is.na(current_similarity), 1, current_similarity)
}
min_similarity = min(min_similarity, current_similarity)
if (min_similarity < threshold) {
do_collapse = FALSE
if (verbose) {
cat('A low similarity (', min_similarity, ') found at', i, 'th leaf combinations (', num_combinations, ')\n')
}
break
}
}
if (length(next_node_nums)==1) {
next_node_nums = c()
} else {
next_node_nums = next_node_nums[2:length(next_node_nums)]
}
if (do_collapse) {
cat('node_num =', current_node_num, 'size =', num_leaves, 'Min similarity =', min_similarity, '\n')
collapse_node_nums = c(collapse_node_nums, current_node_num)
} else {
children_nums = get_children_num(tree, current_node_num)
children_nums = na.omit(children_nums)
children_nums = children_nums[children_nums > num_all_leaves]
next_node_nums = c(next_node_nums, children_nums)
if (verbose) {
cat('node_num =', current_node_num, 'size =', num_leaves, 'Min similarity =', min_similarity, '\n')
}
}
num_processed_node = num_processed_node + 1
if (num_processed_node%%100==0) {
cat('processed', num_processed_node, 'nodes\n')
}
if ((num_test!=0)&(num_processed_node==num_test)) {
cat('Reaching num_test, Exiting at the ', num_test, ' th processed node.\n')
break
}
}
return(collapse_node_nums)
}
collapse_clades = function(tree, trait_table, collapse_node_nums) {
cat('number of leaves before collapse =', length(tree$tip.label), '\n')
trait_table = trait_table[tree$tip.label,]
retain_tips = list()
drop_tips = list()
collapse_leaf_names = list()
for (cnn in collapse_node_nums) {
collapse_leaf_names[[as.character(cnn)]] = get_tip_labels(tree, cnn)
retain_tips[[cnn]] = collapse_leaf_names[[as.character(cnn)]][1]
drop_tips[[cnn]] = collapse_leaf_names[[as.character(cnn)]][2:length(collapse_leaf_names[[as.character(cnn)]])]
}
out_tree = tree
out_trait = trait_table
for (cnn in collapse_node_nums) {
out_tree = ape::drop.tip(out_tree, tip=drop_tips[[cnn]], trim.internal=TRUE)
out_tree$tip.label[out_tree$tip.label==retain_tips[[cnn]]] = cnn
subtree = ape::extract.clade(tree, cnn)
subtree_traits = trait_table[subtree$tip.label,]
subtree_root_states = c()
for (i in 1:ncol(trait_table)) {
ace_result = ape::ace(subtree_traits[,i], subtree, type='continuous', method='REML', CI=FALSE, model='BM')
subtree_root_state = ace_result$ace[names(ace_result$ace)==get_root_num(subtree)]
subtree_root_states = c(subtree_root_states, subtree_root_state)
}
df_subtree_root = data.frame(subtree_root_states)
out_trait = out_trait[!(rownames(out_trait) %in% subtree$tip.label),]
out_trait = rbind(out_trait, subtree_root_states)
rownames(out_trait)[nrow(out_trait)] = cnn
}
out_trait = out_trait[out_tree$tip.label,]
cat('number of leaves after collapse =', length(out_tree$tip.label), '\n')
out = list(tree=out_tree, trait=out_trait, collapse_leaf_names=collapse_leaf_names)
return(out)
}
map_node_num = function(tree_original, tree_collapsed, collapse_leaf_names, verbose=FALSE) {
# tree_collapsed should be a collapsed tree
stopifnot(length(tree_original$tip.label)>=length(tree_collapsed$tip.label))
df = data.frame(matrix(NA, max(tree_original$edge[,1]), 2))
colnames(df) = c('tree_original', 'tree_collapsed')
tree_original_node_nums = 1:max(tree_original$edge[,1])
tree_collapsed_node_nums = 1:max(tree_collapsed$edge[,1])
while (length(tree_original_node_nums)>0) {
nn1 = tree_original_node_nums[1]
nn1_leaf_names = sort(get_tip_labels(tree_original, nn1))
for (i in 1:length(tree_collapsed_node_nums)) {
nn2 = tree_collapsed_node_nums[i]
nn2_leaf_names = sort(get_tip_labels(tree_collapsed, nn2))
collapsed_node_nums = nn2_leaf_names[as.character(nn2_leaf_names) %in% names(collapse_leaf_names)]
any_collapse = (length(collapsed_node_nums)>0)
is_collapse_nn2 = (length(nn2_leaf_names)==1)&(any_collapse)
if (any_collapse) {
nn2_leaf_names = nn2_leaf_names[!nn2_leaf_names %in% collapsed_node_nums]
for (cnn in collapsed_node_nums) {
nn2_leaf_names = c(nn2_leaf_names, collapse_leaf_names[[as.character(cnn)]])
}
nn2_leaf_names = sort(nn2_leaf_names)
}
if (is_collapse_nn2) {
flag = all(nn1_leaf_names %in% nn2_leaf_names)
} else if (length(nn1_leaf_names)==length(nn2_leaf_names)) {
flag = identical(nn1_leaf_names, nn2_leaf_names)
} else {
flag = FALSE
}
if (verbose) {
cat('nn1:', nn1, 'nn2:', nn2,
'collapse match:', all(nn1_leaf_names %in% nn2_leaf_names),
'noncollapse match:', all(nn1_leaf_names==nn2_leaf_names),
'is_collapse_nn2:', is_collapse_nn2, '\n')
}
if (flag) {
df[nn1,] = c(nn1,nn2)
tree_original_node_nums = tree_original_node_nums[-1]
if (!is_collapse_nn2) {
tree_collapsed_node_nums = tree_collapsed_node_nums[-i]
tree_collapsed_node_nums = tree_collapsed_node_nums[tree_collapsed_node_nums!=nn2]
}
break
}
}
if (!flag) {
cat('Cannot find the original tree node:', nn1, '\n')
cat('tree_original_node_nums:', tree_original_node_nums, '\n')
cat('tree_collapsed_node_nums:', tree_collapsed_node_nums, '\n')
break
}
}
num_match_before = nrow(df)
df = df[(!is.na(df[['tree_original']])),]
df = df[(!is.na(df[['tree_collapsed']])),]
num_match_after = nrow(df)
if (num_match_before!=num_match_after) {
num_na = num_match_before-num_match_after
cat('Warning:', num_na, 'NA are produced in map_node_num().\n')
}
return(df)
}
get_tree_table = function(pcm_out, mode) {
if (mode=='l1ou') {
leaves = pcm_out$tree$tip.label
leaf2sp = function(leaf_name) {
leaf_split = strsplit(leaf_name, "_")
sp = paste(leaf_split[[1]][1], leaf_split[[1]][2], sep="_")
return(sp)
}
spp = unique(sapply(leaves, leaf2sp))
if (is.null(names(pcm_out$shift.configuration))) {
shift_conf = c("0", as.character(1:length(pcm_out$shift.configuration)))
} else {
shift_conf = c("0", attr(pcm_out$shift.configuration, "name"))
}
num_regime = length(unique(shift_conf))
num_conv_regime = length(unique(shift_conf[duplicated(shift_conf)]))
tree_table = data.frame(
num_shift = pcm_out$nShifts,
num_regime = num_regime,
num_conv_regime = num_conv_regime,
num_uniq_regime = num_regime - num_conv_regime,
num_species = length(spp),
num_leaf = length(leaves),
model_score = pcm_out$score,
stringsAsFactors = FALSE
)
} else if (mode=='PhylogeneticEM') {
pp = PhylogeneticEM::params_process(pcm_out)
df = data.frame(matrix(NA,1,4))
colnames(df) = c('num_shift','log_likelihood','num_species','num_leaf')
df$num_shift = ncol(pp[['shifts']][['values']])
df$log_likelihood = attr(pp, 'log_likelihood')
df$num_species = length(unique(suppressWarnings(leaf2species(leaf_names=pcm_out[['phylo']][['tip.label']]))))
df$num_leaf = length(pcm_out[['phylo']][['tip.label']])
tree_table = df
} else {
cat('mode "', mode, '" is not supported.\n')
}
return(tree_table)
}
get_regime_table = function(pcm_out, mode) {
if (mode=='l1ou') {
if (is.null(colnames(pcm_out$Y))) {
cols = 'trait1'
} else {
cols = colnames(pcm_out$Y)
}
column_names = c("regime", "node_name", "param", cols)
regime_table = data.frame(matrix(0,0,ncol(pcm_out$Y)+3))
colnames(regime_table) = column_names
shift_conf = sort(pcm_out$shift.configuration, decreasing=TRUE)
if (is.null(names(shift_conf))) {
regimes = 1:length(shift_conf)
} else {
regimes = as.integer(names(shift_conf))
}
branch_index = shift_conf
node_names = c()
for (ind in branch_index) {
node_names = c(node_names, get_node_name_by_num(phy=adj_data$tree, node_num=adj_data$tree$edge[ind,2]))
}
if (pcm_out$nShifts > 0) {
tmp_table = cbind(regimes, node_names, "shift_value", pcm_out$shift.values)
colnames(tmp_table) = column_names
regime_table = rbind(regime_table, tmp_table)
tmp_table = cbind(regimes, node_names, "shift_mean", pcm_out$shift.means)
colnames(tmp_table) = column_names
regime_table = rbind(regime_table, tmp_table)
}
tmp_table = cbind(NA, NA, "alpha", matrix(pcm_out$alpha, 1, length(pcm_out$alpha)))
colnames(tmp_table) = column_names
regime_table = rbind(regime_table, tmp_table)
tmp_table = cbind(NA, NA, "sigma2", matrix(pcm_out$sigma2, 1, length(pcm_out$sigma2)))
colnames(tmp_table) = column_names
regime_table = rbind(regime_table, tmp_table)
tmp_table = cbind(NA, NA, "intercept", matrix(pcm_out$intercept, 1, length(pcm_out$intercept)))
colnames(tmp_table) = column_names
regime_table = rbind(regime_table, tmp_table)
tmp_table = cbind(NA, NA, "log_likelihood", matrix(pcm_out$logLik, 1, length(pcm_out$logLik)))
colnames(tmp_table) = column_names
regime_table = rbind(regime_table, tmp_table)
} else if (mode=='PhylogeneticEM') {
pp = PhylogeneticEM::params_process(pcm_out)
traits = rownames(pcm_out[['Y_data']])
num_trait = length(traits)
shift_node_indices = pp[['shifts']][['edges']]
shift_node_nums = pcm_out[['phylo']]$edge[shift_node_indices,2]
num_shift = length(shift_node_nums)
df = data.frame(matrix(NA, num_shift+3, num_trait+3))
colnames(df) = c('regime','node_name','param',traits)
row=1
if (num_shift) {
df[row:(row+num_shift-1),'regime'] = 1:num_shift
df[row:(row+num_shift-1),'node_name'] = get_node_name_by_num(pcm_out[['phylo']], shift_node_nums)
df[row:(row+num_shift-1),'param'] = 'shift_value'
df[row:(row+num_shift-1),traits] = t(pp[['shifts']][['values']])
}
row = row+num_shift - 1
row = row+1; df[row,] = c(NA, NA, 'alpha', diag(as.matrix(pp$selection.strength)))
row = row+1; df[row,] = c(NA, NA, 'sigma2', diag(as.matrix(pp$variance)))
row = row+1; df[row,] = c(NA, NA, 'intercept', pp$optimal.value)
regime_table = df
} else {
cat('mode "', mode, '" is not supported.\n')
}
rownames(regime_table) = NULL
return(regime_table)
}
get_leaf_regimes = function(pcm_out, mode) {
if (mode=='l1ou') {
tree = pcm_out$tree
shift_nodes = c()
leaf_regimes = data.frame(regime=0, label=rownames(pcm_out$Y))
shift_conf = sort(pcm_out$shift.configuration, decreasing=TRUE)
if ((length(shift_conf)>0)&(is.null(names(shift_conf)))) {
names(shift_conf) = 1:length(shift_conf)
}
for (node_index in shift_conf) {
node_name = get_node_name_by_num(phy=tree, node_num=tree$edge[node_index,2])
regime = ifelse(is.null(names(shift_conf)[shift_conf==node_index]), 1, names(shift_conf)[shift_conf==node_index])
for (subtree in subtrees(tree)) {
if (subtree$node.label[1]==node_name) {
table_leaves = as.character(leaf_regimes$label)
subtree_leaves = as.character(subtree$tip.label)
leaf_regimes[table_leaves %in% subtree_leaves, "regime"] = regime
break
}
}
for (leaf in tree$tip.label) {
if (leaf==node_name) {
leaf_regimes[leaf_regimes$label==leaf, "regime"] = regime
break
}
}
}
} else if (mode=='PhylogeneticEM') {
tree = pcm_out[['phylo']]
pp = PhylogeneticEM::params_process(pcm_out)
shift_nodes = pp[['shifts']][['edges']]
leaf_regimes = data.frame(regime=0, leaf=tree$tip.label)
shift_conf = shift_nodes
if (!is.null(shift_conf)) {
if (is.null(names(shift_conf))) {
names(shift_conf) = 1:length(shift_conf)
}
}
shift_conf = shift_conf[order(tree$edge[shift_conf,1], decreasing=FALSE)]
for (node_index in shift_conf) {
node_name = get_node_name_by_num(phy=tree, node_num=tree$edge[node_index,2])
regime = ifelse(is.null(names(shift_conf)[shift_conf==node_index]), 1, names(shift_conf)[shift_conf==node_index])
for (subtree in subtrees(tree)) {
if (subtree$node.label[1]==node_name) {
table_leaves = as.character(leaf_regimes$label)
subtree_leaves = as.character(subtree$tip.label)
leaf_regimes[table_leaves %in% subtree_leaves, "regime"] = regime
break
}
}
for (leaf in tree$tip.label) {
if (leaf==node_name) {
leaf_regimes[leaf_regimes$label==leaf, "regime"] = regime
break
}
}
}
} else {
cat('mode "', mode, '" is not supported.\n')
}
return(leaf_regimes)
}
get_leaf_table = function(pcm_out, mode) {
if (mode=='l1ou') {
column_names = c("regime", "leaf", "param", colnames(pcm_out$Y))
leaf_table = data.frame(matrix(0,0,ncol(pcm_out$Y)+3))
colnames(leaf_table) = column_names
leaf_regimes = get_leaf_regimes(pcm_out, mode)
tmp_table = cbind(leaf_regimes, "Y", pcm_out$Y)
colnames(tmp_table) = column_names
leaf_table = rbind(leaf_table, tmp_table)
tmp_table = cbind(leaf_regimes, "optima", pcm_out$optima)
colnames(tmp_table) = column_names
leaf_table = rbind(leaf_table, tmp_table)
tmp_table = cbind(leaf_regimes, "mu", pcm_out$mu)
colnames(tmp_table) = column_names
leaf_table = rbind(leaf_table, tmp_table)
tmp_table = cbind(leaf_regimes, "residuals", pcm_out$residuals)
colnames(tmp_table) = column_names
leaf_table = rbind(leaf_table, tmp_table)
} else if (mode=='PhylogeneticEM') {
num_leaf = length(pcm_out[['phylo']][['tip.label']])
traits = rownames(pcm_out[['Y_data']])
num_trait = length(traits)
params = c('imputed','expectations')
num_param = length(params)
df_leaf_regimes = get_leaf_regimes(pcm_out, mode='PhylogeneticEM')
regimes = df_leaf_regimes$regime
df = data.frame(matrix(NA, num_leaf*num_param, num_trait+3))
colnames(df) = c('regime','leaf','param', rownames(pcm_out[['Y_data']]))
ind_start = 1
for (param in params) {
ind_end = ind_start + num_leaf - 1
df[ind_start:ind_end,'regime'] = regimes
df[ind_start:ind_end,'leaf'] = pcm_out[['phylo']][['tip.label']]
df[ind_start:ind_end,'param'] = param
df[ind_start:ind_end,traits] = t(PhylogeneticEM::imputed_traits(pcm_out, trait=1:num_trait, where='tips', what=param))
ind_start = ind_end + 1
}
leaf_table = df
} else {
cat('mode "', mode, '" is not supported.\n')
}
rownames(leaf_table) = NULL
return(leaf_table)
}
get_bootstrap_table = function(pcm_out, bootstrap_result) {
if (mode=='l1ou') {
node_names = c(pcm_out$tree$tip.label, pcm_out$tree$node.label)
nleaf = length(pcm_out$tree$tip.label)
column_names = c("node_name", "bootstrap_support")
bp_table = data.frame(matrix(0,length(node_names),length(column_names)))
colnames(bp_table) = column_names
bp_table["node_name"] = node_names
bp_table["bootstrap_support"] = c(bootstrap_result$detection.rate[1:nleaf], NA, bootstrap_result$detection.rate[(nleaf+1):length(bootstrap_result$detection.rate)])
} else {
cat('mode "', mode, '" is not supported.\n')
}
return(bp_table)
}
tree_table_collapse2original = function(tree_table, tree_original) {
num_leaf = length(tree_original$tip.label)
num_species = length(unique(leaf2species(tree_original$tip.label)))
tree_table[,'num_leaf'] = num_leaf
tree_table[,'num_species'] = num_species
return(tree_table)
}
get_deepest_node_num = function(tree, node_nums) {
depth_values = ape::node.depth(tree)
target_depth_values = depth_values[node_nums]
deepest_node_num = node_nums[target_depth_values==max(target_depth_values)]
return(deepest_node_num)
}
regime_table_collapse2original = function(regime_table, tree_original, tree_collapsed, node_num_mapping) {
node_names_collapsed = unlist(unique(na.omit(regime_table['node_name'])))
for (node_name_collapsed in node_names_collapsed) {
if (node_name_collapsed %in% c(tree_collapsed$tip.label, tree_collapsed$node.label)) {
node_num_collapsed = get_node_num_by_name(tree_collapsed, node_name_collapsed)
node_num_originals = node_num_mapping[(node_num_mapping['tree_collapsed']==node_num_collapsed),'tree_original']
node_num_original = get_deepest_node_num(tree_original, node_num_originals)
node_name_original = get_node_name_by_num(tree_original, node_num_original)
is_target = (regime_table['node_name']==node_name_collapsed)
is_target[is.na(is_target)] = FALSE
regime_table[is_target,'node_name'] = node_name_original
}
}
rownames(df) = NULL
return(regime_table)
}
leaf_table_collapse2original = function(leaf_table, tree_original, tree_collapsed, node_num_mapping) {
num_leaf_original = length(tree_original$tip.label)
node_names_collapsed = unlist(unique(na.omit(leaf_table['leaf'])))
params = unlist(unique(na.omit(leaf_table['param'])))
df = data.frame()
for (param in params) {
for (node_name_collapsed in node_names_collapsed) {
if (node_name_collapsed %in% c(tree_collapsed$tip.label, tree_collapsed$node.label)) {
node_num_collapsed = get_node_num_by_name(tree_collapsed, node_name_collapsed)
node_num_originals = node_num_mapping[(node_num_mapping['tree_collapsed']==node_num_collapsed),'tree_original']
for (node_num_original in node_num_originals) {
if (node_num_original <= num_leaf_original) {
node_name_original = get_node_name_by_num(tree_original, node_num_original)
conditions = (leaf_table['param']==param)&(leaf_table['leaf']==node_name_collapsed)
row = leaf_table[conditions,]
row[,'leaf'] = node_name_original
df = rbind(df, row)
}
}
}
}
}
rownames(df) = NULL
return(df)
}
restore_imputed_leaves = function(leaf_table, original_trait_table) {
traits = colnames(original_trait_table)
leaf_names = leaf_table$label
for (leaf_name in leaf_names) {
if (leaf_name %in% rownames(original_trait_table)) {
conditions = ((leaf_table$label==leaf_name)&(leaf_table$param=='imputed'))
leaf_table[conditions, traits] = original_trait_table[leaf_name, traits]
}
}
return(leaf_table)
}
get_placeholder_leaf = function(tree, original_trait_table) {
out = data.frame()
params = c('Y', 'optima', 'mu', 'residuals')
for (param in params) {
tmp = data.frame(
regime=rep(0,nrow(original_trait_table)),
label=rownames(original_trait_table),
param=param
)
tmp = cbind(tmp, original_trait_table)
rownames(tmp) = NULL
out = rbind(out, tmp)
}
return(out)
}
get_placeholder_regime = function(tree, original_trait_table) {
out = data.frame()
params = c('alpha', 'sigma2', 'intercept', 'log_likelihood')
for (param in params) {
tmp = c(NA, NA, param, rep(NA, ncol(original_trait_table)))
out = rbind(out, tmp)
}
colnames(out) = c('regime', 'node_name', 'param', colnames(original_trait_table))
return(out)
}
get_placeholder_tree = function(tree, original_trait_table) {
out = data.frame()
params = c('num_shift','num_regime','num_conv_regime','num_uniq_regime','num_species','num_leaf','model_score')
out = rbind(out, rep(0, length(params)))
colnames(out) = params
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.