R/sfreemap.misc.R

Defines functions remove_last_part_of_tip_name create_trees remove_outliers create_result_matrix write_to_file save_tree_file create_out_file create_out_dir create_info_file sfreemap.reduce simmap.calc_simulations_evolution simmap.mean sfreemap.diff simulation.data summary_speed_up

Documented in create_trees sfreemap.diff simmap.mean simulation.data

summary_speed_up <- function(t1, t2) {
    ret <- list(
        'mean' = mean(apply(cbind(t1,t2), 1, function(x) x[1]/x[2]))
        , 'lastf1' = tail(t1, 1)
        , 'lastf2' = tail(t2, 1)
        , 'max' = tail(t1, 1) / tail(t2, 1)
    )
    return(ret)
}

simulation.data <- function(tree) {
    # the order of columns created by the sim.history might be different
    # from what simmap and sfreemap expect.

    if (inherits(tree, 'phylo')) {
        fix_order <- tree$mapped.edge[,order(colnames(tree$mapped.edge))]
        lmt <- countSimmap(tree)$N
        emr <- apply(fix_order, 2, sum)
    } else {
        lmt <- mean(countSimmap(tree)$Tr[,1])
        emr <- t(sapply(tree, function(x) colSums(x$mapped.edge)))
        emr <- colMeans(emr)
        emr <- emr[order(names(emr))]
    }
    return (list(lmt=lmt, emr=emr))
}

sfreemap.diff <- function(a, b) {
    lmt <- abs(a$lmt - b$lmt)
    emr <- sum(abs(a$emr - b$emr))
    return (list(lmt=lmt, emr=emr))
}

simmap.mean <- function(mtrees) {
    if (inherits(mtrees, 'multiPhylo')) {
        mean_emr <- rowMeans(sapply(mtrees, function(x) colSums(x$mapped.edge)))
        mean_lmt <- colMeans((countSimmap(mtrees,message=FALSE)))[1]
    } else {
        mean_emr <- colSums(mtrees$mapped.edge)
        mean_lmt <- countSimmap(mtrees)$N
    }

    # ensure order
    n <- sort(names(mean_emr))
    mean_emr <- mean_emr[n]

    return (list(lmt=mean_lmt, emr=mean_emr))
}

# This function is suppose to test whether more simulations means more accuracy
# on simmap. The results are a bit odd, maybe I'm not thinking right about
# how to compute this. Need revision..
simmap.calc_simulations_evolution <- function(trees, plot=FALSE) {
    data <- matrix(NA, nrow=length(trees), ncol=2)
    tmean <- simmap.mean(trees[[1]])$emr
    data[1,] <- c(1, 0)
    for (i in 2:length(trees)) {
        emr <- colSums(trees[[i]]$mapped.edge)
        diff <- sum(abs(emr - tmean))
        data[i,] <- c(i, diff)
        tmean <- simmap.mean(trees[1:i])$emr
    }
    data <- data.frame(data)
    colnames(data) <- c('iteration', 'diff')

    if (isTRUE(plot)) {
        breaks <- data$iteration
        p <- ggplot(data, aes(x=iteration, y=diff)) +
                geom_line() +
                scale_x_continuous(breaks=breaks) +
                theme_bw(base_size=26) +
                xlab('Tree number') +
                ylab('Distance to mean') +
                theme(axis.title.y=element_text(vjust=1.8))
        print(p)
    }

    return(data)
}

# This function calculates the "mean tree", in other words, the mean value
# for dwelling times on states of a multiPhylo object.
sfreemap.reduce <- function(trees, type='mean') {
    reduced_trees <- list()

    start_idx <- 1
    end_idx <- 0
    cont <- 1

    keep_going <- function(t, start, end) {
        return (length(t) > end_idx && all.equal.phylo(t[[start]], t[[end]]))
    }

    while (TRUE) {

        while (keep_going(trees, start_idx, end_idx+1)) {
            end_idx <- end_idx + 1
        }

        range <- start_idx:end_idx
        current_set <- trees[range]
        base_tree <- current_set[[1]]

        if (length(range) > 1) {
            states <- colnames(base_tree$mapped.edge)
            mapped.edge <- lapply(current_set, function(x) x$mapped.edge)

            if (type == 'mean') {
                base_tree$mapped.edge <- Reduce('+', mapped.edge) / length(mapped.edge)
            } else if (type == 'median') {
                reduced <- Reduce(cbind, mapped.edge)
                for (state in states) {
                    tmp <- reduced[,colnames(reduced)==state]
                    base_tree$mapped.edge[,state] <- apply(tmp, 1, median)
                }
            } else {
                stop('unrecognized type, we only know mean and median')
            }
        }

        reduced_trees[[cont]] <- base_tree

        if (length(trees) == end_idx) {
            break
        } else {
            start_idx <- end_idx + 1
            cont <- cont + 1
        }
    }

    if (length(reduced_trees) > 1) {
        class(reduced_trees) <- 'multiPhylo'
    } else {
        reduced_trees <- reduced_trees[[1]]
        class(reduced_trees) <- 'phylo'
    }

    return(reduced_trees)
}

# print all arguments to a file
create_info_file <- function(out_dir, ...) {
	args <- list(...)
	out_file <- paste(out_dir, 'arguments.txt', sep='/')
	for (i in names(args)) {
		if (nchar(i) > 0) {
			txt <- paste(i, '=', args[[i]])
			write(txt, file=out_file, append=TRUE)
		}
	}
}

create_out_dir <- function(dest_dir, species, Q, model, outdir_suffix) {
	q_txt <- ifelse(is.matrix(Q), 'matrix', Q)
	out_dir <- paste(species, q_txt, model, outdir_suffix, sep='_')
	out_dir <- paste(dest_dir, out_dir, sep='/')
	dir.create(out_dir, showWarnings=FALSE)
	return (out_dir)
}

create_out_file <- function(out_dir, method, sim_num) {
	out_file <- paste(method, '_', sim_num, '.txt', sep='')
	out_file <- paste(out_dir, out_file, sep='/')
	return (out_file)
}

save_tree_file <- function(out_dir, tree) {
	out_tree_file <- paste(out_dir, '/tree.nexus', sep='')
	write.tree(tree, file=out_tree_file)
	return (out_tree_file)
}

write_to_file <- function(out_file, result, tree=NULL, out_dir=NULL
                            , hist=NULL) {

    if (!is.null(tree)) {
    	save_tree_file(out_dir, tree)
    }

    if (!is.null(hist)) {
    	txt <- paste('# sim.history ', 0, hist$lmt
                        , paste(hist$emr, collapse=' '))
    	write(txt, file=out_file)
    }

	txt <- paste('#',paste(colnames(result), collapse=','))
	write(txt, file=out_file, append=TRUE)

	write.table(result, file=out_file, row.names=FALSE, col.names=FALSE, append=TRUE, quote=FALSE)
}

create_result_matrix <- function(n) {

	# TODO: only works for two states, should work for any number
	metric_values <- c("n_trees", "n_species", "q_size", "time", "nsim",
                       "mode", "q", "omp", "cores")

	result <- matrix(0, nrow=n
			  , ncol=length(metric_values)
			  , dimnames=list(1:n, metric_values))

	return (result)
}

remove_outliers <- function(x, na.rm = TRUE, ...) {
  qnt <- quantile(x, probs=c(.25, .75), na.rm = na.rm, ...)
  H <- 1.5 * IQR(x, na.rm = na.rm)
  y <- x
  y[x < (qnt[1] - H)] <- NA
  y[x > (qnt[2] + H)] <- NA
  y[!y %in% NA]
}

create_trees <- function(n_trees, n_species, q_size, unique=FALSE) {
    # Create Q
    QS <- matrix(1, nrow=q_size, ncol=q_size)
    diag(QS) <- -sum(QS[1,]) + 1
    rownames(QS) <- colnames(QS) <- 1:nrow(QS)

    # Create topologies
    if (isTRUE(unique)) {
        topologies <- pbtree(n=n_species, nsim=n_trees, scale=1)
        trees <- lapply(topologies, sim.history, Q=QS, message=FALSE)
    } else {
        topology <- pbtree(n=n_species, nsim=1, scale=1)
        trees <- sim.history(topology, Q=QS, message=FALSE)
        if (n_trees > 1) {
            trees <- rep(trees, n_trees)
        }
    }

    add_qs <- function(tree, Q) {
        tree[['Q']] <- Q
        return (tree)
    }

    if (n_trees > 1) {
        trees <- lapply(trees, add_qs, Q=QS)
        class(trees) <- 'multiPhylo'
    } else {
        trees[['Q']] <- QS
        class(trees) <- 'phylo'
    }

    return(trees)
}

# try passing tree$tip.label
remove_last_part_of_tip_name <- function(species) {

    remove <- function(specie) {
        return (paste(head(specie, 2), collapse="_"))
    }

    return (sapply(strsplit(species, '_'), remove))
}
dpasqualin/sfreemap.tests documentation built on May 13, 2017, 4:39 a.m.