#######################################################################
# functions to organize data around the tree structure
#######################################################################
#' Organize the data around the rooted binary weighted tree
#'
#' NB: currently this minimally built; need some checking functions
#'
#' @param Y `N` by `J` binary data matrix; rows for subjects; columns
#' for binary measurements/features
#' @param leaf_ids Character string for leaf nodes for each observation
#' @param mytree a tree (an `igraph` object) that contains the node,
#' edge, edge-length ("weight") information.
#' @param weighted_edge logical: `TRUE` for using the branch lengths then
#' the `mytree` must have this info; if `FALSE`, every edge, including
#' an imaginary edge leading to the root node, is set to have length `1`.
#' @param Z_obs Default is `NULL`; A two-column matrix of (id, class indicator);
#' the number of rows equals the number of observations;
#' an entry of 2nd column is `NA` if the subject
#' in that row has an unknown class indicator.
#' Importantly, the rows will be reordered according to the reordered `Y`.
#'
#' @return A list of data and tree information for model fitting
#' \itemize{
#' \item `Y` A matrix of `N` by `J`; binary measurements with rows ordered by
#' leaf groups (`leaf_ids`).
#' \item `A` A matrix of `p` by `p`; each column contains some `1`s, indicating
#' the node in that column is an ancestor of the node represented in the row.
#' Ancestor of a node include that node itself.
#' \item `A_leaves` A matrix of `pL` by `p`; A submatrix of `A` that represents
#' the ancestor matrix but only for leaves
#' \item `leaf_ids` A vector of `N`integers; ordered by the leaves as
#' specified by `mytree`.
#' \item `leaf_ids_units` A list of length `pL`, each element
#' is a vector of subject ids belonging to each leaf node
#' \item `leaf_ids_nodes` a list of length `p`, each element
#' is a vector of integers (between `1` and `pL`; id
#' is only for leaf nodes) indicating the leaf nodes.
#' \item `ancestors` a list of length `pL`,
#' each element is the vector of ancestors (between `1` and `p`; id is among
#' all nodes)
#' \item `edge_lengths` a list of length `pL`,
#' each element is a numeric vector of edge lengths from the root node
#' to the leaf. It is computed based on `E(mytree)$weight`. It is `NULL`
#' if `E(mytree)$weight` is `NULL`
#' \item `h_pau` a numeric vector of length `p`; each value is
#' the edge length from u to its parent (if u is a root node, then the value is 1).
#' This vector by default is all `1`s. If `weighted_edge=TRUE`, `h_pau`
#' is set to `E(mytree)$weight`, the input edge weights.
#' \item `v_units` (redundant - identical to `leaf_ids`)
#' a vector of length equal to the total number of rows in `Y`;
#' each element is an integer between `1` and `pL`,
#' indicating which leaf does the observation belong to.
#' \item `subject_id_list` a list of length `p`; each element is a vector of
#' subject ids
#' that are in the leaf descendants of node `u` (internal or leaf node)
#' \item `ord` the permutation to order the original rows to produce the final
#' ordering of the rows of `Y`.
#' }
#' @export
#' @import igraph
#'
design_tree <- function(Y,leaf_ids,mytree,weighted_edge=FALSE,Z_obs = NULL){
# by default, not weighted tree.
# warning("using hard-coded info\n")
# Y <- dat_mge[!is.na(match_ind),ind_EL] # these are not ordered yet.
# leaf_ids <- dat_mge[!is.na(match_ind),"ct_MLST"]
# mytree <- thetree_igraph
# E(mytree)$weight <- thetree$edge.length
# rootnode="Node1"
# weighted_edge <- FALSE
if (!is.character(leaf_ids)) stop("[lotR] `leaf_ids` is not a character object.")
if (!igraph::is.igraph(mytree)) stop("[lotR] 'mytree' is not a graph object.")
if (!igraph::is.directed(mytree)) stop("[lotR] 'mytree' is not directed.")
cat("\n\n [lotR] ", c("unweighted", "weighted")[weighted_edge+1], "tree...\n\n")
# extract nodes:
nodes <- names(igraph::V(mytree))
leaves <- names(igraph::V(mytree)[igraph::degree(mytree, mode = "out") == 0])
rootnode <- names(igraph::V(mytree)[igraph::degree(mytree, mode = "in") == 0])
if(!setequal(unique(leaf_ids), leaves))
{stop("[lotR] Not all `leaf_ids` are leaves of tree")}
nodes0 <- nodes
# Re-order nodes to have internal nodes first, then leaves: <------- ordered nodes.
nodes <- c(nodes[!(nodes %in% leaves)], leaves)
# Get levels for specifying groups of hyperparameters
if (is.null(igraph::V(mytree)$levels)) {
# The default is to have two levels: one for leaf nodes, one for internal nodes
levels <- rep(1, length(nodes))
levels[nodes %in% leaves] <- 2
} else {
# Otherwise, use the levels supplied; can be a single level too:
levels <- igraph::V(mytree)[nodes]$levels # <-- ordered.
levels <- levels[match(names(igraph::V(mytree)), nodes)] # <--- ordered. same as above???
}
if (sum(table(levels) < 5) > 0) {
warning("[lotR] Some levels contain fewer than five nodes: this may lead to problems
with estimating variance parameters. Recommend increasing the number
of nodes per level.")
}
# sizes:
p <- length(nodes)
pL <- length(leaves)
n <- nrow(Y)
# ancestor matrix:
A <- igraph::as_adjacency_matrix(mytree, sparse = TRUE) # directed graph; upper diag mat
A <- A[nodes, nodes] # re-order rows/columns to mirror nodes
A <- Matrix::expm(Matrix::t(A)) # get ancestors: lower diag mat (1: column j is anc for the row)
A[A > 0 ] <- 1
A <- Matrix::Matrix(A, sparse = TRUE)
# 16 x 16 sparse Matrix of class "dtCMatrix"
# [[ suppressing 16 column names ‘7.4’, ‘7.4.1’, ‘7.4.2’ ... ]]
#
# 7.4 1 . . . . . . . . . . . . . . .
# 7.4.1 1 1 . . . . . . . . . . . . . .
# 7.4.2 1 . 1 . . . . . . . . . . . . .
# 7.4.3 1 . . 1 . . . . . . . . . . . .
# 7.4.4 1 . . . 1 . . . . . . . . . . .
# 7.4.1.1 1 1 . . . 1 . . . . . . . . . .
# 7.4.1.2 1 1 . . . . 1 . . . . . . . . .
# 7.4.1.3 1 1 . . . . . 1 . . . . . . . .
# 7.4.2.0 1 . 1 . . . . . 1 . . . . . . .
# 7.4.2.1 1 . 1 . . . . . . 1 . . . . . .
# 7.4.2.2 1 . 1 . . . . . . . 1 . . . . .
# 7.4.3.0 1 . . 1 . . . . . . . 1 . . . .
# 7.4.3.1 1 . . 1 . . . . . . . . 1 . . .
# 7.4.3.2 1 . . 1 . . . . . . . . . 1 . .
# 7.4.4.1 1 . . . 1 . . . . . . . . . 1 .
# 7.4.4.2 1 . . . 1 . . . . . . . . . . 1
# Sort by leaf_ids, where order is specified by ordering in 'mytree':
ord <- order(ordered(leaf_ids, levels = leaves)) # <--- leaves.
Y <- Y[ord,,drop=FALSE]
leaf_ids <- leaf_ids[ord] # length = n.
if (!is.null(Z_obs)){
Z_obs <- Z_obs[ord,,drop=FALSE]
}
# get lists of ancestors for each leaf_ids:
d <- igraph::diameter(mytree,weights=NA)
# need to set weight=NA to prevent the use of edge lengths in determining the diameter.
ancestors <- igraph::ego(mytree, order = d + 1, nodes = leaves, mode = "in")
ancestors <- sapply(ancestors, names, simplify = F)
ancestors <- sapply(ancestors, function(a, nodes) which(nodes %in% a),
nodes = nodes, simplify = F)
names(ancestors) <- leaves
# get lists of which units correspond to each leaf_ids
leaf_ids_units <- sapply(leaves, function(v) which(leaf_ids == v), simplify = FALSE)
names(leaf_ids_units) <- leaves
# get lists of leaf_ids are descendants of each node
descendants <- igraph::ego(mytree, order = d + 1, nodes = nodes, mode = "out")
descendants <- sapply(descendants, names)
# leaf.descendants:
leaf_ids_nodes <- sapply(descendants, function(d, leaves) which(leaves %in% d),
leaves = leaves,simplify = FALSE)
# Change leaf_ids to integers
leaf_ids <- sapply(leaf_ids, function(o) which(leaves == o))
# The branch lengths (NB: need to check this is correct.)
edge_lengths <- vector("list",length=pL)
names(edge_lengths) <- names(ancestors)
for (j in 1:pL){
res <- shortest_paths(mytree, # here we are using the original mytree, not "nodes" which is reordered.
which(vertex_attr(mytree)$name==rootnode), # root.
which(vertex_attr(mytree)$name==leaves[j]), # leaf.
output="both")
edge_lengths[[j]] <- E(mytree)$weight[sapply(res$epath[[1]],
function(e) which(E(mytree) == e))]
}
parent_nm <- c(rootnode,
names(unlist(lapply(igraph::ego(mytree, order = 1,
nodes = nodes, mode = "in")[-1],"[",-1))))
h_pau <- rep(1,p) # equally weighted with default lengths 1.
if (weighted_edge){
for (u in 2:p){
res <- shortest_paths(mytree, # here we are using the original mytree, not "nodes" which is reordered.
which(vertex_attr(mytree)$name==parent_nm[u]), #parent
which(vertex_attr(mytree)$name==nodes[u]), # leaf.
output="both")
h_pau[u] <- E(mytree)$weight[sapply(res$epath[[1]],
function(e) which(E(mytree) == e))]
}
h_pau <- pmax(h_pau,1e-8) # just to make the weights non-zero. <-----------------------------------------IS THIS A MUST?
}
#h_pau[1] <- mean(h_pau[-1])
A_leaves <- A[nodes%in%leaves,]
v_units <- NULL
for (v in 1:pL){
leaf_list_tmp <- leaf_ids_units[v]
units <- unlist(leaf_list_tmp)
v_units <- c(v_units,unlist(mapply(rep,v,unlist(lapply(leaf_list_tmp,length)))))
}
# leaf_ids_units: pL length, each is the ids of individuals.
# leaf_ids_nodes: p length, each is the indicator (between 1 and pL) the leaves.
subject_id_list <- list() # for each internal or leaf node
for (u in 1:p){
leaf_desc <- leaf_ids_nodes[[u]]
subject_id_list[[u]] <- unlist(leaf_ids_units[leaf_desc])
# if (u==120){
# for (v in leaf_desc){
# print(leaves[v])
# print(c(unlist(leaf_ids_units[v])))
# }
# }
}
# print(subject_id_list)
mytree <- permute(mytree,match(nodes,nodes0))
make_list(Y,A,A_leaves,rootnode,
leaf_ids,leaf_ids_units,leaf_ids_nodes,
ancestors,edge_lengths,h_pau, # h_pau is used in fitting, so it can be that h_pau are 1s but the edge lengths are weighted.
levels,v_units,subject_id_list,Z_obs,ord,mytree)
#NB: by now, the node ids have already been reordered; it is only that the tree
# itself has not ordered the ids in the tree.
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.