R/formula2colnames.R

Defines functions formula2colnames node_attr2colnames formula2nodes checkSelectNodes checkSelectNodesList filterByFormula filterByList calcObservedStatistic calcPermutedStatistic

Documented in calcObservedStatistic calcPermutedStatistic checkSelectNodes checkSelectNodesList filterByFormula filterByList formula2colnames formula2nodes node_attr2colnames

##' functions that are called inside permutationPval
##' @name permutationPvalHelper
##' @rdname permutationPvalHelper
##' @export permutationPvalHelper
permutationPvalHelper = NULL
##' extract and match attribute column names to data.tables that contain nodes which those attributes describe
##' @name formula2colnames
##' @rdname permutationPvalHelper
##' @param inheritParams permutationPval
##  node_attr interactions2permute associations2test select_nodes also_permuteYZ
##' @param cols list of column names
##' @param nodes list of node names (column names for columns that contain node names)
##' @param nodes_call list of node names, each class call (to be evaluated within data.table expressions DT[,eval(nodes_call$nodeX)])
##' @param data_list list that contains data.tables necessary for these functions to work
##' @param by_cols modified left-hand side of \code{statistic} (\code{\link[MItools]{permutationPval}}), class call
##' @param exprs right-hand side of \code{statistic} (\code{\link[MItools]{permutationPval}}), class call, \code{exprs} is evaluated by \code{by_cols}; in data.table synthax: DT[, statistic := eval(exprs), by = .(eval(by_cols))]
##' @param includeAssociations logical, if calculating statistic requires columns that contain attribute of X-Z pair associations table will be merged
##' @details \code{formula2colnames} extracts nodes, extracts attributes, attaches attributes to a table if all nodes are present in that table and in case of associations table both varibles in a formula should be nodeX and nodeZ
##' @return \code{formula2colnames} returns list of column names for XY, YZ and association tables
##' @import data.table
##' @author Vitalii Kleshchevnikov
##' @examples \dontrun{cols = formula2colnames(node_attr, cols, nodes)}
formula2colnames = function(node_attr, cols, nodes){
  # if not formula is provided
  if(!rlang::is_formula(node_attr)) stop(paste("node_attr is provided but is not a formula: class - ", class(node_attr), "; content - ",paste(paste0("[[",1:length(node_attr),"]]"), node_attr, collapse = " ")))

  # extract nodes
  node_temp = all.vars(node_attr[[2]])
  if(mean(node_temp %in% unlist(nodes)) != 1) stop(paste0("node_attr is supplied for node: ",node_temp," , however, one or more of these nodes are not supplied in interactions2permute or associations2test"))
  # extract attributes
  attribute_temp = all.vars(node_attr[[3]])
  # attach attributes to a table if all nodes are present in that table
  if(mean(node_temp %in% c(nodes$nodeX, nodes$nodeY)) == 1) cols$interactionsXY_cols = c(cols$interactionsXY_cols, attribute_temp)
  if(mean(node_temp %in% c(nodes$nodeY, nodes$nodeZ)) == 1) cols$interactionsYZ_cols = c(cols$interactionsYZ_cols, attribute_temp)
  # and in case of associations table both varibles in a formula should be nodeX and nodeZ
  if(mean(c(nodes$nodeX, nodes$nodeZ) %in% node_temp) == 1) cols$associations_cols = c(cols$associations_cols, attribute_temp)
  return(cols)
}

##' @name node_attr2colnames
##' @rdname permutationPvalHelper
##' @return \code{node_attr2colnames} returns the same as \code{formula2colnames} but when node_attr2colnames is a list of formulas
##' @examples \dontrun{cols = node_attr2colnames(node_attr, cols, nodes)}
node_attr2colnames = function(node_attr, cols, nodes) {

  if(is.list(node_attr)){
    # how many formulas in a list?
    N_attr = length(node_attr)
    list_names = character(N_attr)
    for(i in 1:N_attr){ # for each formula extract elements
      form_temp = node_attr[[i]]
      list_names[i] = as.character(as.expression(form_temp[[2]]))
      cols = formula2colnames(node_attr = form_temp, cols, nodes)
    }
    # give names to the elements of the list
    names(node_attr) = list_names
  } else if(rlang::is_formula(node_attr)){ # extract element from single formula
    cols = formula2colnames(node_attr = node_attr, cols, nodes)
  } else if(is.null(node_attr)) NULL else
    stop("node_attr is provided but is neither a list nor a formula")

  return(cols)
}

##' @name formula2nodes
##' @rdname permutationPvalHelper
##' @details \code{formula2nodes} reorders node column names so that regardless of their order in formula we later permute XY interactions and test for XZ associations
##' @return \code{formula2nodes} returns list of 3 node column names extracted from formulas supplied through \code{interactions2permute}, \code{associations2test}
##' @examples \dontrun{nodes = formula2nodes(interactions2permute, associations2test)}
formula2nodes = function(interactions2permute, associations2test){
  # if not formulas are provided
  if(!rlang::is_formula(interactions2permute)) stop("interactions2permute is provided but is not a formula")
  if(!rlang::is_formula(associations2test)) stop("associations2test is provided but is not a formula")

  # extract nodes to permute from formula ([[2]] extracts the first element of the formula, [[3]] extracts the second)
  nodeX = all.vars(interactions2permute[[2]])
  nodeY = all.vars(interactions2permute[[3]])
  # extract nodes for which to test association from formula
  nodeXtest = all.vars(associations2test[[2]])
  nodeYtest = all.vars(associations2test[[3]])
  # generate standard set of variable names: find the name of the thing that is asked to be associated with nodeX
  if(nodeX == nodeXtest) nodeZ = nodeYtest
  if(nodeX == nodeYtest) nodeZ = nodeXtest
  # generate standard set of variable names: find the name of the thing that is asked to be associated with nodeY
  if(nodeY == nodeXtest) nodeZ = nodeYtest
  if(nodeY == nodeYtest) nodeZ = nodeXtest
  # if formula suggest to find association between nodeY and some other column - reorder
  if(nodeY == nodeXtest | nodeY == nodeYtest) {
    nodeX_temp = nodeY
    nodeY = nodeX
    nodeX = nodeX_temp
  }
  # check if all 3 are different
  if(nodeX == nodeY | nodeX == nodeZ | nodeY == nodeZ) stop(paste0("formulas interactions2permute and associations2test should provide different input, such as: \n interactions2permute = nodeX ~ nodeY, associations2test = nodeX ~ nodeZ"))

  nodes = list(nodeX = nodeX, nodeY = nodeY, nodeZ = nodeZ)
  return(nodes)
}

##' @name checkSelectNodes
##' @rdname permutationPvalHelper
##' @details \code{checkSelectNodes} checks if you want to select nodes based on the attributes of those nodes + splits a formula which selects a combination of nodes based on their shared properties (like count of X and Z pairs) into a list of formulas each selecting separate nodes
##' @return \code{checkSelectNodes} returns a formula or a list of formulas if they have passed the check - error if otherwise
##' @examples \dontrun{select_nodes = checkSelectNodes(select_nodes, node_attr, nodes, cols)}
checkSelectNodes = function(select_nodes, node_attr, nodes, cols){

  if(!(rlang::is_formula(select_nodes))) stop("select_nodes is not formula")

  select_node = all.vars(select_nodes[[2]])
  select_attribute = all.vars(select_nodes[[3]])
  all_nodes = unlist(nodes)
  all_cols = unique(unlist(cols))

  if(length(select_node) >= 3) stop("you cannot select 3 or more nodes based on their shared attribute, try filtering the data before using permutationPval")
  if(mean(select_node %in% all_nodes) != 1) stop(paste0("select_nodes is supplied for node(s): ",paste0(select_node, collapse = " and ")," , however, one or more of these nodes are not supplied in interactions2permute or associations2test"))
  if(mean(select_attribute %in% all_cols) != 1) stop(paste0("select_nodes is supplied with attribute(s): ",paste0(select_attribute, collapse = " and ")," , however, one or more of these attributes are not supplied in node_attr"))

  if(is.null(node_attr)){
    if(!((mean(select_node %in% all_nodes) == 1) &
         (mean(select_attribute %in% all_nodes) == 1))) stop(paste0("select_nodes is supplied for node(s): ",paste0(select_node, collapse = " and ")," , however, one or more of these nodes are not supplied in interactions2permute or associations2test"))
  } else {
    if(!(rlang::is_formula(node_attr))) stop("node_attr is not formula")
    node = all.vars(node_attr[[2]])
    node_attribute = all.vars(node_attr[[3]])

    # if select_nodes select nodes by attribute in node_attr
    if((mean(select_node %in% node) == 1) & (mean(node %in% select_node) == 1)){
      if(mean(select_attribute %in% c(all_nodes, node_attribute)) != 1) stop(paste0("select_nodes asks to select ",paste0(select_node, collapse = " and ")," by attribute (",paste0(select_attribute, collapse = " "),") not specified in node_attr as the attribute of ",paste0(select_node, collapse = " and ")))
    }

  }

  if(length(select_node) == 1) return(select_nodes)
  if(length(select_node) == 2) {
    select_node_list = list()
    select_node_list[[1]] = as.formula(paste0(select_node[1],"~", as.expression(select_nodes[[3]])))
    select_node_list[[2]] = as.formula(paste0(select_node[2],"~", as.expression(select_nodes[[3]])))
    return(select_node_list)
  }
}

##' @name checkSelectNodesList
##' @rdname permutationPvalHelper
##' @details \code{checkSelectNodesList} calls \code{checkSelectNodes} for each element in a list (or formula) \code{select_nodes} and each element in a list (or formula) \code{node_attr}. If \code{select_nodes} or \code{node_attr} is a formula then it is converted to a list of 1L
##' @return \code{checkSelectNodesList} returns a list of formulas (even of lenght 1) if they have passed the check - error if otherwise
##' @examples \dontrun{select_nodes = checkSelectNodesList(select_nodes, node_attr, nodes, cols)}
checkSelectNodesList = function(select_nodes, node_attr, nodes, cols){
  if(rlang::is_formula(select_nodes)) select_nodes = list(select_nodes)
  select_nodes_new = list()
  if(rlang::is_formula(node_attr)) node_attr = list(node_attr)
  for (select_formula in select_nodes) {
    for (node_attr_formula in node_attr) {
      select_nodes_temp = checkSelectNodes(select_formula, node_attr_formula, nodes, cols)
      select_nodes_new = c(select_nodes_new, select_nodes_temp)
    }
  }
  select_nodes = unique(select_nodes_new)
}

##' @name filterByFormula
##' @rdname permutationPvalHelper
##' @details \code{filterByFormula} filters one or 2 of the XY, YZ or association tables based on a condition defining which nodes to keep (only one node type: X, Y or Z)
##' @return \code{filterByFormula} returns data_list in which interactionsXY, interactionsYZ, associations data.table-s were filtered by formula
##' @examples \dontrun{data_list = filterByFormula(data_list, select_nodes, cols, nodes)}
filterByFormula = function(data_list, select_nodes, cols, nodes){
  # if not formula is provided
  if(!rlang::is_formula(select_nodes)) stop("select_nodes is provided but is not a formula")

  # extract node column
  node_temp = all.vars(select_nodes[[2]])
  # extract node column (class name)
  node_ftemp = select_nodes[[2]]
  # extract condition (class call)
  condition_ftemp = select_nodes[[3]]
  # extract condition attribute names
  attribute_temp = all.vars(select_nodes[[3]])
  if(length(node_temp) > 1) stop("selecting multiple nodes by single condition is not implemented yet, however, you can cheat by supplying the same condition twice for two different node names")

  # find which nodes to keep
  # finding which data.table contains attribute necessary for filtering AND has node type that we want to filter ->
  # -> then creates a vector of nodeIDs of specific node type by evaluating the expression on the right hand side of the formula
  if(mean(attribute_temp %in% cols$interactionsXY_cols) == 1 & mean(node_temp %in% c(nodes$nodeX, nodes$nodeY)) == 1) node2keep = data_list$interactionsXY[eval(condition_ftemp), eval(node_ftemp)]
  if(mean(attribute_temp %in% cols$interactionsYZ_cols) == 1 & mean(node_temp %in% c(nodes$nodeY, nodes$nodeZ)) == 1) node2keep = data_list$interactionsYZ[eval(condition_ftemp), eval(node_ftemp)]
  if(mean(attribute_temp %in% cols$associations_cols) == 1 & mean(node_temp %in% c(nodes$nodeX, nodes$nodeZ)) == 1) node2keep = data_list$associations[eval(condition_ftemp), eval(node_ftemp)]

  # filter data.table using condition if the node (nodes) are present in that table
  if(mean(node_temp %in% c(nodes$nodeX, nodes$nodeY)) == 1) data_list$interactionsXY = data_list$interactionsXY[eval(node_ftemp) %in% node2keep,]
  if(mean(node_temp %in% c(nodes$nodeY, nodes$nodeZ)) == 1) data_list$interactionsYZ = data_list$interactionsYZ[eval(node_ftemp) %in% node2keep,]
  if(mean(node_temp %in% c(nodes$nodeX, nodes$nodeZ)) == 1) data_list$associations = data_list$associations[eval(node_ftemp) %in% node2keep,]
  return(data_list)
}

##' @name filterByList
##' @rdname permutationPvalHelper
##' @details \code{filterByList} filters one or 2 of the XY, YZ or association tables based on a condition defining which nodes to keep (only one node type: X, Y or Z), takes a list of formulas or a formula as an argument
##' @return \code{filterByList} returns data_list in which interactionsXY, interactionsYZ, associations data.table-s were filtered by each formula if select_nodes was a list
##' @examples \dontrun{data_list = filterByList(data_list, select_nodes, cols, nodes)}
filterByList = function(data_list, select_nodes, cols, nodes){
  # extract nodes to filter and apply conditions
  if(is.list(select_nodes)){
    # how many formulas in a list?
    N_attr = length(select_nodes)
    list_names = character(N_attr)
    for(i in 1:N_attr){ # for each formula extract elements
      form_temp = select_nodes[[i]]
      list_names[i] = as.character(as.expression(form_temp[[2]]))

      data_list = filterByFormula(data_list, form_temp, cols, nodes)
    }
    # give list elements names
    names(select_nodes) = list_names
  } else if(rlang::is_formula(select_nodes)){ # extract element from single formula
    data_list = filterByFormula(data_list, select_nodes, cols, nodes)
  } else if(is.null(select_nodes)) NULL else stop("select_nodes is provided but is neither a list nor a formula")
  return(data_list)
}

##' @name calcObservedStatistic
##' @rdname permutationPvalHelper
##' @details \code{calcObservedStatistic} calculates Observed Statistic by evaluating \code{exprs} by \code{by_cols}, also calculates how many nodes Y are missing node Z for each node X, saves result to \code{data_list$observed}. includeAssociations specifies if \code{exprs} required any attributes of both X and Z.
##' @return \code{calcObservedStatistic} returns \code{data_list}
##' @examples \dontrun{data_list = calcObservedStatistic(data_list, by_cols, exprs, nodes, nodes_call, includeAssociations) }
calcObservedStatistic = function(data_list, by_cols, exprs, nodes, nodes_call, includeAssociations){

  # merge without removing X and Y that don't have a match in Z, interactionsXY on the inside of "[" (in i position) means keep all interactionsXY, discard non-matching interactionsYZ
  data_list$observed = unique(data_list$interactionsYZ[data_list$interactionsXY, on = nodes$nodeY, allow.cartesian = T])
  # if calculating statistic requires some parameters of both X and Z -> merge associations table containing necessary data
  if(includeAssociations) data_list$observed = unique(data_list$associations[data_list$observed, on = c(nodes$nodeX, nodes$nodeZ), allow.cartesian = T])

  # record in how many cases statistic is missing per X
  data_list$observed[, YmissingZ_perX := sum(is.na(eval(nodes_call$nodeZ))),
                     by = eval(nodes_call$nodeX)]
  #NAs are removed otherwise statistic cannot be calculated
  data_list$observed = data_list$observed[!is.na(eval(nodes_call$nodeZ)),]

  # calculate observed statistic
  data_list$observed[, observed_statistic := eval(exprs), by = eval(by_cols)]

  return(data_list)
}

##' @name calcPermutedStatistic
##' @rdname permutationPvalHelper
##' @details \code{calcPermutedStatistic} calculates Permuted Statistic by permuting node Y column in XY (and, optionally in YZ, if \code{also_permuteYZ} is true) and calculating statistic by evaluating \code{exprs} by \code{by_cols}, unlike \code{calcObservedStatistic} doesn't calculate how many nodes Y are missing node Z for each node X. Saves result to \code{data_list$permuted}.
##' @return \code{calcPermutedStatistic} returns \code{data_list}
##' @examples \dontrun{data_list = calcPermutedStatistic(data_list, by_cols, exprs, nodes, nodes_call, includeAssociations, also_permuteYZ)}
calcPermutedStatistic = function(data_list, by_cols, exprs, nodes, nodes_call, includeAssociations, also_permuteYZ){

  setkeyv(data_list$interactionsYZ, cols = nodes$nodeY)

  # permute XY network (note: [, class character (1L) := eval(class call)])
  data_list$permuted_interactionsXY[, nodes$nodeY := sample(eval(nodes_call$nodeY))]
  setkeyv(data_list$permuted_interactionsXY, cols = nodes$nodeY)
  # if also_permuteYZ is TRUE permute YZ network
  if(also_permuteYZ) {
    data_list$permuted_interactionsYZ[, nodes$nodeY := sample(eval(nodes_call$nodeY))]
    setkeyv(data_list$permuted_interactionsYZ, cols = nodes$nodeY)
  }

  # merge without removing X that don't have a match in Z, interactionsXY on the inside of "[" (in i position) means keep all interactionsXY, discard non-matching interactionsYZ
  # if also_permuteYZ is FALSE merge permuted XY to observed YZ
  if(!also_permuteYZ) data_list$permuted = unique(data_list$permuted_interactionsXY[data_list$interactionsYZ, on = nodes$nodeY, allow.cartesian = T, nomatch = 0])
  # if also_permuteYZ is FALSE merge permuted XY to permuted YZ
  if(also_permuteYZ) data_list$permuted = unique(data_list$permuted_interactionsXY[data_list$permuted_interactionsYZ, on = nodes$nodeY, allow.cartesian = T, nomatch = 0])
  # if calculating statistic requires some parameters of both X and Z -> merge associations table containing necessary data
  if(includeAssociations) {
    setkeyv(data_list$associations, cols = c(nodes$nodeX, nodes$nodeZ))
    setkeyv(data_list$permuted, cols = c(nodes$nodeX, nodes$nodeZ))
    data_list$permuted = unique(data_list$associations[data_list$permuted, on = c(nodes$nodeX, nodes$nodeZ), allow.cartesian = T, nomatch = 0])
  }

  # record in how many cases statistic is missing per X: we don't care in permuted cases, so I don't track this
  # data_list$permuted[, YmissingZ_perX := sum(is.na(eval(nodes_call$nodeZ))),
  #                    by = eval(nodes_call$nodeX)]
  #NAs are removed otherwise statistic cannot be calculated
  data_list$permuted = data_list$permuted[!is.na(eval(nodes_call$nodeZ)),]

  # calculate permuted statistic
  data_list$permuted[, permuted_statistic := eval(exprs), by = eval(by_cols)]

  # since this statistic is being calculated to generate background distribution for nodeX - we keep only the node X and the statistic

  # collapse (unique(), to remove X-Z pairs duplicated because of many Y) counts by nodeX AND nodeZ, then remove nodeZ: otherwise identical values of counts will get collapsed into one (for example, when two nodeZ have the same frequency among interactors of one nodeX)
  net_notNA = unique(data_list$permuted[!is.na(eval(nodes_call$nodeZ)), .(eval(nodes_call$nodeX), eval(nodes_call$nodeZ), permuted_statistic)])
  setnames(net_notNA, colnames(net_notNA), c(nodes$nodeX, nodes$nodeZ,"permuted_statistic"))
  data_list$permuted = net_notNA[, nodes$nodeZ := NULL]

  return(data_list)
}

##' @name observedVSpermuted
##' @rdname permutationPvalHelper
##' @details \code{observedVSpermuted} compares Observed Statistic to Permuted Statistic, records how many times permuted statistic is at least as large as the observed statistic (\code{higher_counts}) and how many of X-Z pairs have non-NA Observed Statistic or Permuted Statistic (\code{not_missing}). In each permutation round, node X might have many nodes Z, so when calculating empirical p-value all these comparisons should be counted (not just the number of permutations). On the other hand, not in all permutation rounds a specific observed XZ pair might occur, so we need to cound non-NA "Observed Statistic to Permuted Statistic" comparisons
##' @return \code{observedVSpermuted} returns data.table contatining the following columns: "higher_counts", "not_missing", nodeX names, nodeZ names
##' @examples \dontrun{data_list_temp = MItools:::observedVSpermuted(data_list, nodes_call, nodes)}
observedVSpermuted = function(data_list, nodes_call, nodes){
  # merge keeping all observed in i and discarding all permuted in x
  setkeyv(data_list$permuted, cols = nodes$nodeX)
  setkeyv(data_list$observed, cols = nodes$nodeX)
  result = data_list$permuted[data_list$observed, on = nodes$nodeX, allow.cartesian=TRUE]
  # calculate in how many cases permuted statistic is at least as large as observed statistic
  result_pval = result[, sum(observed_statistic <= permuted_statistic, na.rm = T), by = .(eval(nodes_call$nodeX), eval(nodes_call$nodeZ))]
  setnames(result_pval, colnames(result_pval), c(nodes$nodeX, nodes$nodeZ, "higher_counts"))
  setorderv(result_pval, cols = c(nodes$nodeX, nodes$nodeZ, "higher_counts"))

  # calculate in how many cases there is a match between X and Z => count of (permutations * number of comparison in each permutation) per each X-Z pair
  result_not_missing = result[, sum(!(is.na(observed_statistic) | is.na(permuted_statistic))), by = .(eval(nodes_call$nodeX), eval(nodes_call$nodeZ))]
  setnames(result_not_missing, colnames(result_not_missing), c(nodes$nodeX, nodes$nodeZ, "not_missing"))
  setorderv(result_not_missing, cols = c(nodes$nodeX, nodes$nodeZ, "not_missing"))

  #merge these higher_counts and not_missing
  temp = result_pval[result_not_missing, on = c(nodes$nodeX, nodes$nodeZ)][!is.na(eval(nodes_call$nodeZ)),]

  return(temp[, c("higher_counts", "not_missing", nodes$nodeX, nodes$nodeZ), with = F])
}

##' @name aggregatePermutations
##' @rdname permutationPvalHelper
##' @details \code{aggregatePermutations} aggregates the result of \code{observedVSpermuted} from many permutation rounds (output of replicate, class matrix)
##' @return \code{aggregatePermutations} returns the data.table of the same format as \code{observedVSpermuted}
##' @examples \dontrun{temp2_inner = MItools:::aggregatePermutations(temp_inner, nodes, nodes_call)}
aggregatePermutations = function(temp, nodes, nodes_call){
  # aggregate attributes across permutations

  if(class(temp) == "list" & class(temp[[1]])[1] == "data.table")
  { # list of data.tables
    temp2 = data.table(higher_counts = 0, not_missing = 0, temp[[1]][, nodes$nodeX, with = F], temp[[1]][, nodes$nodeZ, with = F])
    #setnames(temp2, colnames(temp2), c("higher_counts", "not_missing", nodes$nodeX, nodes$nodeZ))
    for (j in 1:length(temp)) {
      set(temp2, j = "higher_counts", value = temp2[,higher_counts] + temp[[j]][,higher_counts])
      set(temp2, j = "not_missing", value = temp2[,not_missing] + temp[[j]][,not_missing])
    }
  } else

  { # matrix of vectors (each cell contains a vector)
    temp2 = data.table(higher_counts = 0, not_missing = 0, temp[nodes$nodeX,1][[1]], temp[nodes$nodeZ,1][[1]])
    setnames(temp2, colnames(temp2), c("higher_counts", "not_missing", nodes$nodeX, nodes$nodeZ))
    for(i in 1:ncol(temp)){
      if(!(all.equal(temp2[, eval(nodes_call$nodeX)], temp[nodes$nodeX, i][[1]]) &
           all.equal(temp2[, eval(nodes_call$nodeZ)], temp[nodes$nodeZ, i][[1]]))) stop("X and/or Z ids mismatch, you have stumbled upon a bug in permutationPval code") else{
             temp2[, higher_counts := higher_counts + temp["higher_counts", i][[1]]]
             temp2[, not_missing := not_missing + temp["not_missing",i][[1]]]
           }
    }
  }
  return(temp2)
}
vitkl/NetFeaturePval documentation built on May 19, 2019, 9:39 p.m.