R/semantic_enrich.R

Defines functions edge_tbl_to_graph prod_catchNAs min_catchNAs max_catchNAs mean_catchNAs sum_catchNAs normalize metavariable_agg metavariable_variable_descendants metavariable_info join_vars_to_ontol node_IC_zhou node_info_content validate_mapping_tbl validate_ontol_nw

Documented in edge_tbl_to_graph join_vars_to_ontol max_catchNAs mean_catchNAs metavariable_agg metavariable_info metavariable_variable_descendants min_catchNAs node_IC_zhou normalize prod_catchNAs sum_catchNAs validate_mapping_tbl validate_ontol_nw

#    Copyright (C) 2022 Queens University Belfast
#    
#    This file is part of 'eHDPrep'
#
#    'eHDPrep' is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    'eHDPrep' is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with 'eHDPrep'. If not, see <http://www.gnu.org/licenses/>.

# needed for "where" function
utils::globalVariables("where")

#' Validate ontology network for semantic enrichment
#' 
#' Performs tests on a graph object in preparation for semantic enrichment.
#' 
#' The tests are:
#' \enumerate{
#'    \item Is graph coercible to \code{\link[tidygraph:tidygraph]{tidygraph}} format?
#'    \item Is graph directed?
#'    \item Does graph contains one component (is one ontology)?
#' }
#' 
#' @param graph graph object to validate.
#'
#' @return input graph or validation errors
validate_ontol_nw <- function(graph) {
  if(!tidygraph::with_graph(tidygraph::as_tbl_graph(graph),
                           tidygraph::graph_is_directed())) {
    stop("`graph` must be directed graph.", call. = FALSE)
  } else if (!tidygraph::with_graph(tidygraph::as_tbl_graph(graph),
                                     tidygraph::graph_component_count()) == 1) {
    stop("`graph` must have one component.", call. = FALSE)
  }
  
  tidygraph::as_tbl_graph(graph)
}

#' Validate mapping table for semantic enrichment
#'
#' Applies tests to a mapping table to ensure it is valid for use with
#' the data frame and ontological graph, in preparation for semantic enrichment.
#'
#' @param mapping_tbl data frame. Contains two columns. First column contains
#'   variable names of a primary dataset. Second column contains entities in
#'   an ontological graph to which the primary dataset's variable names are mapped.
#' @param data data frame. Primary dataset which contains variable names
#'   referred to in first column of the mapping table
#' @param ontol_graph ontological graph which contains entity names/IDs referred
#'   to in second column of the mapping table
#' @importFrom dplyr mutate filter pull row_number
#' @importFrom rlang .data
#' @importFrom magrittr %>%
#' @return Any warnings and the mapping table returned invisibly
validate_mapping_tbl <- function(mapping_tbl, data, ontol_graph) {
  mapping_tbl %>%
    dplyr::mutate(matching_vals = .[[1]] == .[[2]],
                  row = dplyr::row_number()) %>%
    dplyr::filter(.data$matching_vals) %>%
    dplyr::pull(.data$row)->
    matching_rows
  
  # if not two cols
  if(ncol(mapping_tbl) != 2) {
    stop("`mapping_tbl` must have two columns.\n
         \u2716 You have supplied one with ",ncol(mapping_tbl)," columns.", call. = FALSE)
    # if two cols aren't equal
  } else if (length(matching_rows) > 0){
    stop("Dataset variables must not equal their mapped ontological entities.
         \u2716 Rows ", paste0(matching_rows, collapse = ", "), " have matching values in both columns.",
         call. = FALSE)
    
  } 
  # optionally: if any col1 not in data
  else if(!missing(data)) {
    mapping_tbl %>%
      dplyr::filter(!.[[1]] %in% names(data)) %>%
      dplyr::pull(1)->
      missing_vars
    
    if(length(missing_vars) > 0) {
      warning("\u2716 The following variable names in column 1 of `mapping_tbl` are not column names in `data`:\n",
              paste0(missing_vars, collapse = ", "), call. = FALSE)
    } else{}
    
  }
  # optionally: if any col2 not in onto
  else if(!missing(ontol_graph)) {
    validate_ontol_nw(ontol_graph)
    ontol_graph %>% 
      tidygraph::as_tbl_graph() %>%
      tidygraph::pull(1) ->
      onto_ids
    
    mapping_tbl %>%
      dplyr::filter(!.[[2]] %in% onto_ids) %>%
      dplyr::pull(2) ->
      missing_maps
    
    if(length(missing_maps) > 0) {
      warning("\u2716 The following values in column 2 of `mapping_tbl` are not in `ontol_graph`:\n",
              paste0(missing_maps, collapse = ", "), call. = FALSE)
    } else{}
    
  } else{}
  invisible(mapping_tbl)
  
  
}

#' Calculate Node Information Content
#'
#' Computes the information content for each node in a directed graph using the
#' method developed by  by Lord \emph{et al.}.
#'
#' @param graph \code{\link[tidygraph:tidygraph]{tidygraph}} directed graph.
#' @param mode Character constant specifying the directionality of the edges.
#' @note For use in semantic enrichment, this should be applied before joining
#'   an ontology with variable nodes.
#' @importFrom tidygraph with_graph as_tbl_graph graph_is_directed map_local_dbl
#'   mutate graph_order
#' @importFrom tibble as_tibble
#' @importFrom magrittr %>%
#' @return tidygraph with additional node attribute "information_content"
#' @noRd
#' @examples
#' data(example_ontology)
#' node_info_content(example_ontology)
#' @references Lord, P. W., Stevens, R. D., Brass, A. & Goble, C. A.
#'   Investigating semantic similarity measures across the Gene Ontology: the
#'   relationship between sequence and annotation. Bioinformatics 19, 1275–1283
#'   (2003).


node_info_content <- function(graph, mode = "in") {


  graph %>%
    validate_ontol_nw() %>%
    # Count number of descendants and self
    tidygraph::mutate(tmp_num_desc = tidygraph::map_local_dbl(tidygraph::graph_order(),
                                                     mode = mode,
                                                     mindist  = 0,
                                                     .f = function(graph, neighborhood, ...) {
                                                       nrow(tibble::as_tibble(neighborhood, active ='nodes'))
                                                     })) %>%
    # Calculate IC
    # graph_order = number of nodes in the graph
    tidygraph::mutate(information_content = -log2(.data$tmp_num_desc/tidygraph::graph_order())) %>%
    # remove temporary variable
    tidygraph::select(-.data$tmp_num_desc)
}

#' Calculate Node Information Content (Zhou et al 2008 method)
#'
#' Computes the information content for each node in a directed graph according
#' to the equation developed by Zhou \emph{et al.} (2008).
#'
#' @param graph \code{\link[tidygraph:tidygraph]{tidygraph}} directed graph.
#' @param mode Character constant specifying the directionality of the edges.
#'   One of "in" or "out".
#' @param root name of root node identifier in column 1 to calculate node depth
#'   from.
#' @param k numeric value to adjust the weight of the two items of information
#'   content equation (relative number of hyponyms/descendants and relative node
#'   depth). Default = 0.5
#' @note For use in semantic enrichment, this should be applied before joining
#'   an ontology with nodes representing data variables (i.e. before applying
#'   \code{\link{join_vars_to_ontol}}.
#' @importFrom tidygraph with_graph as_tbl_graph graph_is_directed map_local_dbl
#'   mutate graph_order pull
#' @importFrom tibble as_tibble
#' @importFrom magrittr %>%
#' @importFrom rlang .data
#' @return tidygraph with additional node attribute "information_content"
#' @export
#' @references Zhou, Z., Wang, Y. & Gu, J. A New Model of Information Content
#'   for Semantic Similarity in WordNet. in 2008 Second International Conference
#'   on Future Generation Communication and Networking Symposia vol. 3 85–89
#'   (2008).

#' @examples 
#' data(example_ontology)
#' node_IC_zhou(example_ontology, mode = "in", root = "root")
node_IC_zhou <- function(graph, mode = "in", root, k = 0.5) {

  if ("igraph" %in% class(graph)) {
    graph <- tidygraph::as_tbl_graph(graph)
  } else{}
  
  graph %>%
    tidygraph::mutate( # calc the number of descendents (hyponyms) of concept c
      .hypo_c = tidygraph::map_local_dbl(tidygraph::graph_order(),
                                         mode = mode,
                                         mindist  = 1,
                                         .f = function(graph, neighborhood, ...) {
                                           nrow(tibble::as_tibble(neighborhood, active ='nodes'))
                                         }),
      # calc depth of concept c
      .depth_c = tidygraph::dfs_dist(which(tidygraph::pull(graph, 1) == root), mode),
      # max depth in graph
      .depth_max = max(.data$.depth_c, na.rm = T),
      # Calculate IC
      # graph_order = number of nodes in the graph
      information_content = 
        k*(1-log2(.data$.hypo_c+1) /
             log2(tidygraph::graph_order())) +
        (1-k) * (log2(.data$.depth_c)) / log2(.data$.depth_max),
      information_content = ifelse(is.infinite(.data$information_content),
                                   0, .data$information_content)
      ) %>%
    # remove temporary variables
    tidygraph::select(-c(.data$.hypo_c, .data$.depth_c, .data$.depth_max))
}


#' Join Mapping Table to Ontology Network Graph
#'
#' This function creates new nodes representing dataset variables and joins them
#' to an input ontology network using a mapping file. Prior to joining, the
#' information content of all nodes is calculated using \code{\link{node_IC_zhou}}.
#' 
#' \itemize{ \item The user-defined mappings between variables in a dataset and
#' entities/terms in an ontology are provided in an edge table
#' (\code{var2entity_tbl}). \item A node attribute column, \code{node_category} is
#' generated to describe if a node is one of "Dataset Variable", "Annotation", or
#' "Annotation Ancestor".
#' }
#' @param ontol_graph Graph containing the chosen ontology. Must be in
#'   \code{\link[tidygraph:tidygraph]{tidygraph}} format or coercible to this format.
#' @param var2entity_tbl Edge table containing dataset variable names in first
#'   column and entities in ontologies to which they are mapped in the second
#'   column.
#' @inheritParams node_IC_zhou
#' @family semantic enrichment functions
#' @seealso node_IC_zhou
#' @importFrom tidygraph as_tbl_graph graph_join mutate
#' @importFrom dplyr if_else
#' @importFrom forcats as_factor fct_relevel
#' @importFrom magrittr %>%
#' @importFrom rlang .data
#' @return A \code{\link{tidygraph}} resulting from the joining of \code{var2entity_tbl}
#'   and \code{ontol_graph}.
#' @export
#' @examples data(example_ontology)
#' join_vars_to_ontol(example_ontology, example_mapping_file, root = "root", mode = "in")
join_vars_to_ontol <- function(ontol_graph, var2entity_tbl, mode = "in", root, k = 0.5) {
  
  var2entity_tbl %>%
    tidygraph::as_tbl_graph() ->
    v2e_tbl_g

  ontol_graph %>%
    node_IC_zhou(root = root, mode = mode, k = k) %>%
    tidygraph::graph_join(v2e_tbl_g, by = "name") %>%
    tidygraph::mutate(node_category = dplyr::if_else(.data$name %in% dplyr::pull(var2entity_tbl,2),
                              "Annotation",
                              dplyr::if_else(.data$name %in% dplyr::pull(var2entity_tbl,1),
                                      "Dataset Variable",
                                      "Annotation Ancestor"))) %>%
    tidygraph::mutate(node_category = forcats::as_factor(.data$node_category)) %>%
    tidygraph::mutate(node_category = forcats::fct_relevel(.data$node_category,
                                                c("Dataset Variable",
                                                  "Annotation",
                                                  "Annotation Ancestor")
                                                )
    )
}


#' Compute Metavariable Information
#'
#' Calculates attributes for each node in a graph object pertaining to their
#' suitability and rank as metavariables; primarily if they are the most
#' informative common ancestor (see \code{\link{node_IC_zhou}}) of a set of
#' nodes representing a dataset variable.
#'
#' The added attributes are:
#' \describe{\item{min_dist_to_var}{Integer. The minimum distance of an ontology
#' node in the graph to a node representing a dataset variable.}
#' \item{is_metavariable}{Logical. If the node has at least two descendants in
#' the graph which represent dataset variables.}
#' \item{variable_descendants}{List. The names of variables of which a node is
#' an ancestor.}
#' \item{variable_set}{Integer. An identifier for the unique set of descendants
#' in the graph which represent dataset variables. The assigned
#' number corresponds to the order in which a unique set was identified when
#' scanning through the node table.}
#' \item{highest_IC}{Logical. If the node possesses the highest information
#' content of all other nodes which are common ancestors of the same variable
#' set. Information content is expected to have been calculated in
#' \code{\link{join_vars_to_ontol}}.}
#' }
#' @seealso \code{\link{node_IC_zhou}}
#' @param graph Graph containing ontological and dataset nodes. Must be in
#' \code{\link[tidygraph:tidygraph]{tidygraph}} format or coercible to this format.
#' @param mode Character constant specifying the directionality of the edges.
#'   One of: "in" or "out".
#' @param IC_threshold Metavariables with IC less than this value will be
#'   omitted from output. Default = 0 (no omission).
#' @family semantic enrichment functions
#' @importFrom tidygraph mutate map_local_int as_tibble filter graph_order
#'   select pull map_local_lgl map_local arrange group_by ungroup
#' @importFrom igraph distances
#' @importFrom dplyr cur_data cur_group_id
#' @importFrom magrittr %>%
#' @importFrom rlang .data
#' @return A modified graph object with additional node attributes pertaining to their
#'   status as a metavariable.
#' @export
#'
#' @examples
#' data(example_ontology)
#' require(magrittr)
#' example_ontology %>%
#' join_vars_to_ontol(example_mapping_file, root = "root") -> joined_ontol
#' 
#' metavariable_info(joined_ontol)
metavariable_info <- function(graph, mode = "in", IC_threshold = 0) {
  message("Identifying semantic commonalities...")
  start_time <- Sys.time()
  graph %>%
    # Minimum distance of each node to a variable node
    tidygraph::mutate(min_dist_to_var = tidygraph::map_local_int(tidygraph::graph_order(),
                                                                 mode = mode,
                                                                 mindist = 0,
                                                                 .f = function(neighborhood, ...) {

                                             tidygraph::as_tibble(neighborhood) %>%
                                               tidygraph::filter(.data$.central_node) %>%
                                               tidygraph::pull(.data$name) ->
                                               source_node

                                             tidygraph::as_tibble(neighborhood) %>%
                                               tidygraph::select(.data$name, .data$node_category) %>%
                                               tidygraph::filter(.data$node_category == "Dataset Variable") %>%
                                               tidygraph::pull(.data$name) ->
                                               target_nodes

                                             if(length(target_nodes) > 0) {
                                               igraph::distances(neighborhood,
                                                                 v = source_node,
                                                                 to = target_nodes,
                                                                 mode = mode) %>%
                                                 apply(1,min) %>% # min taken
                                                 as.integer()
                                             } else {NULL}

                                           })) %>%
    # remove nodes which don't connect to variables
    tidygraph::filter(!is.na(.data$min_dist_to_var)) %>%

    # is node a metavariable? (>1 descendants which are dataset variables)
    tidygraph::mutate(is_metavariable = tidygraph::map_local_lgl(tidygraph::graph_order(),
                                                                 mode = mode,
                                                                 mindist = 1,
                                                                 .f = function(neighborhood, ...) {
                                             tidygraph::as_tibble(neighborhood,active ='nodes') %>%
                                               tidygraph::filter(.data$node_category == "Dataset Variable") %>%
                                               nrow() > 1
                                           })) %>%

    # list of variables which are a descendant of each node
    tidygraph::mutate(variable_descendants = tidygraph::map_local(tidygraph::graph_order(), mode = mode, mindist = 1,
                                            .f = function(neighborhood, ...) {
                                              tidygraph::as_tibble(neighborhood,active ='nodes') %>%
                                                tidygraph::filter(.data$node_category == "Dataset Variable") %>%
                                                tidygraph::select(.data$name) %>%
                                                tidygraph::arrange(.data$name)
                                            })) %>%

    # Sets of variable descendant (1 set = unique group of variable descendants)
    # note: this also includes non metavariables
    tidygraph::group_by(.data$variable_descendants) %>%
    tidygraph::mutate(variable_set = dplyr::cur_group_id()) %>%
    tidygraph::mutate(highest_IC = 
                        .data$information_content == max(dplyr::cur_data()$information_content)) %>%
    tidygraph::ungroup() ->
    res
  

  if (IC_threshold > 0) {
    original_nrow <- nrow(as_tibble(filter(res, .data$highest_IC & .data$is_metavariable)))
    
    res <- filter(res, .data$information_content >= IC_threshold)
    thresholded_nrow <- nrow(as_tibble(filter(res, .data$highest_IC & .data$is_metavariable)))
    nrow_removed <- original_nrow - thresholded_nrow 
    
    IC_threshold_msg <- paste0("\n", nrow_removed, " semantic commonalities were omitted as they did not meet the IC threshold (", IC_threshold, ").")
    
  } else {IC_threshold_msg = ""}

  message("Complete. Duration: ", as.character(round(as.numeric(Sys.time()-start_time),2)), " secs.\n",
          nrow(as_tibble(filter(res, .data$highest_IC & .data$is_metavariable))),
          " semantic commonalities found (via common ontological ancestors).", IC_threshold_msg)

  return(res)
}


#' Extract metavariables' descendant variables
#'
#' Formats the output of \code{\link{metavariable_info}} for easier
#' interpretation of each metavariable's descendant variables
#'
#' Not part of the standard semantic enrichment pipeline as this function just
#' produces a simplified version of the output of \code{\link{metavariable_info}}.
#' 
#' The output of \code{\link{metavariable_info}} is converted to a tibble,
#' filtered to only include metavariables with highest information content for
#' the variable set. The tibble has three columns describing a metavariable, its
#' information content, and its descendant variables.
#' 
#' @seealso \code{\link{node_IC_zhou}}
#' @param metavariable_info_output Output tibble of
#'   \code{\link{metavariable_info}}
#' @family semantic enrichment functions
#' @importFrom tibble tibble
#' @importFrom tidyr unnest
#' @importFrom dplyr filter select arrange desc
#' @importFrom magrittr %>%
#' @importFrom rlang .data
#' @return A tibble describing each metavariable, its
#'   information content, and its descendant variables
#' @export
#'
#' @examples
#' data(example_ontology)
#' require(magrittr)
#' example_ontology %>%
#' join_vars_to_ontol(example_mapping_file, root = "root") -> joined_ontol
#'
#' mv_info <- metavariable_info(joined_ontol)
#' metavariable_variable_descendants(mv_info)
metavariable_variable_descendants <- function(metavariable_info_output) {

  out <- metavariable_info_output %>%
    tibble::as_tibble() %>%
    dplyr::filter(.data$is_metavariable & .data$highest_IC) %>%
    dplyr::select(.data$name, .data$information_content, .data$variable_descendants) %>%
    dplyr::arrange(dplyr::desc(.data$information_content)) %>%
    rename(metavariable = .data$name) %>%
    tidyr::unnest(.data$variable_descendants) %>%
    rename(descendant_variable = .data$name)
  
  return(out)
  
}



#' Aggregate Data by Metavariable
#'
#' Variables in a numeric data frame are aggregated into metavariables via
#' their most informative common ancestors identified in an ontological graph
#' object (see \code{\link{metavariable_info}}). Metavariables are appended to
#' the data frame.
#' 
#' Metavariables are created from the aggregation of data variables via their
#' most informative common ancestor (expected to have been calculated in
#' \code{\link{metavariable_info}}). Metavariables are labelled using the
#' syntax: \code{MV_[label_attr]_[Aggregation function]}. The data variables are
#' aggregated row-wise by their maximum, minimum, mean, sum, and product.
#' Metavariables with zero entropy (no information) are not appended to the
#' data. See examples for where this function should be applied in the semantic
#' enrichment workflow.
#' @note A warning may be shown regarding the '.add' argument being deprecated,
#'   this is believed to be an issue with
#'   \code{\link[tidygraph:tidygraph]{tidygraph}} which may be resolved in a
#'   future release: <https://github.com/thomasp85/tidygraph/issues/131>.
#'   Another warning may be shown regarding the 'neimode' argument being
#'   deprecated, this is believed to be an issue with
#'   \code{\link[tidygraph:tidygraph]{tidygraph}} which may be resolved in a
#'   future release: <https://github.com/thomasp85/tidygraph/issues/156>. These
#'   warning messages are not believed to have an effect on the functionality of
#'   'eHDPrep'.
#'  
#' @param graph Graph containing ontological and dataset nodes. Must be in
#'   \code{\link{tidygraph}} format or coercible to this format. Must have been
#'   processed using \code{\link{metavariable_info}}.
#' @param data Numeric data frame or matrix containing variables which are also
#'   in \code{graph}.
#' @param label_attr Node attribute containing labels used for column names when
#'   creating metavariable aggregations. Default: "name"
#' @param normalize_vals Should values be normalized before aggregation?
#'   Default: TRUE
#' @family semantic enrichment functions
#' @importFrom magrittr %>%
#' @importFrom rlang .data
#' @importFrom dplyr slice select pull as_tibble rowwise transmute ungroup
#' @importFrom tidygraph filter
#' @return \code{data} with semantic aggregations derived from common
#'   ontological ancestry (metavariables) appended as new columns, each
#'   prefixed with "MV_" and suffixed by their aggregation function (e.g. "_SUM").
#' @export
#' @examples
#' require(magrittr)
#' require(dplyr)
#' data(example_ontology)
#' data(example_mapping_file)
#' data(example_data)
#' 
#' #' # define datatypes
#' tibble::tribble(~"var", ~"datatype",
#' "patient_id", "id",
#' "tumoursize", "numeric",
#' "t_stage", "ordinal_tstage",
#' "n_stage", "ordinal_nstage",
#' "diabetes_merged", "character",
#' "hypertension", "factor",
#' "rural_urban", "factor",
#' "marital_status", "factor",
#' "SNP_a", "genotype",
#' "SNP_b", "genotype",
#' "free_text", "freetext") -> data_types
#' 
#' # create post-QC data
#' example_data %>%
#'   merge_cols(diabetes_type, diabetes, "diabetes_merged", rm_in_vars = TRUE) %>%
#'   apply_quality_ctrl(patient_id, data_types,
#'                      bin_cats =c("No" = "Yes", "rural" = "urban"),
#'                      to_numeric_matrix = TRUE) %>%
#'                      suppressMessages() ->
#'                      post_qc_data
#' 
#' # minimal example on first four coloums of example data:
#' dplyr::slice(example_ontology, 1:7,24) %>%
#'    join_vars_to_ontol(example_mapping_file[1:3,], root = "root") %>%
#'    metavariable_info() %>%
#'    metavariable_agg(post_qc_data[1:10,1:4]) -> res
#' # see Note section of documentation for information on possible warnings.
#'
#' # summary of result:
#' tibble::glimpse(res)
#'
#' \donttest{
#' # full example:
#' example_ontology %>%
#'    join_vars_to_ontol(example_mapping_file, root = "root") %>%
#'    metavariable_info() %>%
#'    metavariable_agg(post_qc_data) -> res
#'  # see Note section of documentation for information on possible warnings.
#'
#' # summary of result:
#' tibble::glimpse(res)
#' }
metavariable_agg <- function(graph, data, label_attr ="name", normalize_vals = TRUE) {
  start_time <- Sys.time()
  start_ncol <- ncol(data)
  
  message("Aggregating variables with semantic commonalities to metavariables\nand appending to `data`...\nMetavariables will be labelled by the most informative common ancestor.")
  
  # filter graph to only Most Informative Common Ancestors (MICAs)
  graph %>%
    tidygraph::filter(.data$highest_IC & .data$is_metavariable) %>%
    dplyr::as_tibble() ->
    graph_MICAs
  
  # initialise vector for # vars omitted due to zero entropy
  zev <- 0
  
  for(i in 1:nrow(graph_MICAs)) {
    
    # slice graph to include ith MICA
    dplyr::slice(graph_MICAs,i) %>%
      dplyr::select(.data$variable_descendants, label_attr) ->
      slice_i
    
    # get string of variables to aggregate
    slice_i %>%
      dplyr::pull(.data$variable_descendants) %>%
      unlist() %>%
      unique() ->
      cols2aggregate
    
    # create data frame of cols2aggregate from data
    cols2agg <- dplyr::select(data, dplyr::all_of(cols2aggregate))
    
    # TODO: check for all NAs in row. Then aggregate safely
    # these values can be normalised to be between 0 and 1
    if(normalize_vals) {cols2agg <- purrr::map_dfr(cols2agg, normalize)} else {}

    # create prefix
    prefix <- paste0("MV_", gsub('[[:punct:] ]+','_', dplyr::pull(slice_i, label_attr) ))
    
    # aggregate
    cols2agg %>%
      dplyr::rowwise() %>%
      dplyr::transmute("{prefix}_SUM" := sum_catchNAs(dplyr::c_across(everything())),
                       "{prefix}_AVG" := mean_catchNAs(dplyr::c_across(everything())),
                       "{prefix}_MAX" := max_catchNAs(dplyr::c_across(everything())),
                       "{prefix}_MIN" := min_catchNAs(dplyr::c_across(everything())),
                       "{prefix}_MUL" := prod_catchNAs(dplyr::c_across(everything()))
                       ) %>%
      dplyr::ungroup() -> 
      aggs
    
    aggs_ncol <- ncol(aggs)
    # omit aggregations with zero entropy
    aggs <- dplyr::select(aggs, where(~ entropy(.x) != 0)) ->
      aggs
    aggs_ncol2 <- ncol(aggs)
    
    
    zev <- zev + (aggs_ncol - aggs_ncol2)
    # add to dataset
    data <- cbind(data,aggs)
  }
  
  message("Complete. Duration: ", as.character(round(as.numeric(Sys.time()-start_time),2)), " secs.\n",
          "The dataset has been enriched with ", ncol(data) - start_ncol, " metavariables\n(",
          zev, " metavariables had zero entropy and were therefore not appended).")
  
  return(data)
}

#' Min max normalization
#'
#' Normalizes values in \code{x} to be between 0 and 1 using min-max
#' normalization.
#' 
#' @param x numeric vector
#' @param na.rm a logical indicating whether missing values should be removed. Default = TRUE.
#' @return normalised \code{x}
normalize <- function(x, na.rm = TRUE) {
  stopifnot(is.numeric(x))
  (x - min(x, na.rm = na.rm)) / (max(x, na.rm = na.rm) - min(x, na.rm = na.rm))
}

#' Sum vector safely for semantic enrichment
#'
#' sums values in x (ignoring NAs). If all values in x are \code{NA}, returns
#' \code{NA} (rather than 0),
#' 
#' @param x numeric vector
#' @return sum of \code{x}

sum_catchNAs <- function(x) {
  if(all(is.na(x))) {
    return(as.numeric(NA))
  } else{sum(x, na.rm = TRUE)}
}

#' Find mean of vector safely
#'
#' This low-level function is deployed as part of the semantic enrichment
#' process. Averages values in numeric vector (ignoring NAs). If all values in
#' numeric vector are \code{NA}, returns \code{NA} (rather than NaN),
#' 
#' @param x numeric vector
#' @return mean of \code{x}
mean_catchNAs <- function(x) {
  if(all(is.na(x))) {
    return(as.numeric(NA))
  } else{mean(x, na.rm = TRUE)}
}

#' Find maximum of vector safely
#'
#' This low-level function is deployed as part of the semantic enrichment
#' process.Calculates maximum of values in numeric vector (ignoring NAs). If all
#' values in input vector are \code{NA}, returns \code{NA} (rather than -Inf),
#'
#' @param x numeric vector
#' @return maximum value of \code{x}
max_catchNAs <- function(x) {
  if(all(is.na(x))) {
    return(as.numeric(NA))
  } else{max(x, na.rm = TRUE)}
}

#' Find minimum of vector safely
#'
#' This low-level function is deployed as part of the semantic enrichment
#' process. Calculates minimum of values in numeric vector (ignoring NAs). If
#' all values in numeric vector are \code{NA}, returns \code{NA} (rather than
#' Inf),
#'
#' @param x numeric vector
#' @return minimum value of \code{x}
min_catchNAs <- function(x) {
  if(all(is.na(x))) {
    return(as.numeric(NA))
  } else{min(x, na.rm = TRUE)}
}

#' Find product of vector safely
#'
#' This low-level function is deployed as part of the semantic enrichment
#' process. Calculates product of values in numeric vector (ignoring NAs). If
#' all values in numeric vector are \code{NA}, returns \code{NA} (rather than
#' Inf),
#' 
#' @param x numeric vector
#' @return product of \code{x}
prod_catchNAs <- function(x) {
  if(all(is.na(x))) {
    return(as.numeric(NA))
  } else{prod(x, na.rm = TRUE)}
}

#' Convert edge table to tidygraph graph
#' 
#' A edge table, as a data frame, is converted to a directed tidygraph
#' \code{\link[tidygraph:tidygraph]{tidygraph}}. Column 1 of the edge table is
#' interpreted as a "from" column, Column 2 is interpreted as a "to" column, and
#' any further columns are interpreted as attributes of the entity/node recorded
#' in column 1. Incomplete cases are removed from the edge table (rows) to avoid
#' redundancy
#' 
#' @param edge_tbl data frame containing 'from' nodes in column 1 and 'to' nodes
#'   in column 2 so that all nodes go 'towards' the root node
#'
#' @return \code{\link[tidygraph:tidygraph]{tidygraph}} representation of the edge table
#' @export
#'
#' @examples
#' # basic edge table
#' edge_tbl <- tibble::tribble(~from, ~to,
#' "Nstage", "TNM",
#' "Tstage", "TNM",
#' "Tumoursize", "property_of_tumour",
#' "Tstage", "property_of_tumour",
#' "property_of_tumour", "property_of_cancer",
#' "TNM", "property_of_cancer",
#' "property_of_cancer", "disease",
#' "disease", "root",
#' "root", NA)
#' 
#' graph <- edge_tbl_to_graph(edge_tbl)
#' 
#' graph
#' 
#' plot(graph)
#' 
#' 
#' # edge table with node attributes
#' ## note that root node is included in final row to include its label
#' edge_tbl <- tibble::tribble(~from, ~to, ~label,
#' "Nstage", "TNM", "N stage",
#' "Tstage", "TNM", "T stage",
#' "Tumoursize", "property_of_tumour", "Tumour size",
#' "Tstage", "property_of_tumour", "T stage",
#' "property_of_tumour", "property_of_cancer", "Property of tumour",
#' "TNM", "property_of_cancer", "TNM",
#' "property_of_cancer", "disease", "Property of cancer",
#' "disease", "root", "Disease",
#' "root", NA, "Ontology Root")
#' graph <- edge_tbl_to_graph(edge_tbl)
#' 
#' graph
#' 
#' plot(graph)
#' 
edge_tbl_to_graph <- function(edge_tbl) {
  
  if(!is.data.frame(edge_tbl)) {
    stop("`edge_tbl' must be a data frame")
  } else {}
  
  graph <- edge_tbl %>%
    dplyr::select(1,2) %>%
    dplyr::filter(stats::complete.cases(.)) %>%
    tidygraph::as_tbl_graph(directed = TRUE)
  
  # add node attributes
  if (ncol(edge_tbl) > 2) {
    message("More than two columns detected in `edge_tbl`. These will be joined to the graph as node attributes")
    
    node_attr <- edge_tbl %>%
      rename(.from = 1) %>% # rename first column
      dplyr::select(-2) %>% # remove 'to' (second) column
      dplyr::distinct(dplyr::across(dplyr::everything())) # retain only distinct rows
    
    graph <- graph %>%
      tidygraph::left_join(node_attr, by = c(name = ".from")) %>%
      mutate()
      
  } else{}
  
  # quick validation
  validate_ontol_nw(graph)
  
  return(graph)
}

Try the eHDPrep package in your browser

Any scripts or data that you put into this service are public.

eHDPrep documentation built on June 7, 2023, 6:09 p.m.