R/ontology.R

Defines functions mba_structure_id compute_children compute_hierarchical last_column_shift cleanup_taxons generate_taxons filter_mba_ontology_children generate_mba_taxons flatten_mba_ontology get_mba_ontology

Documented in cleanup_taxons compute_hierarchical filter_mba_ontology_children flatten_mba_ontology generate_mba_taxons generate_taxons get_mba_ontology mba_structure_id

#' Download the Mouse Brain Atlas ontology as a list object
#'
#' No parameters
#'
#' @return a nested list object containing the Mouse Brain Atlas ontology
#' @export
get_mba_ontology <- function() {

  # Download the ontology JSON file
  temp <- tempfile()
  download.file("http://api.brain-map.org/api/v2/structure_graph_download/1.json", temp)

  # Read the JSON
  raw_ontology <- jsonlite::fromJSON(temp)[["msg"]]

  return(raw_ontology)
}

#' Convert a nested Mouse Brain Atlas ontology to a data.frame
#'
#' @param ontology A nested ontology object
#' @param ongology_df An existing ontology data.frame. This is used for recursion. You should use the default, NULL, to extract a full ontology.
#'
#' @return a data.frame with all descriptive columns for the ontology.
#' @export
flatten_mba_ontology <- function(ontology, ontology_df = NULL) {

  l <- ontology

  if(is.null(ontology_df)) {
    ontology_df <- data.frame(l[names(l) != "children"])[0,]
    ontology_df$n_children <- numeric()
  }

  if("children" %in% names(l)) {

    child_df <- data.frame(l[names(l) != "children"])

    n_children_of_children <- purrr::map_dbl(l$children,
                                             function(x) {
                                               if("children" %in% names(x)) {
                                                 length(x$children)
                                               } else {
                                                 0
                                               }
                                             })

    child_df$n_children <- n_children_of_children

    ontology_df <- rbind(ontology_df, child_df)

    for(i in 1:length(l$children)) {

      child_list <- l$children[[i]]

      ontology_df <- flatten_mba_ontology(child_list, ontology_df)
    }
  }

  return(ontology_df)
}

#' Generate a taxon column that shows the "lineage" of each ontology term
#'
#' @param flat_ontology A flat ontology object generated by flatten_mba_ontology
#'
#' @return a data.frame with all descriptive columns for the ontology plus a new taxon column.
#' @export
generate_mba_taxons <- function(flat_ontology) {
  flat_ontology <- arrange(flat_ontology,st_level)

  taxons <- list("997" = "0")

  for(i in 2:nrow(flat_ontology)) {
    id <- as.character(flat_ontology$id[i])
    parent_id <- as.character(flat_ontology$parent_structure_id[i])

    taxons[id] <- paste0(taxons[parent_id], ";", id)
  }
  flat_ontology$taxons <- unlist(taxons)
  return(flat_ontology)
}

#' Filter the MBA ontology to find children of a given structure acronym
#'
#' @param flat_ontology A flat ontology data.frame generated by flatten_mba_ontology
#' @param parent_acronym The acronym of a brain ontology structure to find
#' @param include_parent Whether or not to include the parent structure in the output. Default = FALSE.
#'
#' @return A data.frame with the filtered ontology only for children of the parent structure.
#' @export
filter_mba_ontology_children <- function(flat_ontology,
                                       parent_acronym,
                                       include_parent = FALSE) {

  if(!parent_acronym %in% flat_ontology$acronym) {
    stop(paste0("Parent acronym ", parent_acronym," not found in ontology."))
  }

  if(!"taxons" %in% names(flat_ontology)) {
    flat_ontology <- generate_mba_taxons(flat_ontology)
  }

  parent_taxon <- flat_ontology$taxon[flat_ontology$acronym == parent_acronym]

  children_ontology <- flat_ontology[grepl(paste0("^",parent_taxon), flat_ontology$taxon),]

  if(!include_parent) {
    children_ontology <- children_ontology[children_ontology$taxon != parent_taxon,]
  }

  return(children_ontology)
}

#' Generate a taxons column for a flattened ontology
#'
#' This column is needed for use with the taxa and metacodeR packages.
#'
#' @param flat_ontology A flat ontology data.frame generated by flatten_mba_ontology
#' @param id_column The id column used to link structures. default is "id".
#' @param parent_column The column specifying the id column of the parent of each structure. default is "parent_structure_id".
#' @param value_column The column of values to use to build the taxons column. Default is "id". Using "acronym" will yield a more readable taxonomy.
#' @param taxon_column The name of the column to use for storing the resulting taxons. Default is "taxons".
#'
#' @return a data.frame with an additional column of semicolon-separated taxons based on the value_column.
#'
#' @export
generate_taxons <- function(flat_ontology,
                            id_column = "id",
                            parent_column = "parent_structure_id",
                            value_column = "id",
                            taxon_column = "taxons") {

  library(dplyr)

  flat_ontology <- flat_ontology %>%
    arrange(st_level)

  taxon_list <- list(flat_ontology[[value_column]][1])
  names(taxon_list)[1] <- flat_ontology[[id_column]][1]

  for(i in 2:nrow(flat_ontology)) {
    id <- as.character(flat_ontology[[id_column]][i])
    value <- as.character(flat_ontology[[value_column]][i])
    parent_id <- as.character(flat_ontology[[parent_column]][i])

    taxon_list[[id]] <- paste0(taxon_list[[parent_id]], ";", value)
  }

  flat_ontology[[taxon_column]] <- unlist(taxon_list)

  flat_ontology

}

#' Clean up taxons by removing structures no longer present after filtering
#'
#' @param flat_ontology A flat ontology data.frame generated by flatten_mba_ontology.
#' @param id_column The id column used to link structures. default is "id".
#' @param taxon_column The name of the column with taxons generated by generate_taxons().
#'
#' @return a data.frame with a modified taxons column that lacks any ids that are no
#' longer present in the ontology.
#'
#' @export
cleanup_taxons <- function(flat_ontology,
                           id_column = "id",
                           taxon_column = "taxons") {
  library(purrr)

  available_ids <- paste0(as.character(unique(flat_ontology[[id_column]])))

  split_taxons <- strsplit(flat_ontology[[taxon_column]], ";")

  new_taxons <- purrr::map(split_taxons,
                    function(x) {
                      x <- x[x %in% available_ids]
                      paste(x, collapse = ";", sep = "")
                    })

  flat_ontology[[taxon_column]] <- new_taxons

  flat_ontology
}

last_column_shift <- function(df,
                              col = "taxons") {

  others <- names(df)[names(df) != col]
  df[, c(others, col)]

}

#' Compute values hierarchicaly over a flat ontology
#'
#' Values accumulate from the bottom of the ontology to the top at each node,
#' and are computed only from the values of the direct children of each node.
#'
#' @param df A flat ontology data.frame with a taxons column
#' @param fun The function to run as a character object, e.g. "mean".
#' @param compute_column The column of values to use for the function computation.
#' @param result_column The name of the output column.
#' @param include_node Logical, should values assigned to non-leaf nodes be included in calculation?
#' @param na.rm If TRUE or FALSE, passed as a parameter to fun. If NULL (default), is ignored.
#' @param taxon_column The name of the taxons column. Default is "taxons"
#'
#' @return A flat ontology data.frame with the result_column added. The taxon column will be moved
#' to the last column position. This should behave similarly to mutate().
#'
#' @export
#'
#' @examples
#'
#' flat_ontology2 <- flat_ontology %>%
#'   compute_hierarchical("mean","n_children","mean_children",
#'                        include_node = TRUE)
#'
compute_hierarchical <- function(df,
                                 fun,
                                 compute_column,
                                 result_column,
                                 include_node = FALSE,
                                 na.rm = NULL,
                                 taxon_column = "taxons") {


  df[[result_column]] <- 0

  # Get the depth by counting the semicolons in taxons
  df$depth <- sapply(gregexpr(";",df[[taxon_column]]), function(x) sum(x != -1))
  # Sort by deepest first
  df <- df %>%
    dplyr::mutate(original_order = 1:n()) %>%
    dplyr::arrange(desc(depth))

  for(i in 1:nrow(df)) {

    if(df$n_children[i] == 0) {
      # Otherwise, use the row for the leaf
      df[[result_column]][i] <- df[[compute_column]][i]
    } else {
      # If this node has children, find them using the parent_structure_id column
      children <- which(df$parent_structure_id == df$id[i])

      # Get the values for the specified compute_column for the children
      children_values <- df[[result_column]][children]

      if(include_node) {
        children_values <- c(df[[compute_column]][i], children_values)
      }

      # Run the function on the children_values
      if(is.null(na.rm)) {
        df[[result_column]][i] <- do.call(fun, list(children_values))
      } else {
        df[[result_column]][i] <- do.call(fun, list(children_values, na.rm = na.rm))
      }
    }

  }

  # rearrange results to match original order
  df <- df %>%
    arrange(original_order) %>%
    select(-original_order, -depth)

  # rearrange columns so that taxons is last (required for parse_tax_data)
  df <- last_column_shift(df, taxon_column)

  df
}

compute_children <- function(df,
                             taxon_column,
                             compute_column,
                             fun,
                             na.rm = NULL) {

  res <- numeric(length = nrow(df))

  for(i in 1:nrow(df)) {
    parent_taxon <- df[[taxon_column]][i]
    children <- which(grepl(paste0("^",parent_taxon),df[[taxon_column]]))
    if(length(children) > 1) {
      children <- setdiff(children, i)
    }
    children_values <- df[[compute_column]][children]
    if(is.null(na.rm)) {
      res[i] <- do.call(fun, list(children_values))
    } else {
      res[i] <- do.call(fun, list(children_values, na.rm = na.rm))
    }
  }

  res
}

#' Get the Mouse Brain Atlas numeric structure ID(s) for given structure acronym(s)
#'
#' This retrieves the ID from an internal storage source in the package.
#'
#' @param acronym The structure acronym(s) to use.
#'
#' @return numeric value(s) for the target structure ID.
#'
#' @export
#'
#' @examples
#' MOp_id <- mba_structure_id("MOp")
mba_structure_id <- function(acronym) {

  id_table <- read.csv(system.file("extdata", "mba_structure_id_to_acronym.csv", package = "cocoframer"))
  matching_acronyms <- intersect(acronym, id_table$acronym)

  if(length(matching_acronyms) == 0) {
    stop(paste("Couldn't find a match for any acronyms provided:",
               paste(acronym, collapse = ", ")))
  }

  missing_acronyms <- setdiff(acronym, id_table$acronym)
  if(length(missing_acronyms > 0)) {
    warning(paste("Couldn't find a match for these acronym(s):",
                  paste(missing_acronyms, collapse = ", ")))
  }


  id_table$id[match(matching_acronyms, id_table$acronym)]
}
AllenInstitute/cocoframer documentation built on May 21, 2020, 3:09 a.m.