lifemapr: Data Visualisation on 'Lifemap' Tree

Documented in build_Lifemap

#' A function to construct a LifemapR object, usable by the other functions of the package.
#'
#' @param df A dataframe containing at least one column named "taxid" that contains NCBI Taxonomy Identifiers (taxid).
#' The dataframe can contain any number of additional columns defining traits/characters/values associated to
#' each taxid.
#' @param basemap Deprecated argument.
#' @param verbose If TRUE (the default), the function will print detailed information to the console.
#' If FALSE, it will run silently.
#'
#' @return A list of class lifemap_obj containing:
#' - df : a dataframe containing at least for each taxid :
#'   - The x coordinate (lon)
#'   - The y coordonate (lat)
#'   - The scientific name (sci_name)
#'   - The zoom level at which the taxa is visible (zoom)
#'   - A list of its ascendants (ascend)
#'   - Its type ("requested" or "ancestor")
#'   - Its direct ancestor
#'   - Its type (type), i.e. whether the taxid was
#' requested by the user ("requested") or if it is the anecestor of a requested taxid ("ancestor")
#' - basemap : the basemap used to get taxa's details
#'
#' @importFrom jsonlite fromJSON
#' @importFrom dplyr bind_rows distinct
#' @importFrom rlang .data
#' @importFrom RCurl url.exists
#' @importFrom fastmatch fmatch
#' @importFrom arrow read_parquet
#'
#' @export
#' @examples
#' data(eukaryotes_80)
#' \dontrun{
#' # make sure you have a good internet connection to load these very large files
#' LM <- build_Lifemap(eukaryotes_80)
#' }
build_Lifemap <- function(df, basemap = NULL, verbose = TRUE) {
  if (!is.null(basemap)) {
    warning("The basemap argument is now deprecated.")
  }
  basemap_url <- "https://lifemap-back.univ-lyon1.fr/data/lmdata_R.parquet"

  if (is.null(df$taxid)) {
    stop('The dataframe must at least contain a "taxid" column')
  }

  tryCatch(
    {
      if (verbose) {
        cat("Downloading basemap coordinates...\n")
      }
      DF <- arrow::read_parquet(basemap_url)
    },
    warning = function(w) {
      print(w)
      message("The Lifemap server or some remote lifemap files cannot be reached. Please try again later.")
      return(NA)
    },
    error = function(e) {
      print(e)
      message("The Lifemap server or some remote lifemap files cannot be reached. Please try again later.")
      return(NA)
    }
  )

  # add LUCA
  LUCA <- data.frame("taxid" = "0", "lon" = 0, "lat" = -4.226497, "sci_name" = "Luca", "zoom" = 5)
  DF <- dplyr::bind_rows(DF, LUCA)

  # get info for unique taxids (then we work with df_distinct, not df anymore)
  df_distinct <- dplyr::distinct(df, .data$taxid, .keep_all = TRUE)
  if (nrow(df_distinct) != nrow(df)) {
    warning(sprintf("%s duplicated TaxIDs were removed \n", nrow(df) - nrow(df_distinct)))
  }

  # get data
  if (verbose) {
    cat("Getting info for requested taxids...\n")
  }

  # get index of requested taxids
  indexes <- fastmatch::fmatch(df_distinct$taxid, DF$taxid)
  if (sum(is.na(indexes)) > 0) {
    warning(sprintf(
      "%s TaxID(s) could not be found: %s \n",
      sum(is.na(indexes)),
      paste(df_distinct$taxid[is.na(indexes)], sep = ",")
    ))
  }

  # create new df with only existing taxids
  df_exists <- df_distinct[!is.na(indexes), ]
  DATA0 <- DF[indexes[!is.na(indexes)], ]

  # get ancestors
  unique_ancestors <- unique(unlist(DATA0$ascend))
  real_ancestors <- setdiff(unique_ancestors, df_exists$taxid)
  ANCESTORS <- DF[fastmatch::fmatch(real_ancestors, DF$taxid), ]

  # add type
  DATA0$type <- "requested"
  ANCESTORS$type <- "ancestor"
  # bind all
  DATA1 <- dplyr::bind_rows(DATA0, ANCESTORS)

  # merge
  DATA2 <- merge(DATA1, df_exists, by = "taxid", all = TRUE)

  # replace the column 'ascend' by simply the direct ancestor
  DATA2$ancestor <- unlist(lapply(DATA2$ascend, function(x) ifelse(!is.null(x), x[1], NA)))

  lm_obj <- list(df = DATA2, basemap = basemap)
  class(lm_obj) <- c("lifemap_obj", "list")

  return(lm_obj)
}