R/get_data.R

Defines functions get_dataset_version get_df_file handle_dataset_version get_versions get_latest_version get_global_bl_mappings get_reliability get_motor_spreadsheet get_cdi_spreadsheet get_all_basiclevel

Documented in get_all_basiclevel get_cdi_spreadsheet get_dataset_version get_df_file get_global_bl_mappings get_latest_version get_motor_spreadsheet get_reliability get_versions handle_dataset_version

#' Get the all_basiclevel data
#'
#' @param version version tag to checkout
#' @param drop_basic_level_na whether to use the "*_NA" version that has all the
#' nouns, including those whose "basic_level" is set to "NA"
#' (drop_basic_level_na = FALSE) or the standard version that does not include
#' them.
#'
#' @return a dataframe containing the all_basiclevel data
#' @export
#'
#' @examples
#'
#' # get version with a specific version tag
#' all_bl <- get_all_basiclevel(version='0.3.2')
get_all_basiclevel <- function(version = NULL,
                               drop_basic_level_na = TRUE) {
  filename <- "all_basiclevel_na.csv"

  # The reason there are so many factors is for backwards compatibility with the
  # .feather versions of all_basiclevel which we used to keep. They were loaded
  # by default so some scripts might expect factors and they should get them.
  col_types <- readr::cols(
    ordinal = readr::col_integer(),
    onset = readr::col_integer(),
    offset = readr::col_integer(),
    object = readr::col_factor(),
    utterance_type = readr::col_factor(),
    object_present = readr::col_factor(),
    speaker = readr::col_factor(),
    basic_level = readr::col_factor(),
    annotid = readr::col_character(),
    id = readr::col_factor(),
    subj = readr::col_factor(),
    month = readr::col_factor(),
    SubjectNumber = readr::col_factor(),
    audio_video = readr::col_factor(),
    # With include_na = FALSE, NA values get coded as belonging to a NA level,
    # and with include_na = TRUE - to a "NA" level. The latter was easier to
    # remove in the mutate below, so that we have NA values, not a NA/"NA"
    # level.
    tier = readr::col_factor(include_na = TRUE),
    pho = readr::col_character()
  )

  # global_bl column wasn't there until version 0.5.0
  if (is.null(version) || (compareVersion(version, '0.5.0') >= 0)) {
    col_types$cols[['global_bl']] = readr::col_factor()
  }

  all_bl <- get_df_file(repo = 'all_basiclevel', filename = filename,
                        version = version, col_types = col_types)

  # Drop the "NA" level, converting the corresponding values to NA. Done for
  # the sake of consistency with the older versions of all_basiclevel when
  # we had feather files too there.
  all_bl <- all_bl %>%
    dplyr::mutate(tier = dplyr::recode_factor(all_bl$tier, "NA" = NA_character_))

  # There should only be one csv file now - the one that has rows where
  # basic_level is NA - that is the full file. Let's check that that we are
  # loading that file in case someone uses an older version.
  assertthat::assert_that(sum(is.na(all_bl$basic_level)) > 0)

  if (isTRUE(drop_basic_level_na)) {
    all_bl <- all_bl %>%
      tidyr::drop_na(basic_level)
  }

  return(all_bl)
}

#' Get the CDI spreadsheet for SEEDLingS babies
#'
#' @param version version tag to checkout
#' @param type "feather" or "csv". defaults to "feather"
#'
#' @return a tibble containing the SEEDLingS CDI spreadsheet
#' @export
#'
#' @examples
#' cdi <- get_cdi_spreadsheet(version='0.0.7')
get_cdi_spreadsheet <- function(version = NULL, type="feather") {
  col_types <- readr::cols(
    .default = readr::col_character(),
    month = readr::col_double(),
    CDIcomp = readr::col_double(),
    CDIprod = readr::col_double()
  )
  switch(type,
         "csv" = get_df_file('cdi_spreadsheet', "cdi.csv", version = version,
                             col_types = col_types),
         "feather" = get_df_file('cdi_spreadsheet', "cdi.feather",
                                 version = version)
         )
}


#' The the motor questionaire spreadsheet for the SEEDLingS babies
#'
#' @param version version tag to checkout
#' @param type "feather" or "csv". defaults to "feather"
#'
#' @return a tibble contaiing the SEEDLingS Motor Questionaire spreadsheet
#' @export
#'
#' @examples
#' motor <- get_motor_spreadsheet(version = '0.0.2')
get_motor_spreadsheet <- function(version = NULL, type="feather") {
  col_types <- readr::cols(
    .default = readr::col_character(),
    weight = readr::col_double(),
    Q8_FirstClick = readr::col_double(),
    Q8_LastClick = readr::col_double(),
    Q8_PageSubmit = readr::col_double(),
    Q8_ClickCount = readr::col_double(),
    Q26_FirstClick = readr::col_double(),
    Q26_LastClick = readr::col_double(),
    Q26_PageSubmit = readr::col_double(),
    Q26_ClickCount = readr::col_double(),
    walking_month = readr::col_double(),
    walking_day = readr::col_double(),
    walking_year = readr::col_double()
  )
  switch(type,
         "csv" = get_df_file('motor_spreadsheet', "motor.csv",
                             version = version, col_types = col_types),
         "feather" = get_df_file('motor_spreadsheet', "motor.feather",
                                 version = version)
         )
}


#' Get the reliability spreadsheets
#'
#' @param av either "audio" or "video"
#' @param month the month as a string, e.g. "06"
#' @param version version tag to checkout
#'
#' @return dataframe with data for the requested modality and month
#' @export
#'
#' @examples
#' audio_06_rel <- get_reliability("audio", "06")
get_reliability <- function(av, month, version = NULL) {
  fname <- paste0(av, "_", month, ".csv")
  get_df_file('reliability', fname, version)
}


#' Get the global basic level spreadsheets
#'
#' They are used to map every token in all_basiclevel_na to its global basic
#' level, see `map_global_basic_level` and `update_global_basic_levels`
#'
#' @param version version tag to checkout
#'
#' @return list of object_dict and
#' @export
#'
#' @examples
#' global_bl_mapping <- get_global_bl_mappings(version = '0.3.2')
get_global_bl_mappings <- function(version = NULL) {

  get_mapping <- function(filename, col_types) {
    return(
      get_df_file(
        repo = 'all_basiclevel',
        filename = glue::glue('global_basic_level_dicts/{filename}'),
        version = version,
        col_types = col_types))
  }

  object_dict <- get_mapping(
    filename = 'global_bl_dictionary.csv',
    col_types = readr::cols(
      object = readr::col_character(),
      disambiguate = readr::col_character(),
      global_bl = readr::col_character()))
  check_object_dict(object_dict)

  annotid_disambiguation <- get_mapping(
    filename = 'disambiguated_rows.csv',
    col_types = readr::cols(
      object = readr::col_character(),
      annotid = readr::col_character(),
      disambiguate = readr::col_character()))
  check_annotid_disambiguation(annotid_disambiguation)

  return(list(object_dict = object_dict,
              annotid_disambiguation = annotid_disambiguation))
}

#' Find latest version available for downloading?
#'
#' @inheritParams get_latest_tag
#'
#' @return the version string
#' @export
#'
#' @examples
#' get_latest_version('all_basiclevel')
get_latest_version <- function(repo, tags_already_updated = FALSE) {
  get_latest_tag(repo, tags_already_updated = tags_already_updated)
}


#' List available versions
#'
#' @inheritParams get_latest_tag
#'
#' @return the version string
#' @export
#'
#' @examples
#' get_versions('seedlings-nouns')
get_versions <- function(repo, tags_already_updated = FALSE) {
  get_tags(repo, tags_already_updated = tags_already_updated)
}


#' Handles the version.
#'
#' If the version isn't specified, finds the newest version and warns that not
#' specifying the version might not be a good idea.
#' If it is specified, notifies if there is a newer version available.
#'
#' @inheritParams get_all_basiclevel
#' @return version string
handle_dataset_version <- function(repo, version = NULL,
                                   tags_already_updated = FALSE,
                                   check_for_updates = TRUE) {
  if (!tags_already_updated) {update_tags(repo)}

  if (!is.null(version) && !version %in% get_versions(repo)) {
    stop(glue::glue("
      Couldn't find version {version} in {repo}.

      Please check the available versions with

      `get_versions('{repo}')`
      "))
  }

  # We will only need to get the latest version in these two cases.
  if (is.null(version) | isTRUE(check_for_updates)) {
    latest_version <- get_latest_version(
      repo = repo,
      tags_already_updated = TRUE)

    if (is.null(version)) {
      version <- latest_version
      warning(glue::glue(
        'Getting a dataset without specifying a version is highly discouraged.\n',
        'Add ", version = \'{version}\'" to the `get_*` function call.'))
    }

    if (isTRUE(check_for_updates) && (latest_version != version)) {
      message(glue::glue(
        "You've requested version {version} of {repo}.\n",
        "The latest available version is {latest_version}.\n",
        "Consider updating."))
    }
  }

  return(version)
}


#' Downloads and then loads a csv/feather file from a specified version of a
#' dataset
#'
#' @inheritParams get_all_basiclevel
#' @inheritParams get_latest_tag
#' @param filename name of a csv/feather file
#' @param col_types Passed to `readr::read_csv` when filename ends with ".csv".
#'
#' @return tibble for feather files, data.frame for csv files
#'
#' @examples
#'
#' \dontrun{
#' get_df_file('all_basiclevel', 'all_basiclevel.csv', version = '0.1.0')
#' }
get_df_file <- function(repo, filename, version = NULL, col_types = NULL,
                        version_already_handled = FALSE) {
  if (!version_already_handled) {
    version <- handle_dataset_version(repo = repo, version = version,
                                      tags_already_updated = FALSE,
                                      check_for_updates = TRUE)}

  # Download the file
  checkout_tag(repo, tag = version)

  # Load the file
  file_path <- file.path(blab_data, repo, filename)
  if (endsWith(file_path, ".csv")) {

    if (is.null(col_types)) {
      warning(glue::glue(
        "No column types specified for {filename} when blabr loaded.",
        " This might lead to unexpected behavior so please check that",
        " the column types match your expectations."))
    }

    result <- readr::read_csv(file_path, col_types = col_types,
                              show_col_types = FALSE)

    # Ensure that column in the file fully match col_types
    if (!is.null(col_types)) {
      tryCatch({
        assert_df_matches_col_types(result, col_types)},
        error = function(e) {
          message <- paste0(
            "The file ", file_path, " does not match the expected column types. ",
            "This could be due to a change in the dataset or a bug in blabr. ",
            "Please report this issue to the lab staff.\n\n",
            e$message)
          stop(message)})
    }

  } else if (endsWith(file_path, ".feather")) {
    result <- arrow::read_feather(file_path)
  }
  message("reading file: ", file_path)
  return(result)
}


#' Get the version tag of the last downloaded version of a dataset
#'
#' For a given dataset, returns the version tag of the version that was last
#' downloaded - the one that is on your computer right now.
#'
#' The main assumed usage is switching from using `get_all_basiclevel()` without
#' the version argument.
#'
#' Use interactively only and put the actual version string literal in your
#' code.
#'
#' @param dataset dataset name: 'all_basiclevel', 'reliability', etc.
#'
#' @return list with two keys: version and date
#' @export
#'
#' @examples
#' all_bl_version <- get_dataset_version('all_basiclevel')
#' print(all_bl_version$version)
#' print(all_bl_version$date)
get_dataset_version <- function(dataset) {
  version <- get_current_tag(repo = dataset, tags_already_updated = FALSE)
  commit_date <- get_current_commit_date(repo = dataset)
  return(list(version=version, date = commit_date))
}
BergelsonLab/blabr documentation built on April 19, 2024, 7:21 p.m.