R/childesr.R

Defines functions get_sql_query get_contexts get_utterances get_types get_tokens get_content get_speaker_statistics get_participants get_transcripts get_corpora get_collections get_table clear_connections connect_to_childes get_db_info resolve_connection translate_version

Documented in clear_connections connect_to_childes get_collections get_content get_contexts get_corpora get_db_info get_participants get_speaker_statistics get_sql_query get_table get_tokens get_transcripts get_types get_utterances

#' @importFrom magrittr "%>%"
#' @importFrom magrittr "%<>%"
NULL

utils::globalVariables(c("collection_id", "collection_name", "corpus_id",
                         "corpus_name", "gloss", "id", "max_age", "min_age",
                         "name", "speaker_role", "target_child_id",
                         "target_child_name", "target_child_age",
                         "utterance_id", "transcript_id", "utterance_order",
                         "replacement"))

avg_month <- 365.2425 / 12

translate_version <- function(db_version, db_args, db_info) {

  # using the childes-db hosted server
  if (db_args$host == db_info$host) {

    # current version
    if (db_version == "current") {
      db_to_use <- db_info[["current"]]
      message("Using current database version: '", db_to_use, "'.")
      return(db_to_use)

    # supported version
    } else if (db_version %in% db_info[["supported"]]) {
      db_to_use <- db_version
      message("Using supported database version: '", db_to_use, "'.")
      return(db_to_use)

    # historical version
    } else if (db_version %in% db_info[["historical"]]) {
      stop("Version '", db_version, "' is no longer hosted by ",
           "childes-db.stanford.edu; either specify a more recent version or ",
           "install MySQL Server locally and update db_args.")

    # version not recognized
    } else {
      stop("Version '", db_version, "' not found. Specify one of: 'current', ",
           paste(sprintf("'%s'", db_info$supported), collapse = ", "), ".")
    }

  # using a different server than the childes-db hosted one
  } else {
    message("Not using hosted database version; no checks will be applied to ",
            "version specification.")
    return(db_args$db_name)
  }
}

resolve_connection <- function(connection, db_version = NULL, db_args = NULL) {
  if (is.null(connection)) connect_to_childes(db_version, db_args)
  else connection
}

#' Get information on database connection options
#'
#' @return List of database info: host name, current version, supported
#'   versions, historical versions, username, password
#' @export
#'
#' @examples
#' \dontrun{
#' get_db_info()
#' }
get_db_info <- function() {
  tryCatch(jsonlite::fromJSON("https://langcog.github.io/childes-db-website/childes-db.json"),
           error = function(e) message(strwrap(
             prefix = " ", initial = "",
             "Could not retrieve childes-db connection information. Please check your
             internet connection. If this error persists please contact
             childes-db-contact@stanford.edu"
           )))
}

#' Connect to CHILDES
#'
#' @param db_version String of the name of database version to use
#' @param db_args List with host, user, and password defined
#' @return con A DBIConnection object for the CHILDES database
#' @export
#'
#' @examples
#' \dontrun{
#' con <- connect_to_childes(db_version = "current", db_args = NULL)
#' DBI::dbDisconnect(con)
#' }
connect_to_childes <- function(db_version = "current", db_args = NULL) {

  # get info from hosted json, NULL if connection error
  db_info <- get_db_info()

  # if db_info was not found, return NULL
  if (is.null(db_info)) return()

  # if db_args is unspecified, use db_info
  if (is.null(db_args)) db_args <- db_info

  tryCatch(
    expr = {
      con <- DBI::dbConnect(
        RMySQL::MySQL(),
        host = db_args$host,
        dbname = translate_version(db_version, db_args, db_info),
        user = db_args$user,
        password = db_args$password
      )
      DBI::dbGetQuery(con, "SET NAMES utf8")
      return(con)
    },
    error = function(e) message("Could not connect to childes-db"))
}

#' Clear all MySQL connections
#'
#' @export
clear_connections <- function() {
  cons <- DBI::dbListConnections(RMySQL::MySQL())
  purrr::walk(cons, DBI::dbDisconnect)
  message(sprintf("Cleared %s connections", length(cons)))
}

#' Get table
#'
#' @param connection A connection to the CHILDES database
#' @param name String of a table name
#'
#' @return A `tbl`
get_table <- function(connection, name) {
  dplyr::tbl(connection, name)
}

#' Get collections
#'
#' @inheritParams connect_to_childes
#' @param connection A connection to the CHILDES database
#'
#' @return A `tbl` of Collection data. If `connection` is supplied, the result
#'   remains a remote query, otherwise it is retrieved into a local tibble.
#' @export
#'
#' @examples
#' \dontrun{
#' get_collections()
#' }
get_collections <- function(connection = NULL, db_version = "current",
                            db_args = NULL) {

  con <- resolve_connection(connection, db_version, db_args)
  if (is.null(con)) return()

  collections <- dplyr::tbl(con, "collection") %>%
    dplyr::rename(collection_id = id) %>%
    dplyr::rename(collection_name = name)

  if (is.null(connection)) {
    collections %<>% dplyr::collect()
    DBI::dbDisconnect(con)
  }

  return(collections)
}

#' Get corpora
#'
#' @inheritParams get_collections
#'
#' @return A `tbl` of Corpus data. If `connection` is supplied, the result
#'   remains a remote query, otherwise it is retrieved into a local tibble.
#' @export
#'
#' @examples
#' \dontrun{
#' get_corpora()
#' }
get_corpora <- function(connection = NULL, db_version = "current",
                        db_args = NULL) {

  con <- resolve_connection(connection, db_version, db_args)
  if (is.null(con)) return()

  corpora <- dplyr::tbl(con, "corpus") %>%
    dplyr::rename(corpus_id = id) %>%
    dplyr::rename(corpus_name = name)

  if (is.null(connection)) {
    corpora %<>% dplyr::collect()
    DBI::dbDisconnect(con)
  }

  return(corpora)
}

#' Get transcripts
#'
#' @param collection A character vector of one or more names of collections
#' @param corpus A character vector of one or more names of corpora
#' @param target_child A character vector of one or more names of children
#' @inheritParams get_collections
#'
#' @return A `tbl` of Transcript data, filtered down by supplied arguments. If
#'   `connection` is supplied, the result remains a remote query, otherwise it
#'   is retrieved into a local tibble.
#' @export
#'
#' @examples
#' \dontrun{
#' get_transcripts()
#' }
get_transcripts <- function(collection = NULL, corpus = NULL,
                            target_child = NULL, connection = NULL,
                            db_version = "current", db_args = NULL) {

  con <- resolve_connection(connection, db_version, db_args)
  if (is.null(con)) return()

  transcripts <- get_table(con, "transcript") %>%
    dplyr::rename(transcript_id = id)

  if (!is.null(collection)) {
    transcripts %<>% dplyr::filter(collection_name %in% collection)
  }
  if (!is.null(corpus)) {
    transcripts %<>% dplyr::filter(corpus_name %in% corpus)
  }
  if (!is.null(target_child)) {
    transcripts %<>% dplyr::filter(target_child_name %in% target_child)
  }

  transcripts %<>%
    dplyr::mutate(target_child_age = target_child_age / avg_month)

  if (is.null(connection)) {
    transcripts %<>% dplyr::collect()
    DBI::dbDisconnect(con)
  }
  return(transcripts)

}

#' Get participants
#'
#' @inheritParams get_transcripts
#' @param role A character vector of one or more roles to include
#' @param role_exclude A character vector of one or more roles to exclude
#' @param age A numeric vector of an single age value or a min age value and max
#'   age value (inclusive) in months. For a single age value, participants are
#'   returned for which that age is within their age range; for two ages,
#'   participants are returned for whose age overlaps with the interval between
#'   those two ages.
#' @param sex A character vector of values "male" and/or "female"
#'
#' @return A `tbl` of Participant data, filtered down by supplied arguments. If
#'   `connection` is supplied, the result remains a remote query, otherwise it
#'   is retrieved into a local tibble.
#' @export
#'
#' @examples
#' \dontrun{
#' get_participants()
#' }
get_participants <- function(collection = NULL, corpus = NULL,
                             target_child = NULL, role = NULL,
                             role_exclude = NULL, age = NULL, sex = NULL,
                             connection = NULL, db_version = "current",
                             db_args = NULL) {

  con <- resolve_connection(connection, db_version, db_args)
  if (is.null(con)) return()

  participants <- get_table(con, "participant")

  if (!is.null(collection)) {
    participants %<>% dplyr::filter(collection_name %in% collection)
  }

  if (!is.null(corpus)) {
    participants %<>% dplyr::filter(corpus_name %in% corpus)
  }

  if (!is.null(age)) {
    days <- age * avg_month
    if (length(age) == 1) {
      participants %<>% dplyr::filter(max_age >= days & min_age <= days)
    } else if (length(age) == 2) {
      days_1 <- days[1]
      days_2 <- days[2]
      participants %<>% dplyr::filter((max_age >= days_1 & min_age <= days_2) |
                                        (min_age <= days_2 & max_age >= days_1))
    } else {
      stop("`age` argument must be of length 1 or 2")
    }
  }

  if (!is.null(sex)) {
    sex_filter <- sex
    participants %<>% dplyr::filter(sex %in% sex_filter)
  }

  if (!is.null(target_child)) {
    child_id <- participants %>%
      dplyr::filter(name == target_child) %>%
      dplyr::pull(target_child_id) %>%
      unique()
    if (length(child_id) != 1) {
      stop("Duplicate or missing child")
    } else {
      participants %<>% dplyr::filter(target_child_id == child_id)
    }
  }

  if (!is.null(role)) {
    role_filter <- role
    participants %<>% dplyr::filter(role %in% role_filter)
  }

  if (!is.null(role_exclude)) {
    participants %<>% dplyr::filter(!(role %in% role_exclude))
  }

  target_children <- get_transcripts(collection, corpus, target_child, con) %>%
    dplyr::distinct(target_child_id, target_child_name) %>%
    dplyr::select(target_child_id, target_child_name)

  # TODO remove after https://github.com/langcog/childes-db/issues/30 resolved
  participants %<>%
    dplyr::left_join(target_children, by = "target_child_id")

  participants %<>% dplyr::mutate(max_age = max_age / avg_month)
  participants %<>% dplyr::mutate(min_age = min_age / avg_month)

  if (is.null(connection)) {
    participants %<>% dplyr::collect()
    DBI::dbDisconnect(con)
  }

  return(participants)

}
#' Get speaker statistics
#'
#' @inheritParams get_participants
#' @return A `tbl` of Speaker statistics, filtered down by supplied arguments.
#'   If `connection` is supplied, the result remains a remote query, otherwise
#'   it is retrieved into a local tibble.
#' @export
#'
#' @examples
#' \dontrun{
#' get_speaker_statistics()
#' }
get_speaker_statistics <- function(collection = NULL, corpus = NULL,
                                   target_child = NULL, role = NULL,
                                   role_exclude = NULL, age = NULL, sex = NULL,
                                   connection = NULL, db_version = "current",
                                   db_args = NULL) {

  con <- resolve_connection(connection, db_version, db_args)
  if (is.null(con)) return()

  transcripts <- get_transcripts(collection, corpus, target_child, con)
  speaker_statistics <- get_table(con, "transcript_by_speaker")

  if (!is.null(collection)) {
    collection_filter <- transcripts %>%
      dplyr::distinct(collection_id, target_child_id) %>%
      dplyr::pull(target_child_id)

    speaker_statistics %<>%
      dplyr::filter(target_child_id %in% collection_filter)
  }

  if (!is.null(corpus)) {
    corpus_filter <- transcripts %>%
      dplyr::distinct(corpus_id, target_child_id) %>%
      dplyr::pull(target_child_id)

    speaker_statistics %<>% dplyr::filter(target_child_id %in% corpus_filter)
  }

  if (!is.null(age)) {
    if (!(length(age) %in% 1:2)) stop("`age` argument must be of length 1 or 2")
    days <- age * avg_month
    if (length(age) == 1) days <- c(days, days + avg_month)
    days_1 <- days[1]
    days_2 <- days[2]
    speaker_statistics %<>% dplyr::filter(target_child_age >= days_1,
                                          target_child_age <= days_2)
  }

  if (!is.null(sex)) {
    sex_filter <- sex
    speaker_statistics %<>% dplyr::filter(sex %in% sex_filter)
  }

  if (!is.null(target_child)) {
    speaker_statistics %<>% dplyr::filter(target_child_name %in% target_child)
  }

  if (!is.null(role)) {
    role_filter <- role
    speaker_statistics %<>% dplyr::filter(speaker_role %in% role_filter)
  }

  if (!is.null(role_exclude)) {
    speaker_statistics %<>% dplyr::filter(!(speaker_role %in% role_exclude))
  }

  speaker_statistics %<>%
    dplyr::mutate(target_child_age = target_child_age / avg_month)

  if (is.null(connection)) {
    suppressWarnings(speaker_statistics %<>% dplyr::collect())
    DBI::dbDisconnect(con)
  }

  return(speaker_statistics)
}

#' Get content
#'
#' @inheritParams get_participants
#' @param content_type One of "token" or "utterance"
#' @param token A character vector of one or more token patterns (`\%` matches
#'   any number of wildcard characters, `_` matches exactly one wildcard
#'   character)
#' @param stem A character vector of one or more stems
#' @param part_of_speech A character vector of one or more parts of speech
#' @param language A character vector of one or more languages
get_content <- function(content_type, collection = NULL, language = NULL,
                        corpus = NULL, role = NULL, role_exclude = NULL,
                        age = NULL, sex = NULL, target_child = NULL,
                        token = NULL, stem = NULL, part_of_speech = NULL,
                        connection) {

  transcripts <- get_transcripts(collection, corpus, target_child, connection)

  corpora <- transcripts %>%
    dplyr::distinct(corpus_id) %>%
    dplyr::collect()
  child_id <- transcripts %>%
    dplyr::distinct(target_child_id) %>%
    dplyr::pull(target_child_id)

  num_children <- length(child_id)
  num_corpora <- nrow(corpora)

  message("Getting data from ", num_children,
          ifelse(num_children == 1, " child", " children"), " in ",
          num_corpora, ifelse(num_corpora == 1, " corpus ", " corpora"), "...")

  content <- dplyr::tbl(connection, content_type)

  if (content_type %in% c("token", "token_frequency") && !is.null(token) &&
      !identical("*", token)) {

    token_string <- paste0("gloss %like% '", token, "'", collapse = " | ")
    token_expr <- parse(text = token_string)[[1]]
    content %<>% dplyr::filter(!!token_expr)
  }

  if (!is.null(stem)) {
    stem_filter <- stem
    content %<>% dplyr::filter(stem %in% stem_filter)
  }

  if (!is.null(part_of_speech)) {
    part_of_speech_filter <- part_of_speech
    content %<>% dplyr::filter(part_of_speech %in% part_of_speech_filter)
  }

  if (!num_corpora) {
    corpus_filter <- -1
    child_id <- -1
  } else {
    corpus_filter <- corpora$corpus_id
  }

  if (!is.null(collection) | !is.null(corpus)) {
    content %<>% dplyr::filter(corpus_id %in% corpus_filter)
  }

  if (!is.null(target_child)) {
    content %<>% dplyr::filter(target_child_id %in% child_id)
  }

  if (!is.null(age)) {
    if (!(length(age) %in% 1:2)) stop("`age` argument must be of length 1 or 2")
    days <- age * avg_month
    if (length(age) == 1) days <- c(days, days + avg_month)
    days_1 <- days[1]
    days_2 <- days[2]
    content %<>% dplyr::filter(target_child_age >= days_1,
                               target_child_age <= days_2)
  }

  if (!is.null(sex)) {
    sex_filter <- sex
    content %<>% dplyr::filter(sex %in% sex_filter)
  }

  if (!is.null(role)) {
    content %<>% dplyr::filter(speaker_role %in% role)
  }

  if (!is.null(role_exclude)) {
    content %<>% dplyr::filter(!(speaker_role %in% role_exclude))
  }

  if (!is.null(language)) {
    language_filter <- language
    content %<>% dplyr::filter(language %in% language_filter)
  }

  content %<>%
    dplyr::mutate(target_child_age = target_child_age / avg_month)

  return(content)
}


#' Get tokens
#'
#' @inheritParams connect_to_childes
#' @inheritParams get_content
#' @param replace A boolean indicating whether to replace "gloss" with
#'   "replacement" (i.e. phonologically assimilated form), when available
#'   (defaults to \code{TRUE})
#'
#' @return A `tbl` of Token data, filtered down by supplied arguments. If
#'   `connection` is supplied, the result remains a remote query, otherwise it
#'   is retrieved into a local tibble.
#' @export
#'
#' @examples
#' \dontrun{
#' get_tokens(token = "dog")
#' }
get_tokens <- function(collection = NULL, language = NULL, corpus = NULL,
                       target_child = NULL, role = NULL, role_exclude = NULL,
                       age = NULL, sex = NULL, token, stem = NULL,
                       part_of_speech = NULL, replace = TRUE, connection = NULL,
                       db_version = "current", db_args = NULL) {

  if (missing(token))
    stop("Argument 'token' is missing. To fetch all tokens, supply '*' for ",
         "argument 'token'. Caution: this may result in a long-running query.")

  con <- resolve_connection(connection, db_version, db_args)
  if (is.null(con)) return()

  tokens <- get_content(content_type = "token",
                        collection = collection,
                        language = language,
                        corpus = corpus,
                        role = role,
                        role_exclude = role_exclude,
                        age = age,
                        sex = sex,
                        target_child = target_child,
                        token = token,
                        stem = stem,
                        part_of_speech = part_of_speech,
                        connection = con)

  if (replace) {
    tokens %<>%
      dplyr::mutate(gloss = if (replacement == "") gloss else replacement) %>%
      dplyr::select(-replacement)
  }

  if (is.null(connection)) {
    tokens %<>% dplyr::collect()
    DBI::dbDisconnect(con)
  }
  return(tokens)
}


#' Get types
#'
#' @inheritParams connect_to_childes
#' @inheritParams get_content
#' @param type A character vector of one or more type patterns (`%` matches any
#'   number of wildcard characters, `_` matches exactly one wildcard character)
#'
#' @return A `tbl` of Type data, filtered down by supplied arguments. If
#'   `connection` is supplied, the result remains a remote query, otherwise it
#'   is retrieved into a local tibble.
#' @export
#'
#' @examples
#' \dontrun{
#' get_types()
#' }
get_types <- function(collection = NULL, language = NULL, corpus = NULL,
                      role = NULL, role_exclude = NULL, age = NULL, sex = NULL,
                      target_child = NULL, type = NULL, connection = NULL,
                      db_version = "current", db_args = NULL) {

  con <- resolve_connection(connection, db_version, db_args)
  if (is.null(con)) return()

  types <- get_content(content_type = "token_frequency",
                       collection = collection,
                       language = language,
                       corpus = corpus,
                       role = role,
                       role_exclude = role_exclude,
                       age = age,
                       sex = sex,
                       target_child = target_child,
                       token = type,
                       connection = con)

  if (is.null(connection)) {
    types %<>% dplyr::collect()
    DBI::dbDisconnect(con)
  }
  return(types)
}

#' Get utterances
#'
#' @inheritParams get_participants
#' @param language A character vector of one or more languages
#'
#' @return A `tbl` of Utterance data, filtered down by supplied arguments. If
#'   `connection` is supplied, the result remains a remote query, otherwise it
#'   is retrieved into a local tibble.
#' @export
#'
#' @examples
#' \dontrun{
#' get_utterances(target_child = "Shem")
#' }
get_utterances <- function(collection = NULL, language = NULL, corpus = NULL,
                           role = NULL, role_exclude = NULL, age = NULL,
                           sex = NULL, target_child = NULL, connection = NULL,
                           db_version = "current", db_args = NULL) {

  con <- resolve_connection(connection, db_version, db_args)
  if (is.null(con)) return()

  utterances <- get_content(content_type = "utterance",
                            collection = collection,
                            language = language,
                            corpus = corpus,
                            role = role,
                            role_exclude = role_exclude,
                            age = age,
                            sex = sex,
                            target_child = target_child,
                            connection = con)

  if (is.null(connection)) {
    utterances %<>% dplyr::collect()
    DBI::dbDisconnect(con)
  }
  return(utterances)
}

#' Get the utterances surrounding a token(s)
#'
#' @inheritParams get_utterances
#' @inheritParams get_tokens
#' @param window A length 2 numeric vector of how many utterances before and
#'   after each utterance containing the target token to retrieve
#' @param remove_duplicates A boolean indicating whether to remove duplicate
#'   utterances from the results
#'
#' @return A 'tbl' of Utterance data, filtered down by supplied arguments.
#' @export
#'
#' @examples
#' \dontrun{
#' get_contexts(target_child = "Shem", token = "dog")
#' }
get_contexts <- function(collection = NULL, language = NULL, corpus = NULL,
                        role = NULL, role_exclude = NULL, age = NULL,
                        sex = NULL, target_child = NULL, token,
                        window = c(0, 0), remove_duplicates = TRUE,
                        connection = NULL, db_version = "current",
                        db_args = NULL) {

  con <- resolve_connection(connection, db_version, db_args)
  if (is.null(con)) return()

  token_utterances <- get_tokens(collection = collection,
                                 language = language,
                                 corpus = corpus,
                                 role = role,
                                 role_exclude = role_exclude,
                                 age = age,
                                 sex = sex,
                                 target_child = target_child,
                                 token = token,
                                 connection = con) %>%
    dplyr::pull(utterance_id)

  suppressMessages(
    utterances <- get_utterances(collection = collection,
                                 language = language,
                                 corpus = corpus,
                                 role = role,
                                 role_exclude = role_exclude,
                                 age = age,
                                 sex = sex,
                                 target_child = target_child,
                                 connection = con) %>%
      dplyr::rename(utterance_id = id)
  )

  utterance_orders <- utterances %>%
    dplyr::filter(utterance_id %in% token_utterances) %>%
    dplyr::select(transcript_id, utterance_order) %>%
    dplyr::collect()

  contexts <- purrr::map2_df(utterance_orders$transcript_id,
                             utterance_orders$utterance_order,
                             function(tid, index) {
    start <- index - window[1]
    end <- index + window[2]
    utterances %>%
      dplyr::filter(transcript_id == tid, utterance_order >= start,
                    utterance_order <= end) %>%
      dplyr::collect()
  })

  if (remove_duplicates) {
    contexts %<>% dplyr::distinct(transcript_id, utterance_id, .keep_all = TRUE)
  }

  if (is.null(connection)) {
    DBI::dbDisconnect(con)
  }
  return(contexts)
}


#' Run a SQL Query script on the CHILDES database
#'
#' @inheritParams connect_to_childes
#' @param sql_query_string A valid sql query string character
#' @param connection A connection to the CHILDES database
#'
#' @return The database after calling the supplied SQL query
#' @export
#'
#' @examples
#' \dontrun{
#' get_sql_query("SELECT * FROM collection")
#' }

get_sql_query <- function(sql_query_string, connection = NULL,
                          db_version = "current", db_args = NULL) {
  con <- resolve_connection(connection, db_version, db_args)
  if (is.null(con)) return()

  returned_sql_query <- dplyr::tbl(con, dplyr::sql(sql_query_string)) %>%
    dplyr::collect()
  if (is.null(connection)) {
    DBI::dbDisconnect(con)
  }
  return(returned_sql_query)
}
langcog/childesr documentation built on Feb. 1, 2022, 9:19 a.m.