R/read_evavelo.R

Defines functions warning_empty_sheet read_compt_auto read_calendrier read_enquete read_comptage read_table_communes read_evavelo

Documented in read_calendrier read_comptage read_compt_auto read_enquete read_evavelo read_table_communes

#' Read and clean eva-velo xlsx file
#'
#' Read specific sheets from an xlsx object and perform some basic cleaning.
#'
#' @param file xlsx file, Workbook object or URL to xlsx file
#'
#'
#' @return an evadata object which is in fact a list of data.frames
#' @export

read_evavelo <- function(file){
  out <- list(
    calendrier = read_calendrier(file),
    table_communes = read_table_communes(file),
    comptage_init = read_comptage(file, init = TRUE),
    comptage = read_comptage(file),
    enquete_init = read_enquete(file, init = TRUE),
    enquete = read_enquete(file),
    comptages_automatiques = read_compt_auto(file)
  )

  ## Check if any DF is empty (except for comptages_automatiques)
  empty_dfs <- out %>%
    purrr::keep(is.data.frame) %>%
    purrr::map_lgl(is_empty_df)

  if(any(empty_dfs)){
    stop(
      "Des onglets n\'ont aucune donn\u00e9es"
    )
  }

  ## Add attributes to output (S3 Object)
  class(out) <- c("evadata", class(out))
  attr(out, "geocoded") <- FALSE
  out

}

#' Read and clean "table_communes" information
#'
#' Read a specific sheet of an xlsx object and perform some basic cleaning.
#'
#' @param file xlsx file, Workbook object or URL to xlsx file
#' @param sheet Name of the worksheet containing "table_communes" information.
#'
#' @return a data.frame
#' @keywords internal
read_table_communes <- function(file, sheet = "table_communes"){

  openxlsx::read.xlsx(file,
                      sheet,
                      startRow = 2) %>%  # We skip the first row that contains global information and not data.
    janitor::clean_names() %>%
    warning_empty_sheet(sheetname = sheet)
  # geocode_table_communes()

}



#' Read and clean "comptage" information
#'
#' Read a specific sheet of an xlsx object and perform some basic cleaning.
#'
#' @param file xlsx file, Workbook object or URL to xlsx file
#' @param init boolean to import the initial sheet instead of the _post_traitement
#'
#' @importFrom rlang .data
#'
#' @return a data.frame
#' @keywords internal
read_comptage <- function(file, init = FALSE){
  sheet <- ifelse(init, "comptages_manuels", "comptages_man_post_traitements")
  comptage <- openxlsx::read.xlsx(file, sheet)

  comptage %>%
    dplyr::select(dplyr::starts_with("[")) %>%  ## Don't take in account "old" col names that could create duplicated entries
    janitor::clean_names() %>%
    dplyr::mutate(
      dplyr::across(
        dplyr::starts_with("categorie"),
        as.character
      ),
      dplyr::across(
        dplyr::starts_with("categorie_visuelle"),
        ~ stringr::str_remove(.x, "s$") ## Remove s from plurar (common user mistake)
      )
    ) %>%
    dplyr::mutate(
      date_enq = openxlsx::convertToDate(.data$date_enq),
      id_quest = as.character(.data$id_quest)
    ) %>%
    warning_empty_sheet(sheetname = sheet)
}

#' Read and clean "enquete" information
#'
#' Read a specific sheet of an xlsx object and perform some basic cleaning.
#'
#' @param file xlsx file, Workbook object or URL to xlsx file
#' @param init boolean to import the initial sheet instead of the _post_traitement
#'
#' @importFrom rlang .data
#'
#' @return a data.frame
#' @keywords internal
read_enquete <- function(file, init = FALSE) {
  sheet <- ifelse(init, "enquetes_saisies", "enquetes_post_traitement")
  enquete <- openxlsx::read.xlsx(file, sheet)
  enquete <- enquete %>%
    janitor::clean_names() %>%
    dplyr::mutate( ## Deal with empty case that are considered as logical after import
      dplyr::across(dplyr::starts_with("type_"), as.character),
      dms = as.numeric(.data$dms)
    ) %>%
    dplyr::mutate(
      type_sortie = dplyr::case_when(
        type_sortie == "Demi journ\u00e9e" ~ "Demi-journ\u00e9e",
        type_sortie == "La journ\u00e9e" ~ "Journ\u00e9e",
        TRUE ~ type_sortie)
    ) %>%
    dplyr::mutate(
      type_trajet = dplyr::case_when(
        stringr::str_detect(type_trajet, "simple") ~ "Trajet simple",
        stringr::str_detect(type_trajet, "retour") ~ "Aller-retour",
        TRUE ~ type_trajet)
    ) %>%
    dplyr::mutate(
      date_enq = openxlsx::convertToDate(.data$date_enq),
      heure_enq = as.integer(.data$heure_enq),
      revenu = as.numeric(revenu),
      id_quest = as.character(.data$id_quest),
      pays_res = as.character(.data$pays_res),
      dms = dplyr::if_else(.data$dms != 0,.data$dms, NA_real_) ## Fix issue #44 when dms = 0
    ) %>%
    dplyr::mutate( ## Avoid leading or trailing whitespaces.
      dplyr::across(
        dplyr::all_of(c("nom_site_enq", "iti_arrivee_itineraire", "iti_depart_itineraire",
                        "ville_heb", "ville_res")),
        stringr::str_trim
      )
    ) %>%
    warning_empty_sheet(sheetname = sheet)

  enquete

}

#' Read and clean "calendrier" information
#'
#' Read a specific sheet of an xlsx object and perform some basic cleaning.
#'
#' @param file xlsx file, Workbook object or URL to xlsx file
#' @param sheet Name of the worksheet caontaining "calendrier" information.
#'
#' @return a data.frame
#' @keywords internal
read_calendrier <- function(file, sheet = "calendrier_sites"){

  calendrier <- openxlsx::read.xlsx(file,
                                    sheet,
                                    startRow = 2) %>%  # We skip the first row that contains global information and not data.
    dplyr::mutate(
      date_enq = openxlsx::convertToDate(.data$date_enq)
    ) %>%
    janitor::clean_names() %>%
    warning_empty_sheet(sheetname = sheet)

}

#' Read and clean "comptages_automatiques" information
#'
#' Read a specific sheet of an xlsx object and perform some basic cleaning.
#' Compute predictors that are later user for clustering classification
#'
#' @param file xlsx file, Workbook object or URL to xlsx file
#' @param sheet Name of the worksheet containing "comptages_automatiques" information.
#'
#' @return a data.frame
#' @export
read_compt_auto <- function(file, sheet = "comptages_automatiques"){
  # Load header data ----------------------------------------------------------------------------
  header_data <- openxlsx::read.xlsx(file,
                                     sheet,
                                     sep.names = " ",
                                     rows = 1:4)

  if(is.null(header_data)){
    warning("Aucune donn\u00e9e pr\u00e9sente dans l\'onglet comptages_automatiques")
    return(NULL)
  }

  header_data <- header_data %>%
    dplyr::select(-(1:9)) %>% ## start col is 10
    t() %>%
    dplyr::as_tibble(rownames = "site_name") %>%
    dplyr::rename_with(~ c("site_name", "id_site","id_channel", "name")) %>%
    dplyr::mutate(dplyr::across(where(is.character),
                                stringr::str_squish)) ## Remove repeated white spaces

  ##TODO add a test for uniqueness of id_channel.
  if(anyDuplicated(header_data$id_channel) != 0){
    warning("id_channels dupliqu\u00e9s dans l\'onglet comptages_automatiques")
    return(NULL)
  }


  # Load data -----------------------------------------------------------------------------------
  load_data <- openxlsx::read.xlsx(file,
                                   sheet,
                                   sep.names = " ",
                                   startRow = 4)

  if(nrow(load_data) == 0){
    warning("Aucune donn\u00e9e pr\u00e9sente dans l\'onglet comptages_automatiques")
    return(NULL)
  }

  load_data <- load_data%>%
    dplyr::select(-dplyr::any_of(c("date", "annee", "mois", "jour", "type_jour"))) %>%
    dplyr::rename(date = 1) %>%
    ## Remove non-existing date (like winter> summer time) before conversion
    dplyr::filter(dplyr::if_any(c(where(is.numeric), -date),
                                ~ !is.na(.x))) %>%
    ## Transform "x" to TRUE or FALSE
    dplyr::mutate(dplyr::across(where(is.character),
                                ~ !is.na(.x))) %>%
    ## Create useful variables
    dplyr::mutate(
      date = as.POSIXct(date*3600*24, tz = "GMT", origin = "1899-12-30"), ## to avoid BUG when converting long data using openxlsx
      week_end = lubridate::wday(.data$date) %in% c(1,7),
      july_august = lubridate::month(.data$date) %in% 7:8
    ) %>%
    tidyr::pivot_longer(where(is.numeric),
                        names_to = "name",
                        values_to = "count"
    ) %>%
    dplyr::left_join(header_data,
                     by = "name")

  # Check data produced to avoid inconsistency later on
  if(all(!c("date", "jour_ferie", "pont", "vacances", "week_end", "july_august",
            "name", "count", "site_name", "id_site", "id_channel") == names(load_data))){
    warning("Probl\u00e8me de format dans l\'onglet comptages_automatiques. Impossible d\'importer")
    return(NULL)
  }

  # Compute predictors --------------------------------------------------------------------------

  pred <- load_data %>%
    dplyr::group_by(site_name, id_site, id_channel, name) %>%
    dplyr::summarize(
      ## Weekday proportion excluding holiday (working period)
      pred_wd_wp = sum_prod(!vacances, !week_end, !pont, !jour_ferie, count) /
        sum_prod(!vacances, !jour_ferie, count),
      ## Weekday proportion during holiday
      pred_wd_ho = sum_prod(vacances, !week_end, count) / sum_prod(vacances, count),
      ## July August over total
      pred_jul_aug = sum_prod(july_august, count) / sum_prod(count),
      ## Pont 14 juillet et 15 aout dans fréquentation mois aout juillet
      pred_pont_jul_aug = sum_prod(july_august, pont, count) / sum_prod(july_august, count),
      ## Proportion of count after 17:00 and before 9:00 (on weekday / working period)
      pred_wp_17_9 = sum_prod(!dplyr::between(lubridate::hour(date),9,17), !vacances, !week_end, !pont, !jour_ferie, count) /
        sum_prod(!vacances, !week_end, !pont, !jour_ferie, count),
      ## Proportion of counts between 9h and 11h in week-end
      pred_we_09_11 = sum_prod(dplyr::between(lubridate::hour(date),9,11), week_end, count) /
        sum_prod(week_end, count),
      ## Proportion of missing data
      missing_perc = sum(is.na(count)) / dplyr::n(),
      n_missing_days = dplyr::n_distinct(as.Date(date)) - dplyr::n_distinct(as.Date(date[!is.na(count)])),
      .groups = "drop"
    )

  return(pred)
}

## Utils function to check for empty sheet and create a warning
warning_empty_sheet <- function(df, sheetname){
  if(is_empty_df(df))
    warning("L\'onglet ", sheetname, " n\'a aucune donn\u00e9e.",
            immediate. = TRUE,
            call. = FALSE)
  return(df)
}
JMPivette/evavelo documentation built on April 8, 2023, 4:20 p.m.