R/analyze_gesis_file.R

#' Analyze a GESIS file
#'
#' There is a rather cumbersome exception handling with not-unique
#' GESIS variable names, which occur when certain questions are only asked
#' in some geographical units, such as the Turkish community of Cyprus.
#' In these cases, GESIS may or may not give unique variable names. Some
#' futher problems may arrise, currently the code detects this problem.
#' @param gesis_file The full path to the GESIS SPSS file or a data.frame
#' read by haven consisting the contents of the file.
#' @param see_log  \code{TRUE} which will print messages to the screen.
#' @param create_log  It will create log files in the sr_logs director.
#' @param my_treshold  Can be \code{futile.logger::WARN},
#' \code{futile.logger::INFO}, \code{futile.logger::ERROR}.
#' @param log_prefix Defaults to \code{NA}, in which case a
#' new one will be assigned to the logs (if they are requested.) The
#' log_prefix can be directly assigned.
#' @param log_id Defaults to \code{NA}, in which case a
#' new one will be assigned to the logs (if they are requested.) The
#' log_prefix can be directly assigned.
#' @importFrom futile.logger flog.threshold flog.appender flog.remove
#' @importFrom futile.logger flog.info flog.warn flog.error WARN INFO
#' @importFrom futile.logger appender.file
#' @importFrom haven read_spss as_factor
#' @importFrom stringr str_split
#' @importFrom utils write.csv
#' @examples
#' \dontrun{
#' ##use your own file path:
#' analyse_gesis_file( path = " ... your own file path ... ",
#'                    see_log = TRUE,
#'                    create_log = TRUE,
#'                    my_treshold = futile.logger::INFO)
#' }
#' @export



analyze_gesis_file <- function ( gesis_file,
                                 see_log = TRUE,
                                 create_log = TRUE,
                                 log_prefix = NA,
                                 log_id = NA,
                                 my_treshold = futile.logger::WARN ) {

  df <- gesis_name <- n <- spss_name <- na_count <- . <- NULL
  suggested_name <- suggested_class <- NULL
  treshold <- futile.logger::flog.threshold(my_treshold)
  directory_message <- NA

  if (create_log == TRUE) {
    directory_mssage <- NA
    if (! file.exists("sr_logs")) {
      dir.create(file.path(paste0(getwd(), "/sr_logs")))
      directory_message <- paste("Created:\n", getwd(), "/sr_logs")

    }
    if (is.na(log_prefix)) {
      log_prefix <- paste0("sr_logs/", format(Sys.time(),
                                              "%Y_%b_%d_ %H_%M"))
    }
    if (is.na(log_id)) {
      log_id <- "_gesis_analysis_"
    }
    info <- futile.logger::flog.appender(
      futile.logger::appender.file(paste0(log_prefix,log_id, "info.log")), name="info")
    warning <- futile.logger::flog.appender(
      futile.logger::appender.file(paste0(log_prefix,log_id, "warning.log")), name="warning")
    error <- futile.logger::flog.appender(
      futile.logger::appender.file(paste0(log_prefix,log_id, "error.log")), name="error")
    if (!is.na(directory_message)) {
      futile.logger::flog.info ( directory_message,
                                 name="info")
      futile.logger::flog.info ( directory_message)
    }

  }

  tryCatch(
    {

      if ( "character" %in% class(gesis_file) ) {
        insert_file_name <- gesis_file
        read_df <- haven::read_spss(gesis_file)
        read_message <- paste0("Reading ", gesis_file)
        if (see_log)    futile.logger::flog.info(read_message)
        if (create_log) futile.logger::flog.info(read_message, name  ="info")

      } else if (
        ! "data.frame"  %in% class(gesis_file)
      ) {
        stop("Parameter gesis_file must be a pre-imported file or a full path.")
      } else {
        read_df <- gesis_file
        df_message <- paste0("Inputed a pre-existing data.frame with ",
                             ncol( read_df), " variables.")
          if (see_log)    futile.logger::flog.info(df_message)
          if (create_log) futile.logger::flog.info(df_message, name  ="info")
          insert_file_name <- "the data frame"
        }

      read_message <- paste0("Analyzing the GESIS file.")
      if (see_log)    futile.logger::flog.info(read_message)
      if (create_log) futile.logger::flog.info(read_message, name  ="info")

      spss_metadata <- data.frame (
        gesis_name = vector (mode = "character", length = ncol(read_df)),
        spss_name = vector (mode = "character", length = ncol(read_df)),
        suggested_name = vector (mode = "character", length = ncol(read_df)),
        suggested_class = vector (mode = "character", length = ncol(read_df)),
        label_values = vector (mode = "character", length = ncol(read_df)),
        questionnaire_item = vector (mode = "character", length = ncol(read_df)),
        spss_class = vector (mode = "character", length = ncol(read_df)),
         stringsAsFactors = FALSE
      )

      if (see_log)    futile.logger::flog.info("Getting SPSS labels")
      if (create_log) futile.logger::flog.info("Getting SPSS labels", name  ="info")
      spss_metadata$gesis_name <- sjlabelled::get_label (read_df)

      if (see_log)    futile.logger::flog.info("Getting SPSS names")
      if (create_log) futile.logger::flog.info("Getting SPSS names", name  ="info")
      spss_metadata$spss_name <- names (read_df)

      if (see_log)    futile.logger::flog.info("Getting SPSS classes")
      if (create_log) futile.logger::flog.info("Getting SPSS classes", name  ="info")
      spss_metadata$spss_class <- vapply(read_df, class, character(1))

      if (see_log)    futile.logger::flog.info("Getting unique labels")
      if (create_log) futile.logger::flog.info("Getting unique labels", name  ="info")
      spss_metadata$label_values <- vapply ( read_df, surveyreader::unique_value_labels, character(1) )

      if (see_log)    futile.logger::flog.info("Suggesting conversions.")
      if (create_log) futile.logger::flog.info("Suggesting conversions.",
                                               name  ="info")
      spss_metadata$suggested_class <- vapply ( read_df, surveyreader::suggest_variable_conversion, character(1) )


      suggest_message <- paste0("Creating the suggested variables names.")
      if (see_log)    futile.logger::flog.info(suggest_message)
      if (create_log) futile.logger::flog.info(suggest_message,
                                               name  ="info")

      a <- tolower(as.character(spss_metadata$gesis_name))
      a <- gsub ( "%", "pct", a)
      a <- gsub ( "(spont)", "_spont", a)
      a <- gsub ( "(sum)", "_sum", a)
      a <- gsub ( "(summarized)", "_sum", a)
      a <- gsub ( "(summarised)", "_sum", a)
      a <- gsub ( "(recoded)", "_rec", a)
      a <- gsub ( "(rec)", "_rec", a)
      a <- gsub( " - ", ": ", a)
      a <- gsub( "\\s", "_", a)
      a <- gsub( ":_", ":", a)
      a <- gsub ( "/", "_", a)
      a <- gsub ( "\\(", "", a)
      a <- gsub ( ")", "", a)
      a <- gsub ( "\\+", "_", a)
      a <- gsub ( "\\&", "", a)
      a <- gsub ( "___", "_", a)
      a <- gsub ( "__", "_", a)

      spss_metadata$suggested_name = gsub ( ":", "_", a)

      naming_exc_message <- paste0("Reviewing exceptions with get_naming_exception()")
      if (see_log)    futile.logger::flog.info( naming_exc_message )
      if (create_log) futile.logger::flog.info( naming_exc_message , name  ="info")

      naming_exceptions <- get_naming_exceptions()  #must not be factors
      for (i in 1:length(naming_exceptions)) {
        spss_metadata$suggested_name <- ifelse (
          spss_metadata$gesis_name == naming_exceptions$exact[i],
          naming_exceptions$new_name[i],
          spss_metadata$suggested_name)
      }

      spss_metadata <- spss_metadata %>%
        mutate ( suggested_name = ifelse (
          spss_name == "split",
          yes = "split", no = suggested_name)) %>%
        mutate ( suggested_name = ifelse (
          spss_name == "uniqid",
          yes = "uniqid", no = suggested_name)) %>%
        mutate ( suggested_name = ifelse (
          suggested_class == "multiple_choice",
          yes = paste0("mc_", suggested_name ),
          no = suggested_name)) %>%
        mutate ( suggested_name = gsub("_10p-scale", "", suggested_name)) %>%
        mutate ( suggested_name = gsub("aged_<10", "aged_10m", suggested_name )) %>%
        mutate ( suggested_name = gsub("_aged_15_", "aged_15p", suggested_name )) %>%
        mutate ( suggested_name = gsub("aged_10-14", "aged_10_14", suggested_name )) %>%
        mutate ( suggested_name = ifelse( spss_name == "wex",
                                         yes = "wex",
                                         no = suggested_name )) %>%
        mutate ( suggested_name = ifelse ( tolower(spss_name) == "w1",
                                           yes = "w1",
                                           no = suggested_name ))

      spss_metadata <- spss_metadata %>%
        dplyr::add_count ( gesis_name )

      if ( any (spss_metadata$n > 1 ) ) {
        split_message <- paste0("There is a split in the questionnaire with not unique names in the GESIS file.")
        if (see_log)    futile.logger::flog.warn(split_message)
        if (create_log) futile.logger::flog.warn(split_message,
                                                 name  ="warning")

        spss_metadata <- spss_metadata %>%
          dplyr:: mutate ( na_count = ifelse ( n > 1,
                                       spss_name,
                                       "" ))
        count_answers <- function(x) {
            if (x == "") return ( as.numeric(NA) )
            df_sub <- read_df[, which(names(read_df) == x) ]
            as.numeric(sum (! is.na(df_sub )))
        }

        spss_metadata_exc <- spss_metadata

        spss_metadata_exc$na_count <- vapply(
            spss_metadata$na_count,
            count_answers, numeric(1))  ##to determine additional questionnaire sample size


        spss_metadata_exc <-  spss_metadata_exc %>%
          dplyr::mutate ( na_count = ifelse(is.na(na_count),
                                     max(na_count, na.rm=TRUE),
                                     na_count)) %>%
          dplyr::mutate ( suggested_name = ifelse ( na_count < 12000,
                                               paste0(suggested_name, "_split_b"),
                                               suggested_name )) %>%
          dplyr::select ( -na_count, -n) %>%
          dplyr::add_count ( suggested_name )

        if ( any (spss_metadata_exc$n > 1)) {
          unknow_naming_error_message <- paste0("Unknown naming error in\n", gesis_file,
                                                "\nNaming problem exported to naming_problem.csv.")
          if (see_log)    futile.logger::flog.error(unknow_naming_error_message)
          if (create_log) futile.logger::flog.error(unknow_naming_error_message,
                                                   name  ="error")
          utils::write.csv(spss_metadata_exc, "naming_problem.csv")
          stop ( unknow_naming_error_message )
        } else {
          spss_metadata <- spss_metadata_exc
        }
      }

      spss_metadata <- dplyr::select (spss_metadata, -n)

    },
    error=function(cond) {
      gesis_analysis_error <- paste0("Not successful -> analyze_gesis_file, ", cond)
      warning(gesis_analysis_error )
      futile.logger::flog.error(gesis_analysis_error, name="error")
      futile.logger::flog.info(gesis_analysis_error, name="info")
    },
    warning=function(cond) {
      gesis_analysis_warning <- paste0("Warning -> analyze_gesis_file, ", cond)
      futile.logger::flog.warn(gesis_analysis_warning, name="warning")
      futile.logger::flog.info(gesis_analysis_warning, name="info")
    },
    finally = {
      finished_message <- paste0("Finished with the analysis of the file\n",
                                 insert_file_name,
                                 "\n with ", nrow(read_df), " observations in ",
                                 ncol(read_df), " variables.")
      if (see_log) futile.logger::flog.info (finished_message)
      futile.logger::flog.info ( finished_message,
                                 name="info")
    }
  )

  return (spss_metadata)
}
antaldaniel/surveyreader documentation built on May 16, 2019, 2:29 a.m.