RISE_FJ/Data_Dictionary/Dictionary_Creation.R

## First this is to loop through folder and bringing in file one at a time and
## creating the data dictionary. Then append the file with the next file in the
## folder. - Jeff F

library (tidyverse)
library (lubridate)
library (stringr)
library (readxl)
library (xlsx)
library (hablar)

rm(list = ls())

# First lets create the overview page.
setwd("Z:/Data Files/Practice Data")
tab1 <- read_xlsx (path="DataDictionarySchema.xlsx", col_names = FALSE)

setwd("Z:/Data Files/Practice Data/Data Dictionary")

## Now lets load all the files in the folder and the proper sheets - Jeff F
temp = list.files(pattern="*.xlsx")
temp5 = list.files(pattern="*.csv")

for (i in 1:length(temp)) {
  assign(paste0("survey_", temp[i]), readxl::read_excel(temp[i], sheet = "survey"))
}

for (i in 1:length(temp)) {
  assign(paste0("choices_", temp[i]), readxl::read_excel(temp[i], sheet = "choices"))
}

for (i in 1:length(temp)) {
  assign(paste0("settings_", temp[i]), readxl::read_excel(temp[i], sheet = "settings"))
}

for (i in 1:length(temp5)) {
  assign(temp5[i], read_csv(file = temp5[i]))
}

## Next lets create the data dictionary... gonna have to loop through all the
## different survey forms and input into just 1 file - Jeff
temp2 <- mget(ls(pattern="settings_*"))
temp3 <- mget(ls(pattern="survey_*"))
temp4 <- mget(ls(pattern ="choices_*"))
temp6 <- mget(ls(pattern ="*.csv"))
test <- NA
tab2_temp <- temp3
tab2_temp2 <- temp3
tab2 <- temp3
tab3 <- temp4

for (i in 1:length(temp)) {
  #fix column 3 name (it sometimes comes in differently):
  colnames(temp3[[i]])[3] <- 'label:English'

  #grab form title from temp2
  test <- temp2[[i]]$form_title

  #make start_date and end_date dates if the file exists:
  j <- match(paste0(temp2[[i]]$form_title, ".csv"), temp5)
  mintoday <- ifelse(is.na(j), NA, min(temp6[[j]]$today))
  maxtoday <- ifelse(is.na(j), NA, max(temp6[[j]]$today))

  #Lets create each table at a time
  tab2_temp[[i]] <- temp3[[i]] %>%
    mutate (survey_name = test, var_type = type, start_date = mintoday,
            end_date = maxtoday, comment = NA) %>%
    rename(var_label = type, variable_name = name, description = 'label:English') %>%
    select (survey_name,
      variable_name, var_label, var_type, description,
            start_date, end_date, comment) %>%
    add_row(survey_name = 'SurveyCTO name',
      variable_name = 'Name column in "survey" tab', var_label = 'in "type" column', var_type = 'factor, date, numeric, text …', description = 'label:English column - just paste in the question', start_date = 'Date column was initiated', end_date = 'Date column was discontinued', comment = 'Additional information for user', .before = 1)

  #Then move that table into a group of tables called tab2
  tab2[[i]] <- tab2_temp[[i]] %>% filter (var_label != 'note' & var_label != 'end group' &
                                var_label != 'begin group' & var_label != 'end repeat' &
                                var_label != 'begin repeat') %>%
    mutate (var_type = ifelse(var_type == 'start', 'date', var_type),
            var_type = ifelse(var_type == 'end', 'date', var_type),
            var_type = ifelse(var_type == 'calculate', 'numeric', var_type),
            var_type = ifelse(var_type == 'integer', 'numeric', var_type),
            var_type = ifelse(var_type == 'calculate', 'numeric', var_type),
            var_type = ifelse(var_type == 'text', 'string', var_type),
            var_type = ifelse(grepl('select_one', var_type), 'categorical (select one)', var_type),
            var_type = ifelse(grepl('select_multiple', var_type), 'categorical (select multiple)', var_type),
            var_type = ifelse(var_type == 'barcode', 'string', var_type)) %>%
    mutate (var_type = ifelse(variable_name == 'today', 'date', var_type))

}

## Need to do tab 3: the factor values next:
for (i in length(temp)) {
  colnames(temp4[[i]])[3] <- 'label:English'

  tab3[[i]] <- temp4[[i]] %>%
    mutate (start_date = NA, end_date = NA) %>%
    rename (dictionary_id = list_name, factor_value = value, factor_label = 'label:English') %>%
    select (dictionary_id, factor_value, factor_label, start_date, end_date)
}


## Finally all I need to do is output the data dictionary! Almost done!
## wow this runs super slow... took 45 minutes to complete 6 files (network is
## running slow today though)

for (i in seq_along(temp) ) {
  write.xlsx(as.data.frame(tab1), file = paste0("Z:/Data Files/Practice Data/Data Dictionary/Output/Data_dictionary_", tab2[[i]]$survey_name[2], ".xlsx"), sheetName = "Data Dictionary-overview", row.names = FALSE, col.names = FALSE, showNA=FALSE)
  write.xlsx(as.data.frame(tab2[[i]]), file = paste0("Z:/Data Files/Practice Data/Data Dictionary/Output/Data_dictionary_", tab2[[i]]$survey_name[2], ".xlsx"), sheetName = "Data Dictionary-1", append=TRUE, row.names = FALSE, showNA=FALSE)
  write.xlsx(as.data.frame(tab3[[i]]), file = paste0("Z:/Data Files/Practice Data/Data Dictionary/Output/Data_dictionary_", tab2[[i]]$survey_name[2], ".xlsx"), sheetName = "Data Dictionary-2", append=TRUE, row.names = FALSE, showNA=FALSE)
}

# Extra: need to do formatting but will probably not have enough time for that
Monash-RISE/riseR documentation built on Dec. 11, 2019, 9:49 a.m.