#' Read the input file from different sources
#'
#' @param input_format possible formats are [du.enum.input.format] (default = CSV)
#' @param input_path path for importfile
#'
#' @importFrom readr read_csv cols col_double
#' @importFrom haven read_dta read_sas read_spss
#'
#' @return dataframe with source data
#'
#' @noRd
du.read.source.file <- function(input_path, input_format) {
du_data <- NULL
if (input_format %in% du.enum.input.format()) {
if (input_format == du.enum.input.format()$STATA) {
data <- read_dta(input_path)
} else if (input_format == du.enum.input.format()$SPSS) {
data <- read_spss(input_path)
} else if (input_format == du.enum.input.format()$SAS) {
data <- read_sas(input_path)
} else if (input_format == du.enum.input.format()$R) {
data <- source(input_path)
} else {
data <- read_csv(input_path, show_col_types = FALSE)
}
} else {
stop(paste0(
input_format, " is not a valid input format, Possible input formats are: ",
paste(du.enum.input.format(), collapse = ", ")
))
}
return(data)
}
#' Get the table without rows containing only NA's.
#'
#' We have to remove the first column (child_id), that is generated always.
#'
#' @param dataframe dataframe to check
#'
#' @importFrom dplyr %>%
#'
#' @return dataframe without the na values
#'
#' @noRd
du.data.frame.remove.all.na.rows <- function(dataframe) {
if(ncol(dataframe) >= 1) {
df <- dataframe[-c(1)]
naLines <- df %>%
is.na() %>%
apply(MARGIN = 1, FUN = all)
return(df[!naLines, ])
} else {
return(list(0,0))
}
}
#'
#' Matched the columns in the source data.
#' You can then match the found column against the dictionary.
#'
#' @param data_columns columns obtained from raw data
#' @param dict_columns columns matched in the dictionary
#'
#' @importFrom stringr str_subset
#'
#' @return matched_columns in source data
#'
#' @noRd
du.match.columns <- function(data_columns, dict_columns) {
matched_columns <- character()
matched_columns <- data_columns[data_columns %in% dict_columns]
for (variable in dict_columns) {
matched_columns <- c(matched_columns, data_columns %>% str_subset(pattern = paste0("^",
variable, "\\d+",
sep = ""
)))
}
# Select the non-repeated measures from the full data set
return(matched_columns)
}
#'
#' Check if there are columns not matching the dictionary.
#'
#' @param dict_kind specify which dictionary you want to check
#' @param data_columns the coiumns within the data
#' @param run_mode default = NORMAL, can be TEST and NON_INTERACTIIVE
#'
#' @return stops the program if someone terminates
#'
#' @noRd
du.check.variables <- function(dict_kind, data_columns, run_mode) {
variables <- du.retrieve.dictionaries(dict_kind = dict_kind)
matched_columns <- du.match.columns(data_columns, variables$name)
columns_not_matched <- data_columns[!(data_columns %in% matched_columns)]
if (length(columns_not_matched) > 0) {
message(paste0("[WARNING] This is an unmatched column, it will be dropped : [ ", columns_not_matched, " ].", sep = '\n'))
if (run_mode != du.enum.run.mode()$NON_INTERACTIVE) {
proceed <- readline("Do you want to proceed (y/n)")
} else {
proceed <- "y"
}
} else {
proceed <- "y"
}
if (proceed == "n") {
message(paste0(columns_not_matched, sep = '\n'))
stop("Program is terminated. There are unmatched columns in your source data.")
}
}
#' Check for NA columns
#'
#' @param stripped variables without NA values
#' @param raw original variables
#' @param run_mode the run mode of the package
#'
#' @return stops the program if someone terminates
#'
#' @noRd
du.check.nas <- function(stripped, raw) {
# remove id column
# be advised the source data should not contain any column that is some sort of row_id
raw <- raw[-1]
variables_na <- raw[!(raw %in% stripped)]
if (length(variables_na) > 0) {
message(paste0("[WARNING] Variable dropped because completely missing: [ ", variables_na, " ]", sep = '\n'))
if (ds_upload.globals$run_mode != du.enum.run.mode()$NON_INTERACTIVE) {
proceed <- readline("Do you want to proceed (y/n)")
} else {
proceed <- "y"
}
} else {
proceed <- "y"
}
if (proceed == "n") {
message(paste0(variables_na, sep = '\n'))
stop("Program is terminated. There are columns in your source data that are completely missing.")
}
}
#' Generate the yearly repeated measures file and write it to your local workspace
#'
#' @param data data frame with all the data based upon the CSV file
#' @param dict_kind can be 'core' or 'outcome'
#'
#' @importFrom readr write_csv
#' @importFrom dplyr %>%
#' @importFrom readxl read_xlsx
#'
#' @noRd
du.reshape.generate.non.repeated <- function(data, dict_kind) {
message("* Generating: non-repeated measures")
# Retrieve dictionary
variables_non_repeated_dict <- du.retrieve.dictionaries(du.enum.table.types()$NONREP, dict_kind)
# select the non-repeated measures from the full data set
non_repeated <- c("child_id", variables_non_repeated_dict$name)
non_repeated_measures <- data[, which(colnames(data) %in% non_repeated)]
# strip the rows with na values
stripped_non_repeated_measures <- non_repeated_measures[, colSums(is.na(non_repeated_measures)) <
nrow(non_repeated_measures)]
du.check.nas(colnames(stripped_non_repeated_measures), colnames(non_repeated_measures))
# add row_id again to preserve child_id
stripped_non_repeated_measures <- data.frame(
row_id = c(1:length(non_repeated_measures$child_id)),
non_repeated_measures
)
return(as.data.frame(stripped_non_repeated_measures))
}
#' Generate the yearly repeated measures file and write it to your local workspace
#'
#' @param data data frame with all the data based upon the CSV file
#' @param dict_kind can be 'core' or 'outcome'
#'
#' @importFrom readr write_csv
#' @importFrom dplyr %>% filter summarise bind_rows
#' @importFrom maditr dcast as.data.table %<>%
#' @importFrom tidyr gather
#'
#' @noRd
du.reshape.generate.yearly.repeated <- function(data, dict_kind) {
# workaround to avoid glpobal variable warnings, check:
# https://stackoverflow.com/questions/9439256/how-can-i-handle-r-cmd-check-no-visible-binding-for-global-variable-notes-when
orig_var <- value <- age_years <- . <- NULL
message("* Generating: yearly-repeated measures")
variables_yearly_repeated_dict <- du.retrieve.dictionaries(du.enum.table.types()$YEARLY, dict_kind)
matched_columns <- du.match.columns(colnames(data), variables_yearly_repeated_dict$name)
yearly_repeated_measures <- data[matched_columns]
if (ncol(yearly_repeated_measures) <= 0 || nrow(du.data.frame.remove.all.na.rows(yearly_repeated_measures)) <= 0) {
message("[WARNING] No yearly-repeated measures found in this set")
return()
}
long_1 <- yearly_repeated_measures %>% gather(orig_var, value, matched_columns[matched_columns !=
"child_id"], na.rm = TRUE)
# Create the age_years variable with the regular expression extraction of the year
long_1$age_years <- as.numeric(du.num.extract(long_1$orig_var))
# Here we remove the year indicator from the original variable name
long_1$variable_trunc <- gsub("[[:digit:]]+$", "", long_1$orig_var)
raw <- unique(gsub("[[:digit:]]+$", "", colnames(yearly_repeated_measures)))
du.check.nas(unique(long_1$variable_trunc), raw)
# Use the maditr package for spreading the data again, as tidyverse runs into memory
# issues
long_2 <- dcast(long_1, child_id + age_years ~ variable_trunc, value.var = "value")
# As the data table is still too big for opal, remove those rows, that have only
# missing values, but keep all rows at age_years=0, so no child_id get's lost:
# Subset of data with age_years = 0
zero_year <- long_2 %>% filter(age_years %in% 0)
for (id in unique(yearly_repeated_measures$child_id)) {
if (!(id %in% zero_year$child_id)) {
zero_year %<>% summarise(child_id = id, age_years = 0) %>% bind_rows(
zero_year,
)
}
}
# Subset of data with age_years > 0
later_year <- long_2 %>% filter(age_years > 0)
# Bind the 0 year and older data sets together
long_2 <- rbind(zero_year, later_year)
# Create a row_id so there is a unique identifier for the rows
long_2$row_id <- c(1:length(long_2$child_id))
# Arrange the variable names based on the original order
long_yearly <- long_2[, c("row_id", "child_id", "age_years", unique(long_1$variable_trunc))]
return(as.data.frame(long_yearly))
}
#' Generate the monthly repeated measures file and write it to your local workspace
#'
#' @param data data frame with all the data based upon the CSV file
#' @param dict_kind can be 'core' or 'outcome'
#'
#' @importFrom readr write_csv
#' @importFrom dplyr %>% filter summarise bind_rows
#' @importFrom maditr dcast as.data.table %<>%
#' @importFrom tidyr gather
#'
#' @noRd
du.reshape.generate.monthly.repeated <- function(data, dict_kind) {
# workaround to avoid glpobal variable warnings, check:
# https://stackoverflow.com/questions/9439256/how-can-i-handle-r-cmd-check-no-visible-binding-for-global-variable-notes-when
orig_var <- value <- age_months <- . <- NULL
message("* Generating: monthly-repeated measures")
variables_monthly_repeated_dict <- du.retrieve.dictionaries(du.enum.table.types()$MONTHLY, dict_kind)
matched_columns <- du.match.columns(colnames(data), variables_monthly_repeated_dict$name)
monthly_repeated_measures <- data[, matched_columns]
if (ncol(monthly_repeated_measures) <= 0 || nrow(du.data.frame.remove.all.na.rows(monthly_repeated_measures)) <= 0) {
message("[WARNING] No monthly-repeated measures found in this set")
return()
}
long_1 <- monthly_repeated_measures %>% gather(orig_var, value, matched_columns[matched_columns !=
"child_id"], na.rm = TRUE)
# Create the age_years and age_months variables with the regular expression
# extraction of the year
long_1$age_years <- as.integer(as.numeric(du.num.extract(long_1$orig_var)) / 12)
long_1$age_months <- as.numeric(du.num.extract(long_1$orig_var))
# Here we remove the year indicator from the original variable name
long_1$variable_trunc <- gsub("[[:digit:]]+$", "", long_1$orig_var)
raw <- unique(gsub("[[:digit:]]+$", "", colnames(monthly_repeated_measures)))
du.check.nas(unique(long_1$variable_trunc), raw)
# Use the maditr package for spreading the data again, as tidyverse ruins into memory
# issues
long_2 <- dcast(long_1, child_id + age_years + age_months ~ variable_trunc, value.var = "value")
# As the data table is still too big for opal, remove those rows, that have only
# missing values, but keep all rows at age_years=0, so no child_id get's lost:
# Subset of data with age_months = 0
zero_monthly <- long_2 %>% filter(age_months %in% 0)
for (id in unique(monthly_repeated_measures$child_id)) {
if (!(id %in% zero_monthly$child_id)) {
zero_monthly %<>% summarise(child_id = id, age_months = 0) %>% bind_rows(
zero_monthly,
)
}
}
# Subset of data with age_months > 0
later_monthly <- long_2 %>% filter(age_months > 0)
# Bind the 0 year and older data sets together
long_2 <- rbind(zero_monthly, later_monthly)
# Create a row_id so there is a unique identifier for the rows
long_2$row_id <- c(1:length(long_2$child_id))
# Arrange the variable names based on the original order
long_monthly <- long_2[, c("row_id", "child_id", "age_years", "age_months", unique(long_1$variable_trunc))]
return(as.data.frame(long_monthly))
}
#' Generate the weekly repeated measures file and write it to your local workspace
#'
#' @param data data frame with all the data based upon the CSV file
#' @param dict_kind can be 'core' or 'outcome'
#'
#' @importFrom readr write_csv
#' @importFrom dplyr %>% filter summarise bind_rows
#' @importFrom maditr dcast as.data.table %<>%
#' @importFrom tidyr gather
#'
#' @noRd
du.reshape.generate.weekly.repeated <- function(data, dict_kind) {
# workaround to avoid glpobal variable warnings, check:
# https://stackoverflow.com/questions/9439256/how-can-i-handle-r-cmd-check-no-visible-binding-for-global-variable-notes-when
orig_var <- value <- age_weeks <- . <- NULL # Gestational age in weeks
message("* Generating: weekly-repeated measures")
variables_weekly_repeated_dict <- du.retrieve.dictionaries(du.enum.table.types()$WEEKLY, dict_kind)
matched_columns <- du.match.columns(colnames(data), variables_weekly_repeated_dict$name)
weekly_repeated_measures <- data[, matched_columns]
if (ncol(weekly_repeated_measures) <= 0 || nrow(du.data.frame.remove.all.na.rows(weekly_repeated_measures)) <= 0) {
message("[WARNING] No weekly-repeated measures found in this set")
return()
}
long_1 <- weekly_repeated_measures %>% gather(orig_var, value, matched_columns[matched_columns !=
"child_id"], na.rm = TRUE)
# Create the age_years and age_months variables with the regular expression
# extraction of the year NB - these weekly dta are pregnancy related so child is NOT
# BORN YET ---
long_1$age_years <- as.integer(as.numeric(du.num.extract(long_1$orig_var)) / 52)
long_1$age_weeks <- as.integer(du.num.extract(long_1$orig_var))
# Here we remove the year indicator from the original variable name
long_1$variable_trunc <- gsub("[[:digit:]]+$", "", long_1$orig_var)
raw <- unique(gsub("[[:digit:]]+$", "", colnames(weekly_repeated_measures)))
du.check.nas(unique(long_1$variable_trunc), raw)
# Use the maditr package for spreading the data again, as tidyverse ruins into memory
# issues
long_2 <- dcast(long_1, child_id + age_years + age_weeks ~ variable_trunc, value.var = "value")
# As the data table is still too big for opal, remove those rows, that have only
# missing values, but keep all rows at age_years=0, so no child_id get's lost:
# Subset of data with age_months = 0
zero_weekly <- long_2 %>% filter(age_weeks %in% 0)
for (id in unique(weekly_repeated_measures$child_id)) {
if (!(id %in% zero_weekly$child_id)) {
zero_weekly %<>% summarise(child_id = id, age_weeks = 0) %>% bind_rows(
zero_weekly,
)
}
}
# Subset of data with age_months > 0
later_weekly <- long_2 %>% filter(age_weeks > 0)
# Bind the 0 year and older data sets together
long_2 <- rbind(zero_weekly, later_weekly)
# Create a row_id so there is a unique identifier for the rows
long_2$row_id <- c(1:length(long_2$child_id))
# Arrange the variable names based on the original order
long_weekly <- long_2[, c("row_id", "child_id", "age_years", "age_weeks", unique(long_1$variable_trunc))]
return(as.data.frame(long_weekly))
}
#' Generate the trimesterly repeated measures file and write it to your local workspace
#'
#' @param data data frame with all the data based upon the CSV file
#' @param dict_kind can be 'core' or 'outcome'
#'
#' @importFrom readr write_csv
#' @importFrom dplyr %>% filter summarise bind_rows
#' @importFrom maditr dcast as.data.table %<>%
#' @importFrom tidyr gather
#'
#' @noRd
du.reshape.generate.trimesterly.repeated <- function(data, dict_kind) {
# workaround to avoid glpobal variable warnings, check:
# https://stackoverflow.com/questions/9439256/how-can-i-handle-r-cmd-check-no-visible-binding-for-global-variable-notes-when
orig_var <- value <- age_trimester <- . <- NULL
message("* Generating: trimesterly-repeated measures")
variables_trimesterly_repeated_dict <- du.retrieve.dictionaries(
du.enum.table.types()$TRIMESTER,
dict_kind
)
matched_columns <- du.match.columns(colnames(data), variables_trimesterly_repeated_dict$name)
trimesterly_repeated_measures <- data[, matched_columns]
if (ncol(trimesterly_repeated_measures) <= 0 || nrow(du.data.frame.remove.all.na.rows(trimesterly_repeated_measures)) <= 0) {
message("[WARNING] No trimesterly-repeated measures found in this set")
return()
}
long_1 <- trimesterly_repeated_measures %>% gather(orig_var, value, matched_columns[matched_columns !=
"child_id"], na.rm = TRUE)
# Create the age_years and age_months variables with the regular expression
# extraction of the year
long_1$age_trimester <- as.numeric(du.num.extract(long_1$orig_var))
# Here we remove the year indicator from the original variable name
long_1$variable_trunc <- gsub("[[:digit:]]+$", "", long_1$orig_var)
raw <- unique(gsub("[[:digit:]]+$", "", colnames(trimesterly_repeated_measures)))
du.check.nas(unique(long_1$variable_trunc), raw)
# Use the maditr package for spreading the data again, as tidyverse ruins into memory
# issues
long_2 <- dcast(long_1, child_id + age_trimester ~ variable_trunc, value.var = "value")
# As the data table is still too big for opal, remove those rows, that have only
# missing values, but keep all rows at age_years=0, so no child_id get's lost:
# Subset of data with age_months = 0
one_trimesterly <- long_2 %>% filter(age_trimester %in% 1)
for (id in unique(trimesterly_repeated_measures$child_id)) {
if (!(id %in% one_trimesterly$child_id)) {
one_trimesterly %<>% summarise(child_id = id, age_trimester = 1) %>% bind_rows(
one_trimesterly,
.
)
}
}
# Subset of data with age_months > 0
later_trimesterly <- long_2 %>% filter(age_trimester > 1)
long_2 <- rbind(one_trimesterly, later_trimesterly)
# Create a row_id so there is a unique identifier for the rows
long_2$row_id <- c(1:length(long_2$child_id))
# Arrange the variable names based on the original order
long_trimesterly <- long_2[, c("row_id", "child_id", "age_trimester", unique(long_1$variable_trunc))]
return(as.data.frame(long_trimesterly))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.