#' This function will generate all result files and plots.
#'
#' @param study_settings Object that contains all study settings inputted by the user.
#' @param databaseName Name of the database that will appear in the results.
#' @param studyName Name for the study corresponding to the current settings.
#' @param outputFolder Name of local folder to place results; make sure to use forward slashes (/).
#'
#' @export
generateResults <- function(study_settings, databaseName, studyName, outputFolder) {
# For all different study settings
settings <- colnames(study_settings)[grepl("analysis", colnames(study_settings))]
for (s in settings) {
studyName <- study_settings[study_settings$param == "studyName",s]
ParallelLogger::logInfo(print(paste0("Creating output: ", studyName)))
# Select cohorts included
targetCohortId <- study_settings[study_settings$param == "targetCohortId",s]
eventCohortIds <- study_settings[study_settings$param == "eventCohortIds",s]
# Result settings
maxPathLength <- as.integer(study_settings[study_settings$param == "maxPathLength",s]) # Maximum number of steps included in treatment pathway (max 5)
minCellCount <- as.integer(study_settings[study_settings$param == "minCellCount",s]) # Minimum number of persons with a specific treatment pathway for the pathway to be included in analysis
minCellMethod <- study_settings[study_settings$param == "minCellMethod",s] # Select to completely remove / sequentially adjust (by removing last step as often as necessary) treatment pathways below minCellCount
groupCombinations <- study_settings[study_settings$param == "groupCombinations",s] # Select to group all non-fixed combinations in one category 'other’ in the sunburst plot
addNoPaths <- study_settings[study_settings$param == "addNoPaths",s] # Select to include untreated persons without treatment pathway in the sunburst plot
path <- paste0(outputFolder, "/",studyName, "/", databaseName, "_", studyName)
temp_path <- paste0(getwd(),"/temp/", databaseName, "/", studyName, "/", databaseName, "_", studyName)
# Transform results for output
transformed_data <- transformTreatmentSequence(studyName = studyName, path = path, temp_path = temp_path, maxPathLength = maxPathLength, minCellCount = minCellCount)
if (!is.null(transformed_data)) {
file_noyear <- as.data.table(transformed_data[[1]])
file_withyear <- as.data.table(transformed_data[[2]])
# Compute percentage of people treated with each event cohort separately and in the form of combination treatments
outputPercentageGroupTreated(data = file_noyear, studyName = studyName, eventCohortIds = eventCohortIds, groupCombinations = TRUE, outputFolder = outputFolder, outputFile = paste(path,"_percentage_groups_treated_noyear.csv",sep=''))
outputPercentageGroupTreated(data = file_withyear, studyName = studyName, eventCohortIds = eventCohortIds, groupCombinations = TRUE, outputFolder = outputFolder, outputFile = paste(path,"_percentage_groups_treated_withyear.csv",sep=''))
# Compute step-up/down of asthma/COPD drugs
outputStepUpDown(file_noyear = file_noyear, path = path, targetCohortId = targetCohortId)
# Duration of era's
transformDuration(outputFolder = outputFolder, path = path, temp_path = temp_path, maxPathLength = maxPathLength, groupCombinations = TRUE, minCellCount = minCellCount)
# Save (censored) results file_noyear and file_year
saveTreatmentSequence(file_noyear = file_noyear, file_withyear = file_withyear, path = path, groupCombinations = groupCombinations, minCellCount = minCellCount, minCellMethod = minCellMethod)
file_noyear <- as.data.table(readr::read_csv(paste(path,"_file_noyear.csv",sep=''), col_types = readr::cols()))
file_withyear <- as.data.table(readr::read_csv(paste(path,"_file_withyear.csv",sep=''), col_types = readr::cols()))
# Treatment pathways sankey diagram
createSankeyDiagram(data = file_noyear, outputFolder = outputFolder, databaseName = databaseName, studyName = studyName, groupCombinations = TRUE)
# Treatment pathways sunburst plot
outputSunburstPlot(data = file_noyear, databaseName = databaseName, eventCohortIds = eventCohortIds, studyName = studyName, outputFolder=outputFolder, path=path, addNoPaths=addNoPaths, maxPathLength=maxPathLength, createInput=TRUE, createPlot=TRUE)
outputSunburstPlot(data = file_withyear, databaseName = databaseName, eventCohortIds = eventCohortIds, studyName = studyName, outputFolder=outputFolder, path=path, addNoPaths=addNoPaths, maxPathLength=maxPathLength, createInput=TRUE, createPlot=TRUE)
}
}
}
# Transformed treatment pathways files to use in generating output.
#
# Input:
# studyName Name for the study corresponding to the current settings.
# path Path to the output folder including data specific file name.
# temp_path Path to aggregated treatment pathways file of the target cohort in different rows.
# maxPathLength Maximum number of steps included in treatment pathway (max 5).
# minCellCount Minimum number of persons with a specific treatment pathway for the pathway to be included in analysis.
#
# Output: List with two dataframes.
transformTreatmentSequence <- function(studyName, path, temp_path, maxPathLength, minCellCount) {
file <- try(data.table(readr::read_csv(paste(temp_path,"_paths.csv",sep=''), col_types = readr::cols())), silent = TRUE)
if (class(file) == "try-error") {
ParallelLogger::logInfo(warning(paste0("Data is empty for study settings ", studyName)))
return (NULL)
}
# Apply maxPathLength and remove unnecessary columns
layers <- as.vector(colnames(file)[!grepl("index_year|freq", colnames(file))])
layers <- layers[1:maxPathLength]
columns <- c(layers, "index_year", "freq")
file <- file[,..columns]
# Summarize which non-fixed combinations occurring
findCombinations <- apply(file, 2, function(x) grepl("+", x, fixed = TRUE))
combinations <- as.matrix(file)[findCombinations == TRUE]
num_columns <- sum(grepl("event_cohort_name", colnames(file)))
freqCombinations <- matrix(rep(file$freq, times = num_columns), ncol = num_columns)[findCombinations == TRUE]
summaryCombinations <- data.table(combination = combinations, freq = freqCombinations)
summaryCombinations <- summaryCombinations[,.(freq=sum(freq)), by=combination][order(-freq)]
summaryCombinations <- summaryCombinations[freq >= minCellCount,]
write.csv(summaryCombinations, file=paste(path,"_combinations.csv",sep=''), row.names = FALSE)
# Group the resulting treatment paths
file_noyear <- file[,.(freq=sum(freq)), by=layers]
file_withyear <- file[,.(freq=sum(freq)), by=c(layers, "index_year")]
ParallelLogger::logInfo("transformTreatmentSequence done")
return(list(file_noyear, file_withyear))
}
# Input:
# data Dataframe with event cohorts of the target cohort in different columns.
# eventCohortIds IDs to refer to event cohorts.
# studyName Name for the study corresponding to the current settings.
# groupCombinations Select to group all non-fixed combinations in one category 'other’ in the sunburst plot.
# path Path to the output folder including data specific file name.
# outputFolder Path to the output folder.
# outputFile Name of output file.
outputPercentageGroupTreated <- function(data, eventCohortIds, studyName, groupCombinations, path, outputFolder, outputFile) {
if (is.null(data$index_year)) {
# For file_noyear compute once
result <- computePercentageGroupTreated(data, eventCohortIds, groupCombinations, outputFolder)
} else {
# For file_withyear compute per year
years <- unlist(unique(data[,"index_year"]))
results <- lapply(years, function(y) {
subset_data <- data[index_year == as.character(y),]
if (nrow(subset_data) != 0) {
subset_result <- cbind(y, computePercentageGroupTreated(subset_data, eventCohortIds, groupCombinations, outputFolder))
} else {
ParallelLogger::logInfo(warning(paste0("Subset of data is empty for study settings ", studyName, " in year ", y)))
subset_result <- NULL
}
return(subset_result)
})
result <- rbindlist(results)
result$y <- as.character(result$y)
}
write.csv(result, file=outputFile, row.names = FALSE)
ParallelLogger::logInfo("outputPercentageGroupTreated done")
}
# Help function to compute percentage of treated patient with certain event cohorts for outputPercentageGroupTreated.
computePercentageGroupTreated <- function(data, eventCohortIds, groupCombinations, outputFolder) {
layers <- as.vector(colnames(data)[!grepl("index_year|freq", colnames(data))])
cohorts <- readr::read_csv(paste(outputFolder, "/cohort.csv",sep=''), col_types = list("i", "c", "c", "c"))
outcomes <- c(cohorts$cohortName[cohorts$cohortType == "event"], "Other")
# Group non-fixed combinations in one group according to groupCobinations
data <- groupInfrequentCombinations(data, groupCombinations)
percentGroupLayer <- sapply(layers, function(l) {
percentGroup <- sapply(outcomes, function(g) {
sumGroup <- sum(data$freq[data[,..l] == g], na.rm = TRUE)
sumAllNotNA <- sum(data$freq[!is.na(data[,..l])])
return(sumGroup * 100.0 / sumAllNotNA)
})
})
# Add outcome names
result <- data.frame(outcomes, percentGroupLayer, stringsAsFactors = FALSE)
colnames(result) <- c("outcomes", layers)
rownames(result) <- NULL
paths_all <- sum(data$freq)
result$ALL_LAYERS <- sapply(outcomes, function(o) {
paths_with_outcome <- sum(sapply(1:nrow(data), function(r) ifelse(o %in% data[r,], data[r,freq], 0)))
return(paths_with_outcome * 100.0 / paths_all)
})
# Add rows for total, fixed combinations, all combinations
result <- rbind(result, c("Fixed combinations",colSums(result[grepl("\\&", result$outcomes), layers]), NA), c("All combinations",colSums(result[grepl("Other|\\+|\\&", result$outcomes), layers]), NA), c("Monotherapy",colSums(result[!grepl("Other|\\+|\\&", result$outcomes), layers]), NA))
result$ALL_LAYERS[result$outcomes == "Fixed combinations"] <- sum(sapply(1:nrow(data), function(r) ifelse(any(grepl("\\&", data[r,])), data[r,freq], 0))) * 100.0 / paths_all
result$ALL_LAYERS[result$outcomes == "All combinations"] <- sum(sapply(1:nrow(data), function(r) ifelse(any(grepl("Other|\\+|\\&", data[r,])), data[r,freq], 0))) * 100.0 / paths_all
result$ALL_LAYERS[result$outcomes == "Monotherapy"] <- sum(sapply(1:nrow(data), function(r) ifelse(any(!grepl("Other|\\+|\\&", data[r,])), data[r,freq], 0))) * 100.0 / paths_all
return(result)
}
# Input:
# file_noyear Dataframe with aggregated treatment pathways of the target cohort in different rows (unique pathways, with frequency).
# path Path to the output folder including data specific file name.
# targetCohortId ID to refer to target cohort.
outputStepUpDown <- function(file_noyear, path, targetCohortId) {
# Replace & signs by + (so that definitions match both)
file_noyear <- data.table(apply(file_noyear, 2, function(x)
{
x <- gsub("&", "+", x, fixed = TRUE)
# Order the combinations (again)
concept_names <- strsplit(x, split="+", fixed=TRUE)
x <- sapply(concept_names, function(c) paste(sort(c), collapse = "+"))
# Fill gaps with NA
x[x==""] <- NA
return(x)
}))
file_noyear$freq <- as.numeric(file_noyear$freq)
stepUpDown(file_noyear, path, targetCohortId, input = "guidelines")
stepUpDown(file_noyear, path, targetCohortId, input = "generalized rules")
ParallelLogger::logInfo("outputStepUpDown done")
}
# Implementation of step up/down definitions for asthma and COPD.
stepUpDown <- function(file, path, targetCohortId, input) {
if (input == "guidelines") { # GINA guidelines (2020).
def_updown <- readr::read_csv("inst/Settings/guidelines.csv", col_types = list("c", "c", "c", "c"))
# Order the definition columns from/to
from <- strsplit(def_updown$from, split="+", fixed=TRUE)
def_updown$from <- sapply(from, function(c) paste(sort(c), collapse = "+"))
to <- strsplit(def_updown$to, split="+", fixed=TRUE)
def_updown$to <- sapply(to, function(c) paste(sort(c), collapse = "+"))
def_groups <- as.vector(unique(def_updown$targetCohortIds))
# Define set of rules
done <- FALSE
counter <- 1
while(!done & counter <= length(def_groups)) {
if(targetCohortId %in% strsplit(def_groups[counter], split=",")[[1]]) {
done <- TRUE
group <- def_groups[counter]
} else {
counter <- counter + 1
}
}
if (!done) {
warning(paste0("No guidelines for augment/switching for target cohort ", targetCohortId))
} else {
# Select definitions for this target cohort
def_target <- def_updown[def_updown$targetCohortIds == group,]
def_target$targetCohortIds <- NULL
all_results <- data.frame()
# For each treatment layer
for (l in 1:(ncol(file)-2)) {
cols <- c(l, l+1, ncol(file))
subfile <- file[, ..cols]
colnames(subfile) <- c("from", "to", "freq")
result <- merge(subfile, def_target, by.x=c("from","to"), by.y=c("from","to"), all.x = TRUE)
# Remove paths of inviduals who already stopped treatment
result <- result[!is.na(result$from),]
# Define stop treatment
result$group[is.na(result$to)] <- 'stopped'
# Fill NA's with 'undefined'
result$group[is.na(result$group)] <- 'undefined'
# Compute augment/switch
result$freq <- as.integer(result$freq)
total <- sum(result$freq)
result <- result[,.(count = sum(freq), perc = round(sum(freq)*100/total,4)), by = "group"]
result$layer <- l
all_results <- rbind(all_results, result)
}
write.csv(all_results, paste(path,"_augmentswitch_guidelines.csv",sep=''), row.names = FALSE)
}
} else if (input == "generalized rules") { # Broad definition clinical experts
# Drug class levels
if (targetCohortId %in% c(1,4,5,3)) { # Asthma, ACO
class_level <- list("SABA" = 0,
"SAMA" = 0,
"Systemic B2" = 0,
"ICS" = 1,
"LTRA" = 1,
"PDE4" = 1,
"Xanthines" = 1,
"LABA" = 2,
"LAMA" = 2,
"Systemic glucocorticoids (therapy)" = 3,
"Systemic glucocorticoids (acute)" = 3,
"Anti IL5(R)" = 3,
"Anti IgE" = 3,
"Anti IL4R" = 3)
} else if (targetCohortId %in% c(2)) { # COPD
class_level <- list("SABA" = 0,
"SAMA" = 0,
"Systemic B2" = 0,
"LABA" = 1,
"LAMA" = 1,
"Xanthines" = 1,
"ICS" = 2,
"LTRA" = 2,
"PDE4" = 2,
"Systemic glucocorticoids (therapy)" = 3,
"Systemic glucocorticoids (acute)" = 3,
"Anti IL5(R)" = 3,
"Anti IgE" = 3,
"Anti IL4R" = 3)
}
all_results <- data.frame()
# For each treatment layer
for (l in 1:(ncol(file)-2)) {
cols <- c(l, l+1, ncol(file))
subfile <- file[, ..cols]
colnames(subfile) <- c("from", "to", "freq")
subfile[is.na(subfile)] <- "End"
# Start with NA labels
subfile$group <- "NA"
# Compute patient treatment level (sum of drug class levels)
subfile$from_level <- sapply(subfile$from, function(r) sum(unlist(sapply(unlist(strsplit(r, split="+", fixed=TRUE)), function(r_s) class_level[[r_s]]))))
subfile$to_level <- sapply(subfile$to, function(r) sum(unlist(sapply(unlist(strsplit(r, split="+", fixed=TRUE)), function(r_s) class_level[[r_s]]))))
# Cap at level 4 (above everything is considered switching)
subfile$from_level[subfile$from_level > 6] <- 6
subfile$to_level[subfile$to_level > 6] <- 6
# Assign labels based on higher/lower/same patient treatment level
subfile$group[subfile$from_level < subfile$to_level] <- "step_up_broad"
subfile$group[subfile$from_level > subfile$to_level] <- "step_down_broad"
subfile$group[subfile$from_level == subfile$to_level] <- "switching_broad"
# Exceptions: short period of systemic glucocorticoids (acute)
# Start exacerbation (from NO acute, to YES acute)
selection <- !stringr::str_detect(subfile$from, stringr::fixed("Systemic glucocorticoids (acute)")) & stringr::str_detect(subfile$to, stringr::fixed("Systemic glucocorticoids (acute)"))
ParallelLogger::logInfo(paste0("Start exacerbation: ", sum(selection)))
subfile$group[selection] <- "acute_exacerbation"
# End exacerbation (from YES acute, to NO acute)
selection <- stringr::str_detect(subfile$from, stringr::fixed("Systemic glucocorticoids (acute)")) & !stringr::str_detect(subfile$to, stringr::fixed("Systemic glucocorticoids (acute)"))
ParallelLogger::logInfo(paste0("End exacerbation: ", sum(selection)))
subfile$group[selection] <- "end_of_acute_exacerbation"
# Exceptions: some off label use
if (targetCohortId %in% c(1,4,5)) { # Asthma
# Off label (PDE4)
selection <- stringr::str_detect(subfile$to, stringr::fixed("PDE4"))
ParallelLogger::logInfo(paste0("Off label (asthma): ", sum(selection)))
subfile$group[selection] <- "off_label"
} else if (targetCohortId %in% c(2)) { # COPD
# Off label (LTRA, Anti IL5, Anti IgE)
selection <- stringr::str_detect(subfile$to, stringr::fixed("Anti IL5")) |
stringr::str_detect(subfile$to, stringr::fixed("Anti IgE")) |
stringr::str_detect(subfile$to, stringr::fixed("Anti IL4R"))
ParallelLogger::logInfo(paste0("Off label (COPD): ", sum(selection)))
subfile$group[selection] <- "off_label"
}
# Exceptions: non conform treatment
if (targetCohortId %in% c(1,4,5)) { # Asthma
# Non conform (therapy LABA/LAMA without ICS)
selection <- (stringr::str_detect(subfile$to, stringr::fixed("LABA")) & !stringr::str_detect(subfile$to, stringr::fixed("ICS"))) |
(stringr::str_detect(subfile$to, stringr::fixed("LAMA")) & !stringr::str_detect(subfile$to, stringr::fixed("ICS")))
ParallelLogger::logInfo(paste0("Non conform: ", sum(selection)))
subfile$group[selection] <- "non_conform"
}
# Remove paths of inviduals who already stopped treatment
subfile <- subfile[subfile$from != "End",]
# Define stop treatment
subfile$group[subfile$to == "End"] <- 'stopped'
# Fill NA's with 'undefined'
subfile$group[is.na(subfile$group)] <- 'undefined'
# Compute augment/switch
total <- sum(subfile$freq)
subfile <- subfile[,.(count = sum(freq), perc = round(sum(freq)*100/total,4)), by = "group"]
subfile$layer <- l
all_results <- rbind(all_results, subfile)
}
write.csv(all_results, paste(path,"_augmentswitch_generalized.csv",sep=''), row.names = FALSE)
}
}
# Input:
# outputFolder Path to the output folder.
# path Path to the output folder including data specific file name.
# temp_path Path to aggregated treatment pathways file of the target cohort in different rows.
# maxPathLength Maximum number of steps included in treatment pathway (max 5).
# groupCombinations Select to group all non-fixed combinations in one category 'other’ in the sunburst plot.
# minCellCount Minimum number of persons with a specific treatment pathway for the pathway to be included in analysis.
transformDuration <- function(outputFolder, path, temp_path, maxPathLength, groupCombinations, minCellCount) {
file <- data.table(readr::read_csv(paste(temp_path,"_drug_seq_processed.csv",sep=''), col_types = list("c", "i", "i", "D", "D", "i", "i", "c")))
# Remove unnessary columns
columns <- c("duration_era", "drug_seq", "event_cohort_name")
file <- file[,..columns]
file$duration_era <- as.numeric(file$duration_era)
# Apply maxPathLength
file <- file[drug_seq <= maxPathLength,]
# Group non-fixed combinations in one group according to groupCobinations
file <- groupInfrequentCombinations(file, groupCombinations)
result <- file[,.(AVG_DURATION=round(mean(duration_era),3), COUNT = .N), by = c("event_cohort_name", "drug_seq")][order(event_cohort_name, drug_seq)]
# Add column for total treated, fixed combinations, all combinations
file$total <- 1
file$fixed_combinations[grepl("\\&", file$event_cohort_name)] <- 1
file$all_combinations[grepl("Other|\\+|\\&", file$event_cohort_name)] <- 1
file$monotherapy[!grepl("Other|\\+|\\&", file$event_cohort_name)] <- 1
result_total_seq <- file[,.(drug_seq = "Overall", AVG_DURATION= round(mean(duration_era),2), COUNT = .N), by = c("event_cohort_name", "total")]
result_total_seq$total <- NULL
result_total_concept <- file[,.(event_cohort_name = "Total treated", AVG_DURATION= round(mean(duration_era),2), COUNT = .N), by = c("drug_seq", "total")]
result_total_concept$total <- NULL
result_fixed_combinations <- file[,.(event_cohort_name = "Fixed combinations", AVG_DURATION= round(mean(duration_era),2), COUNT = .N), by = c("drug_seq", "fixed_combinations")]
result_fixed_combinations <- result_fixed_combinations[!is.na(fixed_combinations),]
result_fixed_combinations$fixed_combinations <- NULL
result_all_combinations <- file[,.(event_cohort_name = "All combinations", AVG_DURATION=round(mean(duration_era),2), COUNT = .N), by = c("drug_seq", "all_combinations")]
result_all_combinations <- result_all_combinations[!is.na(all_combinations),]
result_all_combinations$all_combinations <- NULL
result_monotherapy <- file[,.(event_cohort_name = "Monotherapy", AVG_DURATION=round(mean(duration_era),2), COUNT = .N), by = c("drug_seq", "monotherapy")]
result_monotherapy <- result_monotherapy[!is.na(monotherapy),]
result_monotherapy$monotherapy <- NULL
results <- rbind(result, result_total_seq, result_total_concept, result_fixed_combinations, result_all_combinations, result_monotherapy)
# Add missing groups
cohorts <- readr::read_csv(paste(outputFolder, "/cohort.csv",sep=''), col_types = list("i", "c", "c", "c"))
outcomes <- c(cohorts$cohortName[cohorts$cohortType == "event"], "Other")
for (o in outcomes[!(outcomes %in% results$event_cohort_name)]) {
results <- rbind(results, list(o, "Overall", 0.0, 0))
}
# Remove durations computed using less than minCellCount observations
results[COUNT < minCellCount, c("AVG_DURATION", "COUNT")] <- NA
write.csv(results, paste(path,"_duration.csv",sep=''), row.names = FALSE)
ParallelLogger::logInfo("transformDuration done")
}
# Censored treatment pathways files to share.
#
# Input:
# file_noyear Dataframe with aggregated treatment pathways of the target cohort in different rows (unique pathways, with frequency).
# file_withyear Idem, but split over index_year.
# path Path to the output folder including data specific file name.
# groupCombinations Select to group all non-fixed combinations in one category 'other’ in the sunburst plot.
# minCellCount Minimum number of persons with a specific treatment pathway for the pathway to be included in analysis.
# minCellMethod Select to completely remove / sequentially adjust (by removing last step as often as necessary) treatment pathways below minCellCount.
saveTreatmentSequence <- function(file_noyear, file_withyear, path, groupCombinations, minCellCount, minCellMethod) {
# Group non-fixed combinations in one group according to groupCobinations
file_noyear <- groupInfrequentCombinations(file_noyear, groupCombinations)
file_withyear <- groupInfrequentCombinations(file_withyear, groupCombinations)
layers <- as.vector(colnames(file_noyear)[!grepl("index_year|freq", colnames(file_noyear))])
# Apply minCellCount by adjusting to other most similar path (removing last treatment in path) or else remove complete path
if (minCellMethod == "Adjust") {
col <- ncol(file_noyear) - 1
while (sum(file_noyear$freq < minCellCount) > 0 & col !=0) {
ParallelLogger::logInfo(paste("Change col ", col, " to NA for ", sum(file_noyear$freq < minCellCount), " paths with too low frequency (without year)"))
file_noyear[freq < minCellCount, col] <- NA
file_noyear <- file_noyear[,.(freq=sum(freq)), by=layers]
col <- col - 1
}
col <- ncol(file_withyear) - 2
while (sum(file_withyear$freq < minCellCount) > 0 & col !=0) {
ParallelLogger::logInfo(paste("Change col ", col, " to NA for ", sum(file_withyear$freq < minCellCount), " paths with too low frequency (with year)"))
file_withyear[freq < minCellCount,col] <- NA
file_withyear <- file_withyear[,.(freq=sum(freq)), by=c(layers, "index_year")]
col <- col - 1
}
# If path becomes completely NA -> add to "Other" group to distinguish from non-treated
file_noyear$event_cohort_name1[is.na(file_noyear$event_cohort_name1)] <- "Other"
file_noyear <- file_noyear[,.(freq=sum(freq)), by=layers]
file_withyear$event_cohort_name1[is.na(file_withyear$event_cohort_name1)] <- "Other"
file_withyear <- file_withyear[,.(freq=sum(freq)), by=c(layers, "index_year")]
}
ParallelLogger::logInfo(paste("Remove ", sum(file_noyear$freq < minCellCount), " paths with too low frequency (without year)"))
file_noyear <- file_noyear[freq >= minCellCount,]
ParallelLogger::logInfo(paste("Remove ", sum(file_withyear$freq < minCellCount), " paths with too low frequency (with year)"))
file_withyear <- file_withyear[freq >= minCellCount,]
summary_counts <- readr::read_csv(paste(path,"_summary_cnt.csv",sep=''), col_types = list("c", "i"))
summary_counts <- rbind(summary_counts, c("Total number of pathways (after minCellCount)", sum(file_noyear$freq)))
for (y in unique(file_withyear$index_year)) {
summary_counts <- rbind(summary_counts, c(paste0("Number of pathways (after minCellCount) in ", y), sum(file_withyear$freq[file_withyear$index_year == y])))
}
write.table(summary_counts,file=paste(path,"_summary_cnt.csv",sep=''), sep = ",", row.names = FALSE, col.names = TRUE)
write.csv(file_noyear, paste(path,"_file_noyear.csv",sep=''), row.names = FALSE)
write.csv(file_withyear, paste(path,"_file_withyear.csv",sep=''), row.names = FALSE)
ParallelLogger::logInfo("saveTreatmentSequence done")
}
# Input:
# data Dataframe with event cohorts of the target cohort in different columns.
# databaseName Name of the database that will appear in the results.
# eventCohortIds IDs to refer to event cohorts.
# studyName Name for the study corresponding to the current settings.
# outputFolder Path to the output folder.
# path Path to the output folder including data specific file name.
# groupCombinations Select to group all non-fixed combinations in one category 'other’ in the sunburst plot.
# addNoPaths Select to include untreated persons without treatment pathway in the sunburst plot.
# maxPathLength Maximum number of steps included in treatment pathway (max 5).
# createInput Whether the input for sunburst plot should be created.
# createPlot Whether the html for sunburst plot should be created.
outputSunburstPlot <- function(data, databaseName, eventCohortIds, studyName, outputFolder, path, addNoPaths, maxPathLength, createInput, createPlot) {
if (is.null(data$index_year)) {
# For file_noyear compute once
createSunburstPlot(data, databaseName, eventCohortIds, studyName, outputFolder, path, addNoPaths, maxPathLength, index_year = 'all', createInput, createPlot)
} else {
# For file_withyear compute per year
years <- unlist(unique(data[,"index_year"]))
for (y in years) {
subset_data <- data[index_year == as.character(y),]
if (nrow(subset_data) != 0) {
createSunburstPlot(subset_data, databaseName, eventCohortIds, studyName, outputFolder, path, addNoPaths, maxPathLength, index_year = y, createInput, createPlot)
} else {
ParallelLogger::logInfo(warning(paste0("Subset of data is empty for study settings ", studyName, " in year ", y)))
}
}
}
ParallelLogger::logInfo("outputSunburstPlot done")
}
# Help function to create input data for sunburst plot and the sunburst plot in HTML.
createSunburstPlot <- function(data, databaseName, eventCohortIds, studyName, outputFolder, path, addNoPaths, maxPathLength, index_year, createInput, createPlot){
if (createInput) {
inputSunburstPlot(data, path, addNoPaths, index_year)
}
if (createPlot) {
transformCSVtoJSON(eventCohortIds, outputFolder, path, index_year, maxPathLength)
# Load template HTML file
html <- paste(readLines("shiny/sunburst/sunburst.html"), collapse="\n")
# Replace @insert_data
input_plot <- readLines(paste(path,"_inputsunburst_", index_year, ".txt",sep=''))
html <- sub("@insert_data", input_plot, html)
# Replace @name
html <- sub("@name", paste0("(", databaseName, " ", studyName," ", index_year, ")"), html)
# Save HTML file as sunburst_@studyName
write.table(html,
file=paste0(outputFolder, "/", studyName, "/", "sunburst_", databaseName, "_", studyName,"_", index_year,".html"),
quote = FALSE,
col.names = FALSE,
row.names = FALSE)
}
}
# Help function to create transform data to correct format for sunburst plot.
inputSunburstPlot <- function(data, path, addNoPaths, index_year) {
layers <- as.vector(colnames(data)[!grepl("index_year|freq", colnames(data))])
transformed_file <- apply(data[,..layers],1, paste, collapse = "-")
transformed_file <- stringr::str_replace_all(transformed_file, "-NA", "")
transformed_file <- paste0(transformed_file, "-End")
transformed_file <- data.frame(path=transformed_file, freq=data$freq, stringsAsFactors = FALSE)
if (addNoPaths) {
summary_counts <- readr::read_csv(paste(path,"_summary_cnt.csv",sep=''), col_types = list("c","i"))
if (index_year == "all") {
noPath <- as.integer(summary_counts[summary_counts$index_year == "Number of persons in target cohort NA", "N"]) - sum(transformed_file$freq)
} else {
noPath <- as.integer(summary_counts[summary_counts$index_year == paste0("Number of persons in target cohort ", index_year), "N"]) - sum(transformed_file$freq)
}
transformed_file <- rbind(transformed_file, c("End", noPath))
}
transformed_file$path <- as.factor(transformed_file$path)
transformed_file$freq <- as.integer(transformed_file$freq)
transformed_file <- transformed_file[order(-transformed_file$freq, transformed_file$path),]
write.table(transformed_file, file=paste(path,"_inputsunburst_", index_year, ".csv",sep=''), sep = ",", row.names = FALSE)
}
# Help function to transform data in csv format to required JSON format for HTML.
transformCSVtoJSON <- function(eventCohortIds, outputFolder, path, index_year, maxPathLength) {
data <- readr::read_csv(paste(path,"_inputsunburst_", index_year, ".csv",sep=''), col_types = list("c", "i"))
cohorts <- readr::read_csv(paste(outputFolder, "/cohort.csv",sep=''), col_types = list("i", "c", "c", "c"))
outcomes <- c(cohorts$cohortName[cohorts$cohortType == "event"], "Other")
# Add bitwise numbers to define combination treatments
bitwiseNumbers <- sapply(1:length(outcomes), function(o) {2^(o-1)})
linking <- data.frame(outcomes,bitwiseNumbers)
# Generate lookup file
series <- sapply(1:nrow(linking), function (row) {
paste0('{ "key": "', linking$bitwiseNumbers[row] ,'", "value": "', linking$outcomes[row],'"}')
})
series <- c(series, '{ "key": "End", "value": "End"}')
lookup <- paste0("[", paste(series, collapse = ","), "]")
# Order names from longest to shortest to adjust in the right order
linking <- linking[order(-sapply(linking$outcomes, function(x) stringr::str_length(x))),]
# Apply linking
# Change all outcomes to bitwise number
updated_path <- sapply(data$path, function(p) {
stringi::stri_replace_all_fixed(p, replacement = as.character(linking$bitwiseNumbers), pattern = as.character(linking$outcomes), vectorize = FALSE)
})
# Sum the bitwise numbers of combinations (indicated by +)
updated_path <- sapply(updated_path, function(p) {
while(!is.na(stringr::str_extract(p, "[[:digit:]]+[+][[:digit:]]+"))) {
pattern <- stringr::str_extract(p, "[[:digit:]]+[+][[:digit:]]+")
p <- sub("[[:digit:]]+[+][[:digit:]]+", eval(parse(text=pattern)), p)
}
return(p)
})
transformed_csv <- cbind(oath = updated_path, freq = data$freq)
transformed_json <- buildHierarchy(transformed_csv, maxPathLength)
result <- paste0("{ \"data\" : ", transformed_json, ", \"lookup\" : ", lookup, "}")
file <- file(paste(path,"_inputsunburst_", index_year, ".txt",sep=''))
writeLines(result, file)
close(file)
}
# Help function to create hierarchical data structure.
buildHierarchy <- function(csv, maxPathLength) {
if (maxPathLength > 5) {
stop(paste0("MaxPathLength exceeds 5, currently not supported in buildHierarchy function."))
}
root = list("name" = "root", "children" = list())
# Create nested structure of lists
for (i in 1:nrow(csv)) {
sequence = csv[i,1]
size = csv[i,2]
parts = unlist(stringr::str_split(sequence,pattern="-"))
currentNode = root
for (j in 1:length(parts)) {
children = currentNode[["children"]]
nodeName = parts[j]
if (j < length(parts)) {
# Not yet at the end of the sequence; move down the tree
foundChild = FALSE
if (length(children) != 0) {
for (k in 1:length(children)) {
if (children[[k]]$name == nodeName) {
childNode = children[[k]]
foundChild = TRUE
break
}
}
}
# If we don't already have a child node for this branch, create it
if (!foundChild) {
childNode = list("name" = nodeName, "children" = list())
children[[nodeName]] <- childNode
# Add to main root
if (j == 1) {
root[['children']] <- children
} else if (j == 2) {
root[['children']][[parts[1]]][['children']] <- children
} else if (j == 3) {
root[['children']][[parts[1]]][['children']][[parts[2]]][['children']] <- children
} else if (j == 4) {
root[['children']][[parts[1]]][['children']][[parts[2]]][['children']][[parts[3]]][['children']] <- children
} else if (j == 5) {
root[['children']][[parts[1]]][['children']][[parts[2]]][['children']][[parts[3]]][['children']][[parts[4]]][['children']] <- children
}
}
currentNode = childNode
} else {
# Reached the end of the sequence; create a leaf node
childNode = list("name" = nodeName, "size" = size)
children[[nodeName]] <- childNode
# Add to main root
if (j == 1) {
root[['children']] <- children
} else if (j == 2) {
root[['children']][[parts[1]]][['children']] <- children
} else if (j == 3) {
root[['children']][[parts[1]]][['children']][[parts[2]]][['children']] <- children
} else if (j == 4) {
root[['children']][[parts[1]]][['children']][[parts[2]]][['children']][[parts[3]]][['children']] <- children
} else if (j == 5) {
root[['children']][[parts[1]]][['children']][[parts[2]]][['children']][[parts[3]]][['children']][[parts[4]]][['children']] <- children
}
}
}
}
# Remove list names
root <- suppressWarnings(stripname(root, "children"))
# Convert nested list structure to json
json <- rjson::toJSON(root)
return(json)
}
# Input:
# data Dataframe with event cohorts of the target cohort in different columns.
# outputFolder Path to the output folder.
# databaseName Name of the database that will appear in the results.
# studyName Name for the study corresponding to the current settings.
# groupCombinations Select to group all non-fixed combinations in one category 'other’ in the sunburst plot.
createSankeyDiagram <- function(data, outputFolder, databaseName, studyName, groupCombinations) {
# Group non-fixed combinations in one group according to groupCobinations
data <- groupInfrequentCombinations(data, groupCombinations)
# Define stop treatment
data[is.na(data)] <- 'Stopped'
# Sankey diagram for first three treatment layers
data$event_cohort_name1 <- paste0("1. ",data$event_cohort_name1)
data$event_cohort_name2 <- paste0("2. ",data$event_cohort_name2)
data$event_cohort_name3 <- paste0("3. ",data$event_cohort_name3)
results1 <- data %>%
dplyr::group_by(event_cohort_name1,event_cohort_name2) %>%
dplyr::summarise(freq = sum(freq))
results2 <- data %>%
dplyr::group_by(event_cohort_name2,event_cohort_name3) %>%
dplyr::summarise(freq = sum(freq))
# Format in prep for sankey diagram
colnames(results1) <- c("source", "target", "value")
colnames(results2) <- c("source", "target", "value")
links <- as.data.frame(rbind(results1,results2))
# Draw sankey network
plot <- googleVis::gvisSankey(links, from = "source", to = "target", weight = "value", chartid = 1)
write.table(plot$html$chart,
file=paste0(outputFolder, "/", studyName, "/", "sankeydiagram_", databaseName, "_", studyName,"_all.html"),
quote = FALSE,
col.names = FALSE,
row.names = FALSE)
ParallelLogger::logInfo("createSankeyDiagram done")
}
# Help function to group combinations
groupInfrequentCombinations <- function(data, groupCombinations) {
# Find all non-fixed combinations occurring
findCombinations <- apply(data, 2, function(x) grepl("+", x, fixed = TRUE))
# Group all non-fixed combinations in one group if TRUE
if (groupCombinations == "TRUE") {
data[findCombinations] <- "Other"
} else {
# Otherwise: group infrequent treatments below groupCombinations as "other"
combinations <- as.matrix(data)[findCombinations == TRUE]
num_columns <- sum(grepl("cohort_name", colnames(data)))
freqCombinations <- matrix(rep(data$freq, times = num_columns), ncol = num_columns)[findCombinations == TRUE]
summaryCombinations <- data.table(combination = combinations, freq = freqCombinations)
summaryCombinations <- summaryCombinations[,.(freq=sum(freq)), by=combination][order(-freq)]
summarizeCombinations <- summaryCombinations$combination[summaryCombinations$freq <= groupCombinations]
selectedCombinations <- apply(data, 2, function(x) x %in% summarizeCombinations)
data[selectedCombinations] <- "Other"
}
return(data)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.