# Copyright 2018 Observational Health Data Sciences and Informatics
#
# This file is part of DistributedRegressionEval
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#' @export
combineData <- function(studyFolder, dropColumns = FALSE, sampleSize = 100000) {
# sampleSize = 10000
folders <- list.files(studyFolder, include.dirs = TRUE)
outcomeIds <- c(3, 4, 5, 6)
for (outcomeId in outcomeIds) {
data <- data.frame()
columnsToDrop <- c()
for (folder in folders) {
if (dir.exists(file.path(studyFolder, folder)) && file.exists(file.path(studyFolder, folder, "data_o3.rds"))) {
file <- file.path(studyFolder, folder, sprintf("data_o%s.rds", outcomeId))
dataDb <- readRDS(file)
if (nrow(dataDb) > sampleSize) {
dataDb <- dataDb[sample.int(nrow(dataDb), sampleSize), ]
}
for (i in 1:ncol(dataDb)) {
if (sum(dataDb[, i]) == 0 && colnames(dataDb)[i] != "y") {
columnsToDrop <- c(columnsToDrop, colnames(dataDb)[i])
}
}
dataDb$database <- folder
data <- plyr::rbind.fill(data, dataDb)
}
}
for (i in 1:ncol(data)) {
if (any(is.na(data[, i]))) {
columnsToDrop <- c(columnsToDrop, colnames(data)[i])
}
}
if (length(columnsToDrop) > 0) {
if (dropColumns) {
writeLines(paste("Dropping columns:", paste(columnsToDrop, collapse = ", ")))
data <- data[, -which(colnames(data) %in% columnsToDrop)]
} else {
writeLines(paste("Columns", paste(columnsToDrop, collapse = ", "), "are all zero in one or more DBs"))
for (i in 1:ncol(data)) {
data[is.na(data[, i]), i] <- 0
}
}
}
saveRDS(data, file.path(studyFolder, sprintf("data_o%s.rds", outcomeId)))
}
}
createSummary <- function(studyFolder) {
# outcomeIds <- c(3, 4, 5, 6)
outcomeIds <- c(3, 5)
pathToCsv <- system.file("settings", "CohortsToCreate.csv", package = "DistributedRegressionEval")
cohortsToCreate <- read.csv(pathToCsv)
first <- TRUE
summaryStats <- NULL
require(dplyr)
for (outcomeId in outcomeIds) {
data <- readRDS(file.path(studyFolder, sprintf("data_o%s.rds", outcomeId)))
means <- data %>% select(-y, -age_in_years, -time) %>% group_by(database) %>% summarize_all(mean)
counts <- data %>% group_by(database) %>% summarize(n = n(),
outcomes = sum(y),
ageMin = min(age_in_years),
age25 = quantile(age_in_years, 0.25),
ageMedian = median(age_in_years),
age75 = quantile(age_in_years, 0.25),
ageMax = min(age_in_years))
counts <- rename(counts, !!paste0("o_", outcomeId) := outcomes)
if (first) {
summaryStats <- inner_join(means, counts)
first <- FALSE
} else {
newCols <- names(means)[!(names(means) %in% names(summaryStats))]
summaryStats <- inner_join(summaryStats, means %>% select(database, newCols))
summaryStats <- inner_join(summaryStats, counts %>% select(-n))
}
}
# summaryStats <- summaryStats %>% select(database, n, o_3, o_4, o_5, o_6, everything())
summaryStats <- summaryStats %>% select(database, n, o_3, o_5, everything())
write.csv(t(summaryStats), file.path(studyFolder, "Summary.csv"))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.