R/Simulation.R

Defines functions simulatePlpData

Documented in simulatePlpData

# @file simulation.R
#
# Copyright 2020 Observational Health Data Sciences and Informatics
#
# This file is part of PatientLevelPrediction
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Create simulation profile
#
# @description
# \code{createplpDataSimulationProfile} creates a profile based on the provided plpData object, which
# can be used to generate simulated data that has similar characteristics.
#
# @param plpData   An object of type \code{plpData} as generated using \code{getDbplpData}.
#
# @details
# The output of this function is an object that can be used by the \code{simulateplpData} function to
# generate a plpData object.
#
# @return
# An object of type \code{plpDataSimulationProfile}.
#
# @export
##createPlpSimulationProfile <- function(plpData) {
##  writeLines("Computing covariate prevalence")  # (Note: currently assuming binary covariates)
##  sums <- bySumFf(plpData$covariates$covariateValue, plpData$covariates$covariateId)
##  covariatePrevalence <- sums$sums/nrow(plpData$cohorts)
##  attr(covariatePrevalence, "names") <- sums$bins
##  
##  
##  writeLines("Fitting outcome model(s)")
##  outcomeModels <- vector("list", length(plpData$metaData$outcomeIds))
##  for (i in 1:length(plpData$metaData$outcomeIds)) {
##    outcomeId <- plpData$metaData$outcomeIds[i]
##    model <- fitPredictiveModel(plpData = plpData,
##                                outcomeId = outcomeId,
##                                modelType = "poisson",
##                                prior = Cyclops::createPrior("laplace",
##                                                             exclude = c(0),
##                                                             useCrossValidation = TRUE),
##                                control = Cyclops::createControl(noiseLevel = "quiet",
##                                                                 cvType = "auto",
##                                                                 startingVariance = 0.001,
##                                                                 threads = 10))
##    model$coefficients <- model$coefficients[model$coefficients != 0]
##    outcomeModels[[i]] <- model$coefficients
##  }
##  
##  writeLines("Computing time distribution")
##  timePrevalence <- table(ff::as.ram(plpData$cohorts$time))/nrow(plpData$cohorts)
##  
##  if (!is.null(plpData$exclude)) {
##    writeLines("Computing prevalence of exlusion")
##    exclusionPrevalence <- table(ff::as.ram(plpData$exclude$outcomeId))/nrow(plpData$cohorts)
##  } else {
##    exclusionPrevalence <- NULL
##  }
##  result <- list(covariatePrevalence = covariatePrevalence,
##                 outcomeModels = outcomeModels,
##                 metaData = plpData$metaData,
##                 covariateRef = ff::as.ram(plpData$covariateRef),
##                 timePrevalence = timePrevalence,
##                 exclusionPrevalence = exclusionPrevalence)
##  class(result) <- "plpDataSimulationProfile"
##  return(result)
##}

#' Generate simulated data
#'
#' @description
#' \code{simulateplpData} creates a plpData object with simulated data.
#'
#' @param plpDataSimulationProfile   An object of type \code{plpDataSimulationProfile} as generated
#'                                   using the \cr\code{createplpDataSimulationProfile} function.
#' @param n                          The size of the population to be generated.
#'
#' @details
#' This function generates simulated data that is in many ways similar to the original data on which
#' the simulation profile is based. The contains same outcome, comparator, and outcome concept IDs,
#' and the covariates and their 1st order statistics should be comparable.
#'
#' @return
#' An object of type \code{plpData}.
#'
#' @export
simulatePlpData <- function(plpDataSimulationProfile, n = 10000) {
  # Note: currently, simulation is done completely in-memory. Could easily do batch-wise
  writeLines("Generating covariates")
  covariatePrevalence <- plpDataSimulationProfile$covariatePrevalence
  
  personsPerCov <- rpois(n = length(covariatePrevalence), lambda = covariatePrevalence * n)
  personsPerCov[personsPerCov > n] <- n
  rowId <- sapply(personsPerCov, function(x, n) sample.int(size = x, n), n = n)
  rowId <- do.call("c", rowId)
  covariateIds <- as.numeric(names(covariatePrevalence))
  covariateId <- sapply(1:length(personsPerCov),
                        function(x, personsPerCov, covariateIds) rep(covariateIds[x],
                                                                     personsPerCov[x]),
                        personsPerCov = personsPerCov,
                        covariateIds = covariateIds)
  covariateId <- do.call("c", covariateId)
  covariateValue <- rep(1, length(covariateId))
  covariateData <- Andromeda::andromeda(covariates = data.frame(rowId = rowId,
                                                                covariateId = covariateId,
                                                                covariateValue = covariateValue),
                                        covariateRef = plpDataSimulationProfile$covariateRef,
                                        analysisRef  = data.frame(analysisId = 1))
  
  class(covariateData) <- "CovariateData"
  
  writeLines("Generating cohorts")
  cohorts <- data.frame(rowId = 1:n, subjectId = 2e+10 + (1:n), cohortId = 1)
  breaks <- cumsum(plpDataSimulationProfile$timePrevalence)
  r <- runif(n)
  cohorts$time <- as.numeric(as.character(cut(r, breaks = c(0, breaks), labels = names(breaks))))
  cohorts$cohortStartDate <- sample(-1000:1000,n,replace=TRUE) + as.Date("2010-01-01")
  cohorts$daysFromObsStart <- sample(1:1000,n,replace=TRUE)
  cohorts$daysToCohortEnd <- sample(1:1000,n,replace=TRUE)
  cohorts$daysToObsEnd <- cohorts$daysToCohortEnd + sample(1:1000,n,replace=TRUE)
  cohorts$ageYear <- sample(0:95,n,replace=TRUE)
  cohorts$gender <- 8532 #female
  cohorts$gender[sample((1:nrow(cohorts)), nrow(cohorts)/2)] <- 8507
  
  writeLines("Generating outcomes")
  allOutcomes <- data.frame()
  for (i in 1:length(plpDataSimulationProfile$metaData$outcomeIds)) {
    prediction <- predictAndromeda(plpDataSimulationProfile$outcomeModels[[i]],
                                   cohorts,
                                   covariateData,
                                   modelType = "poisson")
    outcomes <- merge(prediction, cohorts[, c("rowId", "time")])
    outcomes$value <- outcomes$value * outcomes$time  #Value is lambda
    outcomes$outcomeCount <- as.numeric(rpois(n, outcomes$value))
    outcomes <- outcomes[outcomes$outcomeCount != 0, ]
    outcomes$outcomeId <- plpDataSimulationProfile$metaData$outcomeIds[i]
    outcomes$daysToEvent <- round(runif(nrow(outcomes), 0, outcomes$time))
    outcomes <- outcomes[, c("rowId", "outcomeId", "outcomeCount", "daysToEvent")]
    allOutcomes <- rbind(allOutcomes, outcomes)
  }
  
 covariateData$coefficients <- NULL
 
 # add indexes for covariate summary
 RSQLite::dbExecute(covariateData, "CREATE INDEX covsum_rowId ON covariates(rowId)")
 RSQLite::dbExecute(covariateData, "CREATE INDEX covsum_covariateId ON covariates(covariateId)")
 
  
  # Remove rownames else they will be copied to the ffdf objects:
  metaData = list(cohortIds = 1,
                  outcomeIds = 2:3,
                  call = list())#plpDataSimulationProfile$metaData
  
  #remove details from profile
  metaData$call$cdmDatabaseSchema = 'Profile'
  metaData$call$outcomeDatabaseSchema = NULL
  metaData$call$cohortDatabaseSchema = NULL
  metaData$call$connectionDetails = NULL
  metaData$call$outcomeTable = NULL
  metaData$call$cohortTable = NULL
  metaData$call$cdmVersion = 5
  metaData$call$covariateSettings = NULL

  metaData$cohortId = 1
  metaData$outcomeIds = c(2,3)
  metaData$studyStartDate = NULL
  metaData$studyEndDate = NULL
  metaData$attrition= data.frame(outcomeId=2,description='Simulated data', 
                                      targetCount=nrow(cohorts), uniquePeople=nrow(cohorts), 
                                      outcomes=nrow(outcomes))
  attr(cohorts, "metaData") <- metaData
  
  attr(covariateData, "metaData") <- list(populationSize = n)
  
  result <- list(cohorts = cohorts,
                 outcomes = allOutcomes,
                 covariateData = covariateData,
                 timeRef = NULL,
                 metaData = metaData)
  
  class(result) <- "plpData"
  return(result)
}
hxia/plp-git-demo documentation built on March 19, 2021, 1:54 a.m.