R/mockPatientProfiles.R

Defines functions mockDisconnect addDate mockPatientProfiles

Documented in mockDisconnect mockPatientProfiles

# Copyright 2024 DARWIN EU (C)
#
# This file is part of PatientProfiles
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#' It creates a mock database for testing PatientProfiles package
#'
#' @param con A DBI connection to create the cdm mock object.
#' @param writeSchema Name of an schema on the same connection with writing
#' permisions.
#' @param numberIndividuals Number of individuals to create in the cdm
#' reference.
#' @param ... User self defined tables to put in cdm, it can input as many
#' as the user want.
#' @param seed A number to set the seed. If NULL seed is not used.
#'
#' @return A mock cdm_reference object created following user's specifications.
#'
#' @export
#'
#' @examples
#' \donttest{
#' library(PatientProfiles)
#' library(CDMConnector)
#'
#' cdm <- mockPatientProfiles()
#'
#' mockDisconnect(cdm = cdm)
#' }
#'
mockPatientProfiles <- function(con = NULL,
                                writeSchema = NULL,
                                numberIndividuals = 10,
                                ...,
                                seed = NULL) {
  if (is.null(con)) {
    rlang::check_installed("duckdb")
    con <- duckdb::dbConnect(duckdb::duckdb(), ":memory:")
  }
  if (!inherits(con, "DBIConnection")) {
    cli::cli_abort(c("!" = "`con` must be a DBI connection"))
  }
  if (is.null(writeSchema) & inherits(con, "duckdb_connection")) {
    writeSchema <- "main"
  }
  if (!is.null(seed)) {
    set.seed(seed = seed)
  }

  # Put ... into a list
  tables <- list(...)
  omopgenerics::assertList(tables, named = TRUE, class = "data.frame")

  # get persons
  if (length(tables) == 0) {
    persons <- seq_len(numberIndividuals)
  } else {
    persons <- numeric()
    for (k in seq_along(tables)) {
      x <- tables[[k]]
      if ("person_id" %in% colnames(x)) {
        persons <- c(persons, x[["person_id"]])
      } else if ("subject_id" %in% colnames(x)) {
        persons <- c(persons, x[["subject_id"]])
      }
    }
    persons <- unique(persons)
  }
  n <- length(persons)

  # create person table
  if (!"person" %in% names(tables)) {
    tables[["person"]] <- dplyr::tibble(
      "person_id" = persons,
      "gender_concept_id" = as.integer(sample(c(8532, 8507), n, TRUE)),
      "year_of_birth" = 1900L + sample.int(80, n, TRUE),
      "race_concept_id" = 0L,
      "ethnicity_concept_id" = 0L
    )
  }

  # get dates
  dates <- dplyr::tibble("person_id" = integer(), "date" = as.Date(character()))
  for (k in seq_along(tables)) {
    x <- tables[[k]]
    cols <- colnames(x)
    id <- c("person_id", "subject_id")
    id <- id[id %in% cols]
    if (length(id) == 1) {
      colDates <- cols[grepl("_date", cols)]
      for (i in seq_along(colDates)) {
        dates <- dates |>
          dplyr::union_all(dplyr::tibble(
            "person_id" = x[[id]], "date" = as.Date(x[[colDates[i]]])
          ))
      }
    }
  }

  # create observation_period
  if (!"observation_period" %in% names(tables)) {
    if (nrow(dates) == 0) {
      tables[["observation_period"]] <- tables[["person"]] |>
        dplyr::select("person_id", "year_of_birth") |>
        dplyr::mutate(
          "observation_period_start_date" = as.Date(NA),
          "observation_period_end_date" = as.Date(NA)
        )
    } else {
      tables[["observation_period"]] <- tables[["person"]] |>
        dplyr::select("person_id", "year_of_birth") |>
        dplyr::left_join(
          dates |>
            dplyr::group_by(.data$person_id) |>
            dplyr::summarise(
              "observation_period_start_date" = min(.data$date, na.rm = TRUE),
              "observation_period_end_date" = max(.data$date, na.rm = TRUE)
            ),
          by = "person_id"
        )
    }
    tables[["observation_period"]] <- tables[["observation_period"]] |>
      dplyr::mutate(
        "observation_period_start_date" = dplyr::if_else(
          is.na(.data$observation_period_start_date),
          as.Date(
            x = paste0(
              .data$year_of_birth + sample.int(34, n, TRUE), "-", sample(1:12, n, TRUE), "-",
              sample(1:28, n, TRUE)
            ),
            format = "%Y-%m-%d"
          ),
          .data$observation_period_start_date
        ),
        "observation_period_end_date" = dplyr::if_else(
          is.na(.data$observation_period_end_date),
          .data$observation_period_start_date + sample.int(1e4, n, TRUE),
          .data$observation_period_end_date
        ),
        "period_type_concept_id" = 0L,
        "observation_period_id" = dplyr::row_number(),
        "observation_period_start_date" = dplyr::if_else(
          as.integer(format(.data$observation_period_start_date, "%Y")) >=
            .data$year_of_birth,
          as.Date(paste0(.data$year_of_birth, "-01-01")),
          .data$observation_period_start_date
        )
      ) |>
      dplyr::select(-"year_of_birth")
  }

  # correct person
  tables[["person"]] <- tables[["person"]] |>
    dplyr::left_join(
      tables[["observation_period"]] |>
        dplyr::group_by(.data$person_id) |>
        dplyr::summarise(
          "start_year" = min(.data$observation_period_start_date) |>
            format("%Y") |>
            as.integer(),
          .groups = "drop"
        ),
      by = "person_id"
    ) |>
    dplyr::mutate("year_of_birth" = dplyr::if_else(
      .data$year_of_birth <= .data$start_year,
      .data$year_of_birth,
      .data$start_year
    )) |>
    dplyr::select(-"start_year")

  # create drug_exposure
  if (!"drug_exposure" %in% names(tables)) {
    nr <- sample.int(n * 2, 1)
    tables[["drug_exposure"]] <- dplyr::tibble(
      "person_id" = sample(tables$person$person_id, size = nr, TRUE)
    ) |>
      dplyr::mutate("id" = dplyr::row_number()) |>
      dplyr::inner_join(
        tables[["observation_period"]],
        by = "person_id",
        relationship = "many-to-many"
      ) |>
      dplyr::group_by(.data$id) |>
      dplyr::slice_sample(n = 1) |>
      dplyr::ungroup() |>
      addDate(c("drug_exposure_start_date", "drug_exposure_end_date")) |>
      dplyr::mutate(
        "drug_exposure_id" = dplyr::row_number(),
        "drug_concept_id" = sample.int(10, nr, T),
        "drug_type_concept_id" = 0L
      )
  }

  # create condition_occurrence
  if (!"condition_occurrence" %in% names(tables)) {
    nr <- sample.int(n * 2, 1)
    tables[["condition_occurrence"]] <- dplyr::tibble(
      "person_id" = sample(tables$person$person_id, size = nr, TRUE)
    ) |>
      dplyr::mutate("id" = dplyr::row_number()) |>
      dplyr::inner_join(
        tables[["observation_period"]],
        by = "person_id",
        relationship = "many-to-many"
      ) |>
      dplyr::group_by(.data$id) |>
      dplyr::slice_sample(n = 1) |>
      dplyr::ungroup() |>
      addDate(c("condition_start_date", "condition_end_date")) |>
      dplyr::mutate(
        "condition_occurrence_id" = seq_len(nr),
        "condition_concept_id" = sample.int(10, nr, T),
        "condition_type_concept_id" = 0L
      )
  }

  # create death
  if (!"death" %in% names(tables)) {
    nn <- round(n * 0.2)
    if (nn > 0) {
      nr <- sample.int(nn, 1)
    } else {
      nr <- 0
    }
    tables[["death"]] <- dplyr::tibble(
      "person_id" = sample(tables$person$person_id, size = nr, FALSE)
    ) |>
      dplyr::inner_join(
        tables[["observation_period"]] |>
          dplyr::filter(
            .data$observation_period_end_date ==
              max(.data$observation_period_end_date)
          ),
        by = "person_id"
      ) |>
      addDate(c("death_date"))
  }

  if (!"visit_occurrence" %in% names(tables)) {
    tables$visit_occurrence <- tables$condition_occurrence |>
      dplyr::select("person_id", "visit_start_date" = "condition_start_date") |>
      dplyr::union_all(
        tables$drug_exposure |>
          dplyr::select(
            "person_id",
            "visit_start_date" = "drug_exposure_start_date"
          )
      ) |>
      dplyr::mutate(
        "visit_occurrence_id" = dplyr::row_number(),
        "visit_concept_id" = 0L,
        "visit_end_date" = .data$visit_start_date,
        "visit_type_concept_id" = 0L
      )
  }

  # create cohort1
  if (!"cohort1" %in% names(tables)) {
    tables[["cohort1"]] <- dplyr::tibble(
      "person_id" = sample(tables$person$person_id)
    ) |>
      dplyr::mutate("id" = dplyr::row_number()) |>
      dplyr::inner_join(
        tables[["observation_period"]],
        by = "person_id",
        relationship = "many-to-many"
      ) |>
      dplyr::group_by(.data$id) |>
      dplyr::slice_sample(n = 1) |>
      dplyr::ungroup() |>
      addDate(c("cohort_start_date", "cohort_end_date")) |>
      dplyr::mutate("cohort_definition_id" = sample.int(3, n, T)) |>
      dplyr::rename("subject_id" = "person_id") |>
      dplyr::select("cohort_definition_id", "subject_id",
                    "cohort_start_date", "cohort_end_date")
  }

  # create cohort2
  if (!"cohort2" %in% names(tables)) {
    tables[["cohort2"]] <- dplyr::tibble(
      "person_id" = sample(tables$person$person_id)
    ) |>
      dplyr::mutate("id" = dplyr::row_number()) |>
      dplyr::inner_join(
        tables[["observation_period"]],
        by = "person_id",
        relationship = "many-to-many"
      ) |>
      dplyr::group_by(.data$id) |>
      dplyr::slice_sample(n = 1) |>
      dplyr::ungroup() |>
      addDate(c("cohort_start_date", "cohort_end_date")) |>
      dplyr::mutate("cohort_definition_id" = sample.int(3, n, T)) |>
      dplyr::rename("subject_id" = "person_id") |>
      dplyr::select("cohort_definition_id", "subject_id",
                    "cohort_start_date", "cohort_end_date")
  }

  # into database
  tablesToInsert <- names(tables)
  src <- CDMConnector::dbSource(con = con, writeSchema = writeSchema)

  for (tab in names(tables)) {
    x <- tables[[tab]]
    for (col in c("subject_id", "person_id", "cohort_definition_id")) {
      if (col %in% colnames(x)) {
        x <- x |>
          dplyr::mutate(!!col := as.integer(.data[[col]]))
      }
    }
    omopgenerics::insertTable(
      cdm = src, name = tab, table = x, overwrite = TRUE
    ) |>
      invisible()
  }

  # create the cdm object
  cdm <- CDMConnector::cdm_from_con(
    con = con,
    cdm_schema = writeSchema,
    write_schema = writeSchema,
    cohort_tables = names(tables)[
      !names(tables) %in% omopgenerics::omopTables()
    ],
    .soft_validation = TRUE,
    cdm_name = "PP_MOCK"
  )

  return(cdm)
}

addDate <- function(x, cols) {
  if (nrow(x) == 0) {
    x <- x |> dplyr::select("person_id")
    for (col in cols) {
      x <- x |> dplyr::mutate(!!col := as.Date(character()))
    }
    return(x)
  }
  x <- x |>
    dplyr::select(
      "person_id",
      "observation_period_start_date",
      "observation_period_end_date"
    ) |>
    dplyr::rowwise()
  for (col in cols) {
    x <- x |>
      dplyr::mutate(
        "diff" = as.integer(difftime(
          .data$observation_period_end_date,
          .data$observation_period_start_date
        )),
        "days" = sample.int(.data$diff + 1, 1) - 1,
        !!col := .data$observation_period_start_date + .data$days,
        "observation_period_start_date" = .data[[col]]
      )
  }
  x <- x |>
    dplyr::ungroup() |>
    dplyr::select(
      -"observation_period_start_date", -"observation_period_end_date", -"diff",
      -"days"
    )
  return(x)
}

#' Function to disconnect from the mock
#'
#' @param cdm A cdm_reference object.
#'
#' @export
#'
mockDisconnect <- function(cdm) {
  cdm <- omopgenerics::dropTable(cdm = cdm, name = dplyr::everything())
  if ("db_cdm" %in% class(omopgenerics::cdmSource(cdm))) {
    con <- CDMConnector::cdmCon(cdm = cdm)
    DBI::dbDisconnect(conn = con, shutdown = TRUE)
  }
  return(invisible(NULL))
}

Try the PatientProfiles package in your browser

Any scripts or data that you put into this service are public.

PatientProfiles documentation built on Oct. 30, 2024, 9:13 a.m.