R/summariseResult.R
In PatientProfiles: Identify Characteristics of Patients in the OMOP Common Data Model

Documented in summariseResult

# Copyright 2024 DARWIN EU (C)
#
# This file is part of PatientProfiles
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#' Summarise variables using a set of estimate functions. The output will be
#' a formatted summarised_result object.
#'
#' @param table Table with different records.
#' @param group List of groups to be considered.
#' @param includeOverallGroup TRUE or FALSE. If TRUE, results for an overall
#' group will be reported when a list of groups has been specified.
#' @param strata List of the stratifications within each group to be considered.
#' @param includeOverallStrata TRUE or FALSE. If TRUE, results for an overall
#' strata will be reported when a list of strata has been specified.
#' @param variables Variables to summarise, it can be a list to point to different
#' set of estimate names.
#' @param estimates Estimates to obtain, it can be a list to point to different
#' set of variables.
#' @param counts Whether to compute number of records and number of subjects.
#'
#' @return A summarised_result object with the summarised data of interest.
#'
#' @export
#'
#' @examples
#' \donttest{
#' library(PatientProfiles)
#' library(dplyr)
#'
#' cdm <- mockPatientProfiles()
#' x <- cdm$cohort1 %>%
#'   addDemographics() %>%
#'   collect()
#' result <- summariseResult(x)
#' mockDisconnect(cdm = cdm)
#' }
#'
summariseResult <- function(table,
                            group = list(),
                            includeOverallGroup = FALSE,
                            strata = list(),
                            includeOverallStrata = TRUE,
                            variables = NULL,
                            estimates = c("min", "q25", "median", "q75", "max", "count", "percentage"),
                            counts = TRUE) {
  # initial checks
  checkTable(table)
  if (length(variables) == 0 & length(estimates) == 0 & counts == FALSE) {
    cli::cli_inform("No analyses were selected.")
    return(omopgenerics::emptySummarisedResult())
  }

  if (is.null(variables)) {
    variables <- colnames(table)
    variables <- variables[!grepl("_id", variables)]
  }

  if (inherits(table, "cdm_table")) {
    cdm_name <- omopgenerics::cdmName(omopgenerics::cdmReference(table))
  } else {
    cdm_name <- "unknown"
  }

  # create the summary for overall
  if (table %>%
    dplyr::count() %>%
    dplyr::pull() == 0) {
    if (counts) {
      result <- dplyr::tibble(
        "group_name" = "overall", "group_level" = "overall",
        "strata_name" = "overall", "strata_level" = "overall",
        "variable_name" = c("number_records", "number_subjects"),
        "variable_level" = NA_character_, "estimate_name" = "count",
        "estimate_type" = "integer", "estimate_value" = "0"
      )
    } else {
      result <- omopgenerics::emptySummarisedResult()
      return(result)
    }
  } else {
    if (!is.list(variables)) {
      variables <- list(variables)
    }
    if (!is.list(estimates)) {
      estimates <- list(estimates)
    }
    if (!is.list(group)) {
      group <- list(group)
    }
    if (!is.list(strata)) {
      strata <- list(strata)
    }
    checkStrata(group, table, type = "group")
    checkStrata(strata, table)
    functions <- checkVariablesFunctions(variables, estimates, table)

    mes <- c("i" = "The following estimates will be computed:")
    variables <- functions$variable_name |> unique()
    for (vark in variables) {
      mes <- c(mes, "*" = paste0(
        vark, ": ", paste0(functions$estimate_name[functions$variable_name == vark], collapse = ", ")
      ))
    }
    cli::cli_inform(message = mes)

    # only required variables
    colOrder <- colnames(table)
    table <- table |>
      dplyr::select(dplyr::any_of(unique(c(
        unlist(strata), unlist(group), functions$variable_name, "person_id",
        "subject_id"
      ))))

    # collect if necessary
    collectFlag <- functions %>%
      dplyr::filter(grepl("q", .data$estimate_name)) %>%
      nrow() > 0
    if (collectFlag) {
      cli::cli_inform(c(
        "!" = "Table is collected to memory as not all requested estimates are
        supported on the database side"
      ))
      table <- table %>% dplyr::collect()
    }

    # correct dates and logicals
    dates <- functions |>
      dplyr::filter(.data$variable_type %in% c("date", "logical")) |>
      dplyr::distinct(.data$variable_name) |>
      dplyr::pull()
    table <- table |>
      dplyr::mutate(dplyr::across(
        .cols = dplyr::all_of(dates),
        .fns = as.integer
      ))

    # correct strata and group
    group <- correctStrata(group, includeOverallGroup)
    strata <- correctStrata(strata, includeOverallStrata)

    cli::cli_alert("Start summary of data, at {Sys.time()}")
    nt <- length(group) * length(strata)
    k <- 0
    cli::cli_progress_bar(
      total = nt,
      format = "{cli::pb_bar}{k}/{nt} group-strata combinations @ {Sys.time()}"
    )

    personVariable <- NULL
    if (counts) {
      i <- "person_id" %in% colnames(table)
      j <- "subject_id" %in% colnames(table)
      if (i) {
        if (j) {
          cli::cli_warn(
            "person_id and subject_id present in table, `person_id` used as
            person identifier"
          )
        }
        personVariable <- "person_id"
      } else if (j) {
        personVariable <- "subject_id"
      }
    }

    resultk <- 1
    result <- list()
    for (groupk in group) {
      for (stratak in strata) {
        result[[resultk]] <- summariseInternal(
          table, groupk, stratak, functions, counts, personVariable
        ) |>
          # order variables
          orderVariables(colOrder, unique(unlist(estimates)))
        resultk <- resultk + 1
        k <- k + 1
        cli::cli_progress_update()
      }
    }
    result <- result |> dplyr::bind_rows()
    cli::cli_inform(c("v" = "Summary finished, at {Sys.time()}"))
  }

  # TO REMOVE
  result$variable_name[result$variable_name == "number_subjects"] <- "number subjects"
  result$variable_name[result$variable_name == "number_records"] <- "number records"

  # format summarised_result
  result <- result |>
    dplyr::mutate(
      "result_id" = as.integer(1),
      "cdm_name" = .env$cdm_name,
      "additional_name" = "overall",
      "additional_level" = "overall"
    ) |>
    omopgenerics::newSummarisedResult(
      settings = dplyr::tibble(
        "result_id" = as.integer(1),
        "result_type" = "summarise_table",
        "package_name" = "PatientProfiles",
        "package_version" = as.character(utils::packageVersion("PatientProfiles"))
      )
    )

  return(result)
}

summariseInternal <- function(table, groupk, stratak, functions, counts, personVariable) {
  result <- list()

  # group by relevant variables
  strataGroupk <- unique(c(groupk, stratak))

  if (length(strataGroupk) == 0) {
    table <- table |>
      dplyr::mutate("strata_id" = 1L)
    strataGroup <- dplyr::tibble(
      "strata_id" = 1L,
      "group_name" = "overall",
      "group_level" = "overall",
      "strata_name" = "overall",
      "strata_level" = "overall"
    )
  } else {
    strataGroup <- table |>
      dplyr::select(dplyr::all_of(strataGroupk)) |>
      dplyr::distinct() |>
      dplyr::mutate("strata_id" = dplyr::row_number()) |>
      dplyr::compute()
    if (strataGroup |> dplyr::ungroup() |> dplyr::tally() |> dplyr::pull() == 1) {
      table <- table |>
        dplyr::mutate("strata_id" = 1L)
    } else {
      table <- table |>
        dplyr::inner_join(strataGroup, by = strataGroupk)
    }
    # format group strata
    strataGroup <- strataGroup |>
      dplyr::collect() |>
      visOmopResults::uniteGroup(
        cols = groupk, keep = TRUE, ignore = character()
      ) |>
      visOmopResults::uniteStrata(
        cols = stratak, keep = TRUE, ignore = character()
      ) |>
      dplyr::select(
        "strata_id", "group_name", "group_level", "strata_name", "strata_level"
      )
  }
  table <- table |>
    dplyr::select(dplyr::any_of(c(
      "strata_id", "person_id", "subject_id", unique(functions$variable_name)
    ))) |>
    dplyr::group_by(.data$strata_id)

  # count subjects and records
  if (counts) {
    result$counts <- countSubjects(table, personVariable)
  }

  # summariseNumeric
  result$numeric <- summariseNumeric(table, functions)

  # summariseBinary
  result$binary <- summariseBinary(table, functions)

  # summariseCategories
  result$categories <- summariseCategories(table, functions)

  # summariseMissings
  result$missings <- summariseMissings(table, functions)

  result <- result |>
    dplyr::bind_rows() |>
    dplyr::inner_join(strataGroup, by = "strata_id") |>
    dplyr::select(-"strata_id") |>
    dplyr::arrange(.data$strata_level)

  return(result)
}

countSubjects <- function(x, personVariable) {
  result <- list()
  result$record <- x %>%
    dplyr::summarise(
      "estimate_value" = dplyr::n(),
      .groups = "drop"
    ) %>%
    dplyr::collect() |>
    dplyr::mutate(
      "variable_name" = "number_records"
    )
  if (!is.null(personVariable)) {
    result$subject <- x %>%
      dplyr::summarise(
        "estimate_value" = dplyr::n_distinct(.data[[personVariable]]),
        .groups = "drop"
      ) %>%
      dplyr::collect() |>
      dplyr::mutate(
        "variable_name" = "number_subjects"
      )
  }
  result <- dplyr::bind_rows(result) |>
    dplyr::mutate(
      "estimate_type" = "integer",
      "estimate_name" = "count",
      "variable_level" = NA_character_,
      "estimate_value" = as.character(.data$estimate_value)
    )
  return(result)
}

summariseNumeric <- function(table, functions) {
  functions <- functions |>
    dplyr::filter(
      .data$variable_type %in% c("date", "numeric", "integer") &
        !grepl("count|percentage", .data$estimate_name)
    )

  if (nrow(functions) == 0) {
    return(NULL)
  }

  funs <- functions |>
    dplyr::filter(.data$estimate_name != "density")

  if (nrow(funs) > 0) {
    funs <- funs |>
      dplyr::mutate(fun = estimatesFunc[.data$estimate_name]) |>
      dplyr::rowwise() |>
      dplyr::mutate(fun = gsub("x, ", paste0(".data[['", .data$variable_name, "']], "), .data$fun)) |>
      dplyr::ungroup() |>
      dplyr::mutate(id = paste0("variable_", stringr::str_pad(dplyr::row_number(), 6, pad = "0")))
    numericSummary <- funs$fun |>
      rlang::parse_exprs() |>
      rlang::set_names(funs$id)
    res <- table |>
      dplyr::group_by(.data$strata_id) |>
      dplyr::summarise(!!!numericSummary, .groups = "drop") |>
      suppressWarnings() |>
      dplyr::collect() |>
      dplyr::mutate(dplyr::across(.cols = !"strata_id", .fns = as.numeric)) |>
      tidyr::pivot_longer(
        cols = !"strata_id", names_to = "id", values_to = "estimate_value"
      ) |>
      dplyr::inner_join(
        funs |>
          dplyr::select(c("id", "variable_name", "estimate_name", "estimate_type")),
        by = "id"
      ) |>
      dplyr::select(-"id") |>
      dplyr::mutate("variable_level" = NA_character_) |>
      correctTypes()
  } else {
    res <- NULL
  }

  functions <- functions |>
    dplyr::filter(.data$estimate_name == "density")

  if (nrow(functions) > 0) {
    res <- res |>
      dplyr::union_all(
        table |>
          dplyr::select(dplyr::all_of(c("strata_id", functions$variable_name))) |>
          dplyr::collect() |>
          dplyr::group_by(.data$strata_id) |>
          dplyr::group_split() |>
          as.list() |>
          purrr::map_df(getDensityResult) |>
          dplyr::inner_join(
            functions |>
              dplyr::select("variable_name", "estimate_type" = "variable_type") |>
              dplyr::mutate(estimate_type = dplyr::if_else(
                .data$estimate_type == "integer", "numeric", .data$estimate_type
              )),
            by = "variable_name"
          ) |>
          dplyr::mutate(estimate_type = dplyr::if_else(
            .data$estimate_name == "density_y", "numeric", .data$estimate_type
          )) |>
          correctTypes()
      )
  }

  return(res)
}

correctTypes <- function(x) {
  x |>
    dplyr::mutate(estimate_value = dplyr::case_when(
      # Inf and Nan generated due to missing values
      is.infinite(.data$estimate_value) | is.nan(.data$estimate_value) ~ NA_character_,
      # correct dates
      .data$estimate_type == "date" ~
        as.character(as.Date(round(.data$estimate_value), origin = "1970-01-01")),
      # round integers
      .data$estimate_type == "integer" ~
        as.character(round(.data$estimate_value)),
      # numeric to characters
      .data$estimate_type == "numeric" ~ as.character(.data$estimate_value)
    ))
}

getDensityResult <- function(x) {
  x |>
    dplyr::select(!"strata_id") |>
    as.list() |>
    purrr::map(densityResult) |>
    dplyr::bind_rows(.id = "variable_name") |>
    dplyr::mutate(strata_id = x$strata_id[1])
}
densityResult <- function(x) {
  nPoints <- 512
  nDigits <- ceiling(log(nPoints)/log(10))
  x <- as.numeric(x[!is.na(x)])
  if (length(x) == 0) {
    return(NULL)
  } else if (length(x) == 1) {
    den <- list(x = c(x - 1, x, x + 1), y = c(0, 1, 0)) # NEEDS DISCUSSION
  } else {
    den <- stats::density(x, n = nPoints)
  }
  lev <- paste0("density_", stringr::str_pad(
    seq_along(den$x), width = nDigits, side = "left", pad = "0"))
  dplyr::tibble(
    variable_level = lev,
    estimate_name = "density_x",
    estimate_value = den$x
  ) |>
    dplyr::union_all(dplyr::tibble(
      variable_level = lev,
      estimate_name = "density_y",
      estimate_value = den$y
    )) |>
    dplyr::arrange(.data$variable_level, .data$estimate_name)
}

summariseBinary <- function(table, functions) {
  binFuns <- functions |>
    dplyr::filter(
      .data$variable_type != "categorical" &
        .data$estimate_name %in% c("count", "percentage")
    )
  binNum <- binFuns |>
    dplyr::pull("variable_name") |>
    unique()
  if (length(binNum) > 0) {
    num <- table |>
      dplyr::summarise(dplyr::across(
        .cols = dplyr::all_of(binNum),
        ~ sum(.x, na.rm = TRUE),
        .names = "counts_{.col}"
      )) |>
      dplyr::collect() |>
      dplyr::mutate(dplyr::across(
        .cols = dplyr::all_of(paste0("counts_", binNum)),
        .fns = as.numeric
      ))
    binDen <- binFuns |>
      dplyr::filter(.data$estimate_name == "percentage") |>
      dplyr::pull("variable_name")
    res <- num |>
      tidyr::pivot_longer(
        cols = dplyr::all_of(paste0("counts_", binNum)),
        names_to = "variable_name",
        values_to = "estimate_value"
      ) |>
      dplyr::mutate(
        "variable_name" = substr(.data$variable_name, 8, nchar(.data$variable_name)),
        "estimate_name" = "count",
        "estimate_type" = "integer"
      )
    if (length(binDen) > 0) {
      den <- table |>
        dplyr::summarise(dplyr::across(
          .cols = dplyr::all_of(binDen),
          ~ sum(as.integer(!is.na(.x)), na.rm = TRUE),
          .names = "den_{.col}"
        )) |>
        dplyr::collect() |>
        dplyr::mutate(dplyr::across(
          .cols = dplyr::all_of(paste0("den_", binDen)),
          .fns = as.numeric
        ))
      percentages <- num |>
        tidyr::pivot_longer(
          cols = dplyr::all_of(paste0("counts_", binNum)),
          names_to = "variable_name",
          values_to = "numerator"
        ) |>
        dplyr::mutate(
          "variable_name" = substr(.data$variable_name, 8, nchar(.data$variable_name))
        ) |>
        dplyr::inner_join(
          den |>
            tidyr::pivot_longer(
              cols = dplyr::all_of(paste0("den_", binDen)),
              names_to = "variable_name",
              values_to = "denominator"
            ) |>
            dplyr::mutate(
              "variable_name" = substr(.data$variable_name, 5, nchar(.data$variable_name))
            ),
          by = c("strata_id", "variable_name")
        ) |>
        dplyr::mutate(
          "estimate_value" = 100 * .data$numerator / .data$denominator,
          "estimate_name" = "percentage",
          "estimate_type" = "percentage"
        ) |>
        dplyr::select(-c("numerator", "denominator"))
      res <- res |> dplyr::union_all(percentages)
    }

    res <- res |>
      dplyr::mutate(
        "estimate_value" = dplyr::if_else(
          is.infinite(.data$estimate_value) | is.nan(.data$estimate_value),
          NA_character_, as.character(.data$estimate_value)
        ),
        "variable_level" = NA_character_
      )
  } else {
    res <- NULL
  }

  return(res)
}

summariseCategories <- function(table, functions) {
  catFuns <- functions |>
    dplyr::filter(.data$variable_type == "categorical")
  result <- list()
  catVars <- unique(catFuns$variable_name)
  if (length(catVars) > 0) {
    den <- table |>
      dplyr::tally(name = "denominator") |>
      dplyr::collect() |>
      dplyr::ungroup()
    for (catVar in catVars) {
      est <- catFuns |>
        dplyr::filter(.data$variable_name == .env$catVar) |>
        dplyr::pull("estimate_name")
      result[[catVar]] <- table |>
        dplyr::group_by(.data$strata_id, .data[[catVar]]) |>
        dplyr::tally(name = "count") |>
        dplyr::collect() |>
        dplyr::ungroup() |>
        dplyr::inner_join(den, by = "strata_id") |>
        dplyr::mutate(
          "percentage" = as.character(100 * .data$count / .data$denominator),
          "count" = as.character(.data$count)
        ) |>
        dplyr::select(!"denominator") |>
        tidyr::pivot_longer(
          cols = c("count", "percentage"),
          names_to = "estimate_name",
          values_to = "estimate_value"
        ) |>
        dplyr::mutate(
          "variable_name" = .env$catVar,
          "estimate_type" = dplyr::if_else(
            .data$estimate_name == "count", "integer", "percentage"
          )
        ) |>
        dplyr::select(
          "strata_id", "variable_name",
          "variable_level" = dplyr::all_of(catVar), "estimate_name",
          "estimate_type", "estimate_value"
        ) |>
        dplyr::filter(.data$estimate_name %in% .env$est)
    }
  }
  return(dplyr::bind_rows(result))
}

summariseMissings <- function(table, functions) {
  result <- list()

  # counts
  mVars <- functions |>
    dplyr::filter(.data$estimate_name %in% c("count_missing", "percentage_missing")) |>
    dplyr::pull("variable_name") |>
    unique()
  if (length(mVars) > 0) {
    result <- table |>
      dplyr::summarise(
        dplyr::across(
          .cols = dplyr::all_of(mVars),
          ~ sum(as.integer(is.na(.x)), na.rm = TRUE),
          .names = "cm_{.col}"
        ),
        "den" = dplyr::n()
      ) |>
      dplyr::collect() |>
      dplyr::mutate(dplyr::across(
        .cols = dplyr::all_of(c("den", paste0("cm_", mVars))),
        .fns = as.numeric
      )) |>
      tidyr::pivot_longer(
        cols = dplyr::all_of(paste0("cm_", mVars)),
        names_to = "variable_name",
        values_to = "count_missing"
      ) |>
      dplyr::mutate("percentage_missing" = 100 * .data$count_missing / .data$den) |>
      dplyr::select(-"den") |>
      tidyr::pivot_longer(
        cols = c("count_missing", "percentage_missing"),
        names_to = "estimate_name",
        values_to = "estimate_value"
      ) |>
      dplyr::mutate(
        "variable_name" = substr(.data$variable_name, 4, nchar(.data$variable_name)),
        "variable_level" = NA_character_,
        "estimate_value" = dplyr::if_else(
          is.infinite(.data$estimate_value) | is.nan(.data$estimate_value),
          NA_character_, as.character(.data$estimate_value)
        )
      ) |>
      dplyr::inner_join(
        functions |>
          dplyr::filter(.data$estimate_name %in% c("count_missing", "percentage_missing")) |>
          dplyr::select("variable_name", "estimate_name", "estimate_type"),
        by = c("variable_name", "estimate_name")
      )
  } else {
    result <- NULL
  }
  return(result)
}

orderVariables <- function(res, cols, est) {
  if (length(est) == 0) {
    return(res)
  }
  orderVars <- dplyr::tibble("variable_name" = c(
    "number_records", "number_subjects", cols
  )) |>
    dplyr::mutate("id_variable" = dplyr::row_number())
  orderEst <- dplyr::tibble("estimate_name" = est) |>
    dplyr::mutate("id_estimate" = dplyr::row_number())
  res <- res |>
    dplyr::left_join(orderVars, by = c("variable_name")) |>
    dplyr::left_join(orderEst, by = c("estimate_name")) |>
    dplyr::arrange(.data$id_variable, .data$id_estimate, .data$variable_level) |>
    dplyr::select(-c("id_variable", "id_estimate"))
  return(res)
}
Any scripts or data that you put into this service are public.
PatientProfiles documentation built on Oct. 30, 2024, 9:13 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
PatientProfiles
Identify Characteristics of Patients in the OMOP Common Data Model

R/summariseResult.R
In PatientProfiles: Identify Characteristics of Patients in the OMOP Common Data Model

Defines functions orderVariables summariseMissings summariseCategories summariseBinary densityResult getDensityResult correctTypes summariseNumeric countSubjects summariseInternal summariseResult

Documented in summariseResult

Try the PatientProfiles package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

PatientProfiles Identify Characteristics of Patients in the OMOP Common Data Model

R/summariseResult.R In PatientProfiles: Identify Characteristics of Patients in the OMOP Common Data Model

Defines functions orderVariables summariseMissings summariseCategories summariseBinary densityResult getDensityResult correctTypes summariseNumeric countSubjects summariseInternal summariseResult

Documented in summariseResult

Try the PatientProfiles package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

PatientProfiles
Identify Characteristics of Patients in the OMOP Common Data Model

R/summariseResult.R
In PatientProfiles: Identify Characteristics of Patients in the OMOP Common Data Model