R/summariseResult.R

Defines functions orderVariables summariseMissings summariseCategories summariseBinary densityResult getDensityResult correctTypes summariseNumeric countSubjects summariseInternal summariseResult

Documented in summariseResult

# Copyright 2024 DARWIN EU (C)
#
# This file is part of PatientProfiles
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#' Summarise variables using a set of estimate functions. The output will be
#' a formatted summarised_result object.
#'
#' @param table Table with different records.
#' @param group List of groups to be considered.
#' @param includeOverallGroup TRUE or FALSE. If TRUE, results for an overall
#' group will be reported when a list of groups has been specified.
#' @param strata List of the stratifications within each group to be considered.
#' @param includeOverallStrata TRUE or FALSE. If TRUE, results for an overall
#' strata will be reported when a list of strata has been specified.
#' @param variables Variables to summarise, it can be a list to point to different
#' set of estimate names.
#' @param estimates Estimates to obtain, it can be a list to point to different
#' set of variables.
#' @param counts Whether to compute number of records and number of subjects.
#'
#' @return A summarised_result object with the summarised data of interest.
#'
#' @export
#'
#' @examples
#' \donttest{
#' library(PatientProfiles)
#' library(dplyr)
#'
#' cdm <- mockPatientProfiles()
#' x <- cdm$cohort1 %>%
#'   addDemographics() %>%
#'   collect()
#' result <- summariseResult(x)
#' mockDisconnect(cdm = cdm)
#' }
#'
summariseResult <- function(table,
                            group = list(),
                            includeOverallGroup = FALSE,
                            strata = list(),
                            includeOverallStrata = TRUE,
                            variables = NULL,
                            estimates = c("min", "q25", "median", "q75", "max", "count", "percentage"),
                            counts = TRUE) {
  # initial checks
  checkTable(table)
  if (length(variables) == 0 & length(estimates) == 0 & counts == FALSE) {
    cli::cli_inform("No analyses were selected.")
    return(omopgenerics::emptySummarisedResult())
  }

  if (is.null(variables)) {
    variables <- colnames(table)
    variables <- variables[!grepl("_id", variables)]
  }

  if (inherits(table, "cdm_table")) {
    cdm_name <- omopgenerics::cdmName(omopgenerics::cdmReference(table))
  } else {
    cdm_name <- "unknown"
  }

  # create the summary for overall
  if (table %>%
    dplyr::count() %>%
    dplyr::pull() == 0) {
    if (counts) {
      result <- dplyr::tibble(
        "group_name" = "overall", "group_level" = "overall",
        "strata_name" = "overall", "strata_level" = "overall",
        "variable_name" = c("number_records", "number_subjects"),
        "variable_level" = NA_character_, "estimate_name" = "count",
        "estimate_type" = "integer", "estimate_value" = "0"
      )
    } else {
      result <- omopgenerics::emptySummarisedResult()
      return(result)
    }
  } else {
    if (!is.list(variables)) {
      variables <- list(variables)
    }
    if (!is.list(estimates)) {
      estimates <- list(estimates)
    }
    if (!is.list(group)) {
      group <- list(group)
    }
    if (!is.list(strata)) {
      strata <- list(strata)
    }
    checkStrata(group, table, type = "group")
    checkStrata(strata, table)
    functions <- checkVariablesFunctions(variables, estimates, table)

    mes <- c("i" = "The following estimates will be computed:")
    variables <- functions$variable_name |> unique()
    for (vark in variables) {
      mes <- c(mes, "*" = paste0(
        vark, ": ", paste0(functions$estimate_name[functions$variable_name == vark], collapse = ", ")
      ))
    }
    cli::cli_inform(message = mes)

    # only required variables
    colOrder <- colnames(table)
    table <- table |>
      dplyr::select(dplyr::any_of(unique(c(
        unlist(strata), unlist(group), functions$variable_name, "person_id",
        "subject_id"
      ))))

    # collect if necessary
    collectFlag <- functions %>%
      dplyr::filter(grepl("q", .data$estimate_name)) %>%
      nrow() > 0
    if (collectFlag) {
      cli::cli_inform(c(
        "!" = "Table is collected to memory as not all requested estimates are
        supported on the database side"
      ))
      table <- table %>% dplyr::collect()
    }

    # correct dates and logicals
    dates <- functions |>
      dplyr::filter(.data$variable_type %in% c("date", "logical")) |>
      dplyr::distinct(.data$variable_name) |>
      dplyr::pull()
    table <- table |>
      dplyr::mutate(dplyr::across(
        .cols = dplyr::all_of(dates),
        .fns = as.integer
      ))

    # correct strata and group
    group <- correctStrata(group, includeOverallGroup)
    strata <- correctStrata(strata, includeOverallStrata)

    cli::cli_alert("Start summary of data, at {Sys.time()}")
    nt <- length(group) * length(strata)
    k <- 0
    cli::cli_progress_bar(
      total = nt,
      format = "{cli::pb_bar}{k}/{nt} group-strata combinations @ {Sys.time()}"
    )

    personVariable <- NULL
    if (counts) {
      i <- "person_id" %in% colnames(table)
      j <- "subject_id" %in% colnames(table)
      if (i) {
        if (j) {
          cli::cli_warn(
            "person_id and subject_id present in table, `person_id` used as
            person identifier"
          )
        }
        personVariable <- "person_id"
      } else if (j) {
        personVariable <- "subject_id"
      }
    }

    resultk <- 1
    result <- list()
    for (groupk in group) {
      for (stratak in strata) {
        result[[resultk]] <- summariseInternal(
          table, groupk, stratak, functions, counts, personVariable
        ) |>
          # order variables
          orderVariables(colOrder, unique(unlist(estimates)))
        resultk <- resultk + 1
        k <- k + 1
        cli::cli_progress_update()
      }
    }
    result <- result |> dplyr::bind_rows()
    cli::cli_inform(c("v" = "Summary finished, at {Sys.time()}"))
  }

  # TO REMOVE
  result$variable_name[result$variable_name == "number_subjects"] <- "number subjects"
  result$variable_name[result$variable_name == "number_records"] <- "number records"

  # format summarised_result
  result <- result |>
    dplyr::mutate(
      "result_id" = as.integer(1),
      "cdm_name" = .env$cdm_name,
      "additional_name" = "overall",
      "additional_level" = "overall"
    ) |>
    omopgenerics::newSummarisedResult(
      settings = dplyr::tibble(
        "result_id" = as.integer(1),
        "result_type" = "summarise_table",
        "package_name" = "PatientProfiles",
        "package_version" = as.character(utils::packageVersion("PatientProfiles"))
      )
    )

  return(result)
}

summariseInternal <- function(table, groupk, stratak, functions, counts, personVariable) {
  result <- list()

  # group by relevant variables
  strataGroupk <- unique(c(groupk, stratak))

  if (length(strataGroupk) == 0) {
    table <- table |>
      dplyr::mutate("strata_id" = 1L)
    strataGroup <- dplyr::tibble(
      "strata_id" = 1L,
      "group_name" = "overall",
      "group_level" = "overall",
      "strata_name" = "overall",
      "strata_level" = "overall"
    )
  } else {
    strataGroup <- table |>
      dplyr::select(dplyr::all_of(strataGroupk)) |>
      dplyr::distinct() |>
      dplyr::mutate("strata_id" = dplyr::row_number()) |>
      dplyr::compute()
    if (strataGroup |> dplyr::ungroup() |> dplyr::tally() |> dplyr::pull() == 1) {
      table <- table |>
        dplyr::mutate("strata_id" = 1L)
    } else {
      table <- table |>
        dplyr::inner_join(strataGroup, by = strataGroupk)
    }
    # format group strata
    strataGroup <- strataGroup |>
      dplyr::collect() |>
      visOmopResults::uniteGroup(
        cols = groupk, keep = TRUE, ignore = character()
      ) |>
      visOmopResults::uniteStrata(
        cols = stratak, keep = TRUE, ignore = character()
      ) |>
      dplyr::select(
        "strata_id", "group_name", "group_level", "strata_name", "strata_level"
      )
  }
  table <- table |>
    dplyr::select(dplyr::any_of(c(
      "strata_id", "person_id", "subject_id", unique(functions$variable_name)
    ))) |>
    dplyr::group_by(.data$strata_id)

  # count subjects and records
  if (counts) {
    result$counts <- countSubjects(table, personVariable)
  }

  # summariseNumeric
  result$numeric <- summariseNumeric(table, functions)

  # summariseBinary
  result$binary <- summariseBinary(table, functions)

  # summariseCategories
  result$categories <- summariseCategories(table, functions)

  # summariseMissings
  result$missings <- summariseMissings(table, functions)

  result <- result |>
    dplyr::bind_rows() |>
    dplyr::inner_join(strataGroup, by = "strata_id") |>
    dplyr::select(-"strata_id") |>
    dplyr::arrange(.data$strata_level)

  return(result)
}

countSubjects <- function(x, personVariable) {
  result <- list()
  result$record <- x %>%
    dplyr::summarise(
      "estimate_value" = dplyr::n(),
      .groups = "drop"
    ) %>%
    dplyr::collect() |>
    dplyr::mutate(
      "variable_name" = "number_records"
    )
  if (!is.null(personVariable)) {
    result$subject <- x %>%
      dplyr::summarise(
        "estimate_value" = dplyr::n_distinct(.data[[personVariable]]),
        .groups = "drop"
      ) %>%
      dplyr::collect() |>
      dplyr::mutate(
        "variable_name" = "number_subjects"
      )
  }
  result <- dplyr::bind_rows(result) |>
    dplyr::mutate(
      "estimate_type" = "integer",
      "estimate_name" = "count",
      "variable_level" = NA_character_,
      "estimate_value" = as.character(.data$estimate_value)
    )
  return(result)
}

summariseNumeric <- function(table, functions) {
  functions <- functions |>
    dplyr::filter(
      .data$variable_type %in% c("date", "numeric", "integer") &
        !grepl("count|percentage", .data$estimate_name)
    )

  if (nrow(functions) == 0) {
    return(NULL)
  }

  funs <- functions |>
    dplyr::filter(.data$estimate_name != "density")

  if (nrow(funs) > 0) {
    funs <- funs |>
      dplyr::mutate(fun = estimatesFunc[.data$estimate_name]) |>
      dplyr::rowwise() |>
      dplyr::mutate(fun = gsub("x, ", paste0(".data[['", .data$variable_name, "']], "), .data$fun)) |>
      dplyr::ungroup() |>
      dplyr::mutate(id = paste0("variable_", stringr::str_pad(dplyr::row_number(), 6, pad = "0")))
    numericSummary <- funs$fun |>
      rlang::parse_exprs() |>
      rlang::set_names(funs$id)
    res <- table |>
      dplyr::group_by(.data$strata_id) |>
      dplyr::summarise(!!!numericSummary, .groups = "drop") |>
      suppressWarnings() |>
      dplyr::collect() |>
      dplyr::mutate(dplyr::across(.cols = !"strata_id", .fns = as.numeric)) |>
      tidyr::pivot_longer(
        cols = !"strata_id", names_to = "id", values_to = "estimate_value"
      ) |>
      dplyr::inner_join(
        funs |>
          dplyr::select(c("id", "variable_name", "estimate_name", "estimate_type")),
        by = "id"
      ) |>
      dplyr::select(-"id") |>
      dplyr::mutate("variable_level" = NA_character_) |>
      correctTypes()
  } else {
    res <- NULL
  }

  functions <- functions |>
    dplyr::filter(.data$estimate_name == "density")

  if (nrow(functions) > 0) {
    res <- res |>
      dplyr::union_all(
        table |>
          dplyr::select(dplyr::all_of(c("strata_id", functions$variable_name))) |>
          dplyr::collect() |>
          dplyr::group_by(.data$strata_id) |>
          dplyr::group_split() |>
          as.list() |>
          purrr::map_df(getDensityResult) |>
          dplyr::inner_join(
            functions |>
              dplyr::select("variable_name", "estimate_type" = "variable_type") |>
              dplyr::mutate(estimate_type = dplyr::if_else(
                .data$estimate_type == "integer", "numeric", .data$estimate_type
              )),
            by = "variable_name"
          ) |>
          dplyr::mutate(estimate_type = dplyr::if_else(
            .data$estimate_name == "density_y", "numeric", .data$estimate_type
          )) |>
          correctTypes()
      )
  }

  return(res)
}

correctTypes <- function(x) {
  x |>
    dplyr::mutate(estimate_value = dplyr::case_when(
      # Inf and Nan generated due to missing values
      is.infinite(.data$estimate_value) | is.nan(.data$estimate_value) ~ NA_character_,
      # correct dates
      .data$estimate_type == "date" ~
        as.character(as.Date(round(.data$estimate_value), origin = "1970-01-01")),
      # round integers
      .data$estimate_type == "integer" ~
        as.character(round(.data$estimate_value)),
      # numeric to characters
      .data$estimate_type == "numeric" ~ as.character(.data$estimate_value)
    ))
}

getDensityResult <- function(x) {
  x |>
    dplyr::select(!"strata_id") |>
    as.list() |>
    purrr::map(densityResult) |>
    dplyr::bind_rows(.id = "variable_name") |>
    dplyr::mutate(strata_id = x$strata_id[1])
}
densityResult <- function(x) {
  nPoints <- 512
  nDigits <- ceiling(log(nPoints)/log(10))
  x <- as.numeric(x[!is.na(x)])
  if (length(x) == 0) {
    return(NULL)
  } else if (length(x) == 1) {
    den <- list(x = c(x - 1, x, x + 1), y = c(0, 1, 0)) # NEEDS DISCUSSION
  } else {
    den <- stats::density(x, n = nPoints)
  }
  lev <- paste0("density_", stringr::str_pad(
    seq_along(den$x), width = nDigits, side = "left", pad = "0"))
  dplyr::tibble(
    variable_level = lev,
    estimate_name = "density_x",
    estimate_value = den$x
  ) |>
    dplyr::union_all(dplyr::tibble(
      variable_level = lev,
      estimate_name = "density_y",
      estimate_value = den$y
    )) |>
    dplyr::arrange(.data$variable_level, .data$estimate_name)
}

summariseBinary <- function(table, functions) {
  binFuns <- functions |>
    dplyr::filter(
      .data$variable_type != "categorical" &
        .data$estimate_name %in% c("count", "percentage")
    )
  binNum <- binFuns |>
    dplyr::pull("variable_name") |>
    unique()
  if (length(binNum) > 0) {
    num <- table |>
      dplyr::summarise(dplyr::across(
        .cols = dplyr::all_of(binNum),
        ~ sum(.x, na.rm = TRUE),
        .names = "counts_{.col}"
      )) |>
      dplyr::collect() |>
      dplyr::mutate(dplyr::across(
        .cols = dplyr::all_of(paste0("counts_", binNum)),
        .fns = as.numeric
      ))
    binDen <- binFuns |>
      dplyr::filter(.data$estimate_name == "percentage") |>
      dplyr::pull("variable_name")
    res <- num |>
      tidyr::pivot_longer(
        cols = dplyr::all_of(paste0("counts_", binNum)),
        names_to = "variable_name",
        values_to = "estimate_value"
      ) |>
      dplyr::mutate(
        "variable_name" = substr(.data$variable_name, 8, nchar(.data$variable_name)),
        "estimate_name" = "count",
        "estimate_type" = "integer"
      )
    if (length(binDen) > 0) {
      den <- table |>
        dplyr::summarise(dplyr::across(
          .cols = dplyr::all_of(binDen),
          ~ sum(as.integer(!is.na(.x)), na.rm = TRUE),
          .names = "den_{.col}"
        )) |>
        dplyr::collect() |>
        dplyr::mutate(dplyr::across(
          .cols = dplyr::all_of(paste0("den_", binDen)),
          .fns = as.numeric
        ))
      percentages <- num |>
        tidyr::pivot_longer(
          cols = dplyr::all_of(paste0("counts_", binNum)),
          names_to = "variable_name",
          values_to = "numerator"
        ) |>
        dplyr::mutate(
          "variable_name" = substr(.data$variable_name, 8, nchar(.data$variable_name))
        ) |>
        dplyr::inner_join(
          den |>
            tidyr::pivot_longer(
              cols = dplyr::all_of(paste0("den_", binDen)),
              names_to = "variable_name",
              values_to = "denominator"
            ) |>
            dplyr::mutate(
              "variable_name" = substr(.data$variable_name, 5, nchar(.data$variable_name))
            ),
          by = c("strata_id", "variable_name")
        ) |>
        dplyr::mutate(
          "estimate_value" = 100 * .data$numerator / .data$denominator,
          "estimate_name" = "percentage",
          "estimate_type" = "percentage"
        ) |>
        dplyr::select(-c("numerator", "denominator"))
      res <- res |> dplyr::union_all(percentages)
    }

    res <- res |>
      dplyr::mutate(
        "estimate_value" = dplyr::if_else(
          is.infinite(.data$estimate_value) | is.nan(.data$estimate_value),
          NA_character_, as.character(.data$estimate_value)
        ),
        "variable_level" = NA_character_
      )
  } else {
    res <- NULL
  }

  return(res)
}

summariseCategories <- function(table, functions) {
  catFuns <- functions |>
    dplyr::filter(.data$variable_type == "categorical")
  result <- list()
  catVars <- unique(catFuns$variable_name)
  if (length(catVars) > 0) {
    den <- table |>
      dplyr::tally(name = "denominator") |>
      dplyr::collect() |>
      dplyr::ungroup()
    for (catVar in catVars) {
      est <- catFuns |>
        dplyr::filter(.data$variable_name == .env$catVar) |>
        dplyr::pull("estimate_name")
      result[[catVar]] <- table |>
        dplyr::group_by(.data$strata_id, .data[[catVar]]) |>
        dplyr::tally(name = "count") |>
        dplyr::collect() |>
        dplyr::ungroup() |>
        dplyr::inner_join(den, by = "strata_id") |>
        dplyr::mutate(
          "percentage" = as.character(100 * .data$count / .data$denominator),
          "count" = as.character(.data$count)
        ) |>
        dplyr::select(!"denominator") |>
        tidyr::pivot_longer(
          cols = c("count", "percentage"),
          names_to = "estimate_name",
          values_to = "estimate_value"
        ) |>
        dplyr::mutate(
          "variable_name" = .env$catVar,
          "estimate_type" = dplyr::if_else(
            .data$estimate_name == "count", "integer", "percentage"
          )
        ) |>
        dplyr::select(
          "strata_id", "variable_name",
          "variable_level" = dplyr::all_of(catVar), "estimate_name",
          "estimate_type", "estimate_value"
        ) |>
        dplyr::filter(.data$estimate_name %in% .env$est)
    }
  }
  return(dplyr::bind_rows(result))
}

summariseMissings <- function(table, functions) {
  result <- list()

  # counts
  mVars <- functions |>
    dplyr::filter(.data$estimate_name %in% c("count_missing", "percentage_missing")) |>
    dplyr::pull("variable_name") |>
    unique()
  if (length(mVars) > 0) {
    result <- table |>
      dplyr::summarise(
        dplyr::across(
          .cols = dplyr::all_of(mVars),
          ~ sum(as.integer(is.na(.x)), na.rm = TRUE),
          .names = "cm_{.col}"
        ),
        "den" = dplyr::n()
      ) |>
      dplyr::collect() |>
      dplyr::mutate(dplyr::across(
        .cols = dplyr::all_of(c("den", paste0("cm_", mVars))),
        .fns = as.numeric
      )) |>
      tidyr::pivot_longer(
        cols = dplyr::all_of(paste0("cm_", mVars)),
        names_to = "variable_name",
        values_to = "count_missing"
      ) |>
      dplyr::mutate("percentage_missing" = 100 * .data$count_missing / .data$den) |>
      dplyr::select(-"den") |>
      tidyr::pivot_longer(
        cols = c("count_missing", "percentage_missing"),
        names_to = "estimate_name",
        values_to = "estimate_value"
      ) |>
      dplyr::mutate(
        "variable_name" = substr(.data$variable_name, 4, nchar(.data$variable_name)),
        "variable_level" = NA_character_,
        "estimate_value" = dplyr::if_else(
          is.infinite(.data$estimate_value) | is.nan(.data$estimate_value),
          NA_character_, as.character(.data$estimate_value)
        )
      ) |>
      dplyr::inner_join(
        functions |>
          dplyr::filter(.data$estimate_name %in% c("count_missing", "percentage_missing")) |>
          dplyr::select("variable_name", "estimate_name", "estimate_type"),
        by = c("variable_name", "estimate_name")
      )
  } else {
    result <- NULL
  }
  return(result)
}

orderVariables <- function(res, cols, est) {
  if (length(est) == 0) {
    return(res)
  }
  orderVars <- dplyr::tibble("variable_name" = c(
    "number_records", "number_subjects", cols
  )) |>
    dplyr::mutate("id_variable" = dplyr::row_number())
  orderEst <- dplyr::tibble("estimate_name" = est) |>
    dplyr::mutate("id_estimate" = dplyr::row_number())
  res <- res |>
    dplyr::left_join(orderVars, by = c("variable_name")) |>
    dplyr::left_join(orderEst, by = c("estimate_name")) |>
    dplyr::arrange(.data$id_variable, .data$id_estimate, .data$variable_level) |>
    dplyr::select(-c("id_variable", "id_estimate"))
  return(res)
}

Try the PatientProfiles package in your browser

Any scripts or data that you put into this service are public.

PatientProfiles documentation built on Oct. 30, 2024, 9:13 a.m.