R/calculateObsSummary.R

#' Calculate Observed Data Summary
#'
#' Calculate a summary of an "observed" dataset (typically used to overlay
#' observed data in a simulated plot or table)
#'
#' The inputs are checked, and the alpha value is parsed using
#' \link{checkSimAlpha} Subsets are applied to the data if "subset" is
#' specified and differences from baseline are calculated using
#' \link{calculateDiffsFromBase} if required
#'
#' If respType is "Continuous": * Mean responses are calculated by Subject
#' (\code{idCol}), Dose (\code{doseCol}), Time (\code{timeCol}), and any by
#' variables ((\code{bVar}) * The following summaries are then created (based
#' on any by variables (\code{bVar})) - Mean - Median - Minimum - Maximum -
#' Number of non-missing values - Lower alpha\% percent interval - Upper
#' alpha\% percent interval
#'
#' When dealing with categorical responses, it is possible that the unique set
#' of responses could be (say): 1, 2, 4, 5 In this case, it is unclear as to
#' whether the value "3" should be included in a summary.  If "fillRespRange"
#' is TRUE, it would be included (although would have a count/proportion of
#' zero) Frequencies are calculated by "Response level" (\code{respCol}) and
#' any by variables Frequences are converted to proportions (within by variable
#' level) if required (\code{catType})
#'
#' @param data Observed data frame
#' @param respCol Response column names (given by \link{getEctdColName} by
#' default)
#' @param bVar Variables by which summary should be produced (none by default)
#' @param subset Subsets to be applied to the observed data before calculating
#' the summary
#' @param alpha Alpha value for calculation of lower and upper intervals
#' @param digits Number of digits to round summary data
#' @param diffBase Logical: Should differences from baseline be summarised
#' instead of raw data?
#' @param doseCol Dose column names (given by \link{getEctdColName} by default)
#' @param timeCol Time column names (given by \link{getEctdColName} by default)
#' @param idCol Subject column names (given by \link{getEctdColName} by
#' default)
#' @param respType Response type: Continuous (default) or Categorical
#' @param catType For Categorical response, should "Count" (default) or
#' "Proportion" summary be returned?
#' @param fillRespRange For Categorical response, should we "fill" the range of
#' responses (see below)
#' @return A data frame of response summaries
#' @author Mike K Smith \email{mstoolkit@@googlemail.com}
#' @keywords Observed
#'
#' @export
"calculateObsSummary" <- function(
		data,
		respCol = getEctdColName("Response"),
		bVar = NULL,
		subset = NULL,
		alpha = 95,
		digits = 3,
		diffBase = FALSE,
		doseCol = getEctdColName("Dose"),
		timeCol = getEctdColName("Time"),
		idCol = getEctdColName("Subject"),
		respType = c("Continuous", "Categorical"),
		catType = c("Count", "Proportion"),
		fillRespRange = TRUE
)
{
	###############################################################################
	# Mango Solutions, Chippenham SN15 1BN 2009
	# createObsSummary.R 19NOV09
	#
	# Author: Rich
	###############################################################################
	# DESCRIPTION: Create summary of observed variables from a NONMEM object
	###############################################################################
	# Basic input checks
	respType <- match.arg(respType); catType <- match.arg(catType)
	.checkLogical(diffBase, fillRespRange)							# Check single logicals
	.checkCharacter(respCol, idCol, respType, catType)				# Check single characters
	.checkNumeric(digits)											# Check single numerics

	# Parse the "alpha" input
	alpha <- checkSimAlpha(alpha)
	alpha <- stats::qnorm((1 + alpha)/2)

	# Get the data
	data <- switch(class(data),
			"NMRun" = data,   ## TODO: Call correct data method here
			"data.frame" = data,
			ectdStop("Observed data must be either a data frame or NONMEM run object"))

	# Check inputs
	if (!(timeCol %in% names(data))) timeCol <- NULL
	if (!(doseCol %in% names(data))) doseCol <- NULL
	if (!length(bVar)) ectdStop("Need at least 1 by variable to calculate summaries")
	needCols <- unique(c(respCol, bVar, idCol))
	checkColNames(names(data), needCols)

	# Subset if required
	if (!is.null(subset)) data <- .applyDataSubset(data, subset)
	if (diffBase & length(timeCol)) data <- calculateDiffsFromBase(data, respCol = respCol, idCol = idCol, timeCol = timeCol, replicateCol = NULL)

	# Calculate "means" based on response variable type
	if (respType == "Continuous") {
		# Calculate means by subject, time and by variables
		meanData <- .dataAggregate(data[respCol], data[unique(c(idCol, doseCol, timeCol, bVar))], mean, na.rm = TRUE)

		#
		# Set up functions to use in the aggregate calls
		#
		loFun <- function(x, alp, dG) round(mean(x, na.rm = T) - (alp * stats::sd(x, na.rm = T))/sqrt(sum(!is.na(x))), dG)
		upFun <- function(x, alp, dG) round(mean(x, na.rm = T) + (alp * stats::sd(x, na.rm = T))/sqrt(sum(!is.na(x))), dG)
		meanFun <- function(x, dG) round(mean(x, na.rm = T), dG)
		medianFun <- function(x, dG) round(stats::median(x, na.rm = T), dG)
		minFun <- function(x, dG) round(min(x, na.rm = T), dG)
		maxFun <- function(x, dG) round(max(x, na.rm = T), dG)

		#
		# Create summaries
		#
		myMeds <- .dataAggregate(list(Median = meanData[[respCol]]), meanData[bVar], medianFun, dG = digits)
		myMeans <- .dataAggregate(list(Mean = meanData[[respCol]]), meanData[bVar], meanFun, dG = digits)
		myLower <- .dataAggregate(list(Lower = meanData[[respCol]]), meanData[bVar], loFun, dG = digits, alp = alpha)
		myUpper <- .dataAggregate(list(Upper = meanData[[respCol]]), meanData[bVar], upFun, dG = digits, alp = alpha)
		myN <- .dataAggregate(list(N = meanData[[respCol]]), meanData[bVar], function(x) sum(!is.na(x)))
		myMin <- .dataAggregate(list(Min = meanData[[respCol]]), meanData[bVar], minFun, dG = digits)
		myMax <- .dataAggregate(list(Max = meanData[[respCol]]), meanData[bVar], maxFun, dG = digits)

		# Merge summaries
		outData <- merge(myMeds, merge(myMeans, merge(myLower, merge(myUpper, merge(myMin, merge(myMax, myN))))))
	}
	else {
		# Calculate counts by subject, time, by variables AND response
		meanData <- .dataAggregate(list(COUNT = data[[respCol]]), data[unique(c(bVar, respCol))], function(x) sum(!is.na(x)))
		# "Expand" this to a full set of responses
		expData <- unique(data[bVar])
		uniResp <- unique(data[[respCol]])
		if (fillRespRange & all(is.integer(uniResp))) uniResp <- min(uniResp):max(uniResp)
		fullData <- expData[ rep(1:nrow(expData), length(uniResp)), , drop = FALSE]
		fullData[[respCol]] <- rep(uniResp, each = nrow(expData))
		meanData <- merge(meanData, fullData, all = TRUE)
		outData <- meanData [ do.call("order", meanData[unique(c(bVar, respCol))]), , drop = FALSE]
		outData$COUNT <- replace(outData$COUNT, is.na(outData$COUNT), 0)
		if (catType == "Proportion") {
			totalData <- .dataAggregate(list(TOTAL = outData$COUNT), outData[bVar], sum, na.rm = TRUE)
			outData <- merge(outData, totalData)
			outData$PROPORTION <- round(100 * outData$COUNT / outData$TOTAL, digits = digits)
			outData <- outData[setdiff(names(outData), c("COUNT", "TOTAL"))]
		}
	}
	return(outData)
}
MikeKSmith/MSToolkit documentation built on Feb. 15, 2024, 5:32 p.m.