#' Calculate ID-level longitudinal summaries (linear slope, ITGR, or AUC) from raw data.
#'
#' Computes a longitudinal summary measure for the (x,y) values
#' from each subject_ID, without any explicit model fit. The
#' AUC is calculated using `DescTools::AUC(...)`, which takes
#' one of {'trapezoid', 'step', 'spline'} as its `method`.
#'
#' If VST == TRUE, each ID-level vector of endpoint values
#' (specified in `endpoint_name`) will be transformed for variance
#' stabilization by adding a constant then transforming
#' (e.g., "y = log( 1 + [[endpoint_name]] )").
#' If subtract_starting_value = TRUE, the ID-level vector is centered to
#' its first (in time-order) value, i.e, each y-value will have the first
#' y-value subtracted from it. These transformed & centered values then have
#' their AUC relative to the line y = 0 computed by the DescTools::AUC(...)
#' function and given `method`. ITGR and AUC summaries will be x-range normalized
#' based on the value of 'xrange_norm_method'. Under default settings, each
#' should then be close to the linear slope summary when the longitudinal
#' trace is fairly linear. With the default settings, the "AUC" and "ITGR"
#' statistics are empirical, subject-specific versions of the "eGaIT" and "eDOT"
#' summary statistics, while the "linear" statistic is the least-squares slope
#' of "y" regressed on "x" after any transformations.
#'
#' If a subject_ID has only one observation, its summary is ambiguous, since,
#' its literal AUC or vertical shift must be zero, but so is the x-range over
#' which it is found. By default, these receive NA, but are set to zero if
#' `singleton_to_NA == FALSE.`
#'
#' @author Bill Forrest <forrest@@gene.com>
#'
#' @param study_data_frame data.frame with factors for group_name and subject_ID, numeric columns for x_name (i.e., time) and endpoint_name (i.e., response).
#' @param metric character name method for longitudinal summary.
#' @param method character name method for computing AUC. Passed directly to DescTools::AUC( ..., method = ... ).
#' @param check_data logical whether to pass "study_data_frame" through maeve::check_study_data_frame() to check for problems.
#' @param VST logical whether to apply a variance stabilizing transform to endpoint_name values.
#' @param xmin numeric left / lower end of the interval.
#' @param xmax numeric right / uppper end of the interval.
#' @param subtract_starting_value logical whether to subtract, for each subject_ID, the first value in the time series so that each series starts at zero.
#' @param na.rm logical whether to remove NA values when computing summaries.
#' @param singleton_to_NA logical whether to assign subject_ID cases with a single observation NA or zero.
#' @param group_name character column name for the group name factor
#' @param subject_ID character column name for the subject name factor
#' @param x_name character column name for the x-axis / time field
#' @param endpoint_name character column name for the x-axis / time field
#' @param add_to_endpoint numeric offset value added to the endpoint before transformation
#' @param trans_func_char character function name for transformation of (endpoint_name + add_to_endpoint)
#' @param inv_func_char character inverse function for transformation of (endpoint_name + add_to_endpoint)
#' @param test_func_x numeric values with which to test that trans_func and inv_func are inverse functions.
#' @param xrange_norm_method character method for normalization of spline summary statistics.
#'
#' @return An R data.frame with one row per ID.
#'
#' @examples
#' vismo_summary_by_id <- summarize_by_id( vismodegib, xmin = 0, xmax = 21 )
#'
#' @author Bill Forrest \email{forrest@@gene.com}
#'
#' @references \url{www.r-project.org}
#'
#' @keywords data.frame
#' @export
#'
summarize_by_id <-
function( study_data_frame,
metric = c( 'linear', 'ITGR', 'AUC' ), ## one or more of c( 'linear', 'ITGR', 'AUC' )
method = c( 'trapezoid', 'step', 'spline' ),
check_data = FALSE,
VST = TRUE, # If TRUE, add to endpoint & apply variance-stabilizing transform before AUC
xmin = -Inf,
xmax = Inf,
subtract_starting_value = TRUE, # If TRUE, subtract 1st value from all others, for each ID.
na.rm = TRUE, # If TRUE, discard rows with NA for endpoint_name before calculations.
singleton_to_NA = TRUE, # If TRUE, singleton AUC = NA. If FALSE, singleton AUC = 0.
##
group_name = maeve_options("group_name"),
subject_ID = maeve_options("subject_ID"),
x_name = maeve_options("x_name"),
endpoint_name = maeve_options("endpoint_name"),
add_to_endpoint = maeve_options("add_to_endpoint"),
trans_func_char = maeve_options("trans_func_char"),
inv_func_char = maeve_options("inv_func_char"),
test_func_x = maeve_options("test_func_x"),
xrange_norm_method = maeve_options("xrange_norm_method")
){
metrics_allowed <- c( 'linear', 'ITGR', 'AUC' )
metric <- unique( metric ) ## ignore any repeats.
mismatched_metric <- metric[ which( is.na( match( metric, metrics_allowed ) ) ) ] ## setdiff( metric, metrics_allowed )
if( length( mismatched_metric ) > 0 ){
stop_msg_string_01 <- paste( mismatched_metric, collapse = ' ' )
stop_msg_string_02 <- paste( metrics_allowed, collapse = ' ')
stop( 'error in maeve::summarize_by_id(): the following metric value(s):\n',
stop_msg_string_01,
'\n',
'fail(s) to match perfectly with any of the allowed metric values:\n',
stop_msg_string_02,
'\n'
) # end of 'stop(...'
} # end of 'if( length( mismatched_metric ) > 0 ){...'
metric = match.arg( metric, choices = metrics_allowed, several.ok = TRUE )
## If two or more metrics are requested (e.g., "linear" and "AUC"), call
## this function recursively to compute each separately, then merge the
## results and return them.
if( length( metric ) > 1 ){
## if 2+ metrics are requested, use recursion:
out_list <- vector( "list", length = length( metric ) )
names( out_list ) <- metric
for( metric_name in metric ){
out_list[[ metric_name ]] <-
## call this function (i.e., recursively) but with only 1 "metric" value:
summarize_by_id( study_data_frame,
check_data = check_data,
metric = metric_name,
method = method,
VST = VST,
xmin = xmin,
xmax = xmax,
subtract_starting_value = subtract_starting_value,
na.rm = na.rm,
singleton_to_NA = singleton_to_NA,
##
group_name = group_name,
subject_ID = subject_ID,
x_name = x_name,
endpoint_name = endpoint_name,
add_to_endpoint = add_to_endpoint,
trans_func_char = trans_func_char,
inv_func_char = inv_func_char,
test_func_x = test_func_x,
xrange_norm_method = xrange_norm_method
)
} ## end of 'for( metric_name in metric ){...'
merge_column_names <- c( group_name, subject_ID, 'first_day', 'last_day' )
## The data frames in the list differ only by their last (i.e., 5th) column.
## Create a single data frame that has the first four columns in common, and
## adds the 5th column from each of the others to make a multi-metric summary:
out_multiple_summaries <-
base::Reduce( function(dtf1, dtf2) merge(dtf1, dtf2, by = merge_column_names, all.x = TRUE ), out_list )
out_multiple_summaries[, group_name] <- factor( out_multiple_summaries[, group_name ],
levels = levels( study_data_frame[, group_name ] )
)
out_multiple_summaries[, subject_ID] <- factor( out_multiple_summaries[, subject_ID ],
levels = levels( study_data_frame[, subject_ID ] )
)
out_multiple_summaries %<>% dplyr::arrange( !!dplyr::sym( group_name ), !!dplyr::sym( subject_ID ) )
return( out_multiple_summaries )
} ## end of 'if( length( metric ) > 1 ){...'
## Which method is used to interpolate between the points?
method = match.arg( method, several.ok = FALSE )
dat <-
study_data_frame %>%
dplyr::select( group_name, subject_ID, x_name, endpoint_name )
if( check_data ){
dat %<>% maeve::check_study_data_frame( group_name = group_name,
subject_ID = subject_ID,
x_name = x_name,
endpoint_name = endpoint_name,
##
print_warnings = maeve_options('progress')
)
}
## "Window" the transformed data, interpolating to endpoints as needed.
## Note that by default, xmin = -Inf & xmax = Inf, in which case
## no truncation or interpolation will happen.
dat %<>%
maeve::window_by_id( group_name = group_name,
subject_ID = subject_ID,
x_name = x_name,
endpoint_name = endpoint_name,
##
VST = VST,
add_to_endpoint = add_to_endpoint,
trans_func_char = trans_func_char,
inv_func_char = inv_func_char,
test_func_x = test_func_x,
##
method = method,
xmin = xmin,
xmax = xmax,
##
return_all_columns = TRUE,
unwindowed_to_NA = TRUE,
check_data = FALSE # already checked above
)
## Find a transformed endpoint?
if( VST == TRUE ){ # Apply a Variance Stabilizing Transformation:
## Check the transformation function & assign it & its inverse:
func_list <- check_trans_func( trans_func_char, inv_func_char, test_func_x )
trans_func <- func_list$trans_func;
inv_func <- func_list$inv_func;
## Define 'y' as the transformed endpoint:
dat %<>% dplyr::mutate( y = trans_func( get( endpoint_name ) + add_to_endpoint ) )
} else{
dat %<>% dplyr::mutate( y = get( endpoint_name ) )
}
## find 'first_day' and 'last_day' for each ID.
dat %<>%
dplyr::mutate( x = get( x_name ) ) %>%
plyr::ddply(
.var = c( eval( group_name ), eval( subject_ID ) ),
.fun = function(XX, na_rm = na.rm ){
if( na_rm ){
XX %<>% dplyr::filter( !is.na( .data$x ) & !is.na( .data$y ) )
}
XX %<>%
dplyr::mutate( first_day = min( XX[,x_name], na.rm = na_rm ),
last_day = max( XX[,x_name], na.rm = na_rm ),
x_range_of_ID = .data$last_day - .data$first_day
) %>%
maeve::round_numerics()
return(XX)
}
) # end of 'plyr::ddply(...'
## Subtract out starting value for each ID?
if( subtract_starting_value == TRUE ){
dat %<>%
plyr::ddply(
.var = c( eval( group_name ), eval( subject_ID ), 'first_day', 'last_day', 'x_range_of_ID' ),
.fun = function( XX, na_rm = na.rm ){
baseline_y <- XX %>% dplyr::filter( .data$x <= .data$first_day ) %>% dplyr::select( .data$y ) %>% unlist
XX %<>% dplyr::mutate( y = .data$y - baseline_y ) %>% maeve::round_numerics()
return(XX)
}
) # end of 'plyr::ddply(...'
}
## Call DescTools::AUC() for each ID.
## If the x-range is zero, return NA.
## If the x-range is positive, divide the AUC for each ID by
## half the square of the x-range, by default. Other denominators
## are determined by the "norm_method" parameter.
dat_ID <-
dat %>%
plyr::ddply(
.var = c( eval( group_name ), eval( subject_ID ) ),
.fun = function(XX, na_rm = na.rm, AUC_method = method, norm_method = xrange_norm_method ){
##
x_range_of_ID = unique( XX[,'x_range_of_ID'] )
if( length( x_range_of_ID ) > 1 ){
stop('error in summarize_by_id(): there should be just one value for ID-level x-range')
}
##
if( x_range_of_ID <= 10^(-12) ){
if( singleton_to_NA ){
value = NA # set cases with 1 observation to NA value
} else{
value = 0 # set cases with 1 observation to zero value
}
} else{
if( metric == 'AUC' ){
value = DescTools::AUC( x = XX$x, y = XX$y, method = AUC_method )
norm_value =
switch( norm_method,
'none' = 1,
'xrange' = x_range_of_ID,
'slope_equivalent' = ( 0.5 * x_range_of_ID^2 ), ## empirical eGaIT
stop('error in maeve::summarize_by_id(): xrange_norm_method must match one of {"none","xrange","slope_equivalent"}')
)
value = value / norm_value
} ## end of 'if( metric == 'AUC' ){...'
if( metric == 'linear' ){
value = stats::lm( y ~ x, data = XX )$coef[['x']]
} ## end of 'if( metric == 'linear' ){...'
if( metric == 'ITGR' ){
value = XX$y[ XX$x == max( XX$x ) ] - XX$y[ XX$x == min( XX$x ) ]
##norm_value = max( XX$x ) - min( XX$x )
norm_value =
switch( norm_method,
'none' = 1,
'xrange' = x_range_of_ID,
'slope_equivalent' = x_range_of_ID, ## for ITGR, "slope_equivalent" scaling just divides by the x-range --> "empirical eDOT"
stop('error in maeve::summarize_by_id(): xrange_norm_method must match one of {"none","xrange","slope_equivalent"}')
)
value = value / norm_value
} ## end of 'if( metric == 'ITGR' ){...'
}
out_DF <- data.frame( XX[ 1, c( group_name, subject_ID, 'first_day', 'last_day' ) ], value = value )
colnames( out_DF )[5] <- metric
return( out_DF )
} ## end of '.fun = function(XX, ...'
) ## end of 'plyr::ddply(...'
return( dat_ID )
} ## end of 'auc_by_id <- function(...'
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.