R/validate_datalong.R

Defines functions validate_dataice validate_datalong_unifromStrata validate_datalong_complete validate_datalong_notMissing validate_datalong_types validate_datalong_varExists validate_datalong

Documented in validate_dataice validate_datalong validate_datalong_complete validate_datalong_notMissing validate_datalong_types validate_datalong_unifromStrata validate_datalong_varExists

#' Validate a longdata object
#'
#' @name validate_datalong
#'
#' @param data a `data.frame` containing the longitudinal outcome data + covariates
#' for multiple subjects
#'
#' @param vars a `vars` object as created by [set_vars()]
#'
#' @param data_ice a `data.frame` containing the subjects ICE data. See [draws()] for details.
#'
#' @param update logical, indicates if the ICE data is being set for the first time or if an update
#' is being applied
#'
#' @details
#' These functions are used to validate various different parts of the longdata object
#' to be used in [draws()], [impute()], [analyse()] and [pool()]. In particular:
#'
#' - validate_datalong_varExists - Checks that each variable listed in `vars` actually exists
#' in the `data`
#'
#' - validate_datalong_types - Checks that the types of each key variable is as expected
#' i.e. that visit is a factor variable
#'
#' - validate_datalong_notMissing - Checks that none of the key variables (except the outcome variable)
#' contain any missing values
#'
#' - validate_datalong_complete - Checks that `data` is complete i.e. there is 1 row for each subject *
#' visit combination. e.g. that `nrow(data) == length(unique(subjects)) * length(unique(visits))`
#'
#' - validate_datalong_unifromStrata - Checks to make sure that any variables listed as stratification
#' variables do not vary over time. e.g. that subjects don't switch between stratification groups.
#'
validate_datalong <- function(data, vars) {
    validate_datalong_varExists(data, vars)
    validate_datalong_types(data, vars)
    validate_datalong_notMissing(data, vars)
    validate_datalong_complete(data, vars)
    validate_datalong_unifromStrata(data, vars)
    return(invisible(TRUE))
}





#' @rdname validate_datalong
validate_datalong_varExists <- function(data, vars) {


    assert_that(
        vars$outcome %in% names(data),
        msg = sprintf("Cannot find %s in `data`", vars$outcome)
    )

    assert_that(
        vars$group %in% names(data),
        msg = sprintf("Cannot find %s in `data`", vars$group)
    )

    assert_that(
        vars$visit %in% names(data),
        msg = sprintf("Cannot find %s in `data`", vars$visit)
    )

    assert_that(
        vars$subjid %in% names(data),
        msg = sprintf("Cannot find %s in `data`", vars$subjid)
    )

    assert_that(
        all(vars$strata %in% names(data)) | is.null(vars$strata),
        msg = "One of more variables listed in `vars$strata` do not exist in `data`"
    )

    covars <- extract_covariates(vars$covariates)
    assert_that(
        all(covars %in% names(data)) | is.null(covars),
        msg = "One of more variables listed in `vars$covariates` do not exist in `data`"
    )

    return(invisible(TRUE))
}

#' @rdname validate_datalong
validate_datalong_types <- function(data, vars) {
    covars <- extract_covariates(vars$covariates)

    assert_that(
        is.factor(data[[vars$subjid]]),
        msg = sprintf("Variable `%s` should be of type factor", vars$subjid)
    )

    assert_that(
        is.factor(data[[vars$group]]),
        msg = sprintf("Variable `%s` should be of type factor", vars$group)
    )

    assert_that(
        length(unique(data[[vars$group]])) == nlevels(data[[vars$group]]),
        msg = sprintf(
            "The number of levels in variable `%s` is different than the number of observed levels",
            vars$group
        )
    )

    assert_that(
        is.factor(data[[vars$visit]]),
        msg = sprintf("Variable `%s` should be of type factor", vars$visit)
    )

    assert_that(
        length(unique(data[[vars$visit]])) == nlevels(data[[vars$visit]]),
        msg = sprintf(
            "The number of levels in variable `%s` is different than the number of observed levels",
            vars$visit
        )
    )

    assert_that(
        is.numeric(data[[vars$outcome]]),
        msg = sprintf("Variable `%s` should be of type numeric", vars$outcome)
    )

    additional_vars <- c(covars, vars$strata)

    if (!is.null(additional_vars)) {
        for (var in additional_vars) {
            assert_that(
                is_num_char_fact(data[[var]]),
                msg = sprintf("Variable `%s` should be of type numeric, factor or character", var)
            )
        }
        for (var in covars) {
            if (is_char_fact(data[[var]])) {
                assert_that(
                    length(unique(data[[var]])) >= 2,
                    msg = sprintf(
                        "Covariate `%s` only has 1 level, categorical covariates must have at least 2 levels",
                        var
                    )
                )
            }
        }
    }
    return(invisible(TRUE))
}

#' @rdname validate_datalong
validate_datalong_notMissing <- function(data, vars) {
    non_missing_variables <- c(
        vars$group,
        vars$visit,
        vars$subjid,
        vars$strata,
        extract_covariates(vars$covariates)
    )
    for (var in non_missing_variables) {
        if (any(is.na(data[[var]]))) {
            stop(paste0("Variable ", var, " contains missing data"))
        }
    }
    return(invisible(TRUE))
}

#' @rdname validate_datalong
validate_datalong_complete <- function(data, vars) {
    unique_subjects <- unique(data[[vars$subjid]])
    unique_visits <- levels(data[[vars$visit]])

    data_dedup <- unique(data[, c(vars$subjid, vars$visit)])

    assert_that(
        nrow(data) == length(unique_subjects) * length(unique_visits),
        nrow(data_dedup) == nrow(data),
        msg = "At least one subject has either incomplete or duplicated data"
    )
    return(invisible(TRUE))
}

#' @rdname validate_datalong
validate_datalong_unifromStrata <- function(data, vars) {
    for (var in vars$strata) {
        x <- tapply(
            data[[var]],
            data[[vars$subjid]],
            function(x) length(unique(x))
        )
        if (!all(x == 1)) {
            stop(
                "Stratification variable '", var,
                "' is not constant within at least one subject"
            )
        }
    }
    return(invisible(TRUE))
}




#' @rdname validate_datalong
validate_dataice <- function(data, data_ice, vars, update = FALSE) {

    validate(vars)

    strategy <- vars$strategy
    visit <- vars$visit
    subjid <- vars$subjid

    assert_that(
        is.character(data_ice[[strategy]]),
        all(!is.na(data_ice[[strategy]])),
        msg = "`data_ice[[vars$strategy]]` must be a non-missing character vector"
    )

    assert_that(
        is_char_fact(data_ice[[subjid]]),
        all(!is.na(data_ice[[subjid]])),
        msg = "`data_ice[[vars$subjid]]` must be a non-missing character or factor vector"
    )

    assert_that(
        all(as.character(data_ice[[subjid]]) %in% as.character(data[[subjid]])),
        msg = "`data_ice[[vars$subjid]]` contains values that aren't in `data[[vars$subjid]]`"
    )

    if (!update) {
        valid_visits <- unique(as.character(data[[visit]]))

        assert_that(
            is.character(data_ice[[visit]]) | is.factor(data_ice[[visit]]),
            all(!is.na(data_ice[[visit]])),
            msg = "`data_ice[[vars$visit]]` must be a non-missing character or factor vector"
        )

        assert_that(
            all(as.character(data_ice[[visit]]) %in% valid_visits),
            msg = "`data_ice[[vars$visit]]` contains values that are not in `data[[vars$visit]]`"
        )
    }

    assert_that(
        length(data_ice[[subjid]]) == length(unique(data_ice[[subjid]])),
        msg = paste(
            "`data_ice` must contain at most 1 row per subjects.",
            "If you have multiple ICEs please use the first Non-MAR ICE"
        )
    )

    return(TRUE)
}

Try the rbmi package in your browser

Any scripts or data that you put into this service are public.

rbmi documentation built on Nov. 24, 2023, 5:11 p.m.