R/modules-smd.R
In tableone: Create 'Table 1' to Describe Baseline Characteristics with or without Propensity Score Weights

################################################################################
### Modules for standardized mean difference support
##
## Created on: 2015-07-29
## Author: Kazuki Yoshida
################################################################################

###
### References
################################################################################
## A unified approach to measuring the effect size between two groups using SAS
## https://support.sas.com/resources/papers/proceedings12/335-2012.pdf
## R multinomial distribution variance
## https://stackoverflow.com/questions/19960605/r-multinomial-distribution-variance


###
### Helpers common to both unweighted and weighed functions
################################################################################

## Dummy matrix to means (proportions for each dummy)
MultinomialMeans <- function(dummyMat, na.rm = TRUE) {
    colMeans(dummyMat, na.rm = na.rm)
}

## Proportions to variance-covariance matrix
## https://stackoverflow.com/questions/19960605/r-multinomial-distribution-variance
## diagonal p_k(1-p_k); off-diagonal -1 * p_k * p_l
MultinomialVar <- function(multinomialMeans) {
    ## Mean for each
    p <- multinomialMeans
    ## Diagonal elements are p_i(1-p_i)
    vars <- p * (1 - p)
    ## Off-diagonal elements are - p_i p_j where i != j
    covs <- - outer(p, p)
    ## Put diagonal elements
    diag(covs) <- vars
    ## Reduce to a scalar if 1x1
    drop(covs)
}

## List of mean vector of a multinomial variable to categorical SMD (Yang & Dalton 2012)
StdDiffFromLstMeans <- function(lstMeans) {

    ## list of variance-covariance matrices
    ## https://stackoverflow.com/questions/19960605/r-multinomial-distribution-variance
    ## diagonal p_k(1-p_k); off-diagonal -1 * p_k * p_l
    lstCovs <- lapply(lstMeans, MultinomialVar)

    ## All possible mean vector differences
    lstMeanDiffs <- lapply(lstMeans, function(x) {
        lapply(lstMeans, function(y) {
            x - y
        })
    })

    ## All possible covariance matrix means
    lstCovMeans <- lapply(lstCovs, function(x) {
        lapply(lstCovs, function(y) {
            (x + y) / 2
        })
    })

    ## Initialize a numeric vector object for capturing values
    sqSmds <- vector(mode = "numeric")

    ## Add upper triangle elements (i < j) to output list
    for (i in seq_along(lstMeans)) {
        for (j in seq_along(lstMeans)) {
            if (i < j) {
                ## For upper triangle elements only
                ## Squared Mahalanobis distance
                ## meanDiffs^T (pooled vcov)^-1 meanDiffs
                ## Generalized inverse for protection against singularity
                ## Reduces to true inverse if non-singular

                ## Column mean difference vector
                T_C <- t(t(lstMeanDiffs[[i]][[j]]))
                ## Pooled vcov
                S   <- lstCovMeans[[i]][[j]]

                if (all(S[!is.na(S)] %in% 0)) {
                    ## If S is a zero matrix, ginv is a zero matrix
                    ## which gives a zero SMD regardless of mean
                    ## difference. Such a case should be NaN.
                    ## NOTE: NA's are dropped first. Non-NA elements
                    ## are assessed for 0's. If all remaining
                    ## are zeros or no element remained (all NA),
                    ## all() returns TRUE, and sqMD is forced to NaN.

                    if (all(!is.na(T_C) & (T_C == 0))) {
                        ## If the mean difference vector is a zero vector,
                        ## the distance can be defined 0.
                        sqMD <- 0
                    } else {
                        sqMD <- NaN
                    }

                } else {
                    ## Squared Mahalanobis distance
                    sqMD <- t(T_C) %*% MASS::ginv(S) %*% T_C
                }

                ## Add sqrt of MD to output
                ## Not efficient; room for improvement
                sqSmds <- c(sqSmds, sqMD)
            }
        }
    }
    ## We want it on the original scale
    sqrt(sqSmds)
}


## Given strata x levels of variable table
## Return list of proportions of levels dropping redundant first level
LstMeansFromFullTable <- function(strataByLevels) {
    ## Proportion within each stratum
    ## Equivalent to mean of dummy variables
    propTables <- prop.table(strataByLevels, margin = 1)

    ## Drop first level to eliminate dependence if more than two levels
    ## Avoids errors with constant variable
    if (ncol(propTables) > 1) {
        propTables <- propTables[, -1, drop = FALSE]
    }

    ## list of mean vectors (Missing values are discarded)
    lstMeans <- lapply(seq_len(nrow(propTables)), function(i) {

        propTables[i,]
    })
    lstMeans
}


## Create a matrix form
FormatLstSmds <- function(lstSmds, nStrata) {

    if (nStrata < 2) {
        warning("nStrata has to be >= 2.")
    }

    ## matrix form
    matSmds <- do.call(rbind, lstSmds)

    ## Create contrast names
    allCombns <- t(combn(nStrata, 2))
    combnNames <- apply(allCombns, MARGIN = 1, FUN = paste, collapse = " vs ")
    colnames(matSmds) <- combnNames

    ## Add a mean column if more than two columns
    if (ncol(matSmds) > 1) {
        matSmds <- cbind(average = rowMeans(matSmds),
                         matSmds)
    }

    matSmds
}


###
### Functions for unweighted data only
################################################################################

### Check strata for NA-only strata
CheckNaOnlyStrata <- function(variable, group) {

    unlist(lapply(split(variable, group),
                  function(var) {
                      ## TRUE if only NA's within stratum
                      all(is.na(var))
                  }))
}


### Continuous/binary standardized mean differences
## Expects continuous or 0,1 binary variable
StdDiff <- function(variable, group, binary = FALSE, na.rm = TRUE) {

    ## Check strata for all NA strata
    logiAllNaStrata <- CheckNaOnlyStrata(variable, group)
    ## If ANY strata have only NA's do not remove NA's
    if (any(logiAllNaStrata)) {
        warning("Variable has only NA's in at least one stratum. na.rm turned off.")
        na.rm = FALSE
    }

    ## Proportion of 1 is the mean of variable
    means <- tapply(variable, group, mean, na.rm = na.rm)

    ## Variance is p(1-p)
    if (binary) {
        vars  <- means * (1 - means)
    } else {
        vars  <- tapply(variable, group, var, na.rm = na.rm)
    }

    ## Outer to obtain all pairwise differences
    meanDiffs  <- outer(X = means, Y = means, FUN = "-")
    ## Outer to obtain all pairwise variance mean
    varMeans   <- outer(X = vars, Y = vars, FUN = "+") / 2

    out <- meanDiffs / sqrt(varMeans)

    ## If mean difference is zero and variance is zero,
    ## only one constant exists across two groups.
    ## In this case, the SMD can be defined zero, rather than NaN from 0/0.
    out[is.na(out) & !is.na(meanDiffs) & (meanDiffs == 0) & !is.na(varMeans) & (varMeans == 0)] <- 0

    ## This lower.tri() approach is actually giving 2vs1, 3vs1, etc
    ## opposite of stated 1vs2, 1vs3. Only correct if abs() is used.
    abs(out[lower.tri(out)])
}

### Categorical (including binary) standardizzed mean difference
StdDiffMulti <- function(variable, group) {

    ## Check strata for all NA strata
    logiAllNaStrata <- CheckNaOnlyStrata(variable, group)

    ## If ALL strata have only NA's convert to a level
    ## If any strata have valid levels, table is ok
    ## Empty table breaks the function
    if (all(logiAllNaStrata)) {

        warning("Variable has only NA's in all strata. Regarding NA as a level.")

        variable <- factor(variable, exclude = NULL)
    }

    ## strata x variable table
    strataByLevels <- table(group, variable)
    lstMeans       <- LstMeansFromFullTable(strataByLevels)
    ## Return vector of SMDs
    StdDiffFromLstMeans(lstMeans)
}


### Standardized mean differences for multiple variables
## Continuous or binary only
StdDiffs <- function(data, vars, groupVar, binaryVars) {

    lapply(vars, function(var) {

        StdDiff(variable = data[,var], group = data[,groupVar],
                binary = (var %in% binaryVars))
    })
}


###
### Functions for weighted data only
################################################################################

### Check strata for NA-only strata
svyCheckNaOnlyStrata <- function(varName, groupName, design) {

    unlist(lapply(split(design$variables[,varName],
                        design$variables[,groupName]),
                  function(var) {
                      ## TRUE if only NA's within stratum
                      all(is.na(var))
                  }))
}

### Continuous/binary standardized mean differences
## Expects continuous or 0,1 binary variable
svyStdDiff <- function(varName, groupName, design, binary = FALSE, na.rm = TRUE) {

    ## Check strata for all NA strata
    logiAllNaStrata <- svyCheckNaOnlyStrata(varName, groupName, design)
    ## If ANY strata have only NA's do not remove NA's
    if (any(logiAllNaStrata)) {
        warning(varName, " has only NA's in at least one stratum. na.rm turned off.")
        na.rm = FALSE
    }

    varFormula   <- as.formula(paste("~", varName))
    groupFormula <- as.formula(paste("~", groupName))

    means <- svyby(formula = varFormula, by = groupFormula,
                   FUN = svymean, design = design, na.rm = na.rm)[,2]

    if (binary) {
        vars  <- means * (1 - means)
    } else {
        vars  <- svyby(formula = varFormula, by = groupFormula,
                       FUN = svyvar,  design = design, na.rm = na.rm)[,2]
    }

    ## Outer to obtain all pairwise differences
    meanDiffs <- outer(X = means, Y = means, FUN = "-")
    ## Outer to obtain all pairwise variance mean
    varMeans  <- outer(X = vars, Y = vars, FUN = "+") / 2

    out <- meanDiffs / sqrt(varMeans)

    ## If mean difference is zero and variance is zero,
    ## only one constant exists across two groups.
    ## In this case, the SMD can be defined zero, rather than NaN from 0/0.
    out[is.na(out) & !is.na(meanDiffs) & (meanDiffs == 0) & !is.na(varMeans) & (varMeans == 0)] <- 0

    ## This lower.tri() approach is actually giving 2vs1, 3vs1, etc
    ## opposite of stated 1vs2, 1vs3. Only correct if abs() is used.
    abs(out[lower.tri(out)])
}


### Categorical (including binary) standardizzed mean difference
svyStdDiffMulti <- function(varName, groupName, design) {

    ## Check strata for all NA strata
    logiAllNaStrata <- svyCheckNaOnlyStrata(varName, groupName, design)

    ## If ALL strata have only NA's convert to a level
    ## If any strata have valid levels, table is ok
    ## Empty table breaks the function
    if (all(logiAllNaStrata)) {

        warning(varName, " has only NA's in all strata. Regarding NA as a level.")

        design$variables[,varName] <- factor(design$variables[,varName],
                                             exclude = NULL)
    }

    ## If same variable is used as a group, factor() to allow tabling
    if (varName == groupName) {
        groupName <- sprintf("factor(%s)", groupName)
    }

    tabFormula   <- as.formula(sprintf("~ %s + %s", groupName, varName))

    ## strata x variable table
    strataByLevels <- svytable(formula = tabFormula, design = design)
    lstMeans       <- LstMeansFromFullTable(strataByLevels)
    ## Return vector of SMDs
    StdDiffFromLstMeans(lstMeans)
}


### Standardized mean differences for multiple variables
## Continuous or binary only
svyStdDiffs <- function(data, vars, groupVar, binaryVars) {

    lapply(vars, function(var) {

        svyStdDiff(varName = var, groupName = groupVar, design = data,
                   binary = (var %in% binaryVars))
    })
}

Any scripts or data that you put into this service are public.

tableone documentation built on April 15, 2022, 5:06 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

tableone
Create 'Table 1' to Describe Baseline Characteristics with or without Propensity Score Weights

R/modules-smd.R
In tableone: Create 'Table 1' to Describe Baseline Characteristics with or without Propensity Score Weights

Defines functions svyStdDiffs svyStdDiffMulti svyStdDiff svyCheckNaOnlyStrata StdDiffs StdDiffMulti StdDiff CheckNaOnlyStrata FormatLstSmds LstMeansFromFullTable StdDiffFromLstMeans MultinomialVar MultinomialMeans

Try the tableone package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

tableone Create 'Table 1' to Describe Baseline Characteristics with or without Propensity Score Weights

R/modules-smd.R In tableone: Create 'Table 1' to Describe Baseline Characteristics with or without Propensity Score Weights

Defines functions svyStdDiffs svyStdDiffMulti svyStdDiff svyCheckNaOnlyStrata StdDiffs StdDiffMulti StdDiff CheckNaOnlyStrata FormatLstSmds LstMeansFromFullTable StdDiffFromLstMeans MultinomialVar MultinomialMeans

Try the tableone package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

tableone
Create 'Table 1' to Describe Baseline Characteristics with or without Propensity Score Weights

R/modules-smd.R
In tableone: Create 'Table 1' to Describe Baseline Characteristics with or without Propensity Score Weights