R/calcpvalue.R

Defines functions computeNumericVarStats samplingCovariance computeSamplingVarianceForProportion multinomialCovarianceMatrix zTest tTest raoScottSecondOrderChiSquareTest computeVariances compareTwoSamples calcPvaluesForOneVarOneLevel calcPvaluesForOneVariable

#' @importFrom stats pt qnorm pchisq
#' @importFrom survey svydesign svyranktest
#' @importFrom stats kruskal.test
#' @importFrom verbs Sum
calcPvaluesForOneVariable <- function(x, group, weights, is.binary = FALSE, non.parametric = FALSE)
{
    if (!is.factor(group))
        group <- factor(group)
    n.levels <- nlevels(group)
    levs <- levels(group)

    pval <- rep(NA, n.levels)
    if (n.levels < 2)
        return(pval)
    for (i in 1:n.levels)
    {
        y <- group == levs[i]
        if (non.parametric && !is.binary)
        {
            if (is.null(weights))
                pval[i] <- kruskal.test(x, y)$p.value
            else
            {
                df <- data.frame(x = x, y = y, w = weights)
                pval[i] <- svyranktest(x ~ y, svydesign(ids = ~1, weights = ~w, data = df))$p.value
            }
        }
        else
            pval[i] <- calcPvaluesForOneVarOneLevel(x, x.is.binary = is.binary, y = y, w = weights)
    }
    return(pval)
}

calcPvaluesForOneVarOneLevel = function(x,             # A binary or numeric variable
                      x.is.binary = TRUE,       # TRUE if x is a binary variable
                      y,                        # A binary variable
                      w = rep(1, length(x)))    # weight variable (same length as x)
{
    if (length(w) <= 1)
        w <- rep(1, length(x))

    Filter = function(x, f)
    {
        if (is.null(f))
            return(NULL)
        x[f]
    }

    # Identifying missing values; these are values that are:
    # - Missing in  x (e.g., if x is Pick Any and x = Coke | Pepsi,
    #       if either coke or Pepsi have missing values, then x is missing.
    # - Missing in y
    # - Missing or <= 0 in w
    m = is.na(x) | if(is.null(y)) FALSE else is.na(y) | is.na(w) | w <= 0
    x = Filter(x, !m)
    y = Filter(y, !m)
    w = Filter(w, !m)

    filters = list(which(y == 1), which(y == 0))
    a = computeNumericVarStats(Filter(x, filters[[1]]), Filter(w, filters[[1]]))
    b = computeNumericVarStats(Filter(x, filters[[2]]), Filter(w, filters[[2]]))

    # Here, the test.type for SegmentComparisonTable is determined based on
    # variable type but we will update this later to use the statistical
    # assumptions in QSettings
    # Need to update function signature and tests
    test.type <- if (!x.is.binary) "tTest" else "Nonparametric"
    return(compareTwoSamples(test.type, a, b, x.is.binary,
           is.weighted = length(unique(w)) > 1, bessel = 0))
}

# a and b are lists which contain the summary statistics of each sample
compareTwoSamples <- function(test.type, a, b,
    is.binary = TRUE, is.weighted = FALSE, bessel = 0, dEff = 1)
{
    if (test.type == "tTest")
        return(tTest(a[["Average"]], b[["Average"]], a[["Standard Error"]],
            b[["Standard Error"]], a[["Base n"]], b[["Base n"]],
            is.binary, is.weighted, bessel, dEff))
    else if (test.type == "zTest")
        return(zTest(a[["Average"]], b[["Average"]], a[["Standard Error"]],
            b[["Standard Error"]], a[["Base n"]], b[["Base n"]],
            is.binary, is.weighted, bessel))
    else # Nonparametric or Chisquare
        return(raoScottSecondOrderChiSquareTest(a, b, is.weighted))
    # Non-parametric tests for numeric variables are handled before
    # getting to this function
}


# Functions - these are all from the c# SamplingVariance class (albeit in slightly different forms)
# https://github.com/Displayr/q/blob/master/Flip/DataAnalysis/Inference/StandardErrors/SamplingVariance.cs
computeVariances <- function(mean, is.binary, sum.w, sum.ww, sum.xw, sum.xww, sum.xxw, sum.xxww, n.observations)
{
    if (is.binary) # Numerical precision
    {
        if (mean < 0.00000001)
            mean = 0
        else if (mean > 0.99999999)
            mean = 1
    }
    bessel.correction = n.observations / (n.observations - 1)
    mean2 = mean * mean
    sum_of_squares = sum.xxw - 2 * mean * sum.xw + mean2 * sum.w
    sum_of_squares.w = sum.xxww - 2 * mean * sum.xww + mean2 * sum.ww

    taylor = sum_of_squares.w / (sum.w * sum.w) * bessel.correction

    naive = if (is.binary) mean * (1 - mean) else sum_of_squares / sum.w
    naive = naive * bessel.correction / n.observations
    ess = if (!is.na(taylor) && taylor < 0.000001) # Due to numeric precision issues
        sum.w * sum.w / sum.ww
        else n.observations * naive / taylor

    list(taylor = taylor,
         naive = naive,
         ess = ess,
         se = sqrt(taylor))
}

# A simplification of RaoScottSecondOrder2b2 from Q's C#
# https://github.com/Displayr/q/blob/master/Flip/DataAnalysis/Inference/Tests/ChiSquareTests.cs
# aa and bb contain summary statistics for each sample
#' @importFrom verbs Sum SumEachRow SumEachColumn
raoScottSecondOrderChiSquareTest <- function(aa, bb, is.weighted)
{
    if (!is.null(dim(aa[["Average"]])))
        pvals <- matrix(NA, nrow = NROW(aa[["Average"]]), ncol = NCOL(aa[["Average"]]))
    else
        pvals <- rep(NA, length = length(aa[["Average"]]))
    if (length(aa[["Average"]]) == 0)
        return(pvals)

    for (i in 1:length(aa[["Average"]]))
    {
        n.observations <- aa[["Base n"]][i] + bb[["Base n"]][i]
        if (is.weighted)
        {
            sum.w <- aa[["sumW"]][i] + bb[["sumW"]][i]
            sum.ww <- aa[["sumWW"]][i] + bb[["sumWW"]][i]
            p.a <- aa[["sumW"]][i] / sum.w
        } else
            p.a <- aa[["Base n"]][i] / n.observations
        p.b <- 1 - p.a

        proportiony = p.a
        mean.a <- aa[["Average"]][i]
        if (is.na(mean.a))
            mean.a <- 0
        mean.b <- bb[["Average"]][i]
        if (is.na(mean.b))
            mean.b <- 0

        sums.ww <- c(bb[["sumWW"]][i] - bb[["sumXWW"]][i], bb[["sumXWW"]][i],
                     aa[["sumWW"]][i] - aa[["sumXWW"]][i], aa[["sumXWW"]][i])
        proportions <- c(p.b * (1-mean.b), p.b * mean.b,
                         p.a * (1-mean.a), p.a * mean.a)
        counts = matrix(proportions * n.observations, 2)

        # If not weighted, this reduces to a chi-square test
        group_sizes = SumEachColumn(counts, remove.missing = FALSE)
        row.totals = SumEachRow(counts, remove.missing = FALSE)
        total = Sum(row.totals, remove.missing = FALSE)

        expected = matrix(c(group_sizes[1]*row.totals[1]/total,
                            group_sizes[1]*row.totals[2]/total,
                            group_sizes[2]*row.totals[1]/total,
                            group_sizes[2]*row.totals[2]/total), 2)

        pearson.statistic = Sum((counts - expected)^2/expected, remove.missing = FALSE)
        if (!is.weighted)
            pvals[i] <- pchisq(pearson.statistic, 1, lower.tail = FALSE)
        else
        {
            variance = multinomialCovarianceMatrix(proportions,
                    sums.ww, sum.ww, sum.w, n.observations)
            if (!is.na(pearson.statistic)) # If not a missing value
            {
                a = matrix(0, 4, 1)
                id_mat = d_mat = matrix(0, 4, 4)
                denominator = 0.0;
                for (j in 1:4)
                {
                    prop = proportions[j];
                    d_mat[j, j] = prop;
                    prop_is_0 = prop < 1e-12
                    j_prop = if (prop_is_0) 0 else 1.0 / prop;
                    if (!prop_is_0) id_mat[j, j] = j_prop;
                    a[j, 1] = j_prop / 4.0;
                    denominator = denominator + j_prop;
                }
                a[2, 1] = -a[2, 1];
                a[3, 1] = -a[3, 1];
                denominator = denominator * .0625 / n.observations;
                numerator = t(a) %*% variance %*% a
                delta = numerator / denominator;
                f = pearson.statistic / delta
            } else
                f <- NA
            pvals[i] <- (1 - pf(f, 1, n.observations - 1))
        }
    }
    return(pvals)
}


# Code modified from https://github.com/Displayr/q/blob/master/Flip/DataAnalysis/Inference/Tests/TTests.cs and
# https://github.com/Displayr/q/blob/master/Flip/DataAnalysis/Inference/Tests/CombinedZandTTest.cs
tTest <- function(mean1, mean2, se1, se2, n1, n2,
                  is.binary, is.weighted, bessel = 0, dEff = 1)
{
    if (!is.binary)
    {
        v1 <- se1 * se1
        v2 <- se2 * se2
        v <- v1 + v2
        se <- sqrt(v)
        df <- v * v / (v1 * v1 / (n1 - 1) + v2 * v2 / (n2 - 1))

    } else if (is.binary && !is.weighted)
    {
        m12 <- (n1 * mean1 + n2 * mean2)/(n1 + n2)
        se <- sqrt(m12 * (1 - m12) * (1/n1 + 1/n2))
        df <- n1 + n2 - 2

    } else if (is.binary && is.weighted)
    {
        se <- sqrt(dEff * (se1 * se1 + se2 * se2))
        df <- (se1^2 + se2^2)^2 /
            ((se1^4)/(n1-bessel) + (se2^4)/(n2-bessel))
    }
    t = (mean1 - mean2)/se
    p = pt(-abs(t), df) * 2
    return(p)
}

#' @importFrom stats pnorm
zTest <- function(mean1, mean2, se1, se2, n1, n2, is.binary, is.weighted, bessel = 0)
{
    if (!is.binary)
        se <- sqrt(se1 * se1 + se2 * se2)
    else if(is.binary && !is.weighted)
    {
        m12 <- (n1 * mean1 + n2 * mean2)/(n1 + n2)
        se <- sqrt(m12 * (1 - m12) * (n1 + n2) / (n1 + n2 - 2*bessel) * (1/n1 + 1/n2))

    } else if (is.binary && is.weighted)
    {
        se <- sqrt(se1 * se1 + se2 * se2)
    }
    z = (mean1 - mean2)/se
    p = pnorm(-abs(z)) * 2
    return(p)
}


multinomialCovarianceMatrix <- function(proportions, ww, ww_total, w_total, n)
{
    k =length(proportions)
    covariance = matrix(0, k, k)
    for (r in 1:4)
    {
        for (c in 1:4)
        {
            p1 = proportions[r];
            p2 = proportions[c];
            ww1 = ww[r];
            ww2 = ww[c];
            sc = if(r == c) computeSamplingVarianceForProportion(p1, ww1, ww_total, w_total, n)
                 else       samplingCovariance(p1, p2, ww1, ww2, ww_total, w_total, n)
            covariance[c, r] = covariance[r, c] = sc
        }
    }
    return(covariance)
}

computeSamplingVarianceForProportion <- function(input_proportion, ww, ww_total, w_total,sample_size)
{
    proportion = input_proportion
    if (is.na(proportion)) {
        proportion <- NA
    } else if (proportion < 1E-8) {
        proportion = 0.0;
    } else if (proportion > 1 - 1e-8)
        proportion = 1.0;
    sumSquaredWeights = ww_total;
    n = sample_size;
    mean = proportion;
    complement = (1.0 - proportion)
    variance = proportion * complement
    population = w_total;
    variance = variance * sample_size/(sample_size - 1.0);
    ww_sums_of_squares = complement * complement * ww + proportion * proportion * (ww_total - ww)
    return(ww_sums_of_squares / (w_total * w_total) * sample_size/(sample_size - 1.0))
}

# From the C# SamplingVariance(double proportion1, double proportion2, double ww1, double ww2, double ww_total, double w_total, int n, StatisticalAssumptions statistical_assumptions)
samplingCovariance <- function(proportion1,  proportion2,  ww1,  ww2,  ww_total,  w_total, n)
{
    ww_sums_of_squares = -proportion1 * -proportion2 * (ww_total - ww1 - ww2) + -proportion1 * (1 - proportion2) * ww2 + -proportion2 * (1 - proportion1) * ww1
    return(ww_sums_of_squares / (w_total * w_total) * (n / (n - 1)))
}

#' @importFrom verbs Sum
computeNumericVarStats <- function(x, w)
{
    n.observations <- length(x)
    ww = w * w
    xw = x * w
    xxw = x * xw
    xww = xw * w
    xxww = xxw * w
    sum.w = Sum(w, remove.missing = FALSE)
    sum.xw = Sum(xw, remove.missing = FALSE)
    sum.ww = Sum(ww, remove.missing = FALSE)
    sum.xxw = Sum(xxw, remove.missing = FALSE)
    sum.xww = Sum(xww, remove.missing = FALSE)
    sum.xxww = Sum(xxww, remove.missing = FALSE)
    mean.x = sum.xw / sum.w

    n.used.in.bessel.correction = n.observations
    var = computeVariances(mean.x, FALSE, sum.w, sum.ww, sum.xw, sum.xww, sum.xxw, sum.xxww, n.used.in.bessel.correction)

    return(c("Average" = mean.x, "Base n" = n.observations,
        sumW = sum.w, sumWW = sum.ww, sumXWW = sum.xww,
        "Standard Error" = var$se))
}
NumbersInternational/flipAnalysisOfVariance documentation built on Feb. 26, 2024, 4:52 a.m.