R/utilities.R

Defines functions conformParameters computeMeanAndSD generateNormalData generateDiscreteData colNamesForStage computeMHPBoundaryITT computeMHPBoundaries den.vs groupSampleSize wilcoxon

Documented in colNamesForStage computeMeanAndSD computeMHPBoundaries computeMHPBoundaryITT conformParameters den.vs generateDiscreteData generateNormalData groupSampleSize wilcoxon

## Global constant, the number of stages for this design
NUM_STAGES <- 3L

## Global constant, the names of columns in trial history relating to Ihat, the subgroup chosen
IHAT_COL_NAMES = c("decision_Ihat", "wcx_Ihat", "wcx.fut_Ihat", "Nl_Ihat", "Ihat", "stage_Ihat", "lost")

## Global constant, the name of the CI column
CI_COL_NAME = c("bounds")

## Global constant, the name of the column for stage at which the trial exits
STAGE_COL_NAME = "exitStage"


#' Compute the standardized Wilcoxon test statistic for two samples
#'
#' We compute the standardized Wilcoxon test statistic with mean 0 and
#' and standard deviation 1 for samples \eqn{x} and \eqn{y}.  The R function
#' [stats::wilcox.test()] returns the statistic
#'
#' \deqn{
#' U = \sum_i R_i - \frac{m(m + 1)}{2}
#' }{
#' U = (sum over i) R_i - m(m + 1) / 2
#' }
#'
#' where \eqn{R_i} are the ranks of the first sample \eqn{x} of size
#' \eqn{m}. We compute
#'
#' \deqn{
#' \frac{(U - mn(1/2 + \theta))}{\sqrt{mn(m + n + 1) / 12}}
#' }{
#' (U - mn(1/2 + theta)) / (mn(m + n + 1) / 12)^(1/2)
#' }
#'
#' where \eqn{\theta} is the alternative hypothesis shift on the
#'     probability scale, i.e. \eqn{P(X > Y) = 1/2 + \theta}.
#'
#' @param x a sample numeric vector
#' @param y a sample numeric vector
#' @param theta a value > 0 but < 1/2.
#' @return the standardized Wilcoxon statistic
#'
#' @importFrom stats wilcox.test
#' @export
wilcoxon <- function(x, y, theta = 0) {
    r <- rank(c(x, y))
    n.x <- as.double(length(x))
    n.y <- as.double(length(y))
    STATISTIC <- c(W = sum(r[seq_along(x)]) - n.x * (n.x +
                                                     1)/2)
    TIES <- (length(r) != length(unique(r)))
    NTIES <- table(r)
    z <- STATISTIC - n.x * n.y * (1/2 + theta)
    SIGMA <- sqrt((n.x * n.y/12) * ((n.x + n.y + 1) -
                                    sum(NTIES^3 - NTIES)/((n.x + n.y) * (n.x + n.y -
                                                                         1))))
    ##CORRECTION <- sign(z) * 0.5
    z / SIGMA
}


#' Compute the sample size for any group at a stage assuming a nested
#' structure as in the paper.
#'
#' In the three stage design under consideration, the groups are
#' nested with assumed prevalences and fixed total sample size at each
#' stage. This function returns the sample size for a specified group
#' at a given stage, where the futility stage for the overall group
#' test may be specified along with the chosen subgroup.
#'
#' @param prevalence the vector of prevalence, will be normalized if
#'     not already so. The length of this vector implicitly indicates
#'     the number of groups J.
#' @param N an integer vector of length 3 indicating total sample size
#'     at each of the three stages
#' @param stage the stage of the trial
#' @param group the group whose sample size is desired
#' @param HJFutileAtStage is the stage at which overall futility
#'     occured. Default `NA` indicating it did not occur. Also
#'     ignored if stage is 1.
#' @param chosenGroup the selected group if HJFutilityAtStage is not
#'     `NA`. Ignored if stage is 1.
#' @return the sample size for group
#'
#' @export
groupSampleSize <- function(prevalence, N, stage, group, HJFutileAtStage = NA, chosenGroup = NA) {
    if (!integerInRange(N, low = 1) || length(N) != NUM_STAGES) {
        stop("Improper values for sample size N")
    }
    if (!identical(order(N), seq_along(N))) {
        stop("Sample size vector N is not monotone increasing sequence")
    }

    J <- length(prevalences)

    if (!scalarInRange(J, low = 2, high = 10)) {
        stop("Improper number of subgroups; need at least 2; max 10")
    }

    if (any(prevalence <= 0)) {
        stop("Improper prevalence specified")
    }
    prevalences <- prevalence / sum(prevalence)
    q <- cumsum(prevalence)

    if (stage == 1 || is.na(HJFutileAtStage) || stage == HJFutileAtStage) {
        ## catches stage = 1 and all cases where stage == HJFutileAtStage
        N[stage] * q[group]
    } else {
        ## stage > 1 && stage > HJFutileAtStage
        stopifnot(stage <= 3 && 1 <= HJFutileAtStage && HJFutileAtStage < stage)
        if (stage == 2) {
            ## HJFutileAtStage = 1 for sure
            if (group <= chosenGroup) {
                qq <- prevalence[seq_len(chosenGroup)]
                qq <- cumsum(qq / sum(qq))
                s1 <- N[1] * q[chosenGroup]
                (N[2] - s1) * qq[group] + N[1] * q[group]
            } else {
                N[1] * q[group] + (N[2] - N[1] * q[chosenGroup])
            }
        } else {
            ## stage == 3 here
            if (HJFutileAtStage == 2) {
                if (group <= chosenGroup) {
                    qq <- prevalence[seq_len(chosenGroup)]
                    qq <- cumsum(qq / sum(qq))
                    s2 <- N[2] * q[chosenGroup]
                    (N[3] - s2) * qq[group] + N[2] * q[group]
                } else {
                    N[2] * q[group] + (N[3] - N[2]* q[chosenGroup])
                }
            } else {
                ## HJFutileAtStage = 1
                if (group <= chosenGroup) {
                    qq <- prevalence[seq_len(chosenGroup)]
                    qq <- cumsum(qq / sum(qq))
                    s1 <- N[1] * q[chosenGroup]
                    (N[3] - s1) * qq[group] + N[1] * q[group]
                } else {
                    N[1] * q[group] + (N[3] - N[1] * q[chosenGroup])
                }
            }
        }
    }
}

#' Conditional probability of \eqn{i}-th subgroup statistic being
#' chosen given the appropriate mean, covariance matrix and futility
#' boundary \eqn{\tilde{b}}{btilde} at \eqn{v}.
#'
#' The computation involves a \eqn{J-1} multivariate normal integral
#' of the conditional density of the \eqn{i}-th subgroup statistic
#' given that it was maximal among all subgroups:
#'
#'\deqn{
#' \phi_i(v)(\int_0^v\int_0^v\ldots
#' \int_0^{\tilde{b}} \phi_v(z_{-i})dz_{-i})
#' }
#'
#' where \eqn{z_{-i}} denotes all subgroups other than \eqn{i}.
#'
#' @param v the value of the statistic
#' @param i the subgroup
#' @param mu.prime the conditional mean vector of the distribution of
#'     length \eqn{J - 1}; needs to be multipled by the conditional
#'     value, the parameter `v`.
#' @param Sigma.prime the conditional covariance matrix of dimension
#'     \eqn{J-1} by \eqn{J-1}
#' @param fut the futility boundary, which is \eqn{\tilde{b}}{btilde}
#'     for stages 1 and 2, but \eqn{c} for stage 3
#' @return the conditional probability
#'
#' @rdname ASSISTant-internal
#'
#' @importFrom mvtnorm pmvnorm Miwa
#' @importFrom stats dnorm
den.vs <- function(v, i, mu.prime, Sigma.prime, fut ) {
    ## Density function used in integration
    mu.prime <- mu.prime * v
    mvtnorm::pmvnorm(upper = c(rep(v, nrow(mu.prime) - 1), fut), mean = mu.prime[, i],
                     sigma = Sigma.prime[[i]], algorithm = mvtnorm::Miwa()) * stats::dnorm(v)
}

#' Compute the futility boundary (modified Haybittle-Peto) for the
#' first two stages
#'
#' The futility boundary \eqn{\tilde{b}}{btilde} is computed by
#' solving (under the alternative)
#'
#' \deqn{
#' P(\tilde{Z}_J^1\le\tilde{b} or \tilde{Z}_J^2\le\tilde{b}) = \epsilon\beta }
#'
#' where the superscripts denote the stage and \eqn{\epsilon} is the
#' fraction of the type I error (\eqn{\alpha}) spent and \eqn{\beta}
#' is the type II error. We make use of the joint normal density of
#' \eqn{Z_{J}} (the overall group) at each of the three stages and the
#' fact that the \eqn{\tilde{Z_J}} is merely a translation of
#' \eqn{Z_J}. So here the calculation is based on a mean of zero and
#' has to be translated during use!
#'
#' @param beta the type II error
#' @param cov.J the 3 x 3 covariance matrix
#'
#' @importFrom stats uniroot qnorm
#' @importFrom mvtnorm pmvnorm Miwa
#' @export
mHP.btilde <- function (beta, cov.J) {
    sigma <- cov.J[-NUM_STAGES, -NUM_STAGES]
    btilde <- stats::uniroot(f = function(btilde) {
        1 - mvtnorm::pmvnorm(lower = rep(btilde, NUM_STAGES - 1),
                             upper = rep(Inf, NUM_STAGES - 1),
                             sigma = sigma,
                             algorithm = Miwa()) -
            beta },
        lower = stats::qnorm(beta) - 1,
        upper = stats::qnorm(beta^(1 / (NUM_STAGES - 1))) + 1)
    btilde$root
}

#' Compute the efficacy boundary (modified Haybittle-Peto) for the
#' first two stages
#'
#' @param prevalence the vector of prevalences between 0 and 1 summing
#'     to 1. \eqn{J}, the number of groups, is implicitly the length
#'     of this vector and should be at least 2.
#' @param N a three-vector of total sample size at each stage
#' @param cov.J the 3 x 3 covariance matrix for Z_J at each of the
#'     three stages
#' @param mu.prime a list of \eqn{J} mean vectors, each of length
#'     \eqn{J-1} representing the conditional means of all the other
#'     \eqn{Z_j} given \eqn{Z_i}. This mean does not account for the
#'     conditioned value of \eqn{Z_i} and so has to be multiplied by
#'     that during use!
#' @param Sigma.prime a list of \eqn{J} covariance matrices, each
#'     \eqn{J-1} by \eqn{J-1} representing the conditional covariances
#'     all the other \eqn{Z_j} given \eqn{Z_i}
#' @param alpha the amount of type I error to spend
#' @param btilde the futility boundary
#' @param theta the effect size on the probability scale
#' @importFrom stats integrate pnorm uniroot
#' @importFrom mvtnorm pmvnorm Miwa
#' @export
mHP.b <- function (prevalence, N, cov.J, mu.prime, Sigma.prime, alpha, btilde, theta) {
    J <- length(prevalence)
    q <- cumsum(prevalence / sum(prevalence))

    crossingProb <- function(b) {
        ##
        ## Function to compute conditional probability of rejecting
        ## the subgroup hypothesis for group i at stage (should be 1
        ## or 2), given that the trial was futile at stage.accept.J.
        ##
        f <- function(stage, stage.accept.J, i) {
            ## Translate btilde appropriately from the theta
            ## (probability) scale to the standard scale; see writeup.
            btilde <- btilde + theta * sqrt(3 * N[stage.accept.J])
            ## Adjust the sample size to account for the loss, once
            ## HJ is accepted
            ssi <- replace(N, stage.accept.J, N[stage.accept.J] * q[i])
            if (stage == stage.accept.J) {
                ## Rejection of subgroup at same stage as futility stage
                ## So this is just an integral of the conditional joint
                ## distribution
                ##
                stats::integrate(
                           function(x) {
                               sapply(x, function(x) den.vs(x, i, mu.prime,
                                                            Sigma.prime, fut = btilde))
                           },
                           lower = b, upper = Inf)$value
            } else {
                ## Rejection of subgroup at the subsequent stage,
                ## that is, stage === stage.accept.J + 1
                ## For efficiency, we don't do explicit checking of such
                ## conditions, but the invocation below is implicitly expected
                ## to respect this fact.
                ##
                ## So here we have to integrate the product of the
                ## conditional joint distribution and the probability
                ## of the i-th group statistic exceeding the at the
                ## next stage.
                ##
                sigma <- sqrt(ssi[stage - 1] / ssi[stage])
                integrand <- function(u) {
                    den.vs(u, i, mu.prime, Sigma.prime, fut = btilde) *
                        stats::pnorm(b, mean = u * sigma, sd = sqrt(1 - sigma^2),
                              lower.tail = FALSE)
                }
                stats::integrate(
                           function(u) sapply(u, integrand),
                           lower = -Inf, upper = b)$value
            }
        }
        ## Type I error at interim stages 1 and 2 =
        ## P(accept H_J at stage 1, reject H_I at stage 1) +
        ## P(accept H_J at stage 1, reject H_I at stage 2) +
        ## P(accept H_J at stage 2, reject H_I at stage 2)
        ##          for I in 1:(J-1) +
        ## P(reject H_J at stage 1 or 2)
        ##
        sum(sapply(
            seq_len(J - 1), function(i)  f(1, 1, i) + f(2, 1, i) + f(2, 2, i))) +
            ##
            1 - mvtnorm::pmvnorm(lower = -Inf, upper = b, mean = rep(0, NUM_STAGES - 1),
                                 sigma = cov.J[-NUM_STAGES, -NUM_STAGES],
                                 algorithm = mvtnorm::Miwa()) -
            ##
            alpha
    }
    stats::uniroot(f = crossingProb, lower = 1.0, upper = 4.0)$root
}


#' Compute the efficacy boundary (modified Haybittle-Peto) for the
#' final (third) stage
#'
#' @param prevalence the vector of prevalences between 0 and 1 summing
#'     to 1. \eqn{J}, the number of groups, is implicitly the length
#'     of this vector and should be at least 2.
#' @param N a three-vector of total sample size at each stage
#' @param cov.J the 3 x 3 covariance matrix for Z_J at each of the
#'     three stages
#' @param mu.prime a list of \eqn{J} mean vectors, each of length
#'     \eqn{J-1} representing the conditional means of all the other
#'     \eqn{Z_j} given \eqn{Z_i}. This mean does not account for the
#'     conditioned value of \eqn{Z_i} and so has to be multiplied by
#'     that during use!
#' @param Sigma.prime a list of \eqn{J} covariance matrices, each
#'     \eqn{J-1} by \eqn{J-1} representing the conditional covariances
#'     all the other \eqn{Z_j} given \eqn{Z_i}
#' @param alpha the amount of type I error to spend
#' @param btilde the futility boundary
#' @param b the efficacy boundary for the first two stages
#' @param theta the effect size on the probability scale
#'
#' @importFrom stats uniroot qnorm integrate
#' @importFrom mvtnorm pmvnorm Miwa
#' @export
mHP.c <- function (prevalence, N, cov.J, mu.prime, Sigma.prime, alpha, btilde, b, theta) {
    ## Function for computing final boundary c
    J <- length(prevalence)
    q <- cumsum(prevalence / sum(prevalence))

    crossingProb <- function(c) {
        ##
        ## Function for bounding the probability of rejecting either
        ## H_J or H_I at the third stage.
        ##
        f <- function(stage.accept.J, i) {
            ## i is sub-population selected
            ##
            ## Translate btilde appropriately from the theta
            ## (probability) scale to the standard scale; see writeup.
            ##
            btilde <- btilde + theta * sqrt(3 * N[stage.accept.J])
            ## Adjust the sample size to account for the loss, once
            ## HJ is accepted
            ssi <- replace(N, stage.accept.J, N[stage.accept.J] * q[i])

            if (stage.accept.J == 3 ) {
                ## Rejection of subgroup at same stage as futility
                ## stage, that is, stage = 3.  So this is just an
                ## integral of the conditional joint distribution,
                ## except that at the third stage, the critical
                ## boundary is c.
                ##
                stats::integrate(
                           function(x) {
                               sapply(x, function(x)
                                   den.vs(x, i, mu.prime, Sigma.prime, fut = c))
                           },
                           lower = c,
                           upper = Inf)$value
            } else if (stage.accept.J == 2 ) {
                ## Rejection of subgroup at the subsequent stage,
                ## that is, H_J was futile at stage 2, and H_I is rejected
                ## at stage 3.
                ##
                ## So here we have to integrate the product of the
                ## conditional joint distribution at stage 2 and the
                ## probability of the i-th group statistic exceeding
                ## the critical value stage 3. The latter is a
                ## one-dimensional integral in this case, with
                ## appropriate standard deviation. Note that the
                ## critical boundary is c in the last stage!
                ##
                sigma <- sqrt(ssi[2] / ssi[3])
                integrand <- function(u) {
                    den.vs(u, i, mu.prime, Sigma.prime, btilde) *
                        stats::pnorm(c, mean = u * sigma, sd = sqrt(1 - sigma^2), lower.tail = FALSE)
                }
                stats::integrate(function(u) sapply(u, integrand),
                                 lower = -Inf,
                                 upper = b)$value
            } else {
                ## Rejection of subgroup at the third stage, while
                ## H_J was futile at stage 1, and H_I is rejected
                ## at stage 3.
                ##
                ## So here we have to integrate the product of the
                ## conditional joint distribution at stage 1 and the
                ## probability of the i-th group statistic exceeding
                ## the critical value at stage 3, but not stage 2. The
                ## latter probability is a 2-dimensional integral with
                ## an appropriate covariance structure. Once again,
                ## note that the critical boundary at stage 3 is c.
                ##
                v23 <- sqrt(c(ssi[1] / ssi[2], ssi[1] / ssi[3]))
                sigma23 <- matrix(sqrt(ssi[2] / ssi[3]), 2, 2)
                diag(sigma23) <- 1
                sigma <- sigma23 - v23 %*% t(v23)
                integrand <- function(u) {
                    den.vs(u, i, mu.prime, Sigma.prime, btilde) *
                        mvtnorm::pmvnorm(lower = c(-Inf, c),
                                         upper = c(b, Inf),
                                         mean = u * v23,
                                         sigma = sigma,
                                         algorithm = mvtnorm::Miwa())
                }
                stats::integrate(function(u) sapply(u, integrand),
                                 lower = -Inf,
                                 upper = b)$value
            }
        }
        ##
        ## Type I error at final stage =
        ## P(accept H_J at stage 1, reject H_I at stage 3) +
        ## P(accept H_J at stage 2, reject H_I at stage 3) +
        ## P(accept H_J at stage 3, reject H_I at stage 3) or
        ##             for I in 1:(J-1) +
        ## P(reject H_J at stage 3)
        ##
        sum(sapply(seq_len(J - 1), function(i) f(1, i) + f(2, i) + f(3, i))) +
            ##
            mvtnorm::pmvnorm(lower = c(rep(-Inf, NUM_STAGES - 1), c),
                             upper = c(rep(b, NUM_STAGES - 1), Inf), sigma = cov.J,
                             mean = rep(0, NUM_STAGES),
                             algorithm = mvtnorm::Miwa()) -
            ##
            alpha
    }
    stats::uniroot(f = crossingProb,
                   lower = min(0.0, b - 2.0),
                   upper = max(b + 2.0, 4.0))$root
}

#' Compute the three modified Haybittle-Peto boundaries
#'
#' @param prevalence the vector of prevalences between 0 and 1 summing
#'     to 1. \eqn{J}, the number of groups, is implicitly the length
#'     of this vector and should be at least 2.
#' @param N a three-vector of total sample size at each stage
#' @param alpha the type I error
#' @param beta the type II error
#' @param eps the fraction (between 0 and 1) of the type 1 error to
#'     spend in the interim stages 1 and 2
#' @param futilityOnly a logical value indicating only the futility
#'     boundary is to be computed; default `FALSE`
#' @return a named vector of three values containing
#'     \eqn{\tilde{b}}{btilde}, b, c
#'
#' @importFrom stats integrate pnorm uniroot
#' @export
computeMHPBoundaries <- function(prevalence, N, alpha, beta, eps, futilityOnly = FALSE) {
    J <- length(prevalence)
    q <- cumsum(prevalence / sum(prevalence))

    theta <- (qnorm(1 - alpha) + qnorm(1 - beta)) / sqrt(3 * N[3])

    ## Sigma = covariance matrix between subgroup,
    ## which is roughly stage independent

    Sigma <- matrix(0, J, J)
    for (i in seq_len(J - 1)) {
        for (j in (i + 1):J) {
            Sigma[i, j] <- sqrt(q[i] / q[j])
        }
    }
    Sigma <- Sigma + t(Sigma)
    diag(Sigma) <- 1

    mu.prime <- matrix(0, (J - 1), J)
    Sigma.prime <- vector("list", J)
    for (i in seq_len(J)) {
        mu.prime[, i] <- Sigma[-i, i]
        Sigma.prime[[i]] <- Sigma[-i, -i] - Sigma[-i, i] %*% t(Sigma[i, -i])
    }


    cov.J <- matrix(0, NUM_STAGES, NUM_STAGES)
    for (i in seq_len(NUM_STAGES - 1)) {
        for (j in (i + 1):NUM_STAGES) {
            cov.J[i, j] <- sqrt((N[i]^2 * (N[j] + 1)) / (N[j]^2 * (N[i] + 1)))
        }
    }
    cov.J <- cov.J + t(cov.J)
    diag(cov.J) <- 1

    btilde <- mHP.btilde(beta = beta * eps, cov.J = cov.J)
    if (futilityOnly) {
        b <- c <- NA
    } else {
        b <- mHP.b(prevalence = prevalence,
                   N = N,
                   cov.J = cov.J,
                   mu.prime = mu.prime,
                   Sigma.prime = Sigma.prime,
                   alpha = alpha * eps,
                   btilde = btilde,
                   theta = theta)
        c <- mHP.c(prevalence = prevalence,
                   N = N,
                   cov.J = cov.J,
                   mu.prime = mu.prime,
                   Sigma.prime = Sigma.prime,
                   alpha = alpha * (1 - eps),
                   btilde = btilde,
                   b = b,
                   theta = theta)
    }
    c(btilde = btilde, b = b, c = c)
}

#' Compute the three modified Haybittle-Peto boundaries and effect size
#'
#' @param prevalence the vector of prevalences between 0 and 1 summing
#'     to 1. \eqn{J}, the number of groups, is implicitly the length
#'     of this vector and should be at least 2.
#' @param alpha the type I error
#' @return a named vector of a single value containing the value for `c`
#' @export
computeMHPBoundaryITT <- function(prevalence, alpha) {
    J <- length(prevalence)
    q <- cumsum(prevalence / sum(prevalence))

    ## Sigma = covariance matrix between subgroup,
    ## which is roughly stage independent

    Sigma <- matrix(0, J, J)
    for (i in seq_len(J - 1)) {
        for (j in (i + 1):J) {
            Sigma[i, j] <- sqrt(q[i] / q[j])
        }
    }
    Sigma <- Sigma + t(Sigma)
    diag(Sigma) <- 1

    mu.prime <- matrix(0, (J - 1), J)
    Sigma.prime <- vector("list", J)
    for (i in seq_len(J)) {
        mu.prime[, i] <- Sigma[-i, i]
        Sigma.prime[[i]] <- Sigma[-i, -i] - Sigma[-i, i] %*% t(Sigma[i, -i])
    }

    ## Derive interim eff boundary b.I for subgp
    crossingProb <- function(c) {
        f <- function(i) {
            ##i=sub-population selected
            stats::integrate(
                       function(x) {
                           sapply(x, function(x)
                               den.vs(x, i, mu.prime, Sigma.prime, fut = c))
                       },
                       lower = c,
                       upper = Inf)$value
        }
        sum(sapply(seq_len(J - 1), function(i) f(i))) +
            stats::pnorm(c, lower.tail = FALSE) - alpha
    }
    c(cAlpha = stats::uniroot(f = crossingProb, lower = 1, upper = 4)$root)
}


#' Return a vector of column names for statistics for a given stage
#'
#' @param stage the trial stage (1 to 3 inclusive).
#' @param J the number of subgroups
#' @return a character vector of the column names
#' @export
colNamesForStage <- function(stage, J) {
    seqJ <- seq_len(J)
    c(paste(c("decision", "wcx", "wcx.fut", "Nl"), stage, sep = "_"),
      sapply(seqJ, function(group) {
          c(sprintf("wcx_%d_%d", stage, group),
            sprintf("nc_%d_%d", stage, group),
            sprintf("nt_%d_%d", stage, group),
            sprintf("muc_%d_%d", stage, group),
            sprintf("mut_%d_%d", stage, group),
            sprintf("sdc_%d_%d", stage, group),
            sprintf("sdt_%d_%d", stage, group))
      }))
}

#' A data generation function using a discrete distribution for Rankin
#' score rather than a normal distribution
#'
#' @param prevalence a vector of group prevalences (length denoted by J below)
#' @param N the sample size to generate
#' @param support the support values of the discrete distribution (length K), default 0:6
#' @param ctlDist a probability vector of length K denoting the Rankin score distribution for control.
#' @param trtDist an K x J probability matrix with each column is the Rankin distribution for the associated group
#' @return a three-column data frame of `subGroup`, `trt` (0 or 1), and `score`
#' @examples
#' # Simulate data from a discrete distribution for the Rankin scores,
#' # which are typically ordinal integers from 0 to 6 in the following
#' # simulations. So we define a few scenarios.
#' library(ASSISTant)
#' null.uniform <- rep(1, 7L) ## uniform on 7 support points
#' hourglass <- c(1, 2, 2, 1, 2, 2, 1)
#' inverted.hourglass <- c(2, 1, 1, 2, 1, 1, 2)
#' bottom.heavy <- c(2, 2, 2, 1, 1, 1, 1)
#' bottom.heavier <- c(3, 3, 2, 2, 1, 1, 1)
#' top.heavy <- c(1, 1, 1, 1, 2, 2, 2)
#' top.heavier <- c(1, 1, 1, 2, 2, 3, 3)
#' ctlDist <- null.uniform
#' trtDist <- cbind(null.uniform, null.uniform, hourglass, hourglass) ## 4 groups
#' generateDiscreteData(prevalence = rep(1, 4), N = 10, ctlDist = ctlDist,
#'                      trtDist = trtDist) ## default support is 0:6
#' trtDist <- cbind(bottom.heavy, bottom.heavy, top.heavy, top.heavy)
#' generateDiscreteData(prevalence = rep(1, 4), N = 10, ctlDist = ctlDist,
#'                      trtDist = trtDist)
#' support <- c(-2, -1, 0, 1, 2) ## Support of distribution
#' top.loaded <- c(1, 1, 1, 3, 3) ## Top is heavier
#' ctl.dist <- c(1, 1, 1, 1, 1) ## null on 5 support points
#' trt.dist <- cbind(ctl.dist, ctl.dist, top.loaded) ## 3 groups
#' generateDiscreteData(prevalence = rep(1, 3), N = 10, support = support,
#'                      ctlDist = ctl.dist, trtDist = trt.dist)
#' ## ctl.dist can also be a matrix with different nulls for each subgroup
#' uniform <- rep(1, 5)
#' bot.loaded <- c(3, 3, 1, 1, 1)
#' ctl.dist <- matrix(c(uniform, bot.loaded, top.loaded), nrow = 5)
#' generateDiscreteData(prevalence = rep(1, 3), N = 10, support = support,
#'                      ctlDist = ctl.dist, trtDist = trt.dist)
#' @export
generateDiscreteData <- function(prevalence, N, support = 0L:6L, ctlDist, trtDist) {
    nR <- length(support)
    J <- length(prevalence)
    
    if (is.matrix(ctlDist)) {
      null <- ctlDist
    } else {
      null <- matrix(rep(ctlDist, J), ncol = J)
    }
    if(nrow(null) != nR || nrow(trtDist) != nR || ncol(trtDist) != J) {
        stop("generateDiscreteData: wrong dimensions for ctrlDist/trtDist")
    }
    dists <- cbind(null, trtDist)    

    if (N == 0) {
        data.frame(subGroup = integer(0), trt = integer(0),
                   score = integer(0))
    } else {
        subGroup <- sample.int(n = J, size = N, replace = TRUE,
                               prob = prevalence)
        ## Scale trt to 0:1 from 1:2.
        trt <- sample.int(n = 2L, size = N, replace = TRUE) - 1L

        rankin <- sapply(J * trt + subGroup,
                         function(k) sample(support, size = 1, prob = dists[, k] ))
        data.frame(subGroup = subGroup, trt = trt, score = rankin)
    }
}

#' A data generation function along the lines of what was used in the Lai, Lavori, Liao paper.
#' score rather than a normal distribution
#'
#' @param prevalence a vector of group prevalences (length denoted by J below)
#' @param N the sample size to generate
#' @param mean a 2 x J matrix of means under the null (first row) and alternative for each group
#' @param sd a 2 x J matrix of standard deviations under the null (first row) and alternative for each group
#' @return a three-column data frame of `subGroup`, `trt` (0 or 1), and `score`
#' @export
generateNormalData <- function(prevalence, N, mean, sd) {
    J <- length(prevalence)
    stopifnot((J == ncol(mean)) && (J == ncol(sd)))
    if (N == 0) {
        data.frame(subGroup = integer(0), trt = integer(0),
                   score = numeric(0))
    } else {
        subGroup <- sample.int(n = J, size = N, replace = TRUE,
                               prob = prevalence)
        trt <- sample.int(n = 2L, size = N, replace = TRUE) - 1L
        rankin <- unlist(
            Map(function(i, j)
                rnorm(n = 1, mean = mean[i, j], sd = sd[i, j]),
                trt + 1, subGroup))
        data.frame(subGroup = subGroup, trt = trt, score = rankin)
    }
}

#' Compute the mean and sd of a discrete Rankin distribution
#' @param probVec a probability vector of length equal to length of support,
#'        default is uniform
#' @param support a vector of support values (default 0:6 for Rankin Scores)
#' @return a named vector of `mean` and `sd`
#' @export
computeMeanAndSD <- function(probVec = rep(1, 7L), support = 0L:6L) {
    stopifnot(all(probVec >= 0))
    probVec <- probVec / sum(probVec)
    mean <- sum(support * probVec)
    sd <- sqrt(sum(probVec * support^2) - mean^2)
    c(mean = mean, sd = sd)
}


#' Conform designParameters so that weights are turned in to probabilities, the null and control distributions are proper matrices etc.
#' @param plist the parameter list
#' @param discreteData flag if data is discrete
#' @return the modified parameter list
conformParameters  <- function(plist, discreteData = FALSE) {
    prevalence  <- plist$prevalence
    J  <- length(prevalence)
    plist$J <- J
    prevalence <- prevalence / sum(prevalence)
    names(prevalence) <- paste0("Group", seq_len(J))
    plist$prevalence <- prevalence
    if (discreteData) {
        support <- plist$distSupport
        ## Assume Rankin is 0:6 unless specified in designParameters
        if (is.null(support)) {
            support <- 0L:6L
        }
        plist$distSupport  <- support

        ctlDist <- plist$ctlDist
        if (!is.matrix(ctlDist)) {
            ctlDist  <- matrix(c(rep(ctlDist, J)), ncol = J)
        }
        ctlDist <- apply(ctlDist, 2, function(x) x/sum(x))
        rownames(ctlDist) <- support
        colnames(ctlDist)  <- names(prevalence)
        plist$ctlDist <- ctlDist

        trtDist <- plist$trtDist
        trtDist <- apply(trtDist, 2, function(x) x/sum(x))
        rownames(trtDist) <- support
        colnames(trtDist) <- names(prevalence)
        plist$trtDist <- trtDist
    } else {
        mean <- plist$mean
        sd <- plist$sd
        rownames(mean) <- rownames(sd) <- c("Null", "Alt")
        colnames(mean) <- colnames(sd) <- names(prevalence)
        plist$mean <- mean
        plist$sd <- sd
    }
    plist
}

Try the ASSISTant package in your browser

Any scripts or data that you put into this service are public.

ASSISTant documentation built on Dec. 2, 2022, 5:12 p.m.