MSToolkit: The MSToolkit Library For Clinical Trial Design

Documented in createContinuousCovariates

#' Create a set of continuous covariates
#'
#' Creates a set of continuous covariates from a multivariate normal
#' distribution and (optionally) a set of constraints.
#'
#'
#' @param subjects (Required) Subjects for which to create covariates
#' @param names (Required) Names for the continuous covariates.  They should be
#' valid R names (See \code{\link{validNames}}) and no duplicate name should be
#' given
#' @param mean (Required) Vector of means. Must be of same length than
#' \code{names}
#' @param covariance (Optional) Lower triangle of covariance matrix. See
#' \code{\link{parseCovMatrix}} for details.  1 by default
#' @param range (Optional) Ranges of acceptable values for each covariates. See
#' \code{\link{parseRangeCode}} for details.  This is missing by default,
#' resulting in no "range" limitation being applied
#' @param digits (Optional) Number of digits used to round the values.  This
#' argument can be either missing (the default), so no rounding is done, of
#' length one and all variables are rounded at the same digits, of same length
#' than the number of covariates so that each covariate is rounded according to
#' its value. This argument is first parsed by \code{\link{parseCharInput}} so
#' it can either be a character vector or a numeric vector.  See
#' \code{\link{parseCharInput}}.  If the parsed \code{digits} vector does not
#' have length one or length equal to the number of covariates, an error is
#' generated by the \code{\link{ectdStop}} function.  This is missing by
#' default, resulting in no rounding being performed
#' @param maxDraws (Optional) Maximum number of attempts allowed if initial
#' data not in range (100 by default)
#' @param seed (Optional) Random seed to use.  By default, this is derived from
#' the master random seed
#' @param idCol (Optional) Name of the subject column. Must be a valid R name
#' (See \code{\link{validNames}}) and not equal to one entry of \code{names}.
#' "SUBJ" by default
#' @param includeIDCol (Optional) Should the subject column be included.
# Typically only set to FALSE when called from \code{\link{createCovariates}}.
# Defaults to TRUE
#' @author Romain Francois
#' @seealso \code{\link{createDiscreteCovariates}} to create covariates for a
#' discrete distribution.
#'
# \code{\link{createExternalCovariates}} to create covariates by
# \emph{sampling} data from an external file.
#'
# \code{\link{createTimeVaryingCovariates}} to create time-varying covariates.
#'
# \code{\link{createCovariates}} that wraps \code{createContinuousCovariates}
# and the two other described above.
#' @keywords datagen
#' @examples
#'
#'   # 30 samples from a :      [ 0 ]   [ 1, 0, 0 ]
#'   #                      N ( [ 0 ] , [ 0, 1, 0 ] )
#'   #                          [ 1 ]   [ 0, 0, 1 ]
#'   dat <- createContinuousCovariates( 30,
#'                                      mean = "0,0,1",
#'                                      names = c("X", "Y", "Z")  )
#'
#'
#'   # 30 samples from a :      [ 0 ]   [ 1, 0, 0 ]
#'   #                      N ( [ 0 ] , [ 0, 1, 0 ] )
#'   #                          [ 1 ]   [ 0, 0, 1 ]
#'   # truncated at X > 0
#'   dat <- createContinuousCovariates( 30,
#'                                      mean = "0,0,1",
#'                                      names = c("X", "Y", "Z"),
#'      range= "X > 0"  )
#'
#'   # 30 samples from a :      [ 0 ]   [ 1, 0, 0 ]
#'   #                      N ( [ 0 ] , [ 0, 1, 0 ] )
#'   #                          [ 1 ]   [ 0, 0, 1 ]
#'   # truncated at X = 0, and X < Y  < 1
#'   dat <- createContinuousCovariates( 30,
#'                                      mean = "0,0,1",
#'                                      names = c("X", "Y", "Z"),
#'                                      range= c("X > 0", "X< Y<1")  )
#'   stopifnot( all( dat$X < dat$Y ) )
#'   stopifnot( all( dat$X > 0 ) )
#'
#'   # 30 samples from a :      [ 0 ]   [ 1 , .5, 0 ]
#'   #                      N ( [ 0 ] , [ .5, 1 , 0 ] )
#'   #                          [ 1 ]   [ 0 , 0 , 1 ]
#'   dat1 <- createContinuousCovariates( 30,
#'                                       mean = "0,0,1",
#'                                       names = c("X", "Y", "Z"),
#'                                       covariance = "1,.5,1,0,0,1",
#'                                       seed = 30  )
#'
#'   # same
#'   dat2 <- createContinuousCovariates( 30,
#'                                       mean = "0,0,1",
#'                                       names = c("X", "Y", "Z"),
#'                                       covariance = cbind(c(1,.5,0),
#'                                                          c(.5,1,0),
#'                                                          c(0,0,1)) ,
#'                                       seed = 30  )
#'
#'   stopifnot( all(dat1 == dat2 ))

#'   # use of the digits argument
#'   # X will be rounded at 2 digits
#'   # Y will be rounded at 3 digits
#'   # Z will be rounded at 2 digits
#'   createContinuousCovariates( 10,
#'                               mean = "100,100,100",
#'                               names = c("X", "Y", "Z"),
#'                               digits = "2,3,2"  )
#'
#' @export
createContinuousCovariates <- function(subjects,
                                       names,
                                       mean,
                                       covariance = 1,
                                       range = NULL,
                                       digits,
                                       maxDraws = 100 ,
                                       seed = .deriveFromMasterSeed(),
                                       idCol = getEctdColName("Subject"),
                                       includeIDCol = TRUE) {
  ##############################################################################
  # Mango Solutions, Chippenham SN14 0SQ 2006
  # createContinuousCovariates.R
  # Fri Jun 01 10:41:35 BST 2007 @445 /Internet Time/
  #
  # Author: Romain Francois
  ##############################################################################
  # DESCRIPTION: create a set of continuous covariates, from a (truncated)
  #              multivariate nomal distribution
  # KEYWORDS: datagen, component:covariate
  ##############################################################################

  set.seed(seed)

  ## sanity checks on the inputs
  if( missing(mean) )
    ectdStop("`mean` is needed in `createContinuousCovariates`")
  subjects <- .expandSubjects( subjects )
  nSubjects <- get("nSubjects")

  mean  <- parseCharInput( mean  )
  nCov <- length( mean )
  names <- if(missing(names)) {
    "X" %.% 1:nCov
  } else {
    parseCharInput( names , checkdup = TRUE, convertToNumeric = FALSE)
  }

  if (length(names) != length(mean))
    ectdStop(
      "Dimension mismatch between `names` and `mean`"  %.nt%
        "`mean`  of length: " %.% length(mean) %.nt%
        "`names` of length: " %.% length(names)
    )
  covariance <- parseCovMatrix(covariance, nCov)
  validNames(names, idCol)
  maxDraws <- as.integer(maxDraws)
  if (maxDraws < 1)
    ectdStop("The maximum number of draws should be a positive integer")


  if (!missing(digits)) {
    for (i in 1:length(digits)) {
      if (digits[i] < 0) {
        ectdStop("The `digits` argument must be positive")
      }
      i = i + 1
    }
  }

  if (is.null(range)) {
    out <-
      as.data.frame(MASS::mvrnorm(nSubjects, mu = mean, Sigma = covariance))
  }
  else {
    # deal with range code
    range <- parseRangeCode(range)

    nGen <- 0
    out <-
      do.call(data.frame, structure(rep(list(
        rep(0, nSubjects)
      ), nCov), names = names))
    for (i in 1:maxDraws) {
      # generate a new set of data
      newsets <-
        as.data.frame(MASS::mvrnorm(nSubjects, mu = mean, Sigma = covariance))
      names(newsets) <- names
      alright <- try(eval(range, newsets) , silent = TRUE)
      if (class(alright) == "try-error")
        next
      indxs <- which(alright)
      howManyToAdd <- min(nSubjects - nGen, length(indxs))
      if (howManyToAdd == 0)
        next

      out[nGen + 1:howManyToAdd,] <-
        newsets[indxs[1:howManyToAdd], , drop = FALSE]
      nGen <- nGen + howManyToAdd

      .log(
        "..(createContinuousCovariates) iteration $i , $nGen generated (",
        sprintf("%6.2f", round(nGen / nSubjects * 100, 2)),
        "%)"
      )

      if (nGen == nSubjects)
        break
    }
    if (nGen != nSubjects)
      ectdStop(
        paste(
          "After",
          maxDraws,
          "attempts, covariate data for only",
          nGen,
          "of the",
          nSubjects,
          "subjects matched the range criteria"
        )
      )
  }
  names(out) <- names
  out <- .roundIt(out, digits)

  if (includeIDCol)
    out <- .eval("data.frame( $idCol = subjects, out )")
  out

}