R/mft_test.R

#' Functional t-test for mean curves from two spline mixed effects ('sme') models
#' 
#' Testing hypothesis mean(grp1) = mean(grp2).
#' @param data [vector] with the response variable, i.e. values of some blood marker
#' @param group [vector] with grouping information. Has to be same length as data and
#' contain two groups.
#' @param timepoints [vector] with timepoints of measurments of the values in data
#' @param id [vector] with index for individuals
#' @param points [Integer] number of grid points used for discretizing the functions when integrating.
#' @param nboot [Integer] number of Bootstrap samples to use for estimating distribution of test statistic under H0.
#' @param ncpus [Integer] number of CPUs to use, parallelization for > 2.
#' @return pvalue for group differences
#' @import sme
#' @import parallel
#' @importFrom plyr rbind.fill
#' @export
mft_test <- function(data, group, timepoints, id, 
                     points = 500,
                     nboot = 1000, 
                     ncpus = 1, ...) {
  if(any(missing(data), missing(group), missing(timepoints), missing(id)) ||
       any(is.null(data), is.null(group), is.null(timepoints), is.null(id)))
    stop("Missing inputs")

  # check inputs
  checkmate::assertNumeric(data, all.missing = FALSE, min.len = 10)
  if (length(group) != length(data) || !is.numeric(group) || any(!group %in% 1:2))
    stop("Group has to be a grouping vector of 1s and 2s of same length as data.")
  checkmate::assertNumeric(timepoints, all.missing = FALSE, len = length(data))
  checkmate::assertInteger(id, lower = 0, all.missing = FALSE, len = length(data))
  
  if (.Platform$OS.type == "windows" && ncpus > 1) ncpus = 1
  method <- match.arg(method)

  stopifnot(length(unique(group)) == 2)

  # compute original statistic
  grp1 <- which(group == unique(group)[!is.na(unique(group))][1])
  grp2 <- which(group == unique(group)[!is.na(unique(group))][2])

  # exclude NAs
  NAs <- 0
  if(any(is.na(data)))
    NAs <- which(is.na(data))
  
  # sanity check
  #   simdata1 <- simdata[grp1, ]
  #   datframe1 <- data.frame(y = simdata1$data,
  #                          tme = simdata1$time,
  #                          ind = simdata1$id)
  #   fit1 <- sme::sme.data.frame(object = datframe1)
  
  fit1 <- sme::sme(object = data[grp1[!grp1 %in% NAs]], tme = timepoints[grp1[!grp1 %in% NAs]],
                   ind = id[grp1[!grp1 %in% NAs]], ...)
  fit2 <- sme::sme(object = data[grp2[!grp2 %in% NAs]], tme = timepoints[grp2[!grp2 %in% NAs]],
                   ind = id[grp2[!grp2 %in% NAs]], ...)
  t0 <- compute_statistic(fit1, fit2, points)

  # compute statistics on resampled data
  statistics <- parallel::mclapply(seq_len(nboot), function(x) {
    resampled_data <- resample_coef(fit1, fit2, points, method)

    # stopifnot(!all(is.na(t(resampled_data$y1)[ , 2:(ncol(t(resampled_data$y1)) -1)])))
    # stopifnot(!all(is.na(t(resampled_data$y2[ , 2:(ncol(t(resampled_data$y2)) -1)]))))


    usemat <- c(as.numeric(t(resampled_data$y1)), as.numeric(t(resampled_data$y2)))
    i1 <- intersect(grp1, which(!is.na(usemat)))
    i2 <- intersect(grp2, which(!is.na(usemat)))

    refit1 <- sme::sme(object = usemat[i1],
                       tme = timepoints[i1],
                       ind = id[i1], ...)
    refit2 <- sme::sme(object = usemat[i2],
                       tme = timepoints[i2],
                       ind = id[i2], ...)
    compute_statistic(refit1, refit2, points)

  }, mc.cores = ncpus)

  # pval
  pval <- sum(t0 < statistics) / nboot

  list("p" = pval, "fit1" = fit1, "fit2" = fit2)
}

resample_coef <- function(fit1, fit2, points, method = "non-parametric") {

  n1 <- nrow(fit1$coef) - 1
  n2 <- nrow(fit2$coef) - 1
  stopifnot(n1 >= 2 & n2 >= 2)
  N <- n1 + n2
#   tau <- seq(min(fit1$data$tme), max(fit1$data$tme))
  tau <- sort(unique(c(fit1$data$tme, fit2$data$tme)))
  # before sort(unique(c(fit1$data$tme, fit2$data$tme)))
  at <- seq(min(tau), max(tau), length.out = points)
  by <- at[2] - at[1]

  if(method == "non-parametric") {
    # first a mean curve is chosen at random
    cbind.fill <- function(...) {
      transposed <- lapply(list(...),t)
      transposed_dataframe <- lapply(transposed, as.data.frame)
      return (data.frame(t(plyr::rbind.fill(transposed_dataframe))))
    }


    # draw mean function
    samplemat <- matrix(c(ifelse(tau %in% colnames(fit1$coef), fit1$coef[1, ], NA),
                          ifelse(tau %in% colnames(fit2$coef), fit2$coef[1, ], NA)
                          ), nrow = length(tau), ncol = 2)
    mu_0 <- samplemat[, sample(1:2, 1)]

    # old, produced only NAs, sometimes
#     samplemat <- matrix(c(ifelse(tau %in% names(fit1$coef[1, , drop = FALSE]), fit1$coef[1, ], NA),
#                           ifelse(tau %in% names(fit2$coef[1, ]), fit1$coef[1, ], NA)
#     ), nrow = length(tau), ncol = 2)

    # draw individual effects from both coefficients of both models
    ind_effects_samplemat <-
      matrix(c(ifelse(rep(tau, times = nrow(fit1$coef[-1,])) %in%
                        rep(colnames(fit1$coef[-1,]), times = nrow(fit1$coef[-1,])), fit1$coef[-1,], NA),
               ifelse(rep(tau, times = nrow(fit2$coef[-1,])) %in%
                        rep(colnames(fit2$coef[-1,]), times = nrow(fit2$coef[-1,])), fit2$coef[-1,], NA)),
             byrow = TRUE, ncol = length(tau))

    v_1 <- ind_effects_samplemat[sample(1:n1, n1, replace = TRUE), ]
    v_2 <- ind_effects_samplemat[sample(1:n2, n2, replace = TRUE), ]

    # draw residuals
    residuals <- c(fit1$residuals, fit2$residuals)
    e_1 <- sample(residuals, size = (n1) * length(mu_0), replace = TRUE)
    e_2 <- sample(residuals, size = (n2) * length(mu_0), replace = TRUE)

    # new obs
    y1_new <- mu_0 + t(v_1) + e_1
    y2_new <- mu_0 + t(v_2) + e_2

    return(list("y1" = t(y1_new), "y2" = t(y2_new)))
  }
  # else if(method == "parametric") {
  #
  #     require(mvtnorm)
  #
  # #     n <- 6
  #
  #     controlgrp <- sample(1:2, 1)
  #     significant <- rbinom(1, 1, prob = prob)
  #     # create control grp values
  #
  #     # "the mean curve spline coefs are taken to be mv normally
  #     # distributed with zero mean and cov D_mu_c
  #     D_mu_c <- list(fit1$parameters$D, fit2$parameters$D)[[controlgrp]]
  #     mu_c <- mvtnorm::rmvnorm(n = 1, mean = rep(0, n), sigma = D_mu_c)
  #
  #     # now the random effects:
  #     # " the  ind level spline coefs are taken to be mv normally distributed
  #     # with zero mean and cov t_c * D_mu_c
  #     # "the scalar t_C serves the purpose of scaling the
  #     # individual level curves to be an order smaller than the mean curves,
  #     # and is drawn from an uniform distribution."
  #     t <- runif(1)
  #     v_c <- mvtnorm::rmvnorm(n = 6, mean = rep(0, n), sigma = t * D_mu_c)
  #
  #
  #     # " the noise terms eps_c are also normally distributed with zero mean
  #     # and cov sigma_c^2 * I, where sigma_c^2 is log normally distributed.
  #     sigma_c <- c(fit1$parameters$sigma, fit2$parameters$sigma)[controlgrp]
  #     eps_c <- rmvnorm(n = n, mean = rep(0, 6), sigma = sigma_c * diag(1, 6) )
  #
  #
  #     if(significant) {
  #
  #       D_mu_t <- list(fit1$parameters$D, fit2$parameters$D)[[as.numeric(!controlgrp) + 1]]
  #       mu_t <- mvtnorm::rmvnorm(n = 1, mean = rep(0, n), sigma = D_mu_t)
  #       t2 <- runif(1)
  #       v_t <- mvtnorm::rmvnorm(n = 6, mean = rep(0, 6), sigma = t2 * D_mu_t)
  #       sigma_t <- c(fit1$parameters$sigma, fit2$parameters$sigma)[as.numeric(!controlgrp) + 1]
  #       eps_t <- rmvnorm(n = n, mean = rep(0, 6), sigma = sigma_t * diag(1, 6) )
  #
  #     } else {
  #       # if this case is not significant, create treatment grp from same distributions
  #
  #       D_mu_t <- list(fit1$parameters$D, fit2$parameters$D)[[controlgrp]]
  #       mu_t <- mvtnorm::rmvnorm(n = 1, mean = rep(0, 6), sigma = D_mu_c)
  #       v_t <- mvtnorm::rmvnorm(n = 6, mean = rep(0, 6), sigma = t * D_mu_c)
  #       sigma_t <- c(fit1$parameters$sigma, fit2$parameters$sigma)[controlgrp]
  #       eps_t <- rmvnorm(n = n, mean = rep(0, 6), sigma = sigma_c * diag(1, 6) )
  #     }
  #
  #     # new obs
  #     y1_new <- matrix(rep(mu_c, n), ncol = n, byrow = FALSE) + v_c + eps_c
  #     y2_new <- matrix(rep(mu_c + mu_t, n), ncol = n, byrow = FALSE) + v_t + eps_t
  #
  #     return(list("y1" = y1_new, "y2" = y2_new))
  #
  # }
}

#' Takes two fitted sme models and returns
#' a moderated functional t-test statistic
#' after Berk.
compute_statistic <- function(fit1, fit2, points) {

  n1 <- nrow(fit1$coef) - 1
  n2 <- nrow(fit2$coef) - 1

  tau <- sort(unique(c(fit1$data$tme, fit2$data$tme)))
  at <- seq(min(tau), max(tau), length.out = points)
  by <- at[2] - at[1]
  #' funktional.norm
  #' x, y curves
  #' p = 2 is euclidean norm
  #' by is stepsize of approximation
  functional.norm <- function(x, y = NULL, p = 2, by) {
#     n <- length(x)
    if(!is.null(y)) {
      x <- x - y
    }
    # compute integral with
    # trapezoid rule:
    x <- abs(x)
    integral <- by/2 * (x[1]^2 + 2*sum(x[-points]^2 + x[-1]^2 ) + x[points]^2)
    sqrt(integral)
  }

  # create mean functions from mu_coefficients
  if(!is.null(fit1$knots)) {
    knots <- c(range(tau)[1], fit1$knots, range(tau)[2])
    mu1 <- spline(x = knots, y = fit1$coef[1, ], xout = at, method = "natural")$y
    mu2 <- spline(x = knots, y = fit2$coef[1, ], xout = at, method = "natural")$y
    
  } else {
    mu1 <- spline(x = tau[tau %in% names(fit1$coef[1, ])], y = fit1$coef[1, ], xout = at, method = "natural")$y
    mu2 <- spline(x = tau[tau %in% names(fit2$coef[1, ])], y = fit2$coef[1, ], xout = at, method = "natural")$y
    
  }
  
  # compute functional standard deviation by computing the integral
  # of the individual effects as functions (splines)
  
  if(!is.null(fit1$knots)) {
    knots <- c(range(tau)[1], fit1$knots, range(tau)[2])
    
  s1 <- mean(sapply(1:n1, function(i) {
    v <- spline(x = knots, y = fit1$coefficients[1 + i, ], xout = at,
                method = "natural")$y
    functional.norm(v, by = by)^2
  }))
  s2 <- mean(sapply(1:n2, function(i) {
    v <- spline(x = knots, y = fit2$coefficients[1 + i, ], xout = at,
                method = "natural")$y
    functional.norm(v, by = by)^2
  })) 
  } else {
    s1 <- mean(sapply(1:n1, function(i) {
      v <- spline(x = tau[tau %in% names(fit1$coef[1 + i, ])], y = fit1$coefficients[1 + i, ], xout = at,
                  method = "natural")$y
      functional.norm(v, by = by)^2
    }))
    s2 <- mean(sapply(1:n2, function(i) {
      v <- spline(x = tau[tau %in% names(fit2$coef[1 + i, ])], y = fit2$coefficients[1 + i, ], xout = at,
                  method = "natural")$y
      functional.norm(v, by = by)^2
    }))
  }


  denominator <- sqrt(((s1 / n1)) + ((s2 / n2)))
  l2 <- functional.norm(x = mu1, y = mu2, by = by)

  # statistic
  ft <- l2 / denominator
  ft
}
toreerdmann/TPDT documentation built on May 29, 2019, 10:39 a.m.