AHR: Estimation and Testing of Average Hazard Ratios

#' Weighted Kaplan-Meier estimator with discrete time-independent covariate
#'
#' @title wkm
#' @param times a vector of evaluation times
#' @param data data frame containing the variables in formula (if is.null(formula) expected column names are: Y (time), D (status), W (strat. factor), V (left-truncation times))
#' @param param list of parameters containing:
#' alpha: fractional parameter (default=1)
#' var: if TRUE (default) calculate variance estimate
#' cov: if FALSE (default) do not calculate covariance matrix estimate
#' left.limit: if TRUE calculate left-continuous estimates, else calculate right-continuous estimates
#' rr.subset: logical vector defining subset of observations to use for response rate estimation (default: use all observations)
#' @param formula an object of class '"formula"' specifying the conditional survival model (only discrete covariates supported)
#' @return an object of class '"wkm"'
#' @details This function calculates the weighted Kaplan-Meier estimator for the survival function with weights based on a discrete time-independent covariate as described in Murray/Tsiatis (1996).
#' The survival probabilities are evaluated at each entry in the vector \code{times}. The data frame \code{data} must either contain the variable in \code{formula} or, if \code{formula} is \code{NULL},
#' the variables \code{V} (left-truncation time), \code{Y} (censored failure time), \code{D} (censoring indicator), \code{W} (stratification variable).
#' If \code{var} is \code{TRUE} then an estimate of the asmyptotic variance is calculated for each entry in vector \code{times}. If \code{cov} is \code{TRUE} then the \code{n x n} asymptotic
#' covariance matrix is estimated, where \code{n} is the length of vector \code{times}. If \code{left.limit} is \code{TRUE} then a left-continuous estimate of the survival function is calculated instead
#' of a right-continuous estimate (default). If a logical vector \code{rr.subset} is supplied, then only a subset of observations is used to estimate the response rates.
#' @references S.~Murray and A.~A. Tsiatis. Nonparametric survival estimation using prognostic longitudinal covariates. \emph{Biometrics}, 52(1):137--151, Mar. 1996.
#' @export
wkm <- function(times, data, param=list(alpha=1, var=TRUE, cov=FALSE, left.limit=FALSE, rr.subset=rep(TRUE, nrow(data))), formula=NULL) {

    if(is.null(param)) param <- list(alpha=1, var=TRUE, cov=FALSE, left.limit=FALSE, rr.subset=rep(TRUE, nrow(data)))

    alpha <- param$alpha    
    var <- param$var
    cov  <- param$cov   
    left.limit <- param$left.limit
    rr.subset <- param$rr.subset    
    
    if(!is.null(formula)) data <- parseFormula(formula, data, one.sample=TRUE)
    ## if is.null(formula) assume that the variables in data are named V,Y,D,W
    
    strata <- levels(factor(data$W))
    n.strata <- length(strata)

    n.times <- length(times)
    n <- length(data$Y)
    
    S <- numeric(n.times)  
    P <- numeric(n.strata)
    n.atrisk <- numeric(n.times)

    if(var) {
        logV <- 0
        V <- 0
    } else {
        logV <- NULL
        V <- NULL
    }

    if(n.times == 1) cov <- FALSE
    
    if(cov) COV <- matrix(0, nrow=n.times, ncol=n.times)
    else {
        COV <- NULL
        logCOV <- NULL
    }

    ## set left-truncation times to 0 if not supplied by user
    if(is.null(data$V)) data$V <- rep.int(0, n)

    ## estimate response rates only from subset
    if(!is.null(param$rr.subset)) data2 <- data[rr.subset, ]
    else data2 <- data
    
    for(s in 1:n.strata) {
        s.data <- data[data$W == strata[s], ]

        if(nrow(s.data) == 0) stop("Empty stratum!")

        fit <- fastkm(s.data$Y, s.data$D, s.data$V, left.limit, times)

        ## total number-at-risk
        n.atrisk <- n.atrisk + fit$n.atrisk
        
        ## unbiased response rates
        p <- length(data2$W[data2$W == strata[s]]) / nrow(data2)
        stopifnot(!is.nan(p))
        
        P[s] <- p
        
        fs <- fit$surv
        pfs <- p * fs

        S <- S + pfs
        
        ## fit$variance (Greenwood) is variance of -log(\hat{S}), but we need variance of -\sqrt{ns}log(\hat{S})
        fvar <- fit$variance * nrow(s.data)
        fvar[fit$surv == 0] <- 0

        if(cov) { ## calculate covariance matrix
            ## exploit independent increments structure! (cov(X(s), X(t)) = var(X(min(s,t))))
            v <- matrix(fvar, nrow=n.times, ncol=n.times, byrow=FALSE) 
            v[lower.tri(v)] <- t(v)[lower.tri(v)]

            dfs <- diag(fs)
            
            COV <- COV + p * dfs %*% v %*% dfs + pfs %*% t(fs)            
        }
        
        if(var) { ## calculate variance
            V <- V + pfs * fs * (1 + fvar)
        }
    }

    Sa.log <- 1/(S^alpha)
    Sa.log[S == 0] <- 0
    
    if(cov) {
        COV <- COV - S %*% t(S)
        
        if(alpha != 1) {
            Sa <- S^(alpha - 1)
            Sa[S == 0] <- 0  ## if alpha - 1 < 0 then S == 0 implies V = NA but should be V = 0
            ss <- diag(Sa)
            COV <- alpha^2 * ss %*% COV %*% ss
        }
        
        ssl <- diag(Sa.log)
        logCOV <- ssl %*% COV %*% ssl

        ##V <- diag(COV)
        ##logV <- diag(logCOV)        
    }
    
    if(var) { 
        V <- V - S^2
        
        if(alpha != 1) {
            Sa <- S^(alpha - 1)
            Sa[S == 0] <- 0  ## if alpha - 1 < 0 then S == 0 implies V = NA but should be V = 0
            V <- alpha^2 * Sa^2 * V
        }
        
        logV <- Sa.log^2 * V
    }

    obj <- list(times=times, alpha=alpha, p=P, S=S^alpha, COV=COV, logCOV=logCOV, logV=logV, V=V, n.atrisk=n.atrisk)
    class(obj) <- "wkm"
    obj
}

## calculate diag(x) %*% A %*% diag(x)
matrixProd <- function(A, x) sweep(sweep(A, MARGIN=2, STATS=x, FUN="*"), MARGIN=1, STATS=x, FUN="*")

## same as matrix(x, nrow=length(x), ncol=length(x), byrow=FALSE)
vectorToMatrixByCol <- function(x) {
    n <- length(x)
    dim(x) <- c(n, 1)
    x[, rep.int(1, n)]
}

##v <- matrix(fvar, nrow=n.times, ncol=n.times, byrow=FALSE)

f <- function(fvar, n.times) {
    v <- fvar
    dim(v) <- c(n.times, 1)
    v <- v[, rep.int(1, n.times)]
}
            

################################################
### WKM with time-dependent discrete covariate
################################################
wkmSelect <- function(path, times, data) {
    if(is.null(path)) data
    else {
        col <- length(path)
        sel <- (data$Y > times[col]) & (data[, col+2] == path[col])
        data[sel, ]
    }
}

## depth-first traversal of all possible covariate paths
wkmTraverse <- function(path, k, tt, times, data) {
  s.data <- wkmSelect(path, times, data)
  n <- dim(s.data)[1]
  if(is.null(n)) c(0, 0) ## no subjects on this covariate path
  else if(tt <= times[length(path)+1]) { ## leaf
    c(n, n * fastkm(s.data$Y, s.data$D, s.data$V, FALSE, tt)$surv[1])
  } else {
    x <- sapply(0:k, function(i) wkmTraverse(c(path, i), k, tt, times, s.data))
    tmp <- sum(x[2,]) / sum(x[1,])
    if(is.null(path)) tmp  ## root
    else c(n, n * fastkm(s.data$Y, s.data$D, s.data$V, FALSE, times[length(path)])$surv[1] * tmp)
  }
}

wkmTime <- function(times, k, data) {
  sapply(times, function(tt) wkmTraverse(NULL, k-1, tt, c(0, Inf), data))
}