R/shr.R
In SmoothHazard: Estimation of Smooth Hazard Models for Interval-Censored Data

Documented in shr

#' Fit a survival model
#' 
#' Fit a survival model using either a semi-parametric approach (penalized
#' likelihood with an approximation of the hazard function by linear
#' combination of M-splines) or a parametric approach (specifying a Weibull
#' distribution on the hazard function). Left-truncated, right-censored, and
#' interval-censored data are allowed.
#' 
#' The estimated parameters are obtained using the robust Marquardt algorithm
#' (Marquardt, 1963) which is a combination between a Newton-Raphson algorithm
#' and a steepest descent algorithm.
#' 
#' @param formula a formula object with the response on the left hand side
#' and the terms on the right hand side. The
#' response must be a survival object or Hist object as returned by
#' the \code{'Surv'} or \code{'Hist'} function.
#' @param data a data frame in which to interpret the variables named
#' in the \code{formula}.
#' @param eps a vector of length 3 for the convergence criteria
#' (criterion for parameters, criterion for likelihood, criterion for
#' second derivatives). The default is \code{c(5,5,3)} and corresponds to
#' criteria equals to \code{10^{-5}}, \code{10^{-5}} and \code{10^{-3}}.
#' @param n.knots  Argument only active for the penalized likelihood approach \code{method="splines"}.
#' Number of knots for the splines to use to approximate
#' the hazard function. The default is 7. If \code{knots} are given as a vector this argument is ignored.
#' The algorithm needs least 5 knots and at most 20 knots.
#' @param knots Argument only active for the penalized likelihood approach \code{method="splines"}.
#' There are three ways to control the placement of the knots between the smallest and the largest
#' of all time points:
#' \describe{
#'  \item{\code{knots="equidistant"}}{Knots are placed with same distance on the time scale.}
#'  \item{\code{knots="quantiles"}}{Knots are placed such that the number of observations is roughly the same between knots.}
#' \item{knots=list()}{List of length 3. The list elements are the actual placements
#' (timepoints) of the knots for the M-spline.}
#' }
#' The algorithm reuqires at least 5 knots and allows no more than 20 knots.
#' @param CV binary variable equals to 1 when search (by approximated
#' cross validation) of the smoothing parameter kappa and 0
#' otherwise. Argument for the penalized likelihood approach. The
#' default is 0.
#' @param kappa Argument only active for the penalized likelihood approach \code{method="splines"}.
#' A positive number (smoothing parameter) 
#' If CV=1 the value is used as a starting value 
#' for a cross validation search to optimize \code{kappa}.
#' @param conf.int Level of confidence pointwise confidence intervals of the survival and hazard functions, i.e.,
#' a value between 0 and 1, the default is \code{0.95}.
#' The default is also used when \code{conf.int=TRUE}.
#' To avoid computation of confidence intervals, set \code{conf.int} to FALSE or NULL.
#' @param maxiter maximum number of iterations. The default is 200.
#' @param method type of estimation method: "Splines" for a penalized
#' likelihood approach with approximation of the hazard function by
#' M-splines, "Weib" for a parametric approach with a Weibull
#' distribution on the hazard function. Default is \code{"Weib"}.
#' @param print.iter boolean parameter. Equals to \code{TRUE} to print
#' the likelihood during the iteration process, \code{FALSE}
#' otherwise. Default is \code{FALSE}. This option is not running on
#' Windows.
#' @param na.action how NAs are treated. The default is first, any
#' na.action attribute of data, second a na.action setting of options,
#' and third 'na.fail' if that is unset. The 'factory-fresh' default
#' is na.omit. Another possible value is NULL.
#' @return
#' \describe{
#' \item{call}{} \item{coef}{regression parameters.}
#' \item{loglik}{vector containing the log-likelihood without and with covariate.}
#' \item{modelPar}{Weibull parameters.}
#' \item{N}{number of subjects.}
#' \item{NC}{number of covariates.}
#' \item{nevents}{number of events.}
#' \item{modelResponse}{model response: \code{Hist} or \code{Surv} object.}
#' \item{converged}{integer equal to 1 when the model converged, 2, 3 or 4 otherwise.}
#' \item{time}{times for which survival and hazard functions have been evaluated for plotting.}
#' \item{hazard}{matched values of the hazard function.}
#' \item{lowerHazard}{lower confidence limits for hazard function.}
#' \item{upperHazard}{upper confidence limits for hazard function.}
#' \item{surv}{matched values of the survival function.}
#' \item{lowerSurv}{lower confidence limits for survival function.}
#' \item{upperSurv}{upper confidence limits for survival function.}
#' \item{RR}{vector of relative risks.}
#' \item{V}{variance-covariance matrix.}
#' \item{se}{standard errors.}
#' \item{knots}{knots of the M-splines estimate of the hazard function.}
#' \item{nknots}{number of knots.}
#' \item{CV}{a binary variable equals to 1 when search of the smoothing parameter
#'          \link{kappa} by approximated cross-validation, 1 otherwise. The default is 0.}
#' \item{niter}{number of iterations.}
#' \item{cv}{vector containing the convergence criteria.}
#' \item{na.action}{observations deleted if missing values.}
#' }
#' @author R: Celia Touraine \email{celia.touraine@icm.unicancer.fr} Fortran:
#' Pierre Joly \email{Pierre.Joly@@isped.u-bordeaux2.fr}
#' @seealso \code{\link{shr}}, \code{\link{print.shr}},
#' \code{\link{summary.shr}}, \code{\link{print.shr}},
#' @references D. Marquardt (1963). An algorithm for least-squares estimation
#' of nonlinear parameters.  \emph{SIAM Journal of Applied Mathematics},
#' 431-441.
#' @keywords methods 
##' @examples
##' 
##' # Weibull survival model
##' library(prodlim)
##' data(testdata)
##' fit.su <- shr(Hist(time=list(l,r),id)~cov,data=testdata) 
##' fit.su
##' summary(fit.su)
##'\donttest{
##' shr.spline <- shr(Hist(time=list(l,r),id)~cov,data=testdata,method="splines",n.knots=6)
##' shr.spline
##' shr.spline.q <- shr(Hist(time=list(l,r),id)~cov,data=testdata,
##'                     method="splines",n.knots=6,knots="quantiles")
##' plot(shr.spline.q)
##'
##' ## manual placement of knots
##' shr.spline.man <- shr(Hist(time=list(l,r),id)~cov,data=testdata,method="splines",knots=seq(0,7,1))
##' }
#' @export
shr <- function(formula,
                data,
                eps=c(5,5,3),
                n.knots=7,
                knots="equidistant",
                CV=FALSE,
                kappa=10000,
                conf.int=.95,
                maxiter=200,
                method="Weib",
                print.iter=FALSE,
                na.action=na.omit){

  call <- match.call()
  ## cat("\n")
  ## cat("Be patient. The program is computing ... \n") 
	
    flush.console()
    ptm<-proc.time()
    # {{{ process formula and data
    # check if formula is a formula
    method <- tolower(method)
    if(!(method %in% c("weib","splines"))) stop("The method must be either 'Weib' or 'Splines'")
    # --------------------------------------------------------------------
    formula.names <- try(all.names(formula),silent=TRUE)
    if (!(formula.names[1]=="~")||(match("$",formula.names,nomatch=0)+match("[",formula.names,nomatch=0)>0)){
        stop("Invalid specification of formula. Perhaps forgotten right hand side?\nNote that any subsetting, ie data$var or data[,\"var\"], is invalid for this function.")
    }else{
        if (!(formula.names[2] %in% c("Surv","Hist"))) stop("formula is NOT a proper survival formula,\nwhich must have a `Surv' or `Hist' object as response.")
    }
    # {{{ parse confidence level
    do.conf.int <- !is.null(conf.int) && !is.na(conf.int) && !conf.int==FALSE
    if (is.logical(conf.int)) conf.int <- .95
    if (do.conf.int == TRUE){
        stopifnot(0<conf.int && conf.int<1)
    }
    # }}}	
    call <- match.call()
    m <- match.call()
    position <- match(c("","formula","data","na.action"),names(m),0)
    m <- m[position]
    m[[1]] <- as.name("model.frame")	
    m <- eval(m,sys.parent())
    na.action <- attr(m,"na.action")
    response <- stats::model.response(m)

  #  FIX for people who use `Surv' instead of `Hist' 
  
  if (inherits(response,"Shr")){
    # classe "Shr"
    attr(response,"model") <- "survival"
    attr(response,"cens.type") <- "rightCensored"
    model.type <- 1
  }else{
    # classe "Hist"
    model.type <- match(attr(response,"model"),c("survival","competing.risks","multi.states"))
  }

  cens.type <- attr(response,"cens.type")
  event.history <- response

  if (cens.type!="intervalCensored"){
    event.time.order <- order(event.history[,"time"],-event.history[,"status"])
  }else{
    event.time.order <- order(event.history[,"L"],-event.history[,"status"])
  }

  # }}}
  # {{{ covariates
  # add Amadou 30/01/12
  #----------------------- factor
  X <- model.matrix(formula,data=m)
  Xnames <- NULL
  if (ncol(X) == 1){ 
    #no covariate
    X <- X-1
    NC <- 0
  }else{
    # covariate
    X <- X[, -1, drop = FALSE]
    NC <- NCOL(X)
    Xnames <- colnames(X)
  } 
  #----------------------- factor	

  # }}}
  # {{{ prepare censored event times 
  N <- NROW(X)

  isIntervalCensored <- cens.type=="intervalCensored"
	
  truncated <- nchar(attr(event.history,"entry.type"))>1  

  if (truncated){
    entrytime <- event.history[,"entry"]
  }else{
    entrytime <- rep(0,N)
  }

  if (isIntervalCensored){
    Ltime <- as.double(event.history[,"L",drop=TRUE])
    Rtime <- as.double(event.history[,"R",drop=TRUE])
  }else{
    Ltime <- as.double(event.history[,"time",drop=TRUE])
    Rtime <- rep(0,N)
  }
  id <- event.history[,"status"]
  # }}}
  # {{{ call Fortran function weib and collect results

    # do not give infinite values to fortran
    Rtime[is.infinite(Rtime)] <- Ltime[is.infinite(Rtime)]
    # find time bounderies
    if (length(entrytime)>0){
        alltimes <- sort(unique(c(Ltime, Rtime,entrytime)))
        amax <- max(alltimes)
        amin <- min(alltimes)
    }
    else{
        alltimes <- sort(unique(c(Ltime, Rtime)))
        amax <- max(alltimes)
        amin <- min(alltimes)
    }

  if (method == "weib"){	
      size1 <- NC
      size2 <- size1^2
      size_V <- size1 + 2
      ffit <- .Fortran("survWeib",
                       as.double(entrytime),
                       as.double(Ltime),
                       as.double(Rtime),
                       as.integer(id),
                       as.double(X),
                       as.integer(N),
                       as.integer(NC),
                       as.integer(truncated),
                       as.integer(isIntervalCensored),
                       as.integer(eps),
                       as.integer(maxiter),
                       loglik=as.double(rep(0,2)),
                       basepar=as.double(rep(0.1,2)),
                       regpar=as.double(rep(0.1,NC)),
                       v=as.double(rep(0,NC*NC)),
                       converged=as.integer(rep(0,2)),
                       cv=as.double(rep(0,3)),
                       niter=as.integer(0),
                       t=as.double(rep(0,100)),
                       S=as.double(rep(0,100)),
                       S_l=as.double(rep(0,100)),
                       S_u=as.double(rep(0,100)),
                       h=as.double(rep(0,100)),
                       h_l=as.double(rep(0,100)),
                       h_u=as.double(rep(0,100)),
                       as.integer(do.conf.int),
                       as.double(conf.int),
                       as.integer(print.iter),
                       V_tot=as.double(matrix(0,nrow=size_V,ncol=size_V)),
                       PACKAGE="SmoothHazard")
  }else{
       #  	cat("------ Program Splines ------ \n")
       if (is.character(knots)){
           if (length(n.knots)!=1) stop("Argument n.knots has to be a single positive integer")
           if((!is.numeric(n.knots) && !is.integer(n.knots)) || (n.knots < 5) || (n.knots >20))
               stop("n.knots has to be an integer between 5 and 20. See help(shr).")
           if (knots=="quantiles"){
               knots <- quantile(alltimes,seq(0,1,1/(n.knots-1)))
           } else {
                 if (knots!="equidistant")
                     warning("Unknown specification of knots. Fall back to equidistant.")
                 knots <- seq(amin,amax,(amax-amin)/(n.knots-1))
             }
       } else{## user specified knots
             if (!is.numeric(knots))
                 stop("Incorrect form of argument knots. See help(shr).")
             if (knots[1]< amin - 0.05*amin) stop(paste("Smallest knot should not be smaller than the time point:",amin))
             if (knots[length(knots)]> amax + 0.05*amax) stop(paste("Largest knot should not be larger than the time point:",amax))
             knots <- sort(knots)
         }
       nknots <- length(knots)
       if (nknots<5) {
           stop("Need at least 5 knots.")
       }
       if (nknots>20){
           stop("Cannot handle more than 20 knots.")
       }
       ## make fake knots needed for M-splines
       knots <- c(rep(knots[1],3),knots,rep(knots[length(knots)],3))
       size1 <- NC
       size2 <- size1^2
       size_V <- size1 + nknots + 2
       ffit <- .Fortran("survPl",
                        as.double(entrytime),
                        as.double(Ltime),
                        as.double(Rtime),
                        as.integer(id),
                        as.double(X),
                        as.integer(N),
                        as.integer(NC),
                        as.integer(truncated),
                        as.integer(isIntervalCensored),
                        as.integer(eps),
                        as.integer(maxiter),
                        loglik=as.double(rep(0,2)),
                        regpar=as.double(rep(0.1,NC)),
                        v=as.double(rep(0,NC*NC)),
                        converged=as.integer(rep(0,2)),
                        cv=as.double(rep(0,3)),
                        niter=as.integer(0),
                        t=as.double(rep(0,99)),
                        S=as.double(rep(0,99)),
                        S_l=as.double(rep(0,99)),
                        S_u=as.double(rep(0,99)),
                        h=as.double(rep(0,99)),
                        h_l=as.double(rep(0,99)),
                        h_u=as.double(rep(0,99)),
                        as.integer(nknots),
                        as.double(knots),
                        as.integer(CV),
                        as.double(kappa),
                        kappa=as.double(0),
                        as.integer(do.conf.int),
                        as.double(conf.int),
                        CVcrit=as.double(0),
                        mdf=as.double(0),
                        ti=as.double(rep(0,(nknots+6))),
                        theta=as.double(rep(0,(nknots+2))),
                        as.integer(print.iter),
                        V_tot=as.double(matrix(0,nrow=size_V,ncol=size_V)),
                        PACKAGE="SmoothHazard")



  }
    ## Fortran delivers
    ## Variable name 	Explanation 	Dimension 	Storage mode 	Remark
    ## loglik 	log-likelihood without and with covariate 	length 2 	double 	
    ## basepar 	Weibull parameters 	length 2 	double 	
    ## regpar 	Regression coefficients 	length P 	double 	
    ## v 	covariance matrix 	length P*P 	double 	
    ## converged 	0=converged,1=invert fails,2=no 	length 1 	integer 	
    ## t 	time to plot S(t) and h(t) 	length 100 	double 	
    ## S 	survival function 	length 100 	double 	
    ## S_l 	lower confidence limit for S 	length 100 	double 	
    ## S_u 	Upper confidence limit for S 	length 100 	double 	
    ## h 	hazard function 	length 100 	double 	
    ## h_l 	lower confidence limit for h function 	length 100 	double 	
    ## h_u 	upper confidence limit for h 	length 100 	double 	
  if (any(ffit$converged[1] == 4)){
      warning("Problem in the loglikelihood computation. The program stopped abnormally. Please check your dataset. \n")    
  }
  if (any(ffit$converged[1] == 2)){
      if (CV==0) 
          warning("Model did not converge. You could try to increase the 'maxit' parameter and set 'CV=1'.")
      else
          warning("Model did not converge. You could try to increase the 'maxit' parameter and modify the start values for 'kappa'.")
  }
  if (any(ffit$converged == 3)){
      warning("The Fisher information matrix is not positive definite.")
  }
  if(method=="weib") weibullParameter <- ffit$basepar
  # }}}
  # {{{ output 
	
  fit <- NULL
  fit$call <- call
	

  fit$loglik <- ffit$loglik
  if(method=="weib"){
    fit$modelPar <- weibullParameter
  }
  fit$N <- N
  fit$NC <- NC
  fit$modelResponse <- event.history
  fit$converged <- ffit$converged
  fit$time <- ffit$t
  fit$hazard <- ffit$h
  fit$lowerHazard <- ffit$h_l
  fit$upperHazard <- ffit$h_u
  fit$surv <- ffit$S
  fit$lowerSurv <- ffit$S_l
  fit$upperSurv <- ffit$S_u
	
  if(NC>0){
    betaCoef <- ffit$regpar
    names(betaCoef) <- Xnames
    fit$coef <- betaCoef
    fit$HR <- exp(betaCoef)
    V <- matrix(ffit$v,nrow=NC,ncol=NC,byrow=T)
    colnames(V) <- Xnames
    rownames(V) <- Xnames
    fit$se <- sqrt(diag(V))
    fit$V_cov <- V
  }
  V <- matrix(ffit$V_tot,nrow=size_V,ncol=size_V,byrow=T)
  if(method=="weib"){
    colnames(V) <- c("sqrt(a)","sqrt(b)",Xnames)
    rownames(V) <- c("sqrt(a)","sqrt(b)",Xnames)
  }else{
    theta_names <- cbind(rep("theta",(nknots+2)),(1:(nknots+2)))
    theta_names <- as.vector(apply(theta_names,1,paste,collapse=" "))
    colnames(V) <- c(theta_names,Xnames)	
    rownames(V) <- c(theta_names,Xnames)	
  }
  fit$V <- V
  fit$niter <- ffit$niter
  fit$cv <- ffit$cv

  if(method=="splines"){
    fit$nknots <- nknots
    fit$knots <- ffit$ti
    fit$theta <-  ffit$theta
    fit$CV <- CV
    fit$igraph <- ffit$igraph
    if(CV){
      fit$kappa <- ffit$kappa
      fit$CVcrit <- ffit$CVcrit
      fit$DoF <- ffit$mdf
    }else{
      fit$kappa <- kappa
    }

  }
    fit$na.action <- na.action
    # }}}
    if (method=="weib") fit$method <- "Weib" else fit$method <- "Splines"
    fit$maxtime <- amax
    fit$mintime <- amin
    class(fit) <- "shr"
    fit$runtime <- proc.time()-ptm
    fit
}