R/distrcox.R

#' Function for simulating a Cox distribution series
#' @description Function for simulating a Cox distribution series
#' @details Cox distribution series are based on the assumption of \emph{proportional hazards} of covariate
#' over a given underneath \emph{baseline hazards} function. The hypotesis of proportionality is essential 
#' for achieving good performance by models generated by this kind of fitting. The \code{coxSimulate} function 
#' works on an input \code{\link[survival]{coxph}} model. The model can be also anonymized by \code{\link{coxAnonymize}}
#' function, in order to use the simulation function in a \emph{distributed learning} context, without moving data from
#' their site of origin and without carrying details of single patients within the \code{\link[survival]{coxph}} object that
#' might even hypothetically be used to reproduce the original dataset coming grom its own (far) source. \emph{Baseline hazard} has
#' to be given in order to produce a cox simulation because it cannot be anymore calculated by using the \code{\link[survival]{coxph}}
#' @param n Number of cases to be simulated
#' @param input.model An object of \code{\link[survival]{coxph}} class, even anonymized (see \code{\link{coxAnonymize}})
#' @param features An object of class \code{features} containing description of single covariates in the dataset
#' @param BH Cumulative baseline hazard as calculated by \code{\link[survival]{basehaz}} function applied to a \code{\link[survival]{coxph}} object
#' @return A \code{\link[base]{data.frame}} containing simulated observation
#' @export
coxSimulate<-function(n = 1000, input.model, features, BH) {
  if (class(features)!="features") stop("features isn't a \"features\" class object")
  # create the empty dataset
  dataset<-data.frame(matrix(data = 0, nrow = n, ncol = length(features$features.names)))
  # checks the features and sample the data
  for(m in 1:length(features$features.names)) {
    if (features$classes[m] == "integer") {
      dataset[,m]<-sample(x = as.integer(names(features$prop[[m]])), size = n, replace = TRUE, prob = features$prop[[m]])
    }
    if (features$classes[m] == "factor") {
      dataset[,m]<-sample(x = names(features$prop[[m]]), size = n, replace = TRUE, prob = features$prop[[m]])
      dataset[,m]<-as.factor(dataset[,m])
    }
    if (features$classes[m] == "numeric") {
      # temp vector of data
      temp.data<-c()
      # denominator for division
      total<-sum(features$prop[[m]]$density)
      #browser()
      for (p in 2:length(features$prop[[m]]$breaks)) {
        # !!! force round of the value of sampled number to overcome a bug in "hist" function that gives back not integer values !!!  
        temp.data<-c(temp.data, runif(n = round(features$prop[[m]]$density[p - 1] * n / total), min = features$prop[[m]]$breaks[p - 1], max = features$prop[[m]]$breaks[p]))        
      }
      dataset[,m]<-temp.data
    }
  }
  colnames(dataset)<-features$features.names
  # now start creating the baseline hazard function from the input model
  BH.diff<-diff(BH$hazard)
  # function for the correction of the times of differential of baseline hazard
  BH.diff.time<-c()
  for (m in 2:length(BH$time)) {
    BH.diff.time<-c(BH.diff.time, BH$time[m - 1] + (BH$time[m] - BH$time[m - 1])/2)
  }
  BHdiff<-as.data.frame(cbind(BH.diff.time, BH.diff))
  colnames(BHdiff)<-c("time", "dHazard")
  BHdiff<-subset.data.frame(x = BHdiff, subset = BHdiff$dHazard != 0)
  return(list(dataset = dataset, BH = BH, BHdiff = BHdiff))
}

#' Function for anonymizing a \code{\link[survival]{coxph}} object
#' @description Function that anonymizes a \code{\link[survival]{coxph}} class object by deleting each reference to single patients data.
#' @details The data to be anonymized are in the structure of the \code{\link[survival]{coxph}} object. They are:
#' \itemize{
#'    \item linear.predictors
#'    \item residuals
#'    \item nevent
#'    \item y
#' } An anonymized object can be transmitted far from the origin without giving the chance to reconstruct the single cases details.
#' @param model A \code{\link[survival]{coxph}} object
#' @return An anonymized \code{\link[survival]{coxph}} object
#' @export
coxAnonymize<-function(model)  {
  model$linear.predictors<-NULL
  model$residuals<-NULL
  model$nevent<-NULL
  model$y<-NULL
  return(model)
}

#' Function for extracting summary of covariates in a \code{\link[base]{data.frame}} object
#' @description This function is used for building a summary list of the covariates in a \code{\link[base]{data.frame}} object. It has to be used before 
#' using \code{\link{coxSimulate}} in order to generate new dataset with given baseline hazards and a set of coefficients set for a given outcome.
#' @param dataset A \code{data.frame} object
#' @param breaks Numnber of \code{breaks} in the \code{hist} objects that returns the summary of \code{numeric} covariates
#' @param times Numeric vector of observation times or death in the dataset
#' @param status Numeric vector of censored \code{(0)} or event \code{(1)} cases in the dataset
#' @details When a simulation of new Cox dataset is performed according the features of a given dataset a pre-analysis of the
#'  dataset has to be performed by using this function. The result given by \code{extractFeatures} can be directly put in \code{\link{coxSimulate}}
#'  function to achieve a new dataset similar to the one that gave the model used for \code{coeff}, \code{cov} and \code{BH} arguments.
#' @return An object of class \code{features} with following structure:
#' \itemize{
#'    \item \code{classes}. A character vector with names of classes of each covariate.
#'    \item \code{prop}. A list of \code{table} objects for integer and factor covaraites, or \code{hist} objects for numeric covariates.
#'    \item \code{numCases}. The total number of cases in the dataset.
#' }
#' @export
extractFeatures<-function(dataset, breaks = 20) {
  # extract names of covariates
  features.names<-colnames(dataset)
  # vector of classes of covariates in the dataset
  classes<-c()
  # list of output of analysis of features in the dataset
  prop<-list()
  # number of cases in the dataset
  numCases<-nrow(dataset)
  
  for (n in 1:ncol(dataset)) {
    classes<-c(classes, class(dataset[[n]]))
    if ((class(dataset[[n]])=="factor") || (class(dataset[[n]])=="integer")){
      prop[[n]] <- table(dataset[[n]])
      prop[[n]] <- prop[[n]]/nrow(dataset)
    }
    if (class(dataset[[n]])=="numeric") prop[[n]]<-hist(x = dataset[[n]], breaks = breaks, plot = FALSE)
  }
  
  data<-list(features.names=features.names, classes=classes, prop=prop, numCases=numCases)
  class(data)<-"features"
  return(data)
}

#' Function that extracts the hazard function from a \code{coxph} object
#' @description This function extracts the hazard function \eqn{h_0} from a given \code{\link[survival]{coxph}} object. The object must be provided with
#' the original dataset used for the modeling. The result of the function isn't equivalent to the result of \code{\link[survival]{basehaz}} function, because
#' the latter provides the \emph{cumulative baseline hazard} \eqn{H_0(t)} and not the \emph{hazard function} itself.
#' @param dataset The dataset as \code{data.frame} for modeling the \emph{baseline hazard rate}
#' @param model The \code{\link[survival]{coxph}} object for calculating the \emph{baseline hazard rate}
#' @param times The vector of times where \emph{events} are recorded
#' @param events The vector of the events \code{(0)} for censored cases, \code{(1)} for recorded events
#' @param smoothed If \code{TRUE} return values smoothed according a \code{\link[stats]{smooth.spline}} function
#' @param spar Smoothing parameter, used only if \code{smoothed = TRUE}
#' @param grid Number of estimation point of \eqn{h_0(t)} function
#' @details The survival function of a Cox proportional hazards model is given by the equation: 
#' \deqn{S\left ( t|x \right )=\exp \left [ -H_0\left ( t \right )*\exp\left ( \beta'X \right ) \right ]}
#' where:
#' \deqn{H_0\left ( t \right )=\int_{0}^{t}h_0\left ( u \right )du}
#' Knowing the value of \eqn{h_0\left ( t \right )} it is possible for a given value of \eqn{\beta'X} to compute the expected survival time. In fact given:
#' \deqn{h(t)=h_0(t)*\exp\left ( \beta'X \right )}
#' and censoring time \eqn{c=U*max(t)} where \eqn{U \sim Uni\left [ 0,1 \right ]}, that is \eqn{U} follows a uniform distribution on the interval from 0 to 1,
#' and \eqn{max(t)} is the maximum observation time for the population to be simulated, the calculated survival time is \deqn{T=\frac{-\log{\left (U  \right )}}{h_t}}
#' It is possible to simulate the \emph{censoring} in the population by considering that censored cases are they where \eqn{c<T}.
#' Starting from a base \code{\link[survival]{coxph}} model it is possible to give in output a matrix of values of time and \eqn{h_0(t)} needed for the simulation
#' of a Cox distribution with the same covariates, coefficients and baseline hazard of the generating one.
#' @return A \code{data.frame} with \eqn{h_0} values and related times.
#' @export
extracth0<-function(dataset, model, times, events, smoothed = FALSE, spar = .75, grid = 1000) {
  library(muhaz)
  cov  <- attr(model$terms, "dataClasses")[2:length(attr(model$terms, "dataClasses"))]  # get the type of covariates and names
  BetaX.matrix<-matrix(data = 0, nrow = nrow(dataset), ncol = length(cov))
  h<-muhaz(times = times, delta = events, n.min.grid = grid, n.est.grid = grid * 2) # observed "real" hazard
  hfun<-approxfun(x = h$est.grid, y = h$haz.est)              # approximate function for calculating the overall instant hazard
  dataset.order<-dataset[with(dataset, order(times)),]        # order the dataset according time value
  
  # cov:                                                                                      named vector with covariates names
  # model$xlevels[[names(cov)[n]]]:                                                           names of factors in a factorial covariate
  # model$coefficients[[paste(names(cov)[n], model$xlevels[[names(cov)[n]]][m], sep = "")]]:  name of the coefficient of a named single factor
  
  for (n in 1:length(cov))  {   # iteration through covariates to construct BetaX matrix
    if (cov[n] == "numeric") BetaX.matrix[,n] <- dataset.order[[names(cov)[n]]] * model$coefficients[[names(cov)[n]]]  # numeric covariates    
    if (cov[n] == "factor" ) { 
      nf <- length(model$xlevels[[names(cov)[n]]]) # length of different factors in the covariate cov[n]
      for (m in 2:nf) { 
        BetaX.matrix[,n] <- BetaX.matrix[,n] + model$coefficients[[paste(names(cov)[n], model$xlevels[[names(cov)[n]]][m], sep = "")]] *
        (dataset.order[[names(cov)[n]]]==model$xlevels[[names(cov)[n]]][m])
      }
    }
  }
  dataset.order$BetaX<-apply(X = BetaX.matrix, MARGIN = 1, FUN = function(x) exp(sum(x))) # final values of exp(sum(Beta'X)) for each patient
  
  dataset.order.table<-cbind(as.numeric(names(table(times))), unname((table(times))))  # ordered table with survival times and number of events for each step
  colnames(dataset.order.table)<-c("times", "freq")

  h0<-c() # vector of h0
  n <- 1  # counter of rows in dataset.order.table
  LB <- nrow(dataset)  # arbitrary high number of LB (length Beta) for while iteration
  while (LB > 10) {
    h0 <- c(h0, hfun(h$est.grid[n]) / mean( dataset.order$BetaX[which(sort(times) >= h$est.grid[n])] ))
    LB <- length(which(sort(times) >= h$est.grid[n]))
    n <- n + 1
  }
  if (smoothed == TRUE) {
    spl <- smooth.spline(x = h$est.grid[1:length(h0)], y = h0, spar = spar)
    h0 <- spl$y
    h0[which(h0<0)]<-0   # set negative values to 0
  }
  return (as.data.frame(cbind(h0 = h0, time = h$est.grid[1:length(h0)])))
}

#' Function for calculating Weibull based hazard rate H0
#' @description Weibull function describing the hazard rate as function of the time.
#' @details Given a Cox fit object returned by a \code{\link[survival]{coxph}} function this function calulates the parameters to achieve the value of baseline hazard rate
#' as function of the time. The result returned by function is either \deqn{\frac{1}{H_0 \left ( t \right )}=\left ( \lambda^{-1}t \right )^{\frac{1}{\nu}}} or
#' \deqn{H_0 \left ( t \right )=\left ( \lambda^{-1}t \right )^{-\frac{1}{\nu}}}
#' @param t time
#' @param lambda \eqn{\lambda} parameter of Weibull function
#' @param ni \eqn{\nu} parameter of Weibull equation
#' @param inv if \code{FALSE} returns the inverse value of baseline hazard, if \code{TRUE} returns the baseline hazard.
#' @return A numeric vector of the baseline hazard values \eqn{H_{0}\left (t \right)} or their inverse \eqn{H_{0}^{-1}\left ( t \right )}.
#' @export
HR.weibull<-function (t, lambda, ni, inv = FALSE) {
  invH0t<-(1/lambda*t)^(1/ni)
  if (inv==TRUE) return(invH0t) else return (1/invH0t)
}


#' Function for calculating Gompertz based hazard rate H0
#' @description Gompertz function describing the hazard rate as function of the time
#' @details Given a Cox fit object returned by a \code{\link[survival]{coxph}} function this function calulates the parameters to achieve the value of baseline hazard rate
#' as function of the time. The result returned by function is either \deqn{H_0 \left ( t \right )=\left [\frac{1}{\alpha}log\left ( \frac{\alpha}{\lambda}t+1 \right )  \right ]^{-1}}
#' or \deqn{\frac{1}{H_0 \left ( t \right )}=\frac{1}{\alpha}log\left ( \frac{\alpha}{\lambda}t+1 \right )}
#' @param t time
#' @param lambda \eqn{\lambda} parameter of Gompertz function
#' @param alpha \eqn{\alpha} parameter of Gompertz equation
#' @return A numeric vector of the baseline hazard values \eqn{H_{0}\left (t \right)} or their inverse \eqn{H_{0}^{-1}\left ( t \right )}.
#' @export
HR.gompertz<-function (t, lambda, alpha, inv = FALSE) {
  invH0t<-(1/alpha)*log(alpha/lambda*t + 1)
  if (inv==TRUE) return(invH0t) else return (1/invH0t)
}


#' Function for fitting the baseline hazard function
#' @description The function fits a baseline hazard model according a value of discrete baseline hazards as function of time \eqn{t} achieved by using
#' \code{\link[survival]{basehaz}} function over a \code{\link[survival]{coxph}} object.
#' @export
HR.fit<-function (coxobj, init = c(.1, .1)) {
  if (class(coxobj)!="coxph") stop("coxobj MUST be a \"coxph\" object" )
  # compute the baseline hazard
  H0<-basehaz(coxobj)
  invH0<-H0
  invH0$hazard<-1/H0$hazard
  weibull.fit<-nls(formula = hazard ~ 1/(HR.weibull(t = time, lambda = lambda, ni = ni)), start = list(lambda = init[1], ni = init[2]), data = invH0, trace = T, control = list(maxiter=200))
  return(weibull.fit)
}

#' Beijing post-operative rectal cancer patients dataset
#' 
#' A dataset containing the Beijing University data about post-operative radiotherapy in rectal cancer patients. The recorded features are:
#' \itemize{
#'   \item Age. Age of patient
#'   \item Sex. Male, Female
#'   \item ECOG. Pre-treatment performance status
#'   \item Histology. Adenocarcinoma, Mucinous
#'   \item Grade. Histological differentiation grade
#'   \item RTCT. If treated by chemoradiotherapy (1) or radiotherapy alone (0)
#'   \item pT. Pathological T stage
#'   \item pN. Pathological N stage
#'   \item pM. Pathological M stage
#'   \item AdjCT. If patient underwent (1) or not (0) to adjuvant chemotherapy
#'   \item CTcicles. Number of cicles of adjuvant chemotherapy
#'   \item RTbreak. Duration (in days) of breaks during radiotherapy treatment
#'   \item RTtox. Maximum grade of RTOG toxicity shown during (chemo)radiotherapy treatment
#'   \item Residual. Presence (1) or absence (0) of residual tumor after surgery
#'   \item IMRT. If radiotherapy treatment was delivered by IMRT (1) or not (0)
#'   \item status. Censored cases (0) or dead (1)
#'   \item OS. Overall survival time (in months)
#' }
#' @name rectum
#' @docType data
#' @author Liu Wen Yang \email{liuwenyang26@@163.com}
#' @author Nicola Dinapoli \email{nicola.dinapoli@@rm.unicatt.it}
#' @usage rectum
#' @keywords data
#' @format A data frame with 1798 rows and 17 variables
"rectum"
kbolab/distrcox documentation built on May 20, 2019, 8:10 a.m.