R/guyot.R

Defines functions guyot.method

Documented in guyot.method

#' Go from KM curve data from published figures into individual-level data
#' 
#' This function is a simple adaptation of code given by Guyot et al.
#' 
#' @param x X axis coordinates taken from the digitised Kaplan-Meier curve
#' @param y Y axis coordinates taken from the digitised Kaplan-Meier curve; same length as `x`
#' @param t times at which number at risk `r` is given
#' @param r number at risk; same length as `t`
#' @param tot.events total number of events reported (integer)
#' @export
#' 
#' @examples
#' #Make up some data:
#' ipd_curve1 <- guyot.method(x = seq(0, 100), y = 1-pexp(seq(0, 100), rate = 1/50), 
#'              t=c(0, 10, 50), r = c(1000, 800, 250))
#' ipd_curve2 <- guyot.method(x = seq(0, 100), y = 1-pexp(seq(0, 100), rate = 1/30), 
#'              t=c(0, 10, 50), r = c(1000, 700, 100))
#' library(survival)
#' ipd_curve1$patient$treatment <- "active"
#' ipd_curve2$patient$treatment <- "chemo"
#' ipd <- rbind(ipd_curve1$patient, ipd_curve2$patient)
#' survival_fit <- survfit(Surv(time, event) ~ treatment, data = ipd)
#' survival_fit
#' plot(survival_fit, col = c("blue", "red"))      
#' 
#' @references Guyot, Patricia, AE Ades, Mario JNM Ouwens, and Nicky J. Welton. 
#'             “Enhanced Secondary Analysis of Survival Data: Reconstructing the 
#'             Data from Published Kaplan-Meier Survival Curves.” 
#'             BMC Medical Research Methodology 12, no. 1 (February 1, 2012): 9. 
#'             https://doi.org/10.1186/1471-2288-12-9.


guyot.method <- function(x, y, t, r,
                         km_write_path=NULL,
                         ipd_write_path=NULL,
                         tot.events=NA) {
  if(max(y) > 1)
    stop("Survival (y coordinates) can't be greater than 1. Please scale your y to be in [0, 1] range.")
  if(min(x) > 0){
    message("Adding coordinate (x=0, y=1) to your data.")
    x <- c(0, x)
    y <- c(1, y)
  }
  if(min(t) > 0) {
    warning("Number at risk at time 0 is needed. The calculation might not work correctly otherwise.")
  }
  if(max(t) > max(x)) {
    warning("Times for number at risk (t) where t > x (x = coordinates of the curve) are not used.")
    r <- r[t < max(x)]
    t <- t[t < max(x)]
  }
  # Parameter names used by the script below
  t.S<-x; S<-y; n.risk<-r; t.risk<-t
  
  # Need to grab `lower` and `upper`, to "match" vector x against vector t
  # See Guyot Table 2 for visual explanation of this
  lower <- sapply(t, function(current_t) min(which(x >= current_t)))
  upper <- sapply(c(t[-1], Inf), function(current_t) max(which(x < current_t)))
  
  n.int<-length(n.risk)
  n.t<- upper[n.int]
  #Initialise vectors
  n.censor<- rep(0,(n.int-1))
  n.hat<-rep(n.risk[1]+1,n.t)
  cen<-rep(0,n.t)
  d<-rep(0,n.t)
  KM.hat<-rep(1,n.t)
  last.i<-rep(1,n.int)
  sumdL<-0
  if (n.int > 1){
    #Time intervals 1,...,(n.int-1)
    for (i in 1:(n.int-1)){
      #First approximation of no. censored on interval i
      n.censor[i]<- round(n.risk[i]*S[lower[i+1]]/S[lower[i]]- n.risk[i+1])
      #Adjust tot. no. censored until n.hat = n.risk at start of interval (i+1)
      while((n.hat[lower[i+1]]>n.risk[i+1])||((n.hat[lower[i+1]]<n.risk[i+1])&&(n.censor[i]>0))){
        if (n.censor[i]<=0){
          cen[lower[i]:upper[i]]<-0
          n.censor[i]<-0
        }
        if (n.censor[i]>0){
          cen.t<-rep(0,n.censor[i])
          for (j in 1:n.censor[i]){
            cen.t[j]<- t.S[lower[i]] +
              j*(t.S[lower[(i+1)]]-t.S[lower[i]])/(n.censor[i]+1)
          }
          #Distribute censored observations evenly over time. Find no. censored on each time interval.
          cen[lower[i]:upper[i]]<-graphics::hist(cen.t,breaks=t.S[lower[i]:lower[(i+1)]],
                                       plot=F)$counts
        }
        #Find no. events and no. at risk on each interval to agree with K-M estimates read from curves
        n.hat[lower[i]]<-n.risk[i]
        last<-last.i[i]
        for (k in lower[i]:upper[i]){
          if (i==1 & k==lower[i]){
            d[k]<-0
            KM.hat[k]<-1
          }
          else {
            d[k]<-round(n.hat[k]*(1-(S[k]/KM.hat[last])))
            KM.hat[k]<-KM.hat[last]*(1-(d[k]/n.hat[k]))
          }
          n.hat[k+1]<-n.hat[k]-d[k]-cen[k]
          if (d[k] != 0) last<-k
        }
        n.censor[i]<- n.censor[i]+(n.hat[lower[i+1]]-n.risk[i+1])
      }
      if (n.hat[lower[i+1]]<n.risk[i+1]) n.risk[i+1]<-n.hat[lower[i+1]]
      last.i[(i+1)]<-last
    }
  }
  #Time interval n.int.
  if (n.int>1){
    #Assume same censor rate as average over previous time intervals.
    n.censor[n.int]<- min(round(sum(n.censor[1:(n.int-1)])*(t.S[upper[n.int]]-
                                                              t.S[lower[n.int]])/(t.S[upper[(n.int-1)]]-t.S[lower[1]])), n.risk[n.int])
  }
  if (n.int==1){n.censor[n.int]<-0}
  if (n.censor[n.int] <= 0){
    cen[lower[n.int]:(upper[n.int]-1)]<-0
    n.censor[n.int]<-0
  }
  if (n.censor[n.int]>0){
    cen.t<-rep(0,n.censor[n.int])
    for (j in 1:n.censor[n.int]){
      cen.t[j]<- t.S[lower[n.int]] +
        j*(t.S[upper[n.int]]-t.S[lower[n.int]])/(n.censor[n.int]+1)
    }
    cen[lower[n.int]:(upper[n.int]-1)]<-graphics::hist(cen.t,breaks=t.S[lower[n.int]:upper[n.int]],
                                             plot=F)$counts
  }
  #Find no. events and no. at risk on each interval to agree with K-M estimates read from curves
  n.hat[lower[n.int]]<-n.risk[n.int]
  last<-last.i[n.int]
  for (k in lower[n.int]:upper[n.int]){
    if(KM.hat[last] !=0){
      d[k]<-round(n.hat[k]*(1-(S[k]/KM.hat[last])))} else {d[k]<-0}
    KM.hat[k]<-KM.hat[last]*(1-(d[k]/n.hat[k]))
    n.hat[k+1]<-n.hat[k]-d[k]-cen[k]
    #No. at risk cannot be negative
    if (n.hat[k+1] < 0) {
      n.hat[k+1]<-0
      cen[k]<-n.hat[k] - d[k]
    }
    if (d[k] != 0) last<-k
  }
  #If total no. of events reported, adjust no. censored so that total no. of events agrees.
  if (!is.na(tot.events)){
    if (n.int>1){
      sumdL<-sum(d[1:upper[(n.int-1)]])
      #If total no. events already too big, then set events and censoring = 0 on all further time intervals
      if (sumdL >= tot.events){
        d[lower[n.int]:upper[n.int]]<- rep(0,(upper[n.int]-lower[n.int]+1))
        cen[lower[n.int]:(upper[n.int])]<- rep(0,(upper[n.int]-lower[n.int]+1))
        n.hat[(lower[n.int]+1):(upper[n.int]+1)]<- rep(n.risk[n.int],(upper[n.int]+1-lower[n.int]))
      }
    }
    #Otherwise adjust no. censored to give correct total no. events
    if ((sumdL < tot.events)|| (n.int==1)){
      sumd<-sum(d[1:upper[n.int]])
      while ((sumd > tot.events)||((sumd< tot.events)&&(n.censor[n.int]>0))){
        n.censor[n.int]<- n.censor[n.int] + (sumd - tot.events)
        if (n.censor[n.int]<=0){
          cen[lower[n.int]:(upper[n.int]-1)]<-0
          n.censor[n.int]<-0
        }
        if (n.censor[n.int]>0){
          cen.t<-rep(0,n.censor[n.int])
          for (j in 1:n.censor[n.int]){
            cen.t[j]<- t.S[lower[n.int]] +
              j*(t.S[upper[n.int]]-t.S[lower[n.int]])/(n.censor[n.int]+1)
          }
          cen[lower[n.int]:(upper[n.int]-1)]<-graphics::hist(cen.t,breaks=t.S[lower[n.int]:upper[n.int]],
                                                   plot=F)$counts
        }
        n.hat[lower[n.int]]<-n.risk[n.int]
        last<-last.i[n.int]
        for (k in lower[n.int]:upper[n.int]){
          d[k]<-round(n.hat[k]*(1-(S[k]/KM.hat[last])))
          KM.hat[k]<-KM.hat[last]*(1-(d[k]/n.hat[k]))
          if (k != upper[n.int]){
            n.hat[k+1]<-n.hat[k]-d[k]-cen[k]
            #No. at risk cannot be negative
            if (n.hat[k+1] < 0) {
              n.hat[k+1]<-0
              cen[k]<-n.hat[k] - d[k]
            }
          }
          if (d[k] != 0) last<-k
        }
        sumd<- sum(d[1:upper[n.int]])
      }
    }
  }
  wt1 <- matrix(c(t.S,n.hat[1:n.t],d,cen),ncol=4,byrow=F)
  if(!is.null(km_write_path))
    utils::write.table(wt1,km_write_path,sep="\t")
  
  ### Now form IPD ###
  #Initialise vectors
  t.IPD<-rep(t.S[n.t],n.risk[1])
  event.IPD<-rep(0,n.risk[1])
  #Write event time and event indicator (=1) for each event, as separate row in t.IPD and event.IPD
  k=1
  for (j in 1:n.t){
    if(d[j]!=0){
      t.IPD[k:(k+d[j]-1)]<- rep(t.S[j],abs(d[j]))
      event.IPD[k:(k+d[j]-1)]<- rep(1,abs(d[j]))
      k<-k+d[j]
    }
  }
  #Write censor time and event indicator (=0) for each censor, as separate row in t.IPD and event.IPD
  for (j in 1:(n.t-1)){
    if(cen[j]!=0){
      t.IPD[k:(k+cen[j]-1)]<- rep(((t.S[j]+t.S[j+1])/2),cen[j])
      event.IPD[k:(k+cen[j]-1)]<- rep(0,cen[j])
      k<-k+cen[j]
    }
  }
  
  #Output IPD
  IPD<-matrix(c(t.IPD,event.IPD),ncol=2,byrow=F)
  if(!is.null(ipd_write_path))
    utils::write.table(IPD,ipd_write_path,sep="\t")
  
  return(list("curve"=as.data.frame(wt1), 
              "patient"=data.frame(time = t.IPD, event = event.IPD)))
}

#Find Kaplan-Meier estimates
# KM.est<-survfit(Surv(IPD[,1],IPD[,2])~1,data=IPD,type="kaplan-meier")
# KM.est
# summary(KM.est)
# X11()
# plot(KM.est)
certara/survivalnma documentation built on Oct. 17, 2020, 12:34 a.m.