R/years.R

Defines functions plot_years plot_f gg_color_hue plot.helper ylboot.iter ylboot.apply ylboot years nessie_spi colVars

Documented in plot_f plot_years years

colVars <- function(x, na.rm = FALSE){
  f <- function(v, na.rm = na.rm) {
    if(is.numeric(v) || is.logical(v) || is.complex(v))
      stats::var(v, na.rm = na.rm)
    else NA
  }
  return(unlist(lapply(x, f, na.rm = na.rm)))
}

# Copied function from mstate:::NAfix.
mstateNAfix <- function (x, subst = -Inf)
{
  spec <- max(x[!is.na(x)]) + 1
  x <- c(spec, x)
  while (any(is.na(x))) x[is.na(x)] <- x[(1:length(x))[is.na(x)] -
                                           1]
  x[x == spec] <- subst
  x <- x[-1]
  x
}

# Helper function:
nessie_spi <- function(formula = formula(data), data, ratetable = relsurv::slopop,
                       tis, starting.time, include.censoring=FALSE,
                       arg.example=FALSE, rmap){
  data_orig <- data
  call <- match.call()
  if (!missing(rmap)) {
    rmap <- substitute(rmap)
  }
  na.action <- NA
  rform <- rformulate(formula, data, ratetable, na.action,
                      rmap)

  data <- rform$data
  data$Xs <- rep(1, nrow(data))
  n_rows <- nrow(data)

  # Fix demographic covariates:
  if(starting.time == "left.truncated"){
    rform$R[,"year"] <- rform$R[,"year"] - rform$R[,"age"]
    rform$R[,"age"] <- 0
  }
  if(include.censoring){
    # browser()
    wh <- which(rform$status==1)
    rform$Y[wh] <- max(rform$Y)

    if(arg.example){
      wh2 <- which(rform$status==1 & data$age==18262)
      rform$Y[wh2] <- 1826
    }
  }
  else{
    rform$Y <- rep(max(rform$Y), length(rform$Y))
    # status is not relevant in this case
  }

  out <- NULL
  out$yi <- NULL
  out$yidli <- NULL
  l_tis <- length(tis)
  temps <- lapply(1:n_rows, function(inx) {
    temp <- exp.prep(rform$R[inx, , drop = FALSE], rform$Y[inx], rform$ratetable,
                     rform$status[inx], times = tis, fast = TRUE, cmp=FALSE,ys=data$start[inx])
    s_pi <- exp(-cumsum(temp$yidli))

    s_pi_helper <- which.min(temp$yidli==0)-1
    if(s_pi_helper>1){ s_pi[1:s_pi_helper] <- 0}

    if(include.censoring){ s_pi[(s_pi_helper+1):l_tis] <- pmin(s_pi[(s_pi_helper+1):l_tis],
                                                               temp$yi[(s_pi_helper+1):l_tis])}

    c(s_pi, # s_pi
      temp$yidli*s_pi) # l_pi * s_pi
  })

  temps2 <- do.call("cbind", temps)

  temps2 <- rowSums(temps2)

  out$yi <- temps2[1:(length(temps2)/2)]
  out$yidli <- temps2[(length(temps2)/2+1):length(temps2)]
  return(out)
}

# Copied scales::trans_new:
# scales_trans_new <- function (name, transform, inverse, breaks = extended_breaks(),
#           minor_breaks = regular_minor_breaks(), format = format_format(),
#           domain = c(-Inf, Inf))
# {
#   if (is.character(transform))
#     transform <- match.fun(transform)
#   if (is.character(inverse))
#     inverse <- match.fun(inverse)
#   structure(list(name = name, transform = transform, inverse = inverse,
#                  breaks = breaks, minor_breaks = minor_breaks, format = format,
#                  domain = domain), class = "trans")
# }


#' Compute one of the life years measures
#'
#' Provides an estimate for one of the following measures: years lost (Andersen, 2013), years lost/saved (Andersen, 2017), or
#' life years difference (Manevski, Ruzic Gorenjec, Andersen, Pohar Perme, 2022).
#'
#' The life years difference (\code{measure='yd'}) is taken by default. If other
#' measures are of interest, use the \code{measure} argument.
#'
#' The follow-up time must be specified in days. The \code{ratetable}
#' being used may have different variable names and formats than the user's
#' data set, this is dealt with the \code{rmap} argument. For example, if
#' age is in years in the data but in days in the \code{ratetable} object,
#' age=age*365.241 should be used. The calendar year can be in any date format
#' (date, Date and POSIXt are allowed), the date formats in the
#' \code{ratetable} and in the data may differ.
#'
#' Numerical integration is performed, argument
#' precision is set with argument \code{precision}, which defaults to 30-day
#' intervals for intergration. For higher accuracy take a smaller value (e.g. precision=1 makes
#' the integration on a daily basis).
#'
#' The observed curves are reported at event and censoring times. The
#' population curves are reported at all times used for the numerical integration.
#' Note that for the years lost (Andersen, 2013) measure, only the excess absolute risk is reported.
#'
#' @param formula a formula object, with the response as a \code{Surv} object
#' on the left of a \code{~} operator, and, \code{~1} specified on the right.
#'
#' NOTE: The follow-up time must be in days.
#' @param data a data.frame in which to interpret the variables named in the
#' \code{formula}.
#' @param measure choose which measure is used: 'yd' (life years difference; Manevski, Ruzic Gorenjec, Andersen, Pohar Perme, 2022), 'yl2017' (years lost/saved; Andersen 2017),
#' 'yl2013' (years lost/saved; Andersen 2013).
#' @param ratetable a table of event rates, organized as a \code{ratetable}
#' object, such as \code{slopop}.
#' @param rmap an optional list to be used if the variables are not organized
#' and named in the same way as in the \code{ratetable} object. See details
#' below.
#' @param var.estimator Choose the estimator for the variance ('none', 'bootstrap', 'greenwood'). Default is 'none'.
#' The 'greenwood' option is possible only for \code{measure='yd'}.
#' @param B if \code{var.estimator} is 'bootstrap'. The number of bootstrap replications. Default is 100.
#' @param precision precision for numerical integration of the population curve. Default is 30 (days).
#' The value may be decreased to get a
#' higher precision or increased to achieve a faster calculation.
#' @param add.times specific times at which the curves should be reported.
#' @param na.action a missing-data filter function. Default is \code{na.omit}.
#' @param conf.int the confidence level for a two-sided confidence interval. Default is 0.95.
#' @param timefix the timefix argument in survival::survfit.formula. Default is FALSE.
#' @param is.boot if TRUE, the function \code{years} has been called during a bootstrap replication.
#' @param first.boot if TRUE, this is the first bootstrap replication.
#' @return A list containing the years measure, the observed and population curves (or the excess curve for Andersen 2013).
#' The values are given as separate data.frames through time. Times are given in days, all areas are given in years.
#' For \code{measure='yl2017'} values are reported only at the last time point.
#' Functions \code{plot_f} and \code{plot_years} can be then used for plotting.
#' @seealso \code{\link{plot_f}}, \code{\link{plot_years}}
#' @examples
#'
#' library(relsurv)
#' # Estimate the life years difference for the rdata dataset.
#' mod <- years(Surv(time, cens)~1, data=rdata, measure='yd', ratetable=slopop,
#'              rmap=list(age=age*365.241), var.estimator = 'none')
#' # Plot the absolute risk (observed and population curve):
#' plot_f(mod)
#' # Plot the life years difference estimate:
#' plot_years(mod, conf.int=FALSE)
years <- function(
    formula=formula(data),
    data,
    measure=c('yd', 'yl2017', 'yl2013'),
    # estimator=c("F_P_final"),#, "F_P_Spi", "F_P_Spi2", "F_P", "F_P2", "all"),
    ratetable=relsurv::slopop,
    rmap,
    var.estimator=c('none', 'bootstrap', 'greenwood'),
    B=100,
    precision=30,
    add.times,
    na.action=stats::na.omit,
    conf.int=0.95,
    timefix=FALSE,
    # admin.cens,
    # cause.val,
    is.boot=FALSE,
    first.boot=FALSE
    # ,estimator.observed='Kaplan-Meier'
){

  # OLD ARGUMENTS:
  # F_P_Spi: Tako kot F_P_final, ignorira censoring. Ali pa vzame samo admin cens
  # F_P_Spi2: Vzame ves censoring
  # @param cause.val for competing risks, to be added.
  # @param admin.cens if a Date is supplied, administrative censoring is taken into account at that time
  # in the population curve. Works only if there's late entry, e.g. if the formula is \code{Surv(start,stop,event)~1}.

  ############ #
  # PREPARE OBJECTS:
  ############ #

  estimator=c("F_P_final") #  #' @param estimator which estimator should be used for calculating
  # estimator <- match.arg(estimator)
  arg.example <- FALSE # @param arg.example temporary argument, used for checking additionalities.

  Call <- match.call()
  if(!missing(rmap) & !is.boot & !first.boot)  rmap <- substitute(rmap)
  measure <- match.arg(measure)
  var.estimator <- match.arg(var.estimator)

  if(var.estimator=='bootstrap'){
    bootstrap <- TRUE
  } else if(var.estimator %in% c('none', 'greenwood')){
    bootstrap <- FALSE
  } else{
    stop('Incorrect value provided in argument var.estimator.')
  }

  if(!is.data.frame(data)) stop('Argument data is not a data.frame object.')
  data <- as.data.frame(data)

  out <- NULL
  late.values <- FALSE

  # These were arguments. To be deleted?
  exact.hazards <- FALSE # calculate hazards on a daily basis (to be checked)
  find.cond.time <- FALSE # if TRUE, return time at which there are at least 5 individuals in the at-risk set.

  # if(!missing(cause.val)){
  #   data$status <- ifelse(data$cause == cause.val, 1, 0)
  #   # Remove NAs:
  #   eniNAs <- which(is.na(data$status))
  #   if(length(eniNAs)>0) data <- data[-eniNAs,]
  # }

  # data$age <- round(data$age*365.241)
  # data$stop <- round(data$stop*365.241)

  # If Surv(start,stop, event) (possibly + mstate)
  if_start_stop <- length(as.character(formula[[2]])) %in% c(4,5)

  if(if_start_stop){
    start_col <- as.character(formula[[2]])[2]
    stop_col <- as.character(formula[[2]])[3]
    status_col <- as.character(formula[[2]])[4]
    starting_age <- as.vector(as.matrix(data[, start_col]))
  } else{
    stop_col <- as.character(formula[[2]])[2]
    if(!(stop_col %in% colnames(data))){
      stop(paste0('Instead of \'', stop_col, '\', please use a column from the data in the formula.'))
    }
  }

  # Check if no. at risk falls to zero at some point:
  if(if_start_stop){
    # Prepare at-risk matrix:
    find_tajms <- unique(sort(c(data[,start_col], data[,stop_col])))
    mat <- lapply(1:nrow(data), function(x) ifelse((data[x, start_col] < find_tajms) & (find_tajms <= data[x, stop_col]), 1, 0))
    mat2 <- matrix(unlist(mat), nrow = nrow(data), byrow = TRUE)
    # The sum of the individual at-risk processes:
    yi_left <- colSums(mat2)
    # If there's an empty at-risk at a later timepoint, censor the data:
    wh_yi <- which(yi_left==0)
    if(length(wh_yi)>1){
      if((!is.boot) & (!first.boot)){
        warning(paste0('In the time interval ', find_tajms[wh_yi[2]-1], '-', find_tajms[wh_yi[2]],
                       ' the at-risk sample is empty (nobody is followed). Survival cannot be estimated in this time interval.',
                       ' The data is censored at time ', find_tajms[wh_yi[2]-1], '.'))
      }
      # Censor data:
      data <- data[data[,start_col] <= find_tajms[wh_yi[2]-1], ]
      wh_cen <- which(data[, stop_col] > find_tajms[wh_yi[2]-1])
      data[wh_cen, stop_col] <- find_tajms[wh_yi[2]-1]
      data[wh_cen, status_col] <- 0
      if(!missing(add.times)){
        if(any(add.times > find_tajms[wh_yi[2]-1])) add.times <- add.times[add.times<=find_tajms[wh_yi[2]-1]]
      }
    }
    rm(mat,mat2)
  }
  data_orig <- data

  # if(starting.time=="left.truncated"){
  # if(!missing(admin.cens)){
  #   if(!inherits(admin.cens, 'Date')) warning('Object of class Date should be supplied to admin.cens.')
  #   end_date <- data$year+(data$stop-data$age)
  #   if(any(end_date > admin.cens)) warning('There are events that occur after the date of administrative censoring. Please check the values in arguments data and admin.cens.')
  #   id_admin_cens <- which(admin.cens==end_date)
  # }
  # }

  if(if_start_stop){
    starting.time <- 'left.truncated'
  } else{
    starting.time <- 'zero'
  }

  # Starting age
  starting_age <- rep(0,nrow(data))
  if(if_start_stop){
    starting_age <- as.vector(as.matrix(data[, start_col]))
  }
  starting_age <- as.numeric(starting_age)

  ############ #
  # YEARS ON DATA - GENERAL:
  ############ #

  surv_obj <- as.character(formula[[2]])

  if(missing(formula)){
    stop('Missing formula argument value.')
  } else{
    if('mstate' %in% surv_obj){
      juh <- 1:nrow(data)
      mod <- survival::survfit.formula(as.formula(Reduce(paste, deparse(formula))), data=data, timefix=timefix, id = juh, na.action=na.action)
    } else{
      mod <- survival::survfit.formula(formula, data=data, timefix=timefix, na.action=na.action)
    }
  }

  if('mstate' %in% surv_obj){
    surv_obj_new <- paste0(surv_obj[1], '(', surv_obj[2], ',', surv_obj[3])
    if(length(surv_obj)==5){
      surv_obj_new <- paste0(surv_obj_new, ',', surv_obj[4], ')')
    } else{
      surv_obj_new <- paste0(surv_obj_new, ')')
    }
    formula <- paste0(surv_obj_new, '~1')
  }
  status_obj <- surv_obj[length(surv_obj)]

  # if(!missing(cause.val)){
  #   mod$n.risk <- mod$n.risk[,1]
  #   mod$n.event <- mod$n.event[,cause.val+1]
  #   mod$surv <- 1-mod$pstate[,cause.val+1]
  #   mod$std.err <- mod$std.err[,cause.val+1]
  #   mod$cumhaz <- mod$cumhaz[,cause.val]
  # }

  if(!missing(add.times)){
    mod_sum <- summary(mod, times = sort(unique(c(mod$time, add.times))))
    if(any(!(add.times %in% mod_sum$time))){
      if(!is.boot){
        if(!first.boot){
          warning('Some values in add.times are after the last follow-up time. All measures are extrapolated up to these times. Please consider removing them.')
        }
        late.values <- TRUE

        miss_tajms <- add.times[!(add.times %in% mod_sum$time)]
        mod_sum$time <- c(mod_sum$time, miss_tajms)
        mod_sum$n.risk <- c(mod_sum$n.risk, rep(mod_sum$n.risk[length(mod_sum$n.risk)], length(miss_tajms)))
        mod_sum$n.event <- c(mod_sum$n.event, rep(0, length(miss_tajms)))
        mod_sum$surv <- c(mod_sum$surv, rep(mod_sum$surv[length(mod_sum$surv)], length(miss_tajms)))
        mod_sum$cumhaz <- c(mod_sum$cumhaz, rep(mod_sum$cumhaz[length(mod_sum$cumhaz)], length(miss_tajms)))

        # First fix std.err:
        if(is.nan(mod_sum$std.err[length(mod_sum$std.err)])){
          mod_sum$std.err[length(mod_sum$std.err)] <- mod_sum$std.err[length(mod_sum$std.err) - 1]
        }
        mod_sum$std.err <- c(mod_sum$std.err, rep(mod_sum$std.err[length(mod_sum$std.err)], length(miss_tajms)))
      }
    }
    mod$time <- mod_sum$time
    mod$n.risk <- mod_sum$n.risk
    mod$n.event <- mod_sum$n.event
    mod$surv <- mod_sum$surv
    mod$std.err <- mod_sum$std.err
    mod$cumhaz <- mod_sum$cumhaz
  }

  if(find.cond.time) return(mod$time[which.min(mod$n.risk<5)])

  # Calculate AUC:
  if(length(mod$time)>1){
    if(if_start_stop){
      survs <- c(1, mod$surv[1:(length(mod$surv)-1)])
      t_diff <- diff(c(mod$time[1], mod$time))
    } else{
      survs <- mod$surv
      t_diff <- diff(c(0, mod$time))
    }
    auc_data <- sum(t_diff*(1 - survs))
    auc_data_vec <- cumsum(t_diff*(1 - survs))
  } else{
    auc_data <- mod$time*mod$surv
    auc_data_vec <- auc_data
  }

  out$F_data <- 1-mod$surv
  out$auc_data <- auc_data/365.241
  out$auc_data_vec <- auc_data_vec/365.241

  # Exact hazards:
  if(exact.hazards){
    mod$time <- seq(min(mod$time), max(mod$time), by=1)
    mod$surv <- exp(-cumsum(rep(ratetable[1,1,1], max(mod$time)-min(mod$time)+1)))

    out$F_data <- 1-exp(-cumsum(c(0, rep(ratetable[1,1,1], max(mod$time)-min(mod$time)))))
    out$auc_data <- sum(out$F_data)/365.241
  }

  ############ #
  # SEPARATE YEARS FOR EVERY MEASURE:
  ############ #

  if(measure %in% c('yl2017', 'yl2013')){
    # YL_P preparation:
    data_yi <- data
    rform <- rformulate(formula, data, ratetable, na.action=na.action, rmap = rmap)
    data <- rform$data
    if(if_start_stop){
      if(!(start_col %in% colnames(data))){
        data[,start_col] <- data_orig[, start_col]
      }
    }

    # Check covariates:
    p <- rform$m
    if (p > 0) stop("There shouldn't be any covariates in the formula. This function gives non-parametric estimates of the hazards.")
    else data$Xs <- rep(1, nrow(data)) #if no covariates, just put 1

    out_n <- table(data$Xs) #table of strata
    out$time <- out$haz.excess <- out$haz.pop <- out$std.err <- out$strata <-  NULL

    kt <- 1 # the only stratum
    inx <- which(data$Xs == names(out_n)[kt]) #individuals within this stratum
    # tis <- sort(unique(rform$Y[inx])) #unique times
    if(!if_start_stop){
      tis <- rform$Y[inx] #unique times
      tis_seq <- seq(0, max(rform$Y[inx]), precision)
    } else{
      tis <- sort(unique(c(rform$Y[inx], data[, start_col]))) #unique times
      tis_seq <- seq(min(data[, start_col]), max(rform$Y[inx], data[, start_col]), precision)
    }

    if(!is.boot){
      tis <- sort(unique(c(tis, tis_seq)))
    }

    if(!missing(add.times)){
      tis <- sort(unique(c(tis, add.times)))
    }
    ltis <- length(tis)

    # Fix demographic covariates:
    if(if_start_stop){
      rform$R[,"year"] <- rform$R[,"year"] - rform$R[,"age"]
      rform$R[,"age"] <- 0
    }

    if(measure == 'yl2017'){
      # YL_O (used only for yl2017):
      if(if_start_stop){
        it_auc <- rep(NA, nrow(data_orig))
        mod_sum <- summary(mod, times=tis) # unique(sort(c(data_orig[,start_col], data_orig[,stop_col])))
        lsurv <- length(mod_sum$surv)
        val_mat <- matrix(0, nrow=nrow(data_orig), ncol=lsurv)
        for(it in 1:nrow(data_orig)){
          it_wh <- which(data_orig[it, start_col] == mod_sum$time)
          it_surv <- mod_sum$surv[it_wh:lsurv]/mod_sum$surv[it_wh]
          it_auc[it] <- sum(c(0, diff(mod_sum$time[it_wh:lsurv]))*(1 - it_surv))/365.241
          val_mat[it, it_wh:lsurv] <- cumsum(c(0, diff(mod_sum$time[it_wh:lsurv]))*(1 - it_surv))/365.241
        }
        # spodaj <- mod_sum$n.risk + cumsum(mod_sum$n.event) + cumsum(mod_sum$n.censor)
        YL_O_vec <- colMeans(val_mat) # colSums(val_mat)/spodaj
        YL_O <- mean(it_auc)
        F_O_time <- mod_sum$time
        F_O_ext <- data.frame(time=F_O_time, area=YL_O_vec)
        # Subset:
        F_O_ext2 <- subset(F_O_ext, time %in% mod$time)
        F_O_time <- F_O_ext2$time
        YL_O_vec <- F_O_ext2$area
      } else{
        YL_O_vec <- out$auc_data_vec
        YL_O <- out$auc_data
        F_O_time <- mod$time
        if(!(0 %in% F_O_time)){
          F_O_time <- c(0, F_O_time)
          YL_O_vec <- c(0, YL_O_vec)
        }
        # Prepare extended F_O object:
        if(0 %in% mod$time){
          F_O_temp <- data.frame(time=mod$time, surv=mod$surv)
        } else{
          F_O_temp <- data.frame(time=c(0, mod$time), surv=c(1, mod$surv))
        }
        F_O_ext <- data.frame(time=tis)
        F_O_ext <- merge(F_O_ext, F_O_temp, by='time', all.x=TRUE)
        F_O_ext$surv <- mstateNAfix(F_O_ext$surv, 0)

        tis_diff <- diff(c(0, F_O_ext$time))
        F_O_ext$area <- cumsum(tis_diff*(1 - F_O_ext$surv))/365.241
        F_O_ext <- F_O_ext[,c('time', 'area')]
      }

      F_O <- data.frame(time=F_O_time, area=YL_O_vec)

      ###
      # YL_P continue:
      it_auc_P <- rep(NA, nrow(data))
      it_auc_P_mat <- matrix(0, nrow=nrow(data), ncol=ltis)

      for(it in 1:nrow(data)){
        temp <- exp.prep(rform$R[it,,drop=FALSE],max(rform$Y),rform$ratetable,rform$status[it],times=tis,fast=FALSE, cmp=FALSE, ys=starting_age[it], netweiDM = FALSE)
        if(if_start_stop){
          it_wh <- which(data[it, start_col] == tis)
          hazs <- temp$yidli[it_wh:ltis]
          hazs[1] <- 0
          cumhazs <- cumsum(hazs)
          F_P <- 1 - exp(-cumhazs)
          it_auc_P[it] <- sum(c(tis[it_wh], diff(tis[it_wh:ltis]))*c(0, F_P[1:(length(F_P)-1)]))/365.241
          it_auc_P_mat[it,it_wh:ltis] <- cumsum(c(0, diff(tis[it_wh:ltis]))*c(0, F_P[1:(length(F_P)-1)]))/365.241
        } else{
          # it_wh <- which(data$age[it] == tis)
          hazs <- temp$yidli[1:ltis]
          hazs[1] <- 0
          cumhazs <- cumsum(hazs)
          F_P <- 1 - exp(-cumhazs)
          it_auc_P[it] <- sum(c(0, diff(tis))*c(0, F_P[1:(length(F_P)-1)]))/365.241
          it_auc_P_mat[it,] <- cumsum(c(0, diff(tis))*c(0, F_P[1:(length(F_P)-1)]))/365.241
        }

      }
      YL_P <- mean(it_auc_P)
      F_P <- data.frame(time=tis, area=colMeans(it_auc_P_mat))

      yd_curve <- data.frame(time=tis, est=F_O_ext$area - F_P$area)

      # Bootstrap:
      if(bootstrap){
        data_b <- data_orig
        data_b$id <- 1:nrow(data_b)
        yl_boot <- ylboot(theta=ylboot.iter, data=data_b, id="id",
                          B=B, verbose=0, #all_times = all_times,
                          ratetable=ratetable#, add.times=add.times
                          , starting.time, estimator, precision,
                          add.times = add.times,
                          formula = formula,
                          rmap = rmap, measure=measure
        )
        if(ncol(yl_boot[[2]])>nrow(F_O)){
          varsincol <- colVars(yl_boot[[2]], na.rm=TRUE)^(1/2)
          varsincol_df <- data.frame(time=yl_boot[[4]], area.se=varsincol)
          varsincol_df <- varsincol_df[varsincol_df$time %in% F_O$time,]
          F_O$area.se <- varsincol_df$area.se
        } else{
          F_O$area.se <- colVars(yl_boot[[2]], na.rm=TRUE)^(1/2)
        }
        F_P$area.se <- colVars(yl_boot[[3]], na.rm=TRUE)^(1/2)
        yl_boot <- as.data.frame(t(yl_boot[[1]]))
        yd_curve$est.se <- (colVars(yl_boot, na.rm=TRUE))^(1/2)
      }

      # Add CI:
      if((!is.boot) & (!first.boot)){
        if(!is.null(yd_curve$est.se)){
          yd_curve$lower <- yd_curve$est - yd_curve$est.se*stats::qnorm(0.5+conf.int/2)
          yd_curve$upper <- yd_curve$est + yd_curve$est.se*stats::qnorm(0.5+conf.int/2)
        }
      }
      # Values to be reported:
      if((!is.boot) & (!first.boot)){
        if(if_start_stop){
          # Report only at last time point - the values until this time are not suitable to report:
          out <- list(years=utils::tail(yd_curve,1), F_O=utils::tail(F_O,1), F_P=utils::tail(F_P,1), measure=measure)
        } else{
          # Report full measures:
          out <- list(years=yd_curve, F_O=F_O, F_P=F_P, measure=measure)
        }
      } else{
        out <- list(years=yd_curve, F_O=F_O, F_P=F_P, measure=measure)
      }
      return(out)

    } else{ # measure == 'yl2013'
      temp <- exp.prep(rform$R[,,drop=FALSE],rform$Y[inx],rform$ratetable,rform$status,times=tis, fast=TRUE, cmp=FALSE, ys=starting_age)
      temp$yi[temp$yi==0] <- Inf

      # Calculate measures:
      haz.pop <- temp$yidli/temp$yi
      mod_tis <- summary(mod, times = tis)
      F_E <- cumsum(mod_tis$surv*(mod_tis$n.event/mod_tis$n.risk - haz.pop))
      ltis <- length(tis)

      # To be checked, doesn't work ok
      # # Var as in Pavlic2018:
      # F_E_st <- sapply(1:ltis, function(s){
      #   (sum(mod_tis$surv[s:ltis]*(mod_tis$n.event[s:ltis]/mod_tis$n.risk[s:ltis] - haz.pop[s:ltis]))/mod_tis$surv[s]) # *c(0, diff(tis[s:ltis]))  /365.241
      # })
      # # Klemnova:
      # F_Ese <- (cumsum((mod_tis$surv)^2*(1 - F_E_st)^2*((mod_tis$n.event)/(mod_tis$n.risk^2))*c(0, diff(tis)))/365.241)^(1/2)
      # surv_int <- rev(cumsum(rev(c(0, diff(tis))*c(1, mod_tis$surv[1:(length(mod_tis$surv)-1)])))/365.241)
      #
      # # Moja:
      # F_E_int <- rev(cumsum(rev(c(0, diff(tis))*c(0, F_E[1:(length(F_E)-1)])))/365.241)
      # F_Ese <- (cumsum((surv_int)^2*(1 - F_E_st)^2*((mod_tis$n.event)/(mod_tis$n.risk^2))*c(0, diff(tis)))/365.241)^(1/2)
      #
      # # Observed:
      # F_Ese <- (cumsum(surv_int^2*((mod_tis$n.event)/(mod_tis$n.risk^2))*c(0, diff(tis)))/365.241)^(1/2)
      #
      # # Predlog glede na Andersen 2013:
      # F_Ese <- (cumsum((surv_int^2*(mod_tis$n.event - temp$yidli) + F_E_int^2*temp$yidli)/(mod_tis$n.risk^2)*c(0, diff(tis)))/365.241)^(1/2)

      # Calculate measures:
      YL <- cumsum(F_E*c(0, diff(tis)))/365.241
      F_E_area <- cumsum(c(0, diff(tis))*c(0, F_E[1:(length(F_E)-1)]))/365.241
      F_E_df <- data.frame(time=tis, prob=F_E, area=F_E_area) # , prob.se=F_Ese
      yd_curve <- data.frame(time=tis, est=YL)

      # Bootstrap:
      if(bootstrap){
        data_b <- data_orig
        data_b$id <- 1:nrow(data_b)
        yl_boot <- ylboot(theta=ylboot.iter, data=data_b, id="id",
                          B=B, verbose=0, #all_times = all_times,
                          ratetable=ratetable#, add.times=add.times
                          , starting.time, estimator, precision,
                          add.times = add.times,
                          formula = formula,
                          rmap = rmap, measure=measure
        )

        # Calculate area.se:
        area.se <- yl_boot[[2]]
        for(itar in 1:nrow(yl_boot[[2]])){
          prob_tmp <- as.vector(as.matrix(yl_boot[[2]][itar,]))
          area_tmp <- cumsum(c(0, diff(tis))*c(0, prob_tmp[1:(length(prob_tmp)-1)]))/365.241
          area.se[itar,] <- area_tmp
        }
        area.se <- as.vector(colVars(area.se, na.rm=TRUE))

        F_E_df$prob.se <- (colVars(yl_boot[[2]], na.rm=TRUE))^(1/2)
        F_E_df$area.se <- area.se
        yl_boot <- as.data.frame(t(yl_boot[[1]]))
        yd_curve$est.se <- (colVars(yl_boot, na.rm=TRUE))^(1/2)
      }

      if((!is.boot) & (!first.boot)){
        if(!is.null(yd_curve$est.se)){
          yd_curve$lower <- yd_curve$est - yd_curve$est.se*stats::qnorm(0.5+conf.int/2)
          yd_curve$upper <- yd_curve$est + yd_curve$est.se*stats::qnorm(0.5+conf.int/2)
        }
      }

      out <- list(years=yd_curve, F_E=F_E_df, measure=measure)
      return(out)
    }
  } else{ # measure == 'yd'

    ################################################### #
    # CIF on population:

    data_yi <- data
    rform <- rformulate(formula, data, ratetable, na.action=na.action, rmap = rmap)
    data <- rform$data
    if(if_start_stop){
      if(!(start_col %in% colnames(data))){
        data[,start_col] <- data_orig[, start_col]
      }
    }

    # Check covariates:
    p <- rform$m
    if (p > 0) stop("There shouldn't be any covariates in the formula. This function gives non-parametric estimates of the hazards.")
    else data$Xs <- rep(1, nrow(data)) #if no covariates, just put 1

    out_n <- table(data$Xs) #table of strata
    out$time <- out$haz.excess <- out$haz.pop <- out$std.err <- out$strata <-  NULL

    kt <- 1 # the only stratum
    inx <- which(data$Xs == names(out_n)[kt]) #individuals within this stratum
    if(!if_start_stop) tis <- sort(unique(c(rform$Y[inx], seq(0, max(rform$Y[inx]), precision)))) #unique times
    else tis <- sort(unique(c(rform$Y[inx], data[, start_col], seq(min(data[, start_col]), max(rform$Y[inx], data[, start_col]), precision)))) #unique times

    if(!missing(add.times)){
      tis <- sort(unique(c(tis, add.times)))
    }

    # Fix demographic covariates:
    if(if_start_stop){
      rform$R[,"year"] <- rform$R[,"year"] - rform$R[,"age"]
      rform$R[,"age"] <- 0
    }

    ### #
    # Greenwood Variance of area (not F):
    # First prepare objects:
    mod_gw <- summary(mod, times = tis)
    gw_df <- data.frame(time=mod_gw$time, surv=mod_gw$surv, n.risk=mod_gw$n.risk, n.event=mod_gw$n.event)
    # Then calculate:
    times_all2 <- c(0, diff(gw_df$time))/365.241
    surv_all <- c(1, gw_df$surv[1:(length(gw_df$surv)-1)])
    auc_all <- cumsum(times_all2*surv_all)
    area_var <- sapply(1:length(auc_all), function(x) {
      numer <- gw_df$n.risk[1:x]*(gw_df$n.risk[1:x] - gw_df$n.event[1:x])
      numer[numer==0] <- Inf
      sum(((auc_all[x] - auc_all[1:x])^2*gw_df$n.event[1:x])/numer)
    })
    if(is.nan(area_var[length(area_var)])){
      area_var[length(area_var)] <- area_var[length(area_var)-1]
    }
    ### #

    if(estimator=='F_P' | estimator=="all"){
      # Prepare at-risk matrix:
      # browser()
      # mat <- lapply(1:nrow(data), function(x) ifelse((data$start[x] < tis) & (tis <= data$Y[x]), 1, NA))
      # mat2 <- matrix(unlist(mat), nrow = nrow(data_yi), byrow = TRUE)
      # # The sum of the individual at-risk processes:
      # yi_left <- colSums(mat2)
      # yi_left[yi_left == 0] <- Inf
      #
      # mat3 <- lapply(1:nrow(data), function(x) data$age[x] + c(0, diff(tis)))

      if(any(rform$Y[inx]<=starting_age)) browser()

      temp <- exp.prep(rform$R[inx,,drop=FALSE],rform$Y[inx],rform$ratetable,rform$status[inx],times=tis,fast=TRUE, cmp=FALSE, ys=starting_age)
      # Fix at-risk process, if needed:
      temp$yi[temp$yi==0] <- Inf

      out$time <- c(out$time, tis)						#add times

      # Calculate hazards:
      haz.pop <- temp$yidli/temp$yi
      out$haz.pop <- c(out$haz.pop,haz.pop)
      out$cum.haz.pop <- cumsum(out$haz.pop)
      out$F_P <- 1-exp(-out$cum.haz.pop)
      out$auc_pop <- sum(c(tis[1], diff(tis))*c(0, out$F_P[1:(length(out$F_P)-1)]))/365.241

    }

    data_spi2 <- data
    if(estimator=='F_P_Spi2' | estimator=="all"){

      if(any(data_spi2$start>=data_spi2$Y)) browser()
      # Take into account censoring:
      exp.surv2 <- nessie_spi(Surv(start, Y, stat)~1, data=data_spi2, ratetable=ratetable,
                              tis=tis, starting.time=starting.time, include.censoring = TRUE,
                              arg.example)
      out$haz.pop.spi2 <- exp.surv2$yidli/exp.surv2$yi
      out$cum.haz.pop.spi2 <- cumsum(out$haz.pop.spi2)
      out$F_P_Spi2 <- 1-exp(-out$cum.haz.pop.spi2)
      out$auc_pop_Spi2 <- sum(c(tis[1], diff(tis))*c(0, out$F_P_Spi2[1:(length(out$F_P_Spi2)-1)]))/365.241
    }
    if(estimator=='F_P_Spi' | estimator=="all"){
      if(TRUE){ # (!missing(admin.cens))    - tega nimamo vec
        data_spi2$stat <- 1
        # data_spi2$stat[id_admin_cens] <- 0  # - tole ni bilo zakomentirano, ko smo imeli admin.cens
        exp.surv <- nessie_spi(Surv(start, Y, stat)~1, data=data_spi2, ratetable=ratetable,
                               tis=tis, starting.time=starting.time, include.censoring = TRUE,
                               arg.example)
      } else{
        # Don't take into account censoring:
        exp.surv <- nessie_spi(Surv(start, Y, stat)~1, data=data_spi2, ratetable=ratetable,
                               tis=tis, starting.time=starting.time, include.censoring = FALSE,
                               arg.example)
      }

      out$haz.pop.spi <- exp.surv$yidli/exp.surv$yi
      out$cum.haz.pop.spi <- cumsum(out$haz.pop.spi)
      out$F_P_Spi <- 1-exp(-out$cum.haz.pop.spi)
      out$auc_pop_Spi <- sum(c(tis[1], diff(tis))*c(0, out$F_P_Spi[1:(length(out$F_P_Spi)-1)]))/365.241
    }
    if(estimator=='F_P_final'){
      # Shift all to the end:
      if(if_start_stop) data_yi[,stop_col] <- max(data_yi[,stop_col])

      rform2 <- rform
      rform <- rformulate(formula, data_yi, ratetable, na.action=na.action, rmap = rmap)

      # Shift all to the end:
      if(!if_start_stop){
        rform$Y <- rep(max(rform$Y), length(rform$Y))
        rform$data[,"Y"] <- rform$Y
      }
      data <- rform$data
      if(if_start_stop){
        if(!(start_col %in% colnames(data))){
          data[,start_col] <- data_orig[, start_col]
        }
      }

      # Check covariates:
      p <- rform$m
      if (p > 0) stop("There shouldn't be any covariates in the formula. This function gives non-parametric estimates of the hazards.")
      else data$Xs <- rep(1, nrow(data)) #if no covariates, just put 1

      out$haz.pop2 <- NULL

      kt <- 1 # the only stratum
      inx <- which(data$Xs == names(out_n)[kt]) #individuals within this stratum

      # Fix demographic covariates:
      if(if_start_stop){
        rform$R[,"year"] <- rform$R[,"year"] - rform$R[,"age"]
        rform$R[,"age"] <- 0
      }

      if(any(starting_age>=rform$Y[inx])) browser()
      temp <- exp.prep(rform$R[inx,,drop=FALSE],rform$Y[inx],rform$ratetable,rform$status[inx],times=tis,fast=FALSE, cmp=FALSE, ys=starting_age, netweiDM = TRUE)
      temp$sidliD[1] <- 0
      # temp$sisD[1] <- 1
      temp$sisD[temp$sisD==0] <- Inf
      haz.pop2 <- temp$sidliD/temp$sisD

      out$haz.pop2 <- c(out$haz.pop2, haz.pop2)
      out$cum.haz.pop2 <- cumsum(out$haz.pop2)
      out$F_P2 <- 1-exp(-out$cum.haz.pop2)
      out$auc_pop2 <- sum(c(tis[1], diff(tis))*c(0, out$F_P2[1:(length(out$F_P2)-1)]))/365.241

      out$sidli <- temp$sidli
      out$sis <- temp$sis

      # DODATEK:
      haz.pop.ves.cas <- temp$sidli
      haz.pop.ves.cas[1] <- 0
      haz.pop.ves.cas <- haz.pop.ves.cas/temp$sis
      out$cum.haz.pop.ves.cas <- cumsum(haz.pop.ves.cas)
      out$F_P_ves_cas <- 1 - exp(-out$cum.haz.pop.ves.cas)
      out$auc_pop_ves_cas <- sum(c(tis[1], diff(tis))*c(0, out$F_P_ves_cas[1:(length(out$F_P_ves_cas)-1)]))/365.241
    }
    if(estimator=='F_P2' | estimator=="all"){
      # Shift all to the end:
      if(if_start_stop) data_yi[,stop_col] <- max(data_yi[,stop_col])

      rform2 <- rform

      rform <- rformulate(formula, data_yi, ratetable, na.action=na.action, rmap = rmap)

      # Shift all to the end:
      if(!if_start_stop){
        rform$Y <- rep(max(rform$Y), length(rform$Y))
        rform$data[,"Y"] <- rform$Y
      }
      data <- rform$data
      if(if_start_stop){
        if(!(start_col %in% colnames(data))){
          data[,start_col] <- data_orig[, start_col]
        }
      }

      # Check covariates:
      p <- rform$m
      if (p > 0) stop("There shouldn't be any covariates in the formula. This function gives non-parametric estimates of the hazards.")
      else data$Xs <- rep(1, nrow(data)) #if no covariates, just put 1

      out$haz.pop2 <- NULL

      kt <- 1 # the only stratum
      inx <- which(data$Xs == names(out_n)[kt]) #individuals within this stratum

      # Fix demographic covariates:
      if(if_start_stop){
        rform$R[,"year"] <- rform$R[,"year"] - rform$R[,"age"]
        rform$R[,"age"] <- 0
      }

      if(any(starting_age>=rform$Y[inx])) browser()

      # temp <- exp.prep(rform$R[inx,,drop=FALSE],rform$Y[inx],rform$ratetable,rform$status[inx],times=tis,fast=TRUE, cmp=FALSE, ys=0)
      temp <- exp.prep(rform$R[inx,,drop=FALSE],rform$Y[inx],rform$ratetable,rform$status[inx],times=tis,fast=TRUE, cmp=FALSE, ys=starting_age)

      # Fix at-risk process, if needed:
      temp$yi[temp$yi==0] <- Inf

      # Calculate hazards:
      haz.pop2 <- temp$yidli/temp$yi

      out$haz.pop2 <- c(out$haz.pop2, haz.pop2)
      out$cum.haz.pop2 <- cumsum(out$haz.pop2)
      out$F_P2 <- 1-exp(-out$cum.haz.pop2)
      # out$auc_pop2 <- sum(c(tis[1], diff(tis))*out$F_P2)/365.241
      out$auc_pop2 <- sum(c(tis[1], diff(tis))*c(0, out$F_P2[1:(length(out$F_P2)-1)]))/365.241
    }

    ###
    # Bootstrap:
    if(bootstrap){
      # browser()
      data_b <- data_orig
      data_b$id <- 1:nrow(data_b)
      yl_boot <- ylboot(theta=ylboot.iter, data=data_b, id="id",
                        B=B, verbose=0, #all_times = all_times,
                        ratetable=ratetable#, add.times=add.times
                        , starting.time, estimator, precision,
                        add.times = add.times,
                        formula = formula,
                        rmap = rmap, measure=measure
      )
      L_OP <- yl_boot[[3]]
      F_boot <- yl_boot[[2]]
      yl_boot <- as.data.frame(t(yl_boot[[1]]))
    }
    ###
    estimator.orig <- estimator
    if(estimator=='F_P_final') estimator = 'F_P2'

    out$strata <- c(out$strata, length(tis))				#number of times in this strata
    names(out$strata) <-  names(out_n)
    out$strata <-  NULL
    out$auc <- c(auc_data=out$auc_data, auc_pop=out$auc_pop, auc_pop2=out$auc_pop2, auc_pop_Spi=out$auc_pop_Spi, auc_pop_Spi2=out$auc_pop_Spi2)

    if(estimator=='all'){
      F_P_final <- data.frame(time=out$time,F_P=out$F_P, F_P2=out$F_P2, F_P_Spi=out$F_P_Spi, F_P_Spi2=out$F_P_Spi2)
    } else if(estimator=='F_P'){
      F_P_final <- data.frame(time=tis,prob=out$F_P)
    } else if(estimator=='F_P2'){
      F_P_final <- data.frame(time=tis,prob=out$F_P2)
    } else if(estimator=='F_P_Spi'){
      F_P_final <- data.frame(time=tis,prob=out$F_P_Spi)
    } else if(estimator=='F_P_Spi2'){
      F_P_final <- data.frame(time=tis,prob=out$F_P_Spi2)
    }

    # YD through time:
    F_data_yd <- data.frame(time=mod$time, F_data=out$F_data)
    pop.times <- F_P_final$time[!(F_P_final$time %in% mod$time)]
    if(length(pop.times) > 0){
      F_data_yd_tmp <- data.frame(time=pop.times, F_data=NA)
      F_data_yd <- rbind(F_data_yd, F_data_yd_tmp)
      F_data_yd <- F_data_yd[order(F_data_yd$time),]
      F_data_yd$F_data <- mstateNAfix(F_data_yd$F_data, 0)
    }
    F_data_yd$var <- area_var

    yd_data <- cumsum(c(F_data_yd$time[1], diff(F_data_yd$time))*c(0, F_data_yd$F_data[1:(nrow(F_data_yd)-1)]))/365.241

    # Population part:
    F_P_yd <- F_P_final
    yd_pop <- cumsum(c(F_P_yd$time[1], diff(F_P_yd$time))*c(0, F_P_yd$prob[1:(nrow(F_P_yd)-1)]))/365.241
    yd_curve <- data.frame(time=F_data_yd$time, yd=yd_data - yd_pop,
                           obs_var=F_data_yd$var,
                           # obs_var22=obs_var_time22,
                           yd_data=yd_data,
                           yd_pop=yd_pop
    )
    ###
    # Greenwood for prob:
    greenwood_est <- (mod$surv^2*cumsum(mod$n.event/((mod$n.risk - mod$n.event)*mod$n.risk)))^(1/2)
    # If Surv(t)=0 in the end, take the last var estimate:
    if(any(rev(mod$surv)==0)){
      greenwood_wh <- which(mod$surv==0)
      greenwood_est[greenwood_wh] <- greenwood_est[greenwood_wh[1]-1]
    }

    F_data_tmp <- data.frame(time=mod$time,
                             prob=out$F_data,
                             prob.se=greenwood_est,
                             area=NA,
                             area.se=NA)
    # Add values at time zero:
    F_tmp <- F_data_tmp[1,]
    F_tmp$time <- min(starting_age)
    F_tmp$prob <- 0
    F_tmp$prob.se <- 0
    if(!(F_tmp$time %in% F_data_tmp$time)) F_data_tmp <- rbind(F_tmp, F_data_tmp)

    if(!if_start_stop){
      F_P_final_tmp <- F_P_final[1,]
      F_P_final_tmp$time <- min(starting_age)
      F_P_final_tmp$prob <- 0
      if(!(F_P_final_tmp$time %in% F_P_final$time)) F_P_final <- rbind(F_P_final_tmp, F_P_final)
    }

    yd_curve_tmp <- yd_curve[1,]
    yd_curve_tmp$time <- min(starting_age)
    yd_curve_tmp[,2:ncol(yd_curve_tmp)] <- 0
    if(!(yd_curve_tmp$time %in% yd_curve$time)) yd_curve <- rbind(yd_curve_tmp, yd_curve)

    # Bootstrap:
    if(bootstrap){
      yd_curve$boot_var <- colVars(yl_boot, na.rm=TRUE)
      if(late.values){
        last_val <- utils::tail(yd_curve$boot_var[!is.na(yd_curve$boot_var)],1)
        yd_curve$boot_var[is.na(yd_curve$boot_var)] <- last_val
      }
      yl_sd_boot <- stats::sd(yl_boot[, ncol(yl_boot)], na.rm=TRUE)
    }

    # Add areas:
    F_data_tmp$area <- yd_curve$yd_data[yd_curve$time %in% F_data_tmp$time]
    F_P_final$area <- yd_curve$yd_pop#[yd_curve$time %in% F_P_final$time]
    F_data_tmp$area.se <- yd_curve$obs_var[yd_curve$time %in% F_data_tmp$time]^(1/2)

    # If, add boot variance:
    if(bootstrap & (!is.boot)){
      F_data_tmp$prob.se <- (F_boot$F_data[F_boot$time %in% F_data_tmp$time])^(1/2)
      F_P_final$prob.se <- (F_boot$F_P#[F_boot$time %in% F_P_final$time]
      )^(1/2)
      F_data_tmp$area.se <- L_OP$L_O[L_OP$time %in% F_data_tmp$time]^(1/2)
      F_P_final$area.se <- L_OP$L_P^(1/2)
    }

    # Column order:
    F_data_tmp <- F_data_tmp[, c('time', 'prob', 'area', 'prob.se', 'area.se')]

    # Choose relevant columns:
    if(bootstrap){
      yd_curve <- yd_curve[,c('time', 'yd', 'boot_var')]
    } else{
      yd_curve <- yd_curve[,c('time', 'yd', 'obs_var')]
    }
    yd_curve[,3] <- yd_curve[,3]^(1/2)
    colnames(yd_curve)[2:3] <- c('est', 'est.se')
    yd_curve$lower <- yd_curve$est - yd_curve$est.se*stats::qnorm(0.5+conf.int/2)
    yd_curve$upper <- yd_curve$est + yd_curve$est.se*stats::qnorm(0.5+conf.int/2)

    return_obj <- list(F_data=F_data_tmp,
                       F_P=F_P_final,
                       auc=out$auc,
                       yd_curve=yd_curve,
                       starting.time=starting.time,
                       estimator=estimator.orig,
                       out=out
    )

    if(bootstrap){
      return_obj[[length(return_obj)+1]] <- F_boot
      names(return_obj)[length(return_obj)] <- 'F_boot'
      return_obj[[length(return_obj)+1]] <- L_OP
      names(return_obj)[length(return_obj)] <- 'L_OP'
      return_obj <- append(return_obj, yl_sd_boot)
      names(return_obj)[length(return_obj)] <- 'yl_sd_boot'
    }

    return_short <- list(years=return_obj$yd_curve, F_O=return_obj$F_data, F_P=return_obj$F_P, measure=measure)

    if((bootstrap & (!is.boot)) #| ((!bootstrap) & (!is.boot))
    ){
      return_obj <- return_short
    }
    if((!bootstrap) & (!is.boot)){
      return_obj <- return_short
    }
    if(is.boot){
      return_obj <- return_short
    }

    if(var.estimator=='none'){
      return_obj$years <- return_obj$years[,1:2]
      find_cols <- (!grepl('.se', colnames(return_obj[[2]])))
      return_obj[[2]] <- return_obj[[2]][,find_cols]
      if(length(return_obj)==4){
        find_cols <- (!grepl('.se', colnames(return_obj[[3]])))
        return_obj[[3]] <- return_obj[[3]][,find_cols]
      }
    }

    return(return_obj)
  }
}

utils::globalVariables(c("time", "prob", "Curve", "est", "lower", "upper"))

# Bootstrap function:
ylboot <- function(theta, data, B = 5, id = "id", verbose = 0,
                   #all_times,
                   ratetable=relsurv::slopop, #add.times,
                   starting.time, estimator, precision,
                   add.times,
                   formula,
                   rmap, measure,
                   ...){
  ids <- unique(data[, id])
  n <- length(ids)
  if(!missing(add.times)){
    th <- ylboot.iter(formula, data, starting.time = starting.time, estimator = estimator, precision = precision,
                      ratetable=ratetable, first=TRUE, add.times = add.times, rmap = rmap, measure=measure, ...)
  } else{
    th <- ylboot.iter(formula, data, starting.time = starting.time, estimator = estimator, precision = precision,
                      ratetable=ratetable, first=TRUE, rmap = rmap, measure=measure, ...)
  }

  simple_par <- TRUE
  if(missing(add.times)) simple_par <- FALSE

  # Prepare objects:
  res <- data.frame(matrix(NA, nrow=B, ncol=nrow(th[[1]])))
  if(!missing(add.times)){
    add.times <- sort(unique(c(th[[1]]$time, add.times)))
  } else{
    add.times <- th[[1]]$time
  }

  Fdata <- data.frame(matrix(NA, nrow=B, ncol=length(add.times)))
  Fo <- data.frame(matrix(NA, nrow=B, ncol=nrow(th[[2]])))
  Fp <- data.frame(matrix(NA, nrow=B, ncol=length(add.times)))
  L_O <- data.frame(matrix(NA, nrow=B, ncol=length(add.times)))
  L_P <- data.frame(matrix(NA, nrow=B, ncol=length(add.times)))
  F_E <- data.frame(matrix(NA, nrow=B, ncol=length(add.times)))

  # Iteration:
  for (b in 1:B) {
    nek_obj <- ylboot.apply(formula, b, verbose, ids, data, id, add.times, starting.time, estimator, precision, ratetable, th, simple_par, rmap, measure, ...)
    res[b,1:length(nek_obj[[1]])] <- nek_obj[[1]]
    if(measure=='yl2013'){
      F_E[b,1:length(nek_obj[[2]])] <- nek_obj[[2]]
    }
    if(measure=='yl2017'){
      Fo[b,1:length(nek_obj[[2]])] <- nek_obj[[2]]
      Fp[b,1:length(nek_obj[[3]])] <- nek_obj[[3]]
    }
    if(measure=='yd'){
      subnek <- subset(nek_obj[[2]], time %in% add.times)

      sub_vec <- 1:nrow(subnek)
      Fdata[b,sub_vec] <- subnek$F_data
      Fp[b,sub_vec] <- subnek$F_P

      subnek2 <- subset(nek_obj[[3]], time %in% add.times)

      sub2_vec <- 1:nrow(subnek2)
      L_O[b,sub2_vec] <- subnek2$yd_data
      L_P[b,sub2_vec] <- subnek2$yd_pop
    }
  }
  res <- as.data.frame(t(res))
  if(measure == 'yl2013'){
    return(list(res, F_E))
  }
  if(measure == 'yl2017'){
    return(list(res, Fo, Fp, add.times))
  }
  else{
    if (verbose)
      cat("\n")

    F_obj <- data.frame(time=add.times,
                        F_data=colVars(Fdata, na.rm = TRUE),
                        F_P=colVars(Fp, na.rm = TRUE))
    L_OP <- data.frame(time=add.times,
                       L_O=colVars(L_O, na.rm = TRUE),
                       L_P=colVars(L_P, na.rm = TRUE))

    return(list(res, F_obj, L_OP))
  }
}

ylboot.apply <- function(formula, b, verbose, ids, data, id, add.times, starting.time, estimator, precision, ratetable, th, simple_par,
                         rmap, measure,
                         ...){

  if(starting.time=='left.truncated'){
    start_col <- as.character(formula[[2]])[2]
    stop_col <- as.character(formula[[2]])[3]
  } else{
    stop_col <- as.character(formula[[2]])[2]
  }

  if (verbose > 0) {
    cat("\nBootstrap replication", b, "\n")
  }
  bootdata <- NULL
  bids <- sample(ids, replace = TRUE)
  bidxs <- unlist(sapply(bids, function(x) which(x ==
                                                   data[, id])))
  bootdata <- data[bidxs, ]
  if (verbose > 0) {
    cat("applying theta ...")
  }

  if(length(unique(bootdata[,id]))==1){
    next
  }

  if(!missing(add.times) & simple_par){
    add.times.arg <- sort(unique(c(th[[1]]$time, add.times)))
  } else{
    add.times.arg <- th[[1]]$time
  }
  add.times.arg2 <- add.times.arg
  # Remove unnecessary times
  if(starting.time == 'left.truncated'){
    add.times.arg <- add.times.arg[add.times.arg<=max(bootdata[,stop_col])]
  } else{
    add.times.arg <- add.times.arg[add.times.arg<=max(bootdata[,stop_col])]# - bootdata[,start_col])]
  }

  thstar <- ylboot.iter(formula, bootdata, starting.time = starting.time, estimator = estimator, precision = precision,
                        ratetable=ratetable, add.times=add.times.arg, rmap=rmap, measure=measure, ...)
  if(measure == 'yl2013'){
    return(list(thstar[[1]]$est, thstar[[2]]$prob))
  }
  if(measure == 'yl2017'){
    FoO <- thstar[[2]]
    FpP <- thstar[[3]]
    thstar <- thstar[[1]]
    # if(nrow(th[[1]]) != nrow(thstar)) browser()

    if(nrow(FoO) < nrow(th[[2]])){
      mis.tajms <- th[[2]]$time[!(th[[2]]$time %in% FoO$time)]
      mis.tajms <- mis.tajms[mis.tajms <= max(FoO$time)]
      temp_df <- data.frame(time=mis.tajms, area=NA)
      FoO <- rbind(FoO, temp_df)
      FoO <- FoO[order(FoO$time),]
      FoO$area <- mstateNAfix(FoO$area, 0)
    }

    if(nrow(th[[1]]) < nrow(thstar)){
      thstar <- thstar[thstar$time %in% th[[1]]$time, ]
      FpP <- FpP[FpP$time %in% th[[1]]$time, ]
      foO <- foO[foO$time %in% th[[1]]$time, ]
    }

    if(length(th[[1]]$time[th[[1]]$time <= max(thstar$time)]) != length(thstar$time)) browser()
    pogoj <- any(th[[1]]$time[th[[1]]$time <= max(thstar$time)] != thstar$time)
    if(pogoj){
      missing_times <- th[[1]]$time[which(!(th[[1]]$time %in% thstar$time))]

      if(length(missing_times)>0){
        # There are times missing in thstar, add them:
        add_df <- thstar[1:length(missing_times),]
        add_df$time <- missing_times
        add_df$yd <- NA
        add_df$obs_var <- NA
        add_df$yd_data <- NA

        thstar <- rbind(thstar, add_df)
        thstar <- thstar[order(thstar$time),] # redundantno

        thstar$yd <- mstateNAfix(thstar$yd, 0)
        thstar$obs_var <- mstateNAfix(thstar$obs_var, 0)
        thstar$yd_data <- mstateNAfix(thstar$yd_data, 0)

        if(nrow(th[[1]]) < nrow(thstar)){
          thstar <- thstar[thstar$time %in% th[[1]]$time, ]
        }

        if(nrow(th[[1]]) != nrow(thstar)) browser()
      } else{
        # This means there's more times in thstar than needed. Remove unnecessary times:
        thstar <- thstar[-which(!(thstar$time %in% th[[1]]$time)),]
        FpP <- FpP[-which(!(FpP$time %in% th[[1]]$time)),]
        foO <- foO[-which(!(foO$time %in% th[[1]]$time)),]

        if(nrow(th[[1]]) != nrow(thstar)) browser()
      }
    }

    return(list(thstar$est, FoO$area, FpP$area))
  }

  L_OP <- thstar[[3]]
  Fobj <- thstar[[2]]
  thstar <- thstar[[1]]

  if(nrow(th[[1]]) < nrow(thstar)){
    thstar <- thstar[thstar$time %in% th[[1]]$time, ]
    L_OP <- L_OP[L_OP$time %in% th[[1]]$time, ]
    Fobj <- Fobj[Fobj$time %in% th[[1]]$time, ]
  }

  # Ali kaksne vrednosti manjkajo:
  if(length(th[[1]]$time[th[[1]]$time <= max(thstar$time)]) != length(thstar$time)) browser()
  pogoj <- any(th[[1]]$time[th[[1]]$time <= max(thstar$time)] != thstar$time)
  if(pogoj){

    missing_times <- th[[1]]$time[which(!(th[[1]]$time %in% thstar$time))]

    if(length(missing_times)>0){
      # There are times missing in thstar, add them:
      add_df <- thstar[1:length(missing_times),]
      add_df$time <- missing_times
      add_df$yd <- NA
      add_df$obs_var <- NA
      add_df$yd_data <- NA

      thstar <- rbind(thstar, add_df)
      thstar <- thstar[order(thstar$time),] # redundantno

      thstar$yd <- mstateNAfix(thstar$yd, 0)
      thstar$obs_var <- mstateNAfix(thstar$obs_var, 0)
      thstar$yd_data <- mstateNAfix(thstar$yd_data, 0)

      if(nrow(th[[1]]) < nrow(thstar)){
        thstar <- thstar[thstar$time %in% th[[1]]$time, ]
      }

      if(nrow(th[[1]]) != nrow(thstar)) browser()
    } else{
      # This means there's more times in thstar than needed. Remove unnecessary times:
      thstar <- thstar[-which(!(thstar$time %in% th[[1]]$time)),]
      L_OP <- L_OP[-which(!(L_OP$time %in% th[[1]]$time)),]
      Fobj <- Fobj[-which(!(Fobj$time %in% th[[1]]$time)),]

      if(nrow(th[[1]]) != nrow(thstar)) browser()
    }
  }

  # thstar$b <- b
  # Save result:
  # res[b,] <-

  list(thstar$est, Fobj, L_OP)
}

ylboot.iter <- function(formula, data, #all_times,
                        starting.time, estimator, precision,
                        ratetable=relsurv::slopop,
                        first=FALSE, add.times,
                        rmap, measure
){
  if(!missing(rmap))  rmap <- as.call(rmap)
  if(first){
    is.boot <- FALSE
    first.boot <- TRUE
  } else{
    is.boot <- TRUE
    first.boot <- FALSE
  }

  # Round, if needed:
  tolerance <- 1e-15

  if(missing(add.times)){
    object <- years(formula = formula, data = data, ratetable = ratetable,
                    precision=precision, var.estimator='greenwood', is.boot=is.boot, first.boot = first.boot, rmap = rmap, measure=measure)
    # estimator = estimator,
  } else{
    object <- years(formula = formula, data = data, ratetable = ratetable,
                    precision=precision, var.estimator='greenwood', add.times=add.times, is.boot=is.boot, first.boot = first.boot, rmap = rmap, measure=measure)
    # estimator = estimator,
  }
  if(measure=='yd'){
    if(first) return(list(object$years, object$F_O))
    else{
      # return(object$yd_curve)
      Fobj <- merge(object$F_P[,c('time','prob')], object$F_O[,c('time','prob')], by='time', all.x=TRUE)
      Fobj <- Fobj[,c(1,3,2)]
      colnames(Fobj)[2:3] <- c('F_data','F_P')

      L_OP <- merge(object$F_P[,c('time','area')], object$F_O[,c('time','area')], by='time', all.x = TRUE)
      L_OP <- L_OP[,c(1,3,2)]
      colnames(L_OP)[2:3] <- c('yd_data', 'yd_pop')

      return(list(object$years,
                  Fobj,
                  L_OP))
    }
  } else if(measure=='yl2013'){
    return(list(object$years, object$F_E))
  } else{
    return(list(object$years, object$F_O, object$F_P))
  }

}

plot.helper <- function(years, obj){

  df_poly <- data.frame(time=years[[obj]]$time/365.241,
                        prob=years[[obj]]$prob)
  df_st <- df_poly[1,]
  df_st$prob <- 0
  df_end <- df_poly[nrow(df_poly),]
  df_end$prob <- 0
  df_poly <- rbind(df_st, df_poly, df_end)

  df_poly
}

gg_color_hue <- function(n) {
  hues = seq(15, 375, length = n + 1)
  grDevices::hcl(h = hues, l = 65, c = 100)[1:n]
}

#' Plot the absolute risk (observed and population curve)
#'
#' Plots the estimated observed and population curve for the
#' life years difference (Manevski, Ruzic Gorenjec, Andersen, Pohar Perme, 2022).
#'
#' A ggplot2 implementation for plotting the observed and population curves. The type of curves is
#' dependent upon the measure calculated using \code{years} function (argument \code{measure}).

#' @param years the object obtained using function \code{years}.
#' @param xlab a title for the x axis.
#' @param ylab a title for the y axis.
#' @param xbreak the breaks on the x axis (this is supplied to \code{scale_x_continuous}).
#' @param ybreak the breaks on the y axis (this is supplied to \code{scale_y_continuous}).
#' @param xlimits define the limits on the x axis (this is supplied to \code{scale_x_continuous}).
#' @param ylimits define the limits on the y axis (this is supplied to \code{scale_y_continuous}).
#' @param show.legend if TRUE, the legend is shown on the graph.
#' @return A ggplot object
#' @seealso \code{\link{years}}, \code{\link{plot_years}}
#'
plot_f <- function(years, xlab='Time interval', ylab='Absolute risk', xbreak, ybreak, xlimits, ylimits, show.legend=TRUE){
  # years: object given from the years() function
  # xlab: define xlab
  # ylab: define ylab
  # xbreak: The breaks on x axis
  # ybreak: The breaks on y axis
  # xlimits: Define the limits on the x axis
  # ylimits: Define the limits on the y axis
  # show.legend: TRUE by default (shows the legend)

  # Checks:
  if(years$measure != 'yd'){
    stop("The plot_f function is available only for the YD measure (argument measure='yd' in the years function).")
  }

  out <- rbind(
    cbind(years$F_O[,c('time', 'prob')], Curve='Observed'),
    cbind(years$F_P[,c('time', 'prob')], Curve='Population')
  )

  if(missing(xlimits)){
    xlimits <- c(min(out$time), max(out$time))/365.241
  }
  if(missing(ylimits)){
    ylimits <- c(0,max(out$prob))*1.1
  }

  colorji <- gg_color_hue(3)
  colorji <- colorji[c(1,3)]

  g <- ggplot2::ggplot(out)+
    ggplot2::geom_step(ggplot2::aes(time/365.241, prob, color=Curve)#, size=1.001
    )+
    ggplot2::scale_color_manual(values=colorji)+
    ggplot2::xlab(xlab)+
    ggplot2::ylab(ylab)

  poly_data <- plot.helper(years, 'F_O')
  poly_P <- plot.helper(years, 'F_P')

  g <- g+
    pammtools::geom_stepribbon(ggplot2::aes(x=time/365.241, ymin=0, ymax=prob, fill=Curve), alpha=0.3, linetype='dashed')+
    ggplot2::scale_fill_manual(values = colorji)

  if(!missing(xbreak)){
    g <- g +
      ggplot2::scale_x_continuous(expand = c(0, 0), limits=xlimits, breaks = xbreak)
  } else{
    g <- g +
      ggplot2::scale_x_continuous(expand = c(0, 0), limits=xlimits)
  }

  if(!missing(ybreak)){
    g <- g +
      ggplot2::scale_y_continuous(expand = c(0, 0), limits=ylimits, breaks = ybreak)
  } else{
    g <- g +
      ggplot2::scale_y_continuous(expand = c(0, 0), limits=ylimits)
  }

  g <- g +
    ggplot2::theme_bw()+
    ggplot2::theme(legend.position = 'bottom',
                   legend.title = ggplot2::element_blank())+
    ggplot2::theme(text = ggplot2::element_text(size=14))+
    ggplot2::theme(
      panel.grid.major.x = ggplot2::element_line(linetype='dashed', colour = 'grey85'),
      panel.grid.minor.x = ggplot2::element_line(linetype='dashed', colour = 'grey85'),
      panel.grid.major.y = ggplot2::element_line(linetype='dashed', colour = 'grey85'),
      panel.grid.minor.y = ggplot2::element_line(linetype='dashed', colour = 'grey85'))

  if(!show.legend){
    g <- g +
      ggplot2::theme(legend.position = 'none')
  }

  g
}

#' Plot the years measure
#'
#' Plot the years measure obtained from the \code{years} function.
#'
#' A ggplot2 implementation for plotting the years measure. The type of curve is
#' dependent upon the measure calculated using the \code{years} function (argument \code{measure}).

#' @param years the object obtained using function \code{years}.
#' @param xlab a title for the x axis.
#' @param ylab a title for the y axis.
#' @param xbreak the breaks on the x axis (this is supplied to \code{scale_x_continuous}).
#' @param ybreak the breaks on the y axis (this is supplied to \code{scale_y_continuous}).
#' @param xlimits define the limits on the x axis (this is supplied to \code{scale_x_continuous}).
#' @param ylimits define the limits on the y axis (this is supplied to \code{scale_y_continuous}).
#' @param conf.int if TRUE, the confidence interval is plotted.
#' @param ymirror mirror the y values (w.r.t. the x axis).
#' @param yminus use function y -> -y when plotting.
#' @return A ggplot object
#' @seealso \code{\link{years}}, \code{\link{plot_f}}
#'
plot_years <- function(years, xlab='Time interval', ylab='Years', xbreak, ybreak, xlimits, ylimits, conf.int=FALSE, ymirror=FALSE, yminus=FALSE){

  out <- years$years

  if(conf.int){
    if(is.null(out$lower)){
      stop('Confidence intervals not present in the years object. Please set conf.int=FALSE or use the var.estimator argument in the years function.')
    }
  }

  if(years$measure=='yl2017' & nrow(out)==1){
    stop('The years measure is reported at the end of follow-up thus it is not plotted.')
  }

  if(yminus){
    out$est <- -out$est
    if(!is.null(out$lower)){
      tmp_lower <- out$lower
      out$lower <- -out$upper
      out$upper <- -tmp_lower
    }
  }

  if(missing(xlimits)){
    xlimits <- c(min(out$time[1]), max(out$time))/365.241
  }
  if(missing(ylimits)){
    tmp_vec <- out$est
    if(!is.null(out$lower)) tmp_vec <- c(out$est, out$lower, out$upper)
    ymax <- max(tmp_vec)
    ymin <- min(tmp_vec)
    ylimits <- c(ymin,ymax)*1.1
  }

  g <- ggplot2::ggplot(out)+
    ggplot2::geom_step(ggplot2::aes(time/365.241, est)#, size=1.001
                       )

  if(conf.int){
    g <- g+
      ggplot2::geom_step(ggplot2::aes(time/365.241, lower), linetype='dashed')+
      ggplot2::geom_step(ggplot2::aes(time/365.241, upper), linetype='dashed')
  }

  g <- g+
    ggplot2::xlab(xlab)+
    ggplot2::ylab(ylab)

  if(!missing(xbreak)){
    g <- g+
      ggplot2::scale_x_continuous(expand = c(0, 0), limits=xlimits, breaks = xbreak)
  } else{
    g <- g+
      ggplot2::scale_x_continuous(expand = c(0, 0), limits=xlimits)
  }

  # Helper:
  trans <- function(x) -x
  inv <- function(x) -x
  reverse_fun <- scales::trans_new(name = "reverse_new",
                                   transform = trans,
                                   inverse = inv
  )

  if(!missing(ybreak)){
    g <- g +
      ggplot2::scale_y_continuous(expand = c(0, 0), limits = ylimits, breaks = ybreak)
  } else{
    g <- g +
      ggplot2::scale_y_continuous(expand = c(0, 0), limits = ylimits)
  }
  if(ymirror){
    g <- g +
      ggplot2::coord_trans(y=reverse_fun)
  }

  g <- g +
    ggplot2::theme_bw()+
    ggplot2::theme(text = ggplot2::element_text(size=14))+
    ggplot2::expand_limits(y = 0)+
    ggplot2::theme(
      panel.grid.major.x = ggplot2::element_line(linetype='dashed', colour = 'grey85'),
      panel.grid.minor.x = ggplot2::element_line(linetype='dashed', colour = 'grey85'),
      panel.grid.major.y = ggplot2::element_line(linetype='dashed', colour = 'grey85'),
      panel.grid.minor.y = ggplot2::element_line(linetype='dashed', colour = 'grey85'))

  g

}

Try the relsurv package in your browser

Any scripts or data that you put into this service are public.

relsurv documentation built on Dec. 28, 2022, 2:25 a.m.