Tutorial_toy_data.R
In wcox: Weights to Correct for Outcome Dependent Sampling in Time to Event Data

## ---- include = FALSE---------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

Prepare_data <- function(dat, population_incidence, breaks){

  ### Input:

  #' @param dat Data.frame with one row per individual which at least includes
  #' a column **d** with event indicator (1 for event, 0 for censored), a column
  #' **y** with event/censoring time.
  #' @param population_incidence A vector (in combination with breaks) or
  #' a data.frame (columns 1) 'start age group', 2) 'end age group', 3)'S_pop')
  #' with population incidence per 100,000 per interval k.
  #' @param breaks Cut-points for the (age/time) groups. Only needed when
  #' population_incidence is a vector.

  ### Output:

  #' @return Data.frame one row per individual and a.o. columns *id* unique ID;
  #' *d* non-censoring indicator; **k** interval of (age) group; **S_k**
  #' population interval-based proportion of individuals experiencing the
  #' event in intervals later than k; **S_k.** sample
  #' proportion of individuals experiencing the event in intervals later
  #' than k.

  # --------------- Load packages.

#  require(survival)
#  require(dplyr)
#  require(tidyr)

  # --------------- Organize external information (population incidence rate).
  # N.B.: there are different options regarding the breaks (1-4).

  if(is.vector(population_incidence)){

    if(is.numeric(breaks)){ # Option 1: two vectors population_incidence and breaks.
      if(length(breaks)!=(length(population_incidence)+1) ) warning("Number of
                                                                  breaks should
                                                                  equal the
                                                                  number of
                                                                  groups plus
                                                                  one.")
      n_agegroups <- length(population_incidence)
      from <- breaks[-(n_agegroups + 1)]  # Remove last
      to <- breaks[-1] # Remove first
      dat_inc <- data.frame(start = from, end = to, incidence_rate =
                              population_incidence/100000)

    } else if(breaks == "5 years"){ # Option 2: 5 years age groups.
      n_agegroups <- length(population_incidence)
      breaks <- seq(0, 5*(n_agegroups), by = 5)
      from <- breaks[-(n_agegroups + 1)]  # Remove last
      to <- breaks[-1] # Remove first
      dat_inc <- data.frame(start = from, end = to, incidence_rate =
                              population_incidence/100000)

    } else if(breaks == "10 years"){ # Option 3: 10 years age groups.
      n_agegroups <- length(population_incidence)
      breaks <- seq(0, 10*(n_agegroups), by = 10)
      from <- breaks[-(n_agegroups + 1)]  # Remove last
      to <- breaks[-1] # Remove first
      dat_inc <- data.frame(start = from, end = to, incidence_rate =
                              population_incidence/100000)
    }

  } else { # Option 4: if population incidence is given as a data.frame.
    dat_inc <- data.frame(start = population_incidence[,1],
                          end = population_incidence[,2],
                          incidence_rate = population_incidence[,3]/100000)
    breaks <- population_incidence[,1]
  }

  dat_inc <- dat_inc %>% mutate( k = paste("(", start, ",", end, "]", sep=""),
                                 t = end - start)
  # t is the width of the age interval, in years.
  # Note that in the paper, this is referred to
  # as (alpha_k - alpha_({k-1}).

  # --------------- Calculate S_k population from population incidence.

  dat_inc$S_k <- c(exp(-(dat_inc$t * dat_inc$incidence_rate)))

  # --------------- Calculate S_k. based on sample data.

  dat$y_cat <- cut(dat$y, breaks = breaks)
  index <- which(is.na(dat$y_cat)==T)
  if(length(index)>0){
    dat<- dat[-index,]}  # Include only those that fall within an age category.

  # Store number of cases and unaffected individuals per age group.
  agegroups.info <- aggregate(id ~ y_cat + d, data = dat, length)

  agegroups.info <- agegroups.info %>% tidyr::spread(d, -y_cat)
  colnames(agegroups.info) <- c("k","s_k","r_k")

  # Transform NA to 0 (its real meaning).
  agegroups.info$s_k <- ifelse(!is.na(agegroups.info$s_k), agegroups.info$s_k, 0)
  agegroups.info$r_k <- ifelse(!is.na(agegroups.info$r_k), agegroups.info$r_k, 0)

  # Calculate person years per outcome and age group.
  personyears <- aggregate(y ~ y_cat + d, data =
                             dat[,c("d","y_cat", "y"),], sum) %>%
    tidyr::spread(d, -y_cat)
  colnames(personyears) <- c("k","q_k","p_k")
  agegroups.info <- merge(agegroups.info, personyears, by = "k")  # Merge.

  agegroups.info$q_k <- ifelse(!is.na(agegroups.info$q_k),
                               agegroups.info$q_k, 0)
  agegroups.info$p_k <- ifelse(!is.na(agegroups.info$p_k),
                               agegroups.info$p_k, 0)

  agegroups.info$years.before <- breaks[which(breaks != max(breaks))]

  # Person years among censored individuals.
  q <- agegroups.info$q_k - (agegroups.info$years.before * agegroups.info$s_k)

  # Person years among cases.
  p <- agegroups.info$p_k - (agegroups.info$years.before * agegroups.info$r_k)

  # Years per age group.
  # N.B. in the paper referred to as (alpha_k - alpha_{k-1}).
  t <- agegroups.info$t <- dat_inc$t

  # Number of censored.
  s <- agegroups.info$s_k

  # Number of cases.
  r <- agegroups.info$r_k

  # Age group label.
  k <- agegroups.info$k

  # Obtain incidence rate (mu) based on sample data.
  n_agegroups <- nrow(agegroups.info)
  for (k in 1:n_agegroups){
    if(k!=n_agegroups){calcsum <- sum(agegroups.info$r_k[(k+1):n_agegroups],
                                      agegroups.info$s_k[(k+1):n_agegroups])
    } else {calcsum <- 0}
    agegroups.info$mu_k[k] = agegroups.info$r_k[k]/
      (agegroups.info$p_k[k]+agegroups.info$q_k[k]+agegroups.info$t[k]*calcsum)

  }

  # Recalculate S_k. (in the sample!) based on the intervals.
  S_k. <- c()
  S_k. <- c(exp(-(agegroups.info$mu_k*t)))

  agegroups.info$S_k. <- S_k.
  agegroups.info$S_k <- dat_inc$S_k

  # --------------- Add S_k. to the sample data.

  dat_out <- merge(dat, agegroups.info, by.x = "y_cat", by.y = "k") %>%
    mutate(k = y_cat) %>%
    mutate(population_incidence = mu_k)

  dat_out$years.before <- NULL; dat_out$t <- NULL;
  dat_out$y_cat <- NULL; dat_out$mu_k <- NULL;
  dat_out$s_k <- dat_out$r_k <- dat_out$p_k <- dat_out$q_k <- NULL

  # --------------- Output the data.frame.

  dat_out

}
#' Calculate inverse probability of selection weights.
#'
#' @description
#' This function calculates weights to correct for ascertainment bias in
#' time-to-event data where clusters are outcome-dependently sampled,
#' for example high-risk families in genetic epidemiological studies in
#' cancer research.
#'
#' @details
#' Weights are based on a comparison between the survival between sample and
#' population. Therefore, besides the sample data, the population incidence rate
#' (per 100 000) is needed as input, as well as the cut-offs of the
#' (age/time-to-event) groups for which this is available. The function provides
#' two options for the latter: cut-offs can be provided manually or using the
#' standard 5- or 10-years (age) categories (0-4, 5-9, ... or 0-9, 10-14, ...).
#' Note that resulting intervals are of the form [xx, xx).
#'
#' @export


Calculate_weights <- function(dat){

### Input:

#' @param dat Data.frame with one row per individual with columns *d*
#' non-censoring indicator; **k** interval of (age) group; **S_k**
#' population interval-based proportion of individuals experiencing the
#' event in intervals later than k; **S_k.** sample
#' proportion of individuals experiencing the event in intervals later
#' than k.

### Output:

#' @return Vector with weights.

  # --------------- Extract variables from input data.frame.

  # Group/interval.
  k <- dat$k

  # Population proportion of individuals experiencing the event in intervals
  # later than k.
  S_k <- dat$S_k

  # Sample proportion of individuals experiencing the event in intervals
  # later than k.
  S_k. <- dat$S_k.

  # --------------- Create empty containers.

  v <- w <- rep(NA, nrow(dat))

  # --------------- Calculate the weights.

  for (n in 1:nrow(dat)){

    # Weights for unaffected.
    v[n] = 1

    # Weights for cases.
    w[n] = ( (1 - S_k[n]) / S_k[n] ) * (S_k.[n] / ( 1 - S_k.[n]))
  }

  merged <- cbind(dat, w, v)
  merged$weight <- NA

  # --------------- Assign w for uncensored, v for censored (interval-wise).

  merged$weight[which(merged$d == 1)] <- merged$w[which(merged$d == 1)]
  merged$weight[which(merged$d == 0)] <- merged$v[which(merged$d == 0)]

  merged$w <- merged$v <- NULL  # Remove old variable.

  # --------------- Collect output.

  vec_weights <- merged$weight

  # If weight is 0, add very small value (coxph does not accept weights of 0).
  vec_weights[which(vec_weights==0)] <- 0.0000001

  # --------------- Print warning if weights are invalid.

  ifelse((sum(vec_weights<0)>0), print("Invalid (negative) weights!"),
         print("No negative weights"))

  # --------------- Return output.

  vec_weights

}







## ----Loading, message = F-----------------------------------------------------

library(wcox)

require(dplyr)
require(tidyr)
require(survival)

## ----Load data set, message = F-----------------------------------------------
# Load toy data.
data("fam_dat")

## ----Show first lines data----------------------------------------------------
# Show the first few lines of the toy data.
head(fam_dat)

## ----Distribution of family size----------------------------------------------
# Show the number of families and top rows.
cat( "There are", unique(fam_dat$family_id) %>% length() , "families in data set. ")

# Examine family sizes.
fam_sizes <- fam_dat %>% group_by(family_id) %>% count()

cat( "Family size is on average ", 
     fam_sizes$n %>% mean() %>% round(1), 
     " and ranges from ", 
     fam_sizes$n %>% min(), " to ", 
     fam_sizes$n %>% max(), ".", sep = "")

## ----External information-----------------------------------------------------

# Enter the incidence by age group per 100 000, starting with the youngest age group.
incidence_rate <- c(2882, 1766, 1367, 1155, 987, 845, 775, 798, 636, 650)

# Define the age group cutoffs.
breaks_manually <- c(0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100)
breaks_10yrs <- "10 years"

## ----Rename variables---------------------------------------------------------
# Rename variables.
my_dat <- rename(fam_dat, id = individual_id, d = event_indicator, y = age)

## ----Prepare data, message = F------------------------------------------------

# Using option 1 (manual cut-offs):
my_dat_out <- Prepare_data(dat = my_dat, population_incidence = incidence_rate, 
                           breaks = breaks_manually)

# Unhash to use option 2 (pre-set cut-offs):
# my_dat_out <- Prepare_data(dat = my_dat, population_incidence = incidence_rate, 
# breaks = "10 years")

## ----Look at prepared data----------------------------------------------------

# Select the newly add columns.
my_dat_out %>% select(id, k, d, S_k, S_k.) %>% arrange(id) %>% head(8)

## ----Calculate weights--------------------------------------------------------
# Calculate weights using the object that is output from the Prepare_data() function.

w <- Calculate_weights(dat = my_dat_out)

## ----Show weights-------------------------------------------------------------
# Show the weights.
my_dat_out %>% mutate(weight = w) %>% 
  select(id, d, weight) %>% 
  arrange(id) %>% 
  filter(id %in% c(1,3))

## ----Fit model using weights, message = F-------------------------------------
# Fit the model.
fit <- coxph(Surv(y, d==1) ~ x + cluster(family_id), weights = w, data =  my_dat_out)

fit

## ----Examine estimates of fitted model----------------------------------------
# Extract estimates.
fit.summary <- summary(fit)

# Summarize findings.
cat("Covariate effect (95% CI): ",
exp(fit.summary$coefficients[1]), " (",
exp(fit.summary$coefficients[1] - 1.96*fit.summary$coefficients[4]), ", ",
exp(fit.summary$coefficients[1] + 1.96*fit.summary$coefficients[4]), ").", sep = "")
Any scripts or data that you put into this service are public.
wcox documentation built on May 31, 2023, 9:13 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
wcox
Weights to Correct for Outcome Dependent Sampling in Time to Event Data

inst/doc/Tutorial_toy_data.R
In wcox: Weights to Correct for Outcome Dependent Sampling in Time to Event Data

Try the wcox package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

wcox Weights to Correct for Outcome Dependent Sampling in Time to Event Data

inst/doc/Tutorial_toy_data.R In wcox: Weights to Correct for Outcome Dependent Sampling in Time to Event Data

Try the wcox package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

wcox
Weights to Correct for Outcome Dependent Sampling in Time to Event Data

inst/doc/Tutorial_toy_data.R
In wcox: Weights to Correct for Outcome Dependent Sampling in Time to Event Data