Nothing
## ---- include = FALSE---------------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
Prepare_data <- function(dat, population_incidence, breaks){
### Input:
#' @param dat Data.frame with one row per individual which at least includes
#' a column **d** with event indicator (1 for event, 0 for censored), a column
#' **y** with event/censoring time.
#' @param population_incidence A vector (in combination with breaks) or
#' a data.frame (columns 1) 'start age group', 2) 'end age group', 3)'S_pop')
#' with population incidence per 100,000 per interval k.
#' @param breaks Cut-points for the (age/time) groups. Only needed when
#' population_incidence is a vector.
### Output:
#' @return Data.frame one row per individual and a.o. columns *id* unique ID;
#' *d* non-censoring indicator; **k** interval of (age) group; **S_k**
#' population interval-based proportion of individuals experiencing the
#' event in intervals later than k; **S_k.** sample
#' proportion of individuals experiencing the event in intervals later
#' than k.
# --------------- Load packages.
# require(survival)
# require(dplyr)
# require(tidyr)
# --------------- Organize external information (population incidence rate).
# N.B.: there are different options regarding the breaks (1-4).
if(is.vector(population_incidence)){
if(is.numeric(breaks)){ # Option 1: two vectors population_incidence and breaks.
if(length(breaks)!=(length(population_incidence)+1) ) warning("Number of
breaks should
equal the
number of
groups plus
one.")
n_agegroups <- length(population_incidence)
from <- breaks[-(n_agegroups + 1)] # Remove last
to <- breaks[-1] # Remove first
dat_inc <- data.frame(start = from, end = to, incidence_rate =
population_incidence/100000)
} else if(breaks == "5 years"){ # Option 2: 5 years age groups.
n_agegroups <- length(population_incidence)
breaks <- seq(0, 5*(n_agegroups), by = 5)
from <- breaks[-(n_agegroups + 1)] # Remove last
to <- breaks[-1] # Remove first
dat_inc <- data.frame(start = from, end = to, incidence_rate =
population_incidence/100000)
} else if(breaks == "10 years"){ # Option 3: 10 years age groups.
n_agegroups <- length(population_incidence)
breaks <- seq(0, 10*(n_agegroups), by = 10)
from <- breaks[-(n_agegroups + 1)] # Remove last
to <- breaks[-1] # Remove first
dat_inc <- data.frame(start = from, end = to, incidence_rate =
population_incidence/100000)
}
} else { # Option 4: if population incidence is given as a data.frame.
dat_inc <- data.frame(start = population_incidence[,1],
end = population_incidence[,2],
incidence_rate = population_incidence[,3]/100000)
breaks <- population_incidence[,1]
}
dat_inc <- dat_inc %>% mutate( k = paste("(", start, ",", end, "]", sep=""),
t = end - start)
# t is the width of the age interval, in years.
# Note that in the paper, this is referred to
# as (alpha_k - alpha_({k-1}).
# --------------- Calculate S_k population from population incidence.
dat_inc$S_k <- c(exp(-(dat_inc$t * dat_inc$incidence_rate)))
# --------------- Calculate S_k. based on sample data.
dat$y_cat <- cut(dat$y, breaks = breaks)
index <- which(is.na(dat$y_cat)==T)
if(length(index)>0){
dat<- dat[-index,]} # Include only those that fall within an age category.
# Store number of cases and unaffected individuals per age group.
agegroups.info <- aggregate(id ~ y_cat + d, data = dat, length)
agegroups.info <- agegroups.info %>% tidyr::spread(d, -y_cat)
colnames(agegroups.info) <- c("k","s_k","r_k")
# Transform NA to 0 (its real meaning).
agegroups.info$s_k <- ifelse(!is.na(agegroups.info$s_k), agegroups.info$s_k, 0)
agegroups.info$r_k <- ifelse(!is.na(agegroups.info$r_k), agegroups.info$r_k, 0)
# Calculate person years per outcome and age group.
personyears <- aggregate(y ~ y_cat + d, data =
dat[,c("d","y_cat", "y"),], sum) %>%
tidyr::spread(d, -y_cat)
colnames(personyears) <- c("k","q_k","p_k")
agegroups.info <- merge(agegroups.info, personyears, by = "k") # Merge.
agegroups.info$q_k <- ifelse(!is.na(agegroups.info$q_k),
agegroups.info$q_k, 0)
agegroups.info$p_k <- ifelse(!is.na(agegroups.info$p_k),
agegroups.info$p_k, 0)
agegroups.info$years.before <- breaks[which(breaks != max(breaks))]
# Person years among censored individuals.
q <- agegroups.info$q_k - (agegroups.info$years.before * agegroups.info$s_k)
# Person years among cases.
p <- agegroups.info$p_k - (agegroups.info$years.before * agegroups.info$r_k)
# Years per age group.
# N.B. in the paper referred to as (alpha_k - alpha_{k-1}).
t <- agegroups.info$t <- dat_inc$t
# Number of censored.
s <- agegroups.info$s_k
# Number of cases.
r <- agegroups.info$r_k
# Age group label.
k <- agegroups.info$k
# Obtain incidence rate (mu) based on sample data.
n_agegroups <- nrow(agegroups.info)
for (k in 1:n_agegroups){
if(k!=n_agegroups){calcsum <- sum(agegroups.info$r_k[(k+1):n_agegroups],
agegroups.info$s_k[(k+1):n_agegroups])
} else {calcsum <- 0}
agegroups.info$mu_k[k] = agegroups.info$r_k[k]/
(agegroups.info$p_k[k]+agegroups.info$q_k[k]+agegroups.info$t[k]*calcsum)
}
# Recalculate S_k. (in the sample!) based on the intervals.
S_k. <- c()
S_k. <- c(exp(-(agegroups.info$mu_k*t)))
agegroups.info$S_k. <- S_k.
agegroups.info$S_k <- dat_inc$S_k
# --------------- Add S_k. to the sample data.
dat_out <- merge(dat, agegroups.info, by.x = "y_cat", by.y = "k") %>%
mutate(k = y_cat) %>%
mutate(population_incidence = mu_k)
dat_out$years.before <- NULL; dat_out$t <- NULL;
dat_out$y_cat <- NULL; dat_out$mu_k <- NULL;
dat_out$s_k <- dat_out$r_k <- dat_out$p_k <- dat_out$q_k <- NULL
# --------------- Output the data.frame.
dat_out
}
#' Calculate inverse probability of selection weights.
#'
#' @description
#' This function calculates weights to correct for ascertainment bias in
#' time-to-event data where clusters are outcome-dependently sampled,
#' for example high-risk families in genetic epidemiological studies in
#' cancer research.
#'
#' @details
#' Weights are based on a comparison between the survival between sample and
#' population. Therefore, besides the sample data, the population incidence rate
#' (per 100 000) is needed as input, as well as the cut-offs of the
#' (age/time-to-event) groups for which this is available. The function provides
#' two options for the latter: cut-offs can be provided manually or using the
#' standard 5- or 10-years (age) categories (0-4, 5-9, ... or 0-9, 10-14, ...).
#' Note that resulting intervals are of the form [xx, xx).
#'
#' @export
Calculate_weights <- function(dat){
### Input:
#' @param dat Data.frame with one row per individual with columns *d*
#' non-censoring indicator; **k** interval of (age) group; **S_k**
#' population interval-based proportion of individuals experiencing the
#' event in intervals later than k; **S_k.** sample
#' proportion of individuals experiencing the event in intervals later
#' than k.
### Output:
#' @return Vector with weights.
# --------------- Extract variables from input data.frame.
# Group/interval.
k <- dat$k
# Population proportion of individuals experiencing the event in intervals
# later than k.
S_k <- dat$S_k
# Sample proportion of individuals experiencing the event in intervals
# later than k.
S_k. <- dat$S_k.
# --------------- Create empty containers.
v <- w <- rep(NA, nrow(dat))
# --------------- Calculate the weights.
for (n in 1:nrow(dat)){
# Weights for unaffected.
v[n] = 1
# Weights for cases.
w[n] = ( (1 - S_k[n]) / S_k[n] ) * (S_k.[n] / ( 1 - S_k.[n]))
}
merged <- cbind(dat, w, v)
merged$weight <- NA
# --------------- Assign w for uncensored, v for censored (interval-wise).
merged$weight[which(merged$d == 1)] <- merged$w[which(merged$d == 1)]
merged$weight[which(merged$d == 0)] <- merged$v[which(merged$d == 0)]
merged$w <- merged$v <- NULL # Remove old variable.
# --------------- Collect output.
vec_weights <- merged$weight
# If weight is 0, add very small value (coxph does not accept weights of 0).
vec_weights[which(vec_weights==0)] <- 0.0000001
# --------------- Print warning if weights are invalid.
ifelse((sum(vec_weights<0)>0), print("Invalid (negative) weights!"),
print("No negative weights"))
# --------------- Return output.
vec_weights
}
## ----Loading, message = F-----------------------------------------------------
library(wcox)
require(dplyr)
require(tidyr)
require(survival)
## ----Load data set, message = F-----------------------------------------------
# Load toy data.
data("fam_dat")
## ----Show first lines data----------------------------------------------------
# Show the first few lines of the toy data.
head(fam_dat)
## ----Distribution of family size----------------------------------------------
# Show the number of families and top rows.
cat( "There are", unique(fam_dat$family_id) %>% length() , "families in data set. ")
# Examine family sizes.
fam_sizes <- fam_dat %>% group_by(family_id) %>% count()
cat( "Family size is on average ",
fam_sizes$n %>% mean() %>% round(1),
" and ranges from ",
fam_sizes$n %>% min(), " to ",
fam_sizes$n %>% max(), ".", sep = "")
## ----External information-----------------------------------------------------
# Enter the incidence by age group per 100 000, starting with the youngest age group.
incidence_rate <- c(2882, 1766, 1367, 1155, 987, 845, 775, 798, 636, 650)
# Define the age group cutoffs.
breaks_manually <- c(0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100)
breaks_10yrs <- "10 years"
## ----Rename variables---------------------------------------------------------
# Rename variables.
my_dat <- rename(fam_dat, id = individual_id, d = event_indicator, y = age)
## ----Prepare data, message = F------------------------------------------------
# Using option 1 (manual cut-offs):
my_dat_out <- Prepare_data(dat = my_dat, population_incidence = incidence_rate,
breaks = breaks_manually)
# Unhash to use option 2 (pre-set cut-offs):
# my_dat_out <- Prepare_data(dat = my_dat, population_incidence = incidence_rate,
# breaks = "10 years")
## ----Look at prepared data----------------------------------------------------
# Select the newly add columns.
my_dat_out %>% select(id, k, d, S_k, S_k.) %>% arrange(id) %>% head(8)
## ----Calculate weights--------------------------------------------------------
# Calculate weights using the object that is output from the Prepare_data() function.
w <- Calculate_weights(dat = my_dat_out)
## ----Show weights-------------------------------------------------------------
# Show the weights.
my_dat_out %>% mutate(weight = w) %>%
select(id, d, weight) %>%
arrange(id) %>%
filter(id %in% c(1,3))
## ----Fit model using weights, message = F-------------------------------------
# Fit the model.
fit <- coxph(Surv(y, d==1) ~ x + cluster(family_id), weights = w, data = my_dat_out)
fit
## ----Examine estimates of fitted model----------------------------------------
# Extract estimates.
fit.summary <- summary(fit)
# Summarize findings.
cat("Covariate effect (95% CI): ",
exp(fit.summary$coefficients[1]), " (",
exp(fit.summary$coefficients[1] - 1.96*fit.summary$coefficients[4]), ", ",
exp(fit.summary$coefficients[1] + 1.96*fit.summary$coefficients[4]), ").", sep = "")
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.