#' Update clinical data from follow-up table
#'
#' Survival and vital status are not updated in the clinical data
#' obtained from TCGA, so we need to do it manually.
#'
#' The data is assumed to be obtained from TCGAbiolinks
#'
#' @param clinical data.frame with clinical information from TCGA project
#' @param follow.up data.frame with follow-up information from TCGA project
#'
#' @return
#' @export
#'
#' @examples
#' library(dplyr)
#' data(clinical, gdc, package = 'skcm.data')
#' clinical.new <- update.survival.from.followup(clinical$all, gdc$follow.up)
#'
#' problem.barcodes <- c('TCGA-FR-A726', 'TCGA-D3-A8GP', 'TCGA-DA-A1HW', 'TCGA-DA-A1I1',
#' 'TCGA-DA-A1I5', 'TCGA-DA-A1I7', 'TCGA-DA-A1IB', 'TCGA-DA-A95W',
#' 'TCGA-DA-A95X', 'TCGA-EB-A41A', 'TCGA-EE-A2GJ', 'TCGA-FR-A726',
#' 'TCGA-HR-A5NC', 'TCGA-XV-A9VZ')
#'
#' clinical.new %>% filter(bcr_patient_barcode %in% problem.barcodes)
#'
#'
#' #
#' #
#' # Original clinical information from TCGAbiolinks
#'
#' clinical$all %>% filter(bcr_patient_barcode %in% problem.barcodes) %>%
#' select(bcr_patient_barcode, days_to_last_followup, days_to_death, vital_status,
#' year_of_form_completion, month_of_form_completion, day_of_form_completion) %>% arrange(bcr_patient_barcode)
#'
#' #
#' #
#' # Original follow-up information from TCGAbiolinks
#'
#' gdc$follow.up %>% filter(bcr_patient_barcode %in% problem.barcodes) %>%
#' select(bcr_patient_barcode, days_to_last_followup, days_to_death, vital_status,
#' year_of_form_completion, month_of_form_completion, day_of_form_completion) %>% arrange(bcr_patient_barcode)
update.survival.from.followup <- function(clinical, follow.up) {
# Replace empty vital status to NA
levels(follow.up$vital_status) <- levels(follow.up$vital_status) %>% {replace(., . == '', NA)}
levels(clinical$vital_status) <- levels(clinical$vital_status) %>% {replace(., . == '', NA)}
#
# Build up follow-up information
f.up.short <- follow.up %>%
#
# Keep individual if it has at least one of the following columns
dplyr::filter(!is.na(vital_status) | !is.na(days_to_last_followup) | !is.na(days_to_death)) %>%
#
# Group by patient barcode, so that operations are performed per barcode
dplyr::group_by(bcr_patient_barcode) %>%
#
# Change vital status to 1: Dead 0: Alive
dplyr::mutate(vital_status = (vital_status == 'Dead') * 1) %>%
#
# Build date for form completion
dplyr::mutate(date_form_completion = as.Date(paste0(year_of_form_completion,
month_of_form_completion,
day_of_form_completion, collapse = ''),
format = '%Y%M%d')) %>%
#
# Keep only:
# * highest follow-up date
# * biggest vital_status (no comming back from dead)
# * highest value for days_to_death
# * date of form completion (calculated field)
dplyr::summarise(days_to_last_followup = max(days_to_last_followup, na.rm = TRUE),
days_to_death = max(days_to_death, na.rm = TRUE),
vital_status = max(vital_status),
follow.up.new = 1,
date_form_completion = max(date_form_completion, na.rm = TRUE)) %>%
#
# Perform following mutate operations per row
dplyr::rowwise() %>%
#
# Keep only highest value from days to death/follow-up
dplyr::mutate(surv_event_time = suppressWarnings(max(days_to_death, days_to_last_followup, na.rm = TRUE))) %>%
#
# Replace infinite values by NA (these come from max(NA) = -Inf)
dplyr::mutate(surv_event_time = replace(surv_event_time, is.infinite(surv_event_time), NA)) %>%
#
# Keep only some columns
dplyr::select(bcr_patient_barcode, vital_status, surv_event_time, follow.up.new, date_form_completion)
#
# Prepare clinical data to merge
# * vital status 0 or 1 (see above)
# * date of form completion (built from day, month and year values)
clinical.up <- clinical %>%
#
# perform row operations
dplyr::rowwise() %>%
#
# prepare vital status and form completion date
dplyr::mutate(vital_status = (vital_status == 'Dead') * 1,
date_form_completion = as.Date(paste0(year_of_form_completion,
month_of_form_completion,
day_of_form_completion, collapse = ''),
format = '%Y%M%d')) %>%
dplyr::rowwise() %>%
#
# Keep only highest value from days to death/follow-up
dplyr::mutate(surv_event_time = suppressWarnings(max(days_to_death, days_to_last_followup, na.rm = TRUE))) %>%
#
# Replace infinite values by NA (these come from max(NA) = -Inf)
dplyr::mutate(surv_event_time = replace(surv_event_time, is.infinite(surv_event_time), NA))
#
# Merge two tables
clinical.new <- dplyr::left_join(clinical.up,
f.up.short,
by = 'bcr_patient_barcode',
suffix = c('__clinical', '__followup')) %>%
#
# keep only interesting fields to survival
dplyr::select(bcr_patient_barcode,
vital_status__clinical, vital_status__followup,
surv_event_time__clinical, surv_event_time__followup,
date_form_completion__clinical, date_form_completion__followup,
follow.up.new) %>% dplyr::as.tbl() %>%
# Next mutate operations are performed by row
dplyr::rowwise() %>%
#
# Build:
# * vital status with highest value between clinical and follow-up data (again, can't come back from dead)
# * same with event time, only biggest one should matter
# * calculate days between form_completion
dplyr::mutate(vital_status = suppressWarnings(max(vital_status__clinical, vital_status__followup, na.rm = TRUE)),
surv_event_time = suppressWarnings(max(surv_event_time__clinical, surv_event_time__followup, na.rm = TRUE)),
days_between_forms = as.numeric(date_form_completion__followup - date_form_completion__clinical)) %>%
#
# replace infinite values by NA on days_between_form_completion
dplyr::mutate(surv_event_time = replace(surv_event_time, is.infinite(surv_event_time), NA),
vital_status = replace(vital_status, is.infinite(vital_status), NA),
days_between_forms = replace(days_between_forms, is.infinite(days_between_forms), NA)) %>%
#
# Select columns
dplyr::select(bcr_patient_barcode,
vital_status,
surv_event_time) %>%
#
# sort by patient barcode
dplyr::arrange(bcr_patient_barcode)
clinical.new$vital_status <- factor(clinical.new$vital_status,
levels = c(0, 1),
labels = c('Alive', 'Dead'))
return(clinical.new)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.