R/update_data.R

Defines functions update.survival.from.followup

Documented in update.survival.from.followup

#' Update clinical data from follow-up table
#'
#' Survival and vital status are not updated in the clinical data
#' obtained from TCGA, so we need to do it manually.
#'
#' The data is assumed to be obtained from TCGAbiolinks
#'
#' @param clinical data.frame with clinical information from TCGA project
#' @param follow.up data.frame with follow-up information from TCGA project
#'
#' @return
#' @export
#'
#' @examples
#' library(dplyr)
#' data(clinical, gdc, package = 'skcm.data')
#' clinical.new <- update.survival.from.followup(clinical$all, gdc$follow.up)
#'
#' problem.barcodes <- c('TCGA-FR-A726', 'TCGA-D3-A8GP', 'TCGA-DA-A1HW', 'TCGA-DA-A1I1',
#'                       'TCGA-DA-A1I5', 'TCGA-DA-A1I7', 'TCGA-DA-A1IB', 'TCGA-DA-A95W',
#'                       'TCGA-DA-A95X', 'TCGA-EB-A41A', 'TCGA-EE-A2GJ', 'TCGA-FR-A726',
#'                       'TCGA-HR-A5NC', 'TCGA-XV-A9VZ')
#'
#' clinical.new %>% filter(bcr_patient_barcode %in% problem.barcodes)
#'
#'
#' #
#' #
#' # Original clinical information from TCGAbiolinks
#'
#' clinical$all %>% filter(bcr_patient_barcode %in% problem.barcodes) %>%
#'  select(bcr_patient_barcode, days_to_last_followup, days_to_death, vital_status,
#'         year_of_form_completion, month_of_form_completion, day_of_form_completion) %>% arrange(bcr_patient_barcode)
#'
#' #
#' #
#' # Original follow-up information from TCGAbiolinks
#'
#' gdc$follow.up %>% filter(bcr_patient_barcode %in% problem.barcodes) %>%
#'  select(bcr_patient_barcode, days_to_last_followup, days_to_death, vital_status,
#'         year_of_form_completion, month_of_form_completion, day_of_form_completion) %>% arrange(bcr_patient_barcode)
update.survival.from.followup <- function(clinical, follow.up) {

  # Replace empty vital status to NA
  levels(follow.up$vital_status) <- levels(follow.up$vital_status) %>% {replace(., . == '', NA)}
  levels(clinical$vital_status)  <- levels(clinical$vital_status) %>% {replace(., . == '', NA)}

  #
  # Build up follow-up information
  f.up.short <- follow.up %>%
    #
    # Keep individual if it has at least one of the following columns
    dplyr::filter(!is.na(vital_status) | !is.na(days_to_last_followup) | !is.na(days_to_death)) %>%
    #
    # Group by patient barcode, so that operations are performed per barcode
    dplyr::group_by(bcr_patient_barcode) %>%
    #
    # Change vital status to 1: Dead 0: Alive
    dplyr::mutate(vital_status = (vital_status == 'Dead') * 1) %>%
    #
    # Build date for form completion
    dplyr::mutate(date_form_completion = as.Date(paste0(year_of_form_completion,
                                                 month_of_form_completion,
                                                 day_of_form_completion, collapse = ''),
                                          format = '%Y%M%d')) %>%
    #
    # Keep only:
    #  * highest follow-up date
    #  * biggest vital_status (no comming back from dead)
    #  * highest value for days_to_death
    #  * date of form completion (calculated field)
    dplyr::summarise(days_to_last_followup = max(days_to_last_followup, na.rm = TRUE),
                     days_to_death         = max(days_to_death, na.rm = TRUE),
                     vital_status          = max(vital_status),
                     follow.up.new         = 1,
                     date_form_completion  = max(date_form_completion, na.rm = TRUE)) %>%
    #
    # Perform following mutate operations per row
    dplyr::rowwise() %>%
    #
    # Keep only highest value from days to death/follow-up
    dplyr::mutate(surv_event_time = suppressWarnings(max(days_to_death, days_to_last_followup, na.rm = TRUE))) %>%
    #
    # Replace infinite values by NA (these come from max(NA) = -Inf)
    dplyr::mutate(surv_event_time = replace(surv_event_time, is.infinite(surv_event_time), NA)) %>%
    #
    # Keep only some columns
    dplyr::select(bcr_patient_barcode, vital_status, surv_event_time, follow.up.new, date_form_completion)


  #
  # Prepare clinical data to merge
  #   * vital status 0 or 1 (see above)
  #   * date of form completion (built from day, month and year values)
  clinical.up <- clinical %>%
    #
    # perform row operations
    dplyr::rowwise() %>%
    #
    # prepare vital status and form completion date
    dplyr::mutate(vital_status         = (vital_status == 'Dead') * 1,
                  date_form_completion = as.Date(paste0(year_of_form_completion,
                                                        month_of_form_completion,
                                                        day_of_form_completion, collapse = ''),
                                                 format = '%Y%M%d')) %>%
    dplyr::rowwise() %>%
    #
    # Keep only highest value from days to death/follow-up
    dplyr::mutate(surv_event_time = suppressWarnings(max(days_to_death, days_to_last_followup, na.rm = TRUE))) %>%
    #
    # Replace infinite values by NA (these come from max(NA) = -Inf)
    dplyr::mutate(surv_event_time = replace(surv_event_time, is.infinite(surv_event_time), NA))


  #
  # Merge two tables
  clinical.new <- dplyr::left_join(clinical.up,
                                   f.up.short,
                                   by = 'bcr_patient_barcode',
                                   suffix = c('__clinical', '__followup')) %>%
    #
    # keep only interesting fields to survival
    dplyr::select(bcr_patient_barcode,
                  vital_status__clinical,         vital_status__followup,
                  surv_event_time__clinical,      surv_event_time__followup,
                  date_form_completion__clinical, date_form_completion__followup,
                  follow.up.new) %>% dplyr::as.tbl() %>%
    # Next mutate operations are performed by row
    dplyr::rowwise() %>%
    #
    # Build:
    #   * vital status with highest value between clinical and follow-up data (again, can't come back from dead)
    #   * same with event time, only biggest one should matter
    #   * calculate days between form_completion
    dplyr::mutate(vital_status       = suppressWarnings(max(vital_status__clinical, vital_status__followup, na.rm = TRUE)),
                  surv_event_time    = suppressWarnings(max(surv_event_time__clinical, surv_event_time__followup, na.rm = TRUE)),
                  days_between_forms = as.numeric(date_form_completion__followup - date_form_completion__clinical)) %>%
    #
    # replace infinite values by NA on days_between_form_completion
    dplyr::mutate(surv_event_time    = replace(surv_event_time, is.infinite(surv_event_time), NA),
                  vital_status       = replace(vital_status, is.infinite(vital_status), NA),
                  days_between_forms = replace(days_between_forms, is.infinite(days_between_forms), NA)) %>%
    #
    # Select columns
    dplyr::select(bcr_patient_barcode,
                  vital_status,
                  surv_event_time) %>%
    #
    # sort by patient barcode
    dplyr::arrange(bcr_patient_barcode)

  clinical.new$vital_status <- factor(clinical.new$vital_status,
                                      levels = c(0, 1),
                                      labels = c('Alive', 'Dead'))

  return(clinical.new)
}
averissimo/glmSparseNetPaper documentation built on Jan. 25, 2021, 12:11 p.m.