knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

This vignette contains the code for downloading and processing the BRCA Mutation and Clinical data

library(tidyverse)
library(data.table)
library(RTCGA.clinical)
library(RTCGA.mutations)

brca_clinical_dt_wide <- 
  RTCGA.clinical::BRCA.clinical %>% 
  data.table()

# the above is in wide format and I'd like to make it long 
# so we can revise the column names to remove the periods

brca_clinical_dt_long <- 
  brca_clinical_dt_wide %>% 
  melt(id.vars=c("patient.bcr_patient_barcode")) %>% 
  na.omit()

# With the above I'd now like to make it snake case and get rid of those periods
brca_clinical_dt_long_snake_case <- 
  brca_clinical_dt_long[,
                      .(snake_case_variable = str_to_lower(variable) %>% str_replace_all("\\.","_")),
                      .(patient.bcr_patient_barcode,value)
                      ]

# Now I'm going to make this wide again but now the column names are much nicer

brca_clinical_dt_long_snake_case[order(snake_case_variable)]$snake_case_variable %>% unique() %>% head(100)

brca_clinical_dt_long_snake_case[grepl("patient_anatomic_neoplasm_subdivisions_anatomic_neoplasm_subdivision",
                                       snake_case_variable)
                                 ] %>% dcast(patient.bcr_patient_barcode ~ snake_case_variable)

brca_clinical_dt_long_snack_case_wide <- 
  brca_clinical_dt_long_snake_case[
    snake_case_variable %in% c("admin_bcr",
                               "admin_file_uuid",
                               "patient_bcr_patient_uuid",
                               "patient_days_to_birth",
                               "patient_days_to_death",
                               "patient_age_at_initial_pathologic_diagnosis","admin_project_code",
                               "patient_ethnicity",
                               "patient_biospecimen_cqcf_tumor_samples_tumor_sample_tumor_necrosis_percent",
                               "patient_number_of_lymphnodes_positive_by_he",
                               "patient_number_of_lymphnodes_positive_by_ihc",
                               "patient_lymph_node_examined_count",
                               "patient_anatomic_neoplasm_subdivisions_anatomic_neoplasm_subdivision"
                               )
  ] %>% 
  dcast(patient.bcr_patient_barcode ~ snake_case_variable,value.var="value")

brca_clinical_dt_long_snack_case_wide_transpose <- 
  brca_clinical_dt_long_snack_case_wide %>% 
  t() %>% 
  data.table()

x <- "column_name"
brca_clinical_dt_long_snack_case_wide_transpose$column_name <- 
  colnames(brca_clinical_dt_long_snack_case_wide)

brca_clinical_dt_long_snack_case_wide_transpose_reorder <- 
  brca_clinical_dt_long_snack_case_wide_transpose[,
                                                  c(x,paste0("V",
                                                             1:(ncol(brca_clinical_dt_long_snack_case_wide_transpose)-1))
                                                  ),
                                                  with=F]

dim(brca_clinical_dt_long_snack_case_wide_transpose_reorder)
brca_clinical_dt_long_snack_case_wide_transpose_reorder[1:50,1:5]

brca_clinical_dt_long_snack_case_wide_transpose_reorder$column_name <- 
  lapply(brca_clinical_dt_long_snack_case_wide_transpose_reorder[,column_name],
       function(x){splt <- str_split(x,"_"); paste0(splt[[1]][2:length(splt[[1]])],collapse="_")})

brca_clinical_dt_long_snack_case_wide_transpose_reorder$column_name[[1]] <- 
  paste0("bcr_",brca_clinical_dt_long_snack_case_wide_transpose_reorder$column_name[[1]])

brca_clinical_dt_long_snack_case_wide_transpose_reorder 

#brca_clinical_dt_long_snack_case_wide_transpose_reorder %>% fwrite("../data/brca_clinical.csv")

barcodes <- 
  brca_clinical_dt_long_snack_case_wide_transpose_reorder[
    column_name=="patient.bcr_patient_barcode"
    ] %>% 
  unlist %>% 
  unname
brca_mutation_dt_wide <- 
  RTCGA.mutations::BRCA.mutations %>% 
  data.table()

brca_mutation_dt_long <- 
  brca_mutation_dt_wide %>% 
  melt(id.vars=c("bcr_patient_barcode")) %>% 
  na.omit()

bcr_map <- 
  unique(brca_mutation_dt_long[,
                             .(bcr_patient_barcode)
                             ]
       )[,
         .(bcr_patient_barcode,split = str_split(bcr_patient_barcode,"-")
           )
         ]

bcr_map$split_join <- sapply(bcr_map$split,function(x){paste0(str_to_lower(x[1:3]),collapse="-")})

tmp <- merge( 
  bcr_map[,
          .(bcr_patient_barcode,split_join)
          ],
  brca_mutation_dt_long,
  by="bcr_patient_barcode"
  )[
    ,c("bcr_patient_barcode") := list(NULL)
  ]

setnames(tmp,c("split_join"),c("bcr_patient_barcode"))

brca_mutation_dt_long_snake_case <- 
  tmp[,
      .(snake_case_variable = str_to_lower(variable)),
      .(bcr_patient_barcode,value)
      ]

brca_mutation_dt_long_snake_case$snake_case_variable %>% unique()

brca_mutation_dt_long_snack_case_wide <- 
  brca_mutation_dt_long_snake_case[
    snake_case_variable %in%
      c("tumor_sample_uuid","ncbi_build","sequencer",
        "bam_file","matched_norm_sample_barcode","tumor_sample_barcode",
        "validation_method","sequence_source")
  ] %>% 
  dcast(bcr_patient_barcode ~ snake_case_variable,value.var="value",fun.aggregate=function(x){x[1]})

brca_mutation_dt_long_snack_case_wide_transpose <- 
  brca_mutation_dt_long_snack_case_wide %>% 
  t() %>% 
  data.table()

x <- "column_name"
brca_mutation_dt_long_snack_case_wide_transpose$column_name <- 
  colnames(brca_mutation_dt_long_snack_case_wide)

brca_mutation_dt_long_snack_case_wide_transpose_reorder <- 
  brca_mutation_dt_long_snack_case_wide_transpose[,
                                                c(x,paste0("V",
                                                           1:(ncol(brca_mutation_dt_long_snack_case_wide_transpose)-1))
                                                  ),
                                                with=F] 

dim(brca_mutation_dt_long_snack_case_wide_transpose_reorder)
brca_mutation_dt_long_snack_case_wide_transpose_reorder[1:10,1:5]

brca_mutation_dt_long_snack_case_wide_transpose_reorder

#brca_mutation_dt_long_snack_case_wide_transpose_reorder %>% fwrite("../data/brca_mutation.csv")


AndrewC160/ROMOPOmics documentation built on Jan. 27, 2021, 6:57 p.m.