R/augment_data.R

Defines functions augment_data

Documented in augment_data

#' augment data function
#'
#' This function takes in data and clean
#' @param data data input
#' @export

# function with clean data
augment_data <- function(data = my_clean_data) {

  my_data_clean_aug <- data %>%
    # add response column
    mutate(response = case_when(log_fold_change >= 2 & p_value <= 0.01 & input_1&input_2&input_3 != 0 & count > input_1  ~ "yes",
                                # everything else does not match any of the previous criterias as is labelled no
                                TRUE ~ "no")) %>%
    # add organ column
    mutate(organ = case_when(str_detect(sample, "TU$") ~ "tumor",
                             str_detect(sample, "SP$") | str_detect(sample, "^CT26") ~ "spleen")) %>%

    # remove _SP or _TU from sample
    mutate(sample = case_when(str_detect(sample, "TU$") ~ str_replace(sample, ".{3}$",  " ") ,
                              str_detect(sample, "SP$") ~ str_replace(sample, ".{3}$", " "),
                              TRUE ~ as.character(sample))) %>%

    # add treatment column
    mutate(treatment = case_when(str_detect(sample, "^4T1_19") | str_detect(sample,"^4T1_23") | str_detect(sample,"^4T1_20") |
                                   str_detect(sample,"^4T1_16") | sample == "CT26_C1" | sample == "CT26_D1" | sample == "CT26_D2" ~ "CPI",
                                 str_detect(sample,"^4T1_22") | str_detect(sample,"^4T1_17") | str_detect(sample,"^4T1_18") |
                                   sample == "CT26_C3" | sample == "CT26_C4" | sample == "CT26_D4" ~ "Isotype control")) %>%

    # add cell_line column
    mutate(cell_line = case_when(str_detect(sample, "^4T1") ~ "4T1", str_detect(sample, "^CT26") ~ "CT26")) %>%
    # add percent_count.fraction column = barcode_count / sum(barcode_counts) for a particular sample * 100
    group_by(sample) %>%
    mutate(percent_count_fraction = count/sum(count)*100) %>%
    # add estimated_frequency column
    mutate(estimated_frequency = percent_pe*percent_count_fraction/100) %>%

    # add a sum of the normalized counts column for the significant epitopes (response == yes) to calculate the normalized estimated freq
    #mutate(count_norm_signif = case_when(response == "yes" ~ sum(count_normalised_edger))) %>%
    mutate(identifier = paste(neoepitope_sequence, hla, sep = "_"),
           identifier = paste(identifier, cell_line, sep = "_"))


  count_norm_signif <- my_data_clean_aug %>%
    group_by(sample) %>%
    filter(response == "yes") %>%
    summarise(count_norm_signif = sum(count_normalised_edger))


  my_data_clean_aug <- full_join(my_data_clean_aug, count_norm_signif) %>%

    # add estimated_frequency_normalized column
    mutate(estimated_frequency_norm = ((count_normalised_edger*percent_pe)/count_norm_signif)) %>%

    # change estimated_frequency_norm of non-response peptides to 0, as this measure is not relevant
    mutate(estimated_frequency_norm = case_when(response == "no" ~ 0,
                                                TRUE ~ as.numeric(estimated_frequency_norm))) %>%
    # convert Mut_MHCrank_EL and Expression level to numeric so we can join both files
    mutate(mut_mhcrank_el = as.numeric(mut_mhcrank_el),
           mut_mhcrank_ba = as.numeric(mut_mhcrank_ba),
           expression_level = as.numeric(expression_level),
           norm_mhcrank_el = as.numeric(norm_mhcrank_el),
           norm_mhcrank_ba = as.numeric(norm_mhcrank_ba),
           self_similarity = as.numeric(self_similarity)) %>%
    arrange(response)

}
rforbiodatascience/barcc documentation built on May 17, 2020, 5:31 p.m.