src/functions.R

# """
# Script defining functions used in cleaning, summarizing, 
# and plotting clinical center labs exported from BTRIS
# """

library(tidyverse)
library(lubridate)
library(janitor)
library(here)


loaded_packages <- function(){
    for (i in .packages()) {
        cat(paste0(i, ": ", packageVersion(i)), sep = "\n")
    }
}
#### QUICKPLOT---------------------------------------------------------------------------------------

# Quick plot of non-transformed labs

quickplot_numeric_btris_lab <- function(data, lab_id, group = mrn){
    group <- enquo(group)
    
    y_units <- data[which(data$cluster_id == lab_id),'unit_of_measure'] %>% sample_n(1) %>% c()
    y_name <- data[which(data$cluster_id == lab_id),'cluster_name'] %>% sample_n(1) %>% c()
    y_name <- str_extract(y_name, "[^(]+")
    title <- paste0(y_name, "vs Age")
    
    data %>% filter(data$cluster_id == lab_id) %>% mutate(observation_value = as.numeric(observation_value)) %>% 
       
        ggplot(aes(age_collected, observation_value)) + 
         
        geom_smooth(method = "lm", se = FALSE, 
                    color = "grey50", alpha = 0.5, linetype = 'dashed') + 
       
        geom_jitter(aes(color = !!group), alpha = 0.75) + 
        
        geom_smooth(aes(color = !!group), method = "lm", se = FALSE) + 
        
        labs(
            x = "Age (Years)", 
            y = paste0(y_name, " (", y_units, ")"),
            title = title) + 
        
        theme(axis.ticks.y.left = element_blank(),
              panel.grid.minor = element_blank(),
              panel.grid.major.y = element_blank(),
              legend.position = 'bottom',
              legend.title = element_blank()) +
        
        #scale_color_brewer(palette = "Dark2")
        viridis::scale_color_viridis(discrete = TRUE)
}


#### ELAPSED TIME-----------------------------------------------------------------------------------------------
# Calculating ages / time between dates

elapsed_years <- function(end_date, start_date) {
    lubridate::time_length(difftime(as.POSIXlt(end_date), as.POSIXlt(start_date)), "years")
}

elapsed_days <- function(end_date, start_date) {
    lubridate::time_length(difftime(as.POSIXlt(end_date), as.POSIXlt(start_date)), "days")
}


#### CLEANING & EXTRACTING--------------------------------------------------------

# CLEANING

clean_btris_lab_df <- function(file_path_csv) {
    df <- readr::read_csv(file_path_csv)
    df <- janitor::clean_names(df)
    
    df %>% rename(date_collected = collected_date_time) %>% 
        mutate(date_collected = date(mdy_hm(date_collected)),
               mrn = as_factor(mrn))
}

##### If BTRIS query did not include demogs, it's possible to merge them afterwards. 

merge_btris_demogs <- function(clean_lab_df, demogs_file_path_csv) {
    df_lab <- clean_lab_df
    df_demogs <- readr::read_csv(demogs_file_path_csv)
    df_demogs <- janitor::clean_names(df_demogs)
    
    df_lab$mrn <- as.character(df_lab$mrn)
    df_demogs$mrn <- as.character(df_demogs$mrn)
    
    demogs_columns <- c("mrn", "date_of_birth", "gender", "first_name", "last_name")
    
    df_merged <- dplyr::left_join(df_lab, df_demogs[, demogs_columns], by = "mrn")
    
    # Change class of merged columns:
    
    df_merged <- df_merged %>% 
        mutate_at(vars(contains("name|mrn|gender|cluster")), as_factor) %>%
         mutate(date_of_birth = mdy(date_of_birth))
    
    df_merged
}

# EXTRACTING 

list_numeric_labs <- function(){
    c(
        aldolase = "E994",
        basophil_abs = "E7",
        basophil_percent = "E6",
        calcium = "E11",
        c3_complement = "E2299",
        c4_complement = "E2300",
        cd3_percent = "E416",
        cd3_count = "E415",
        cd8_cd3_percent = "E714",
        cd8_cd3_count = "E713",
        cd4_cd3_percent = "E630",
        cd4_cd3_count = "E629",
        cd19_percent = "E336",
        cd19_count = "E335",
        eosinophils_abs = "E19",
        eosinophils_percent = "E18",
        fibrinogen = "E2620",
        haptoglobin = "E2628",
        ferritin = "E124",
        hematocrit = "E21",
        hemoglobin = "E25",
        iga = "E227",
        igd = "E247",
        ige = "E249",
        igg = "E251",
        igm = "E256",
        immature_granulocytes_percent = "E31",
        immature_granulocytes_abs = "E32",
        iron = "E93",
        iron_sat_persent = "E92",
        ldh = "E33",
        lymphocytes_abs = "E35",
        lymphocytes_percent = "E34",
        mch = "E37",
        mchc = "E38",
        mcv = "E39",
        mpv = "E42",
        monocytes_abs = "E41",
        monocytes_percent = "E40",
        neutrophils_abs = "E44",
        neutrophils_percent = "E43",
        nk_percent = "E824",
        nk_count = "E823",
        platelets = "E48",
        rbc = "E51",
        transferrin = "E124",
        crp = "E74",
        wbc = "E55"
    )
}

table_numeric_labs <- function(){
   df <- list_numeric_labs() %>% enframe(value = "cluster_id")
   df %>% arrange(name)
}

n_missing_units <- function(df_labs, percent_cutoff = 0){
# Summary of observations missing units, percent_cutoff filters for proportion missing. 
    df_labs %>%
        group_by(cluster_name) %>%
        summarise(n_missing_units = sum(is.na(unit_of_measure)), 
                  total_observations = n(),
                  perc_missing_units = round((n_missing_units / n() * 100), 1)) %>% 
        filter(perc_missing_units >= percent_cutoff)
}


unique_units <- function(df_labs, n_unique_cutoff = 1){
    df_labs %>% 
        filter(!is.na(unit_of_measure)) %>% 
        group_by(cluster_name) %>% 
        summarise(units = str_c(unique(unit_of_measure), collapse = "; "),
                  n_unique = length(unique(unit_of_measure))) %>% 
        filter(n_unique >= n_unique_cutoff) %>% 
        arrange(cluster_name)
        
}




clean_numeric_values <- function(df_labs) {
    df_labs %>% 
        
        mutate(observation_value = 
                           as.numeric(str_extract(df_labs$observation_value, "\\d+\\.*\\d*"))) %>%
        
        mutate(observation_value = 
                           case_when(
                               cluster_id == "E74" & unit_of_measure == "mg/dL" ~ 
                                   observation_value * 10,
                               TRUE ~ observation_value
                           ),
               
               unit_of_measure = 
                   case_when(
                       cluster_id == "E994" ~ "U/L",
                       cluster_id == "E38" ~ "g/dL",
                       unit_of_measure %in% c("%", "PERCENT") ~ "%", 
                       unit_of_measure %in% c("K/uL", "THOU/MM3","K/UL", 
                                              "K/mcL", "/MM3", "/uL", "UL", "K/uL", "/mcL") ~ "K/uL",
                       unit_of_measure %in% c("g/dL", "G/DL", "G/100ML") ~ "g/dL",
                       unit_of_measure %in% c("pg", "UUG") ~ "pg",
                       unit_of_measure %in% c("M/uL", "MILL/MM3", "M/mcL") ~ "M/uL",
                       unit_of_measure %in% c("fL", "CU MCRON") ~ "fL", 
                       unit_of_measure %in% c("mg/dL", "mg/L") & cluster_id == "E74" ~ "mg/L", 
                       unit_of_measure %in% c("mg/dL", "MG/DL") & cluster_id != "E74" ~ "mg/dL",
                       TRUE ~ unit_of_measure
                   ))
}




### PLOTS & TABLES---------------------------------------------------------------------------------

#plot_lab <- function(lab_dataframe, lab_id, )
Ryan-Laird/ClinDash documentation built on Jan. 20, 2020, 12:04 a.m.