In usnistgov/peprDnaStability: For Analysis of NIST Genomic DNA PFGE Gels Images

library(peprDnaStability)
library(dplyr)
library(tidyr)
library(ggplot2)
library(knitr)

gel_data_dir <- "stability_example"
gel_metadata <- "stability_example/MG001_stability_metadata.csv"

## number of gel lanes ran
number_of_lanes <- 12

Stability Study Experimental Design

gel_design <- read.csv(gel_metadata) %>% 
                mutate(gel = sub(pattern = ".*_Gel_0",replacement =  "Gel-", x = gel),
                       gel = sub(pattern = ".jpg",replacement =  "", x = gel))

ggplot(gel_design) + geom_raster(aes(x = gel, y = lane, fill = condition), alpha = 0.5) +
    geom_text(aes(x = gel, y = lane, label = vial)) + facet_wrap(~gel,nrow = 1, scales = "free_x") + theme_bw() + theme(axis.text.x = element_blank(), axis.title.x = element_blank(), axis.ticks.x = element_blank(), legend.position = "bottom", legend.direction = "horizontal") + labs(y = "Lane", fill = "Conditions")

Raw Image processing

Images were croped using preview. Below are the quality control plots for processing the raw image files.

Image Processing QC Figures

Rdat_files <- list.files(path = gel_data_dir,pattern = "Rdata",full.names = TRUE)
lane_labels <- paste0("L",c(paste0("0",1:9),10:12))[1:number_of_lanes]

for(gel_dat in Rdat_files){
    dat_list <- process_gel(gel_dat)
    print(gel_dat)
    annotated_gel(image_mat = dat_list['image_dat'], 
                        lane_edges_df = dat_list[5][[1]]$lane_edges_df , 
                        peaks_df = dat_list['marker_dat'], 
                        lane_labels = lane_labels)
}

Statistical Analysis of Stability Results

batch_data <- batch_process_gels(gel_data_dir)

## combine data frames
intensity_df <- dplyr::data_frame()
markers_df <- dplyr::data_frame()
binned_df <- dplyr::data_frame()
for(x in batch_data){
    gel_name <- x$gel
    intensity_df <- x$intensity_dat %>%
        dplyr::mutate(gel = gel_name) %>%
        dplyr::bind_rows(intensity_df)
    markers_df <- x$marker_dat %>%
        dplyr::mutate(gel = gel_name) %>%
        dplyr::bind_rows(markers_df)
    binned_df <- x$bin_dat %>%
        dplyr::mutate(gel = gel_name) %>%
        dplyr::bind_rows(binned_df)
}

## adding metadata
meta <- read.csv(gel_metadata,stringsAsFactors = F) %>% 
    filter(condition != "Blank") %>% 
    mutate(lane =gsub("L","", lane),
           lane = as.numeric(lane),
           lane = ifelse(lane < 10, 
                         paste0("L0", lane),
                         paste0("L", lane)))

intensity <- intensity_df  %>% 
    mutate(lane =gsub("L","", lane),
           lane = as.numeric(lane),
           lane = ifelse(lane < 10, 
                         paste0("L0", lane),
                         paste0("L", lane))) %>% 
    left_join(meta)    

markers <- markers_df  %>% 
    mutate(lane =gsub("L","", lane),
           lane = as.numeric(lane),
           lane = ifelse(lane < 10, 
                         paste0("L0", lane),
                         paste0("L", lane))) %>% 
    left_join(meta)

binned <- binned_df  %>% 
    mutate(lane =gsub("L","", lane),
           lane = as.numeric(lane),
           lane = ifelse(lane < 10, 
                         paste0("L0", lane),
                         paste0("L", lane))) %>% 
    left_join(meta)

highlow_bin <- binned %>%
    filter(!is.na(vial), bins %in% c("23.1-9.42", "48.5-23.1",
                                     "97-48.5","194-97")) %>%
    mutate(bins = ifelse(bins %in% c("23.1-9.42", "48.5-23.1"), 
                         "low","high")) %>% 
    group_by(lane, bins, gel, vial, condition) %>% 
    summarise(bin_intensity = sum(bin_intensity))

highlow_ratio <- highlow_bin %>% 
    spread(bins, bin_intensity) %>% 
    mutate(bin_ratio = high/(low + high))

highlow_ratio_control<- highlow_ratio %>% filter(condition == "Control")

control_ratio <- highlow_ratio_control %>% 
                    group_by(gel) %>% 
                    summarise(control_ratio = mean(bin_ratio))
highlow_diff <- highlow_ratio %>% 
                    left_join(control_ratio) %>% 
                    mutate(ratio_diff = bin_ratio - control_ratio) %>% 
                    filter(condition != "Control" & condition != "Ladder") %>% 
                    mutate(gel = sub(pattern = ".*Gel_0",replacement = "", x = gel),
                           gel = sub(".jpg","", gel))
ggplot(highlow_diff) + geom_point(aes(x = condition, y = ratio_diff, color = gel)) + 
    labs(x = "Treatment", y = "Ratio Difference from Control") + 
    theme_bw()

make_one_sided_ttest_df <- function (dat, value, gelAsFactor = TRUE) {
    tt_df <- data_frame()
    ## Pairwise comparison to control for each bin and each condition
    for(bin in unique(dat$bins)){
        df <- dat %>% filter(bins == bin)
        y <- df  %>% filter(condition == "Control")  %>% .[[value]]
        for(treatment in unique(df$condition[df$condition != "Control"])){
            x <-df  %>% filter(condition == treatment)  %>% .[[value]]
            tt <- t.test(x,y,alternative = "less")
            tt_df <- data_frame(size_bins = bin,
                                condition = treatment,
                                tstatistic = tt$statistic,
                                pvalue = tt$p.value) %>% 
                bind_rows(tt_df)
        }
    }
    tt_df %>% mutate(sig = ifelse(pvalue < 0.05/n(),TRUE,FALSE))
}

tt_df <- highlow_ratio %>% mutate(bins = "H") %>% 
            filter(condition != "Ladder") %>% 
            make_one_sided_ttest_df(value = "bin_ratio", gelAsFactor = FALSE)
tt_table <- tt_df %>% mutate(pvalue = round(pvalue, 4) %>% as.character(),
                             pvalue = ifelse(sig == TRUE, paste0("__",pvalue,"__"),pvalue),
                             pvalue = ifelse(pvalue == "__0__", "__<0.0000__", pvalue),
                             tstatistic = round(tstatistic, 2)) %>% 
            select(-sig) %>% 
            spread(size_bins, pvalue)
colnames(tt_table) <- c("Treatment", "T-Statistic", "p-value") 

kable(tt_table, row.names = FALSE, caption = "P-value of one sided t-tests comparing proprotion of DNA in high and low molecular weight bins between treatment and control. Significant p-values (alpha = 0.0125) are indicated in bold.")