#' Get FP concentrations using ECmax method
#'
#' Get protein's concentration from a dilution series measured with an
#' absorbance spectrum. Based on `get_conc_a280()`, but uses the FPbase-stated
#' extinction coefficient (EC) for the FPbase-stated maximal excitation
#' wavelength (which usually corresponds to its maximal absorbance wavelength),
#' which we call the protein's 'ECmax', in order to convert absorbance values to
#' concentrations. Expects 'processed' data such as that produced by
#' `process_absorbance_spectrum()`, with a file name ending `_processed.csv`,
#' which contains values corrected for path length and normalised to blanks as a
#' column called `normalised_cm1_value`, but retains replicate data containing
#' positional (well) information required for exporting predicted concentrations
#' at the end of this function. Uses `get_fpbase_properties()` to get FPbase EC
#' in M-1cm-1 and wavelength, and converts it to an ECmax mass extinction
#' coefficient in `(mgml)-1cm-1` using the MW (worked out from `protein_seq` and
#' `fpcountr::get_mw`). Then the function uses the `EC_max_mgml` to work out the
#' concentration of protein in each well, using three correction methods.
#' Instead of using the normalised data directly, the values used are based on a
#' LOESS fit through the absorption spectra to minimise fluctuations due to
#' noise. Finally, linear models are fitted to each concentration prediction
#' method, and a dataframe is built, returned and saved, containing predicted
#' concentrations according to the user's chosen correction method. Plots
#' showing each of the analytical steps are saved concurrently. Troubleshooting:
#' for 'incompatible lengths' errors, adjust `xrange` to avoid noisy
#' wavelengths.
#'
#' @param protein_slug character string of protein name in 'slug' form to match
#' slug of FPbase entry.
#' @param protein_seq character string of protein sequence using 1-letter code.
#' Required for MW calculation.
#' @param processed_spectrum_csv Path to CSV file of a processed absorbance
#' spectrum. Processing should be done with `process_absorbance_spectrum()`,
#' which corrects for path lengths and normalises to blank wells.
#' @param wells_to_remove list of wells to remove before analysis. Defaults to
#' NULL.
#' @param xrange list of two numerical values corresponding to the wavelength
#' range to keep when fitting the LOESS model across the absorbance spectrum.
#' By default these values are 250nm and 800nm but where the data at the UV
#' range is noisy, adjusting the `xrange` can prevent errors in the fitting.
#' @param corr_method string corresponding to type of correction method to use
#' for the data to remove contribution of light scatter. Options are `none`,
#' `baseline` and `scatter`. Method `none` applies no correction. Method
#' `baseline` subtracts the absorbance value of the wavelength supplied in
#' `wav_to_use1`. Method `scatter` subtracts a fraction of the absorbance
#' value of the wavelength supplied in `wav_to_use2` according to scatter
#' theory (details in section).
#' @param wav_to_use1 numerical value of wavelength (nm) to use for `baseline`
#' correction. Defaults to 340nm.
#' @param wav_to_use2 numerical value of wavelength (nm) to use for `scatter`
#' correction. Defaults to 333nm.
#' @param outfolder path to folder where output files should be saved. Defaults
#' to current working directory.
#' @param csv_only logical. Saves only CSV files as outputs when `TRUE`.
#' Defaults to `FALSE`.
#'
#' @export
#'
#' @importFrom dplyr %>%
#' @importFrom rlang .data
#'
#' @examples
#' \dontrun{
#' ecmax_concs <- get_conc_ecmax(
#' protein_slug = "mcherry", protein_seq = protein_seq,
#' processed_spectrum_csv = "abs_parsed_processed.csv",
#' corr_method = "scatter", wav_to_use1 = 700, wav_to_use2 = 315,
#' outfolder = "protquant_ecmax/mCherry_T5N15pi"
#' )
#' }
get_conc_ecmax <- function(protein_slug, protein_seq,
processed_spectrum_csv, wells_to_remove = NULL,
xrange = c(250,800),
corr_method = "none", # "none", "baseline", "scatter"
wav_to_use1 = 340, wav_to_use2 = 333,
outfolder,
csv_only = FALSE
){
# Get data -------------------------------------------------
spectrum_data <- utils::read.csv(processed_spectrum_csv, header = TRUE, check.names = FALSE)
# Where protein is none, dilution is usually NA, so these rows are lost in later steps.
# To keep them, they need a value.
spectrum_data <- spectrum_data %>%
dplyr::mutate(dilution = ifelse(.data$protein == "none", 0, .data$dilution))
spectrum_data
unique(spectrum_data$dilution)
# Remove wells if requested -------------------------------------------------
if(!is.null(wells_to_remove)){
spectrum_data_subset <- spectrum_data %>%
dplyr::filter(!.data$well %in% wells_to_remove)
# unique(spectrum_data_subset$well)
} else {
spectrum_data_subset <- spectrum_data
}
# Location for saved outputs -------------------------------------------------
# make folder if it doesn't exist already
ifelse(test = !dir.exists(file.path(outfolder)), yes = dir.create(file.path(outfolder)), no = FALSE)
# 1. Replicates data -------------------------------------------------
# Check input data
# Plot absorbance spectra (250-800nm) of replicates separately as sanity check for reproducibility
data.to.plot <- spectrum_data_subset %>%
dplyr::filter(.data$measure > 250 & .data$measure < 800)
data.to.plot$dilution <- as.factor(data.to.plot$dilution) # make dilution a factor
newlist <- levels(data.to.plot$dilution)
data.to.plot$dilution <- factor(data.to.plot$dilution, levels = rev(newlist)) # reverse the order
plot1 <- ggplot2::ggplot(data.to.plot) +
ggplot2::geom_hline(yintercept = 0, colour = "grey") + # bottom
ggplot2::geom_point(ggplot2::aes(x = .data$measure, y = .data$normalised_cm1_value,
colour = as.factor(replicate)), # where you want to plot >1 replicate
size = 0.5) +
ggplot2::scale_x_continuous("wavelength (nm)", limits = c(250,800)) +
ggplot2::scale_y_continuous("absorbance (cm-1)") +
ggplot2::scale_color_discrete("replicate") +
ggplot2::facet_wrap(dilution ~ ., scales = "free") +
ggplot2::theme_bw() +
ggplot2::theme(
aspect.ratio = 1,
legend.position = "bottom",
axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.5),
panel.grid.minor = ggplot2::element_blank(),
strip.background = ggplot2::element_blank(),
strip.text = ggplot2::element_text(hjust = 0, face = "bold")
)
plot1
plotname <- "plot1_abs_spectra_replicates.pdf"
if(isFALSE(csv_only)){
ggplot2::ggsave(file.path(outfolder, plotname),
plot = plot1,
width = 18, height = 18, units = "cm")
}
##
# 2. Averaged data -------------------------------------------------
# Take mean of duplicate readings for: raw_value, normalised_value, normalised_cm1_value
names(spectrum_data_subset)
summ_data <- spectrum_data_subset %>%
# For each dilution and wavelength
dplyr::group_by(.data$dilution, .data$measure,
.drop = FALSE) %>% # don't remove groups w no values
# Take mean of raw and normalised values
dplyr::mutate(raw_value = mean(.data$raw_value, na.rm = TRUE)) %>%
dplyr::mutate(raw_cm1_value = mean(.data$raw_cm1_value, na.rm = TRUE)) %>%
dplyr::mutate(normalised_cm1_value = mean(.data$normalised_cm1_value, na.rm = TRUE)) %>%
dplyr::ungroup() %>%
# Tidy
dplyr::select(-c(.data$replicate, .data$well, .data$row, .data$column,
.data$kfactor_well, .data$pathlength_each)) %>%
dplyr::distinct() # remove duplicate rows
names(summ_data)
summ_data # half the rows bc going from duplicates to averaged rows
##
# 3. Smooth data -------------------------------------------------
# Instead of taking the raw absorbance values, fit a loess model through the points to make the quantifications less sensitive to noise
# Plot absorbance spectrum loess with 'geom_smooth' (250-800nm)
data.to.plot <- summ_data %>%
dplyr::filter(.data$measure > 250 & .data$measure < 800)
data.to.plot$dilution <- as.factor(data.to.plot$dilution) # make dilution a factor
newlist <- levels(data.to.plot$dilution)
data.to.plot$dilution <- factor(data.to.plot$dilution, levels = rev(newlist)) # reverse the order
# Get ECmax wavelength of your FP for plotting
fp_properties <- fpcountr::get_fpbase_properties(slug = protein_slug, verbose = TRUE, outfolder = outfolder)
fp_properties
# fp_properties$ex_max # is excitation max
# fp_properties$ext_coeff # is EC
# If fp properties are missing, stop here
if(nrow(fp_properties) == 0){
message("Error: The FP properties table for ", protein_slug, " is empty.")
print(fp_properties)
message("Stopping..")
return()
}
# If key fp properties are missing, throw an error here (stop later)
if(is.na(fp_properties$ex_max) | is.na(fp_properties$ext_coeff)){
message("Error: The FP properties table for ", protein_slug, " does not have all the properties required for calculating protein concentration via the ECmax method.\nBoth excitation maximum (ex_max) and extinction coefficient (ext_coeff) are required.")
print(fp_properties)
# if fp_properties$ext_coeff = NA: section 4 will lead to EC_max_mgml = NA, and section 5 will fail.
# if fp_properties$ex_max = NA: section 3 plots ok with as.numeric(), then will fail at section 5.
# stop both after section 3.
}
plot1 <- ggplot2::ggplot(data.to.plot) +
ggplot2::geom_hline(yintercept = 0, colour = "grey") + # bottom
ggplot2::geom_point(ggplot2::aes(x = .data$measure, y = .data$normalised_cm1_value),
colour = "lightblue") +
ggplot2::geom_smooth(ggplot2::aes(x = .data$measure, y = .data$normalised_cm1_value),
colour = "black",
span = 0.5/5.5 ### could leave as default, us 0.5, or add as argument in function # 0.5 used for 250-350
) +
ggplot2::geom_vline(xintercept = as.numeric(fp_properties$ex_max), colour = "red") + # as.numeric to handle missing (NA) values without error
ggplot2::scale_x_continuous("wavelength (nm)", limits = c(250,800)) +
ggplot2::scale_y_continuous("absorbance (cm-1)") +
ggplot2::facet_wrap(dilution ~ ., scales = "free") +
ggplot2::theme_bw() +
ggplot2::theme(
aspect.ratio = 1,
axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.5),
panel.grid.minor = ggplot2::element_blank(),
strip.background = ggplot2::element_blank(),
strip.text = ggplot2::element_text(hjust = 0, face = "bold")
)
plot1
plotname <- "plot3a_abs_spectra_geomsmooth.pdf"
if(isFALSE(csv_only)){
ggplot2::ggsave(file.path(outfolder, plotname),
plot = plot1,
width = 18, height = 18, units = "cm")
}
##
## Fit loess model for absorbance ~ wavelength (separately for each dilution)
# method slightly complex. using both:
# https://r4ds.had.co.nz/many-models.html
# https://stackoverflow.com/questions/50163106/loess-regression-on-each-group-with-dplyrgroup-by
# subset
spectrum_data_subset <- summ_data %>%
dplyr::filter(.data$measure > xrange[1]) %>% # default 250
dplyr::filter(.data$measure < xrange[2]) # default 800
# nest
spectrum_data_nested <- spectrum_data_subset %>%
dplyr::group_by(.data$instrument, .data$media, .data$calibrant, .data$dilution) %>%
tidyr::nest()
spectrum_data_nested
spectrum_data_nested$data
# purrr map to make model
spectrum_data_nested <- spectrum_data_nested %>%
dplyr::mutate(model = purrr::map(.data$data, stats::loess, formula = normalised_cm1_value ~ measure, span = 0.5/5.5)) # 0.5 for 250-350, 0.5/5.5 for 250-800
spectrum_data_nested
# extract fitted values!!
spectrum_data_nested <- spectrum_data_nested %>%
dplyr::mutate(fitted_cm1_value = purrr::map(.data$model, `[[`, "fitted"))
spectrum_data_nested
# remove model column (awkward and no longer necessary) and unnest
spectrum_data_model <- spectrum_data_nested %>%
dplyr::select(-.data$model) %>%
tidyr::unnest(cols = c(.data$data, .data$fitted_cm1_value))
spectrum_data_model
spectrum_data_model$fitted_cm1_value
# Model check, entire spectrum:
# Plot loess with geom_line - effectively make line plot of the loess fit using fitted_values column
# to check my loess fits are the same as what geom_smooth suggests:
data.to.plot <- spectrum_data_model
data.to.plot$dilution <- as.factor(data.to.plot$dilution) # make dilution a factor
newlist <- levels(data.to.plot$dilution)
data.to.plot$dilution <- factor(data.to.plot$dilution, levels = rev(newlist)) # reverse the order
plot1 <- ggplot2::ggplot(data.to.plot) +
ggplot2::geom_hline(yintercept = 0, colour = "grey") + # bottom
ggplot2::geom_point(ggplot2::aes(x = .data$measure, y = .data$normalised_cm1_value), colour = "lightblue") +
# geom_line of extracted loess fitted values
ggplot2::geom_line(ggplot2::aes(x = .data$measure, y = .data$fitted_cm1_value)) +
ggplot2::geom_vline(xintercept = as.numeric(fp_properties$ex_max), colour = "red") + # as.numeric to handle missing (NA) values without error
ggplot2::scale_x_continuous("wavelength (nm)", limits = c(250,800)) +
ggplot2::scale_y_continuous("absorbance (cm-1)") +
ggplot2::facet_wrap(dilution ~ ., scales = "free") +
ggplot2::theme_bw() +
ggplot2::theme(
aspect.ratio = 1,
axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.5),
panel.grid.minor = ggplot2::element_blank(),
strip.background = ggplot2::element_blank(),
strip.text = ggplot2::element_text(hjust = 0, face = "bold")
)
plot1
plotname <- "plot3b_abs_spectra_modelcheck.pdf"
if(isFALSE(csv_only)){
ggplot2::ggsave(file.path(outfolder, plotname),
plot = plot1,
width = 18, height = 18, units = "cm")
}
##
# If key properties are missing, stop here
if(is.na(fp_properties$ex_max) | is.na(fp_properties$ext_coeff)){
message("Stopping..")
return()
}
# 4. Get extinction coefficient ------------------------------------------------------------
## 4a. Get protein sequence
protein_seq
## 4b. Get MW
protein_mw <- fpcountr::get_mw(protein = protein_seq)
protein_mw
## 4c. Get mgml extinction coefficient (get_fpbase_properties fn call moved up to allow indication of ECmax in previous plots)
# fp_properties <- fpcountr::get_fpbase_properties(slug = protein_slug, verbose = TRUE, outfolder = outfolder, filename = filename)
# fp_properties
# fp_properties$ex_max # is excitation max
# fp_properties$ext_coeff # is EC
## 4d. Add 'ECmax of 1mg/ml of protein' to table
# EC of 0.1% solution = 0.1g/100ml = 0.001g/ml = 1mg/ml
# EC/MW = (M-1cm-1)/(g/mol) = (mol/L)-1 * (cm)-1 / (g/mol) = (L/mol)*(1/cm)*(g/mol) = L/g * 1/cm = (g/L)-1(cm)-1 = (mg/ml)-1(cm)-1
# = EC (mg/ml)-1(cm)-1
df_2 <- spectrum_data_model %>%
dplyr::mutate(EC_max_mgml = fp_properties$ext_coeff / protein_mw)
utils::head(df_2)
##
# 5. Calculate protein concentration ------------------------------------------------------------
# 5a. Correction method - none ------------------------------------------------------------
# Standard method with no correction
df_3 <- c()
for(diln in unique(df_2$dilution)){
# Subset
temp_diln_values <- df_2 %>%
dplyr::filter(.data$dilution == diln)
# Abs
abs_max <- temp_diln_values %>%
dplyr::filter(.data$measure == fp_properties$ex_max) %>%
dplyr::ungroup() %>%
dplyr::select(.data$fitted_cm1_value) %>% # so pathlength = 1cm
as.numeric()
abs_max
# Add this Abs value as column
temp_diln_values <- temp_diln_values %>%
dplyr::mutate(absmax_std = abs_max)
temp_diln_values
# Get conc from this Abs and add it into df
temp_diln_values <- temp_diln_values %>%
dplyr::mutate(conc_max_std = .data$absmax_std/.data$EC_max_mgml)
temp_diln_values
# Rbind
df_3 <- rbind(df_3, temp_diln_values)
}
df_3
# Plot processing steps
plot1 <- ggplot2::ggplot(data = df_3 %>%
dplyr::filter(.data$measure == fp_properties$ex_max)) +
ggplot2::geom_point(ggplot2::aes(x = .data$dilution, y = .data$raw_value), colour = "grey") +
ggplot2::geom_point(ggplot2::aes(x = .data$dilution, y = .data$raw_cm1_value), colour = "black") +
ggplot2::geom_point(ggplot2::aes(x = .data$dilution, y = .data$normalised_cm1_value), colour = "red") +
ggplot2::geom_point(ggplot2::aes(x = .data$dilution, y = .data$fitted_cm1_value), colour = "blue") +
ggplot2::scale_x_continuous("dilution", limits = c(0,1)) +
ggplot2::scale_y_continuous("absorbance") +
ggplot2::labs(subtitle = "raw data and processing steps",
caption = "raw: grey, raw_cm1: black,\nnormalised_cm1: red, fitted: blue") +
ggplot2::theme_bw() +
ggplot2::theme(
aspect.ratio = 1,
panel.grid.minor = ggplot2::element_blank()
)
plot1
plotname <- "plot5a_ecmax.pdf"
if(isFALSE(csv_only)){
ggplot2::ggsave(file.path(outfolder, plotname),
plot = plot1,
width = 8, height = 8, units = "cm")
}
# Subset data to get rid of negatives:
df_3_subset <- subset(df_3, df_3$conc_max_std >= 0)
# Fit model as (Y ~ X):
model1 <- stats::lm(conc_max_std ~ dilution + 0, data = df_3_subset) # force through zero to compare points to ideal
model1
plot1 <- ggplot2::ggplot() +
# all points
ggplot2::geom_point(data = df_3,
ggplot2::aes(x = .data$dilution, y = .data$conc_max_std*1000)) +
# points used in model
ggplot2::geom_point(data = df_3_subset,
ggplot2::aes(x = .data$dilution, y = .data$conc_max_std*1000),
colour = "red") +
ggplot2::geom_line(data = df_3, # to extend line to zero
ggplot2::aes(x = .data$dilution,
y = (stats::predict(model1, df_3))*1000),
colour = "red") +
ggplot2::scale_x_continuous("dilution", limits = c(0,1)) +
ggplot2::scale_y_continuous("concentration (ng/ul)") +
ggplot2::labs(subtitle = "predicted conc (correction = none)") +
ggplot2::theme_bw() +
ggplot2::theme(
aspect.ratio = 1,
panel.grid.minor = ggplot2::element_blank()
)
plot1
plotname <- "plot5b_ecmax_stdmethod.pdf"
if(isFALSE(csv_only)){
ggplot2::ggsave(file.path(outfolder, plotname),
plot = plot1,
width = 8, height = 8, units = "cm")
}
##
# 5b. Correction method1 - baseline ------------------------------------------------------------------------
# Correction method1 to remove scatter
# Baseline aka Nanodrop method: normalise to A340
# So use Abs = abs_ecmax - abs_340
# ND-1000 Spectrophotometer V3.5 User’s Manual > Section 8- Protein A280 Spectrum Normalization
# The baseline is automatically set to the absorbance value of the sample at 340 nm,
# which should be very nearly zero absorbance. All spectra are referenced off of this zero.
# Edit: wavelength used here defined by wav_to_use1, with default 340nm.
df_4 <- c()
for(diln in unique(df_3$dilution)){
# Subset
temp_diln_values <- df_3 %>%
dplyr::filter(.data$dilution == diln)
# max
abs_max <- temp_diln_values %>%
dplyr::filter(.data$measure == fp_properties$ex_max) %>%
dplyr::ungroup() %>%
dplyr::select(.data$fitted_cm1_value) %>% # so pathlength = 1cm
as.numeric()
abs_max
abs_340 <- temp_diln_values %>%
dplyr::filter(.data$measure == wav_to_use1) %>%
dplyr::ungroup() %>%
dplyr::select(.data$fitted_cm1_value) %>% # so pathlength = 1cm
as.numeric()
abs_340
# Add corrected value as column
temp_diln_values <- temp_diln_values %>%
dplyr::mutate(wav_corr1 = wav_to_use1) %>% # record wav_to_use1
dplyr::mutate(absmax_corr1 = abs_max - abs_340) # normalise
temp_diln_values
# Get conc from this Abs and add it into df
temp_diln_values <- temp_diln_values %>%
dplyr::mutate(conc_max_corr1 = .data$absmax_corr1/.data$EC_max_mgml)
temp_diln_values
# Rbind
df_4 <- rbind(df_4, temp_diln_values)
}
df_4
# Baseline fit check, entire spectrum:
data.to.plot <- df_4
data.to.plot$dilution <- as.factor(data.to.plot$dilution) # make dilution a factor
newlist <- levels(data.to.plot$dilution)
data.to.plot$dilution <- factor(data.to.plot$dilution, levels = rev(newlist)) # reverse the order
# use top dilution
data.to.plot <- subset(data.to.plot, data.to.plot$dilution == levels(data.to.plot$dilution)[1])
# fit baseline to data at chosen wavelength
baselinefit <- data.to.plot %>%
dplyr::filter(.data$measure == wav_to_use1) %>%
dplyr::ungroup() %>%
dplyr::select(.data$fitted_cm1_value) %>% # so pathlength = 1cm
as.numeric()
baselinefit
#
plot1 <- ggplot2::ggplot(data.to.plot) +
ggplot2::geom_hline(yintercept = 0, colour = "grey") + # bottom
ggplot2::geom_point(ggplot2::aes(x = .data$measure, y = .data$normalised_cm1_value), colour = "lightblue") +
# geom_line of extracted loess fitted values
ggplot2::geom_line(ggplot2::aes(x = .data$measure, y = .data$fitted_cm1_value)) +
ggplot2::geom_vline(xintercept = fp_properties$ex_max, colour = "red") +
# baseline fit
ggplot2::geom_hline(yintercept = baselinefit, colour = "blue") +
ggplot2::scale_x_continuous("wavelength (nm)", limits = c(xrange[1],xrange[2])) +
ggplot2::scale_y_continuous("absorbance (cm-1)") +
ggplot2::facet_wrap(dilution ~ ., scales = "free") +
ggplot2::theme_bw() +
ggplot2::theme(
aspect.ratio = 1,
axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.5),
panel.grid.minor = ggplot2::element_blank(),
strip.background = ggplot2::element_blank(),
strip.text = ggplot2::element_text(hjust = 0, face = "bold")
)
plot1
plotname <- "plot5c_ecmax_baselinenorm_baselinecheck.pdf"
if(isFALSE(csv_only)){
ggplot2::ggsave(file.path(outfolder, plotname),
plot = plot1,
width = 8, height = 8, units = "cm")
}
# Subset data to get rid of negatives:
df_4_subset <- subset(df_4, df_4$conc_max_corr1 >= 0)
# Fit model as (Y ~ X):
model2 <- stats::lm(conc_max_corr1 ~ dilution + 0, data = df_4_subset) # force through zero to compare points to ideal
model2
plot1 <- ggplot2::ggplot() +
# all points
ggplot2::geom_point(data = df_4,
ggplot2::aes(x = .data$dilution, y = .data$conc_max_corr1*1000)) +
# points used in model
ggplot2::geom_point(data = df_4_subset,
ggplot2::aes(x = .data$dilution, y = .data$conc_max_corr1*1000),
colour = "red") +
ggplot2::geom_line(data = df_4, # to extend line to zero
ggplot2::aes(x = .data$dilution,
y = (stats::predict(model2, df_4))*1000),
colour = "red") +
ggplot2::scale_x_continuous("dilution", limits = c(0,1)) +
ggplot2::scale_y_continuous("concentration (ng/ul)") +
ggplot2::labs(subtitle = "predicted conc (correction = baseline)") +
ggplot2::theme_bw() +
ggplot2::theme(
aspect.ratio = 1,
panel.grid.minor = ggplot2::element_blank()
)
plot1
plotname <- "plot5c_ecmax_baselinenorm.pdf"
if(isFALSE(csv_only)){
ggplot2::ggsave(file.path(outfolder, plotname),
plot = plot1,
width = 8, height = 8, units = "cm")
}
##
# 5c. Correction method2 - scatter ------------------------------------------------------------
# Correction method2 to remove scatter
# Scatter method:
# For A280 measurements, A333 gives 1/2 the scatter as A280, so use Abs = abs_280 - 2*abs_333
# For ECmax measurements, calculate scatter_ratio between A333 and ECmax wavelength, then use Abs = abs_ecmax - scatter_ratio*abs_333
# Edit: wavelength used here defined by wav_to_use2, with default 333nm.
# ## Correcting for ECmax - how and why?
# # the optical density due to light scatter varies with 1/wavelength^4
# # y = 1/x^4
# eq = function(x){1/x^4}
# ggplot2::ggplot(data = data.frame(x = c(200,800)),
# ggplot2::aes(x = x)) +
# ggplot2::stat_function(fun = eq) +
# ggplot2::scale_x_continuous("wavelength (nm)") +
# ggplot2::scale_y_continuous("light scatter (au)") +
# ggplot2::theme_bw() +
# ggplot2::theme(
# aspect.ratio = 1,
# panel.grid.minor = ggplot2::element_blank()
# )
# # light scatter at ecmax (eg 500nm) is not same level as light scatter elsewhere
# scatter_a500 <- eq(500)
# scatter_a500 # 1.626926e-10
# # to find light scatter at ecmax when there's a peak in the way,
# # need to find it by looking for light scatter at a different wavelength, then doing a transformation
# # use A333 or a wavelength where FP absorbs minimally, and find ratio
# # eg. A333
# scatter_a333 <- eq(333)
# scatter_ecmax <- eq(587) # for eg
# scatter_ratio <- scatter_ecmax/scatter_a333
# scatter_ratio
# # so scatter at ecmax should be ~0.1x scatter at a333...
# Work out scatter_ratio.
# What is ratio between scatter at ECmax wavelength and scatter at the user-defined wav_to_use2 wavelength?
eq = function(x){1/x^4}
scatter_a333 <- eq(wav_to_use2) # 333 by default
scatter_a333
scatter_ecmax <- eq(fp_properties$ex_max)
scatter_ecmax
scatter_ratio <- scatter_ecmax/scatter_a333
scatter_ratio # the scatter ratio should be <1.
df_5 <- c()
for(diln in unique(df_4$dilution)){
# Subset
temp_diln_values <- df_4 %>%
dplyr::filter(.data$dilution == diln)
# max
abs_max <- temp_diln_values %>%
dplyr::filter(.data$measure == fp_properties$ex_max) %>%
dplyr::ungroup() %>%
dplyr::select(.data$fitted_cm1_value) %>% # so pathlength = 1cm
as.numeric()
abs_max
abs_333 <- temp_diln_values %>%
dplyr::filter(.data$measure == wav_to_use2) %>%
dplyr::ungroup() %>%
dplyr::select(.data$fitted_cm1_value) %>% # so pathlength = 1cm
as.numeric()
abs_333
# Add corrected value as column
temp_diln_values <- temp_diln_values %>%
dplyr::mutate(wav_corr2 = wav_to_use2) %>% # record wav_to_use2
dplyr::mutate(abs_corr2 = abs_333) %>% # record abs of wav_to_use2 (needed for plot5d scatter check)
dplyr::mutate(absmax_corr2 = abs_max - scatter_ratio*abs_333) # normalise
temp_diln_values
# Get conc from this value and add it into df
temp_diln_values <- temp_diln_values %>%
dplyr::mutate(conc_max_corr2 = .data$absmax_corr2/.data$EC_max_mgml)
temp_diln_values
# Rbind
df_5 <- rbind(df_5, temp_diln_values)
}
df_5
# Scatter fit check, entire spectrum:
data.to.plot <- df_5
data.to.plot$dilution <- as.factor(data.to.plot$dilution) # make dilution a factor
newlist <- levels(data.to.plot$dilution)
data.to.plot$dilution <- factor(data.to.plot$dilution, levels = rev(newlist)) # reverse the order
# use only one dilution (top dilution), as stat_function doesn't play well with faceted plots
data.to.plot <- subset(data.to.plot, data.to.plot$dilution == levels(data.to.plot$dilution)[1])
## fit scatter eqn to data at chosen wavelength
eq = function(x){1/x^4}
# eq(wav_to_use2) # value at scatter wavelength for 1/x^4 eqn
# unique(data.to.plot$abs_corr2) # abs at scatter wavelength
# coefficient to supply to coefficient*(1/x^4) eqn to fit scatter curve to data:
scatterfit_coefficient <- unique(data.to.plot$abs_corr2)/eq(wav_to_use2)
scatterfit_coefficient
#
plot1 <- ggplot2::ggplot(data.to.plot) +
ggplot2::geom_hline(yintercept = 0, colour = "grey") + # bottom
ggplot2::geom_point(ggplot2::aes(x = .data$measure, y = .data$normalised_cm1_value), colour = "lightblue") +
# geom_line of extracted loess fitted values
ggplot2::geom_line(ggplot2::aes(x = .data$measure, y = .data$fitted_cm1_value)) +
ggplot2::geom_vline(xintercept = fp_properties$ex_max, colour = "red") +
# scatter fit
ggplot2::stat_function(fun = function(x) scatterfit_coefficient/x^4, colour = "blue") +
ggplot2::scale_x_continuous("wavelength (nm)", limits = c(xrange[1],xrange[2])) +
ggplot2::scale_y_continuous("absorbance (cm-1)") +
ggplot2::facet_wrap(dilution ~ ., scales = "free") +
ggplot2::theme_bw() +
ggplot2::theme(
aspect.ratio = 1,
axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.5),
panel.grid.minor = ggplot2::element_blank(),
strip.background = ggplot2::element_blank(),
strip.text = ggplot2::element_text(hjust = 0, face = "bold")
)
plot1
plotname <- "plot5d_ecmax_scatternorm_scattercheck.pdf"
if(isFALSE(csv_only)){
ggplot2::ggsave(file.path(outfolder, plotname),
plot = plot1,
width = 8, height = 8, units = "cm")
}
# Subset data to get rid of negatives:
df_5_subset <- subset(df_5, df_5$conc_max_corr2 >= 0)
# Fit model as (Y ~ X):
model3 <- stats::lm(conc_max_corr2 ~ dilution + 0, data = df_5_subset) # force through zero to compare points to ideal
model3
# Plot
plot1 <- ggplot2::ggplot() +
# all points
ggplot2::geom_point(data = df_5,
ggplot2::aes(x = .data$dilution, y = .data$conc_max_corr2*1000)) +
# points used in model
ggplot2::geom_point(data = df_5_subset,
ggplot2::aes(x = .data$dilution, y = .data$conc_max_corr2*1000),
colour = "red") +
ggplot2::geom_line(data = df_5, # to extend line to zero
ggplot2::aes(x = .data$dilution,
y = (stats::predict(model3, df_5))*1000),
colour = "red") +
ggplot2::scale_x_continuous("dilution", limits = c(0,1)) +
ggplot2::scale_y_continuous("concentration (ng/ul)") +
ggplot2::labs(subtitle = "predicted conc (correction = scatter)") +
ggplot2::theme_bw() +
ggplot2::theme(
aspect.ratio = 1,
panel.grid.minor = ggplot2::element_blank()
)
plot1
plotname <- "plot5d_ecmax_scatternorm.pdf"
if(isFALSE(csv_only)){
ggplot2::ggsave(file.path(outfolder, plotname),
plot = plot1,
width = 8, height = 8, units = "cm")
}
##
# 5d. Compare methods
# Redundant unless fits fail.
# # Plot
# df_6 <- df_5 %>%
# # dplyr::ungroup() %>%
# dplyr::select(calibrant, protein, dilution,
# measure,
# conc_max_std, conc_max_corr1, conc_max_corr2) %>%
# dplyr::filter(measure == fp_properties$ex_max)
# df_6 <- df_6 %>%
# tidyr::pivot_longer(cols = c(
# conc_max_std, conc_max_corr1, conc_max_corr2),
# names_to = "type", values_to = "value")
# df_6
#
# df_6$type <- factor(df_6$type, levels = c("conc_max_std", "conc_max_corr1", "conc_max_corr2"))
# type.labs <- c("standard", "baseline", "scatter")
# names(type.labs) <- c("conc_max_std", "conc_max_corr1", "conc_max_corr2")
# plot1 <- ggplot2::ggplot(data = df_6) +
#
# ggplot2::geom_point(ggplot2::aes(x = dilution, y = value*1000)) +
# ggplot2::geom_hline(yintercept = 0) +
#
# ggplot2::scale_x_continuous("dilution", limits = c(0,1)) +
# ggplot2::scale_y_continuous("concentration (ng/ul)") +
# ggplot2::facet_grid(. ~ type,
# labeller = ggplot2::labeller(type = type.labs)) +
# ggplot2::theme_bw() +
# ggplot2::theme(
# aspect.ratio = 1,
# axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.5),
# panel.grid.minor = ggplot2::element_blank(),
# strip.background = ggplot2::element_blank(),
# strip.text = ggplot2::element_text(face = "bold", hjust = 0)
# )
# plot1
# plotname <- "plot5e1_ecmax_all.pdf"
# ggplot2::ggsave(file.path(outfolder, plotname),
# plot = plot1,
# width = 12, height = 12, units = "cm")
#
# plot1 <- ggplot2::ggplot(data = df_6 %>%
# dplyr::filter(dilution != 0)) +
# ggplot2::geom_point(ggplot2::aes(x = dilution, y = value*1000)) +
# ggplot2::scale_x_log10("dilution") +
# ggplot2::scale_y_log10("concentration (ng/ul)") +
# ggplot2::facet_grid(. ~ type,
# labeller = ggplot2::labeller(type = type.labs)) +
# ggplot2::theme_bw() +
# ggplot2::theme(
# aspect.ratio = 1,
# axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.5),
# panel.grid.minor = ggplot2::element_blank(),
# strip.background = ggplot2::element_blank(),
# strip.text = ggplot2::element_text(face = "bold", hjust = 0),
# )
# plot1
# plotname <- "plot5e2_ecmax_all_logplot.pdf"
# ggplot2::ggsave(file.path(outfolder, plotname),
# plot = plot1,
# width = 12, height = 12, units = "cm")
##
# 6. Fit models (conc ~ dilution) ----------------------------------------------------------------
## Tidy data
df_6 <- df_5 %>%
dplyr::ungroup() %>%
dplyr::select(.data$media, .data$calibrant, .data$protein, .data$dilution,
.data$measure,
.data$conc_max_std, .data$conc_max_corr1, .data$conc_max_corr2) %>%
dplyr::filter(.data$measure == fp_properties$ex_max)
df_6 <- df_6 %>%
tidyr::pivot_longer(cols = c(
.data$conc_max_std, .data$conc_max_corr1, .data$conc_max_corr2),
names_to = "type", values_to = "value")
df_6
df_6$type <- factor(df_6$type, levels = c("conc_max_std", "conc_max_corr1", "conc_max_corr2"))
## Fit model for conc ~ dilution ---------
# remove blanks
df_7 <- df_6 %>%
dplyr::filter(.data$protein != "none")
# nest to get to df with three rows, one per model required
# nesting methods from https://r4ds.had.co.nz/many-models.html
df_7_nested <- df_7 %>%
dplyr::group_by(.data$media, .data$calibrant, .data$protein, .data$measure, .data$type) %>%
tidyr::nest() # basically nesting dilution and value cols only
df_7_nested
df_7_nested$data
# purrr map to make models
df_7_nested <- df_7_nested %>%
dplyr::mutate(model = purrr::map(.data$data, stats::lm, formula = value ~ dilution + 0))
df_7_nested
df_7_nested$model
# extract fitted values
df_7_nested <- df_7_nested %>%
dplyr::mutate(tidied = purrr::map(.data$model, broom::tidy)) %>% # tidy fn from broom # https://cran.r-project.org/web/packages/broom/vignettes/broom_and_dplyr.html
tidyr::unnest(.data$tidied)
df_7_nested
# save model coefficients
df_7_model_tosave <- df_7_nested %>%
dplyr::select(-.data$model, -.data$data)
df_7_model_tosave
csvname <- "ecmax_coeffs.csv"
utils::write.csv(x = df_7_model_tosave, file = file.path(outfolder, csvname), row.names = FALSE)
### Plot points and model fits
df_9 <- df_7_nested %>%
tidyr::unnest(.data$data) %>%
dplyr::mutate(pred_conc = .data$estimate*.data$dilution)
df_9
type.labs <- c("none", "baseline", "scatter")
names(type.labs) <- c("conc_max_std", "conc_max_corr1", "conc_max_corr2")
plot1 <- ggplot2::ggplot() +
# data
ggplot2::geom_point(data = df_6, ggplot2::aes(x = .data$dilution, y = .data$value*1000)) +
# model
ggplot2::geom_line(data = df_9, ggplot2::aes(x = .data$dilution, y = .data$pred_conc*1000)) +
ggplot2::geom_hline(yintercept = 0) +
ggplot2::scale_x_continuous("dilution", limits = c(0,1)) +
ggplot2::scale_y_continuous("concentration (ng/ul)") +
ggplot2::facet_grid(. ~ type,
labeller = ggplot2::labeller(type = type.labs)) +
ggplot2::theme_bw() +
ggplot2::theme(
aspect.ratio = 1,
axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.5),
panel.grid.minor = ggplot2::element_blank(),
strip.background = ggplot2::element_blank(),
strip.text = ggplot2::element_text(face = "bold", hjust = 0)
)
plot1
plotname <- "plot6a_ecmax_models_all.pdf"
if(isFALSE(csv_only)){
ggplot2::ggsave(file.path(outfolder, plotname),
plot = plot1,
width = 12, height = 12, units = "cm")
}
plot1 <- ggplot2::ggplot() +
# data
ggplot2::geom_point(data = df_6 %>%
dplyr::filter(.data$dilution != 0),
ggplot2::aes(x = .data$dilution, y = .data$value*1000)) +
# model
ggplot2::geom_line(data = df_9 %>%
dplyr::filter(.data$dilution != 0),
ggplot2::aes(x = .data$dilution, y = .data$pred_conc*1000)) +
ggplot2::scale_x_log10("dilution") +
ggplot2::scale_y_log10("concentration (ng/ul)") +
ggplot2::facet_grid(. ~ type,
labeller = ggplot2::labeller(type = type.labs)) +
ggplot2::theme_bw() +
ggplot2::theme(
aspect.ratio = 1,
axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.5),
panel.grid.minor = ggplot2::element_blank(),
strip.background = ggplot2::element_blank(),
strip.text = ggplot2::element_text(face = "bold", hjust = 0),
)
plot1
plotname <- "plot6b_ecmax_models_all_logplot.pdf"
if(isFALSE(csv_only)){
ggplot2::ggsave(file.path(outfolder, plotname),
plot = plot1,
width = 12, height = 12, units = "cm")
}
# 7. Export concentration predictions ---------------
### Tidy dfs for export
# NB Need to use a base df that contains columns like well, row, column - use input df
## Prep base df
names(spectrum_data)
spectrum_data_tidy <- spectrum_data %>%
dplyr::select(c( .data$media:.data$well )) %>% # take all cols between A:B - https://suzan.rbind.io/2018/01/dplyr-tutorial-1/
dplyr::select(c( -.data$volume )) # remove volume as downstream assays may have used different volumes
names(spectrum_data_tidy)
# loads of duplicated rows here now that measure (wavelength) and absorption value columns have been deleted
nrow(spectrum_data_tidy) # eg. 19,224
spectrum_data_tidy <- spectrum_data_tidy %>%
dplyr::distinct()
nrow(spectrum_data_tidy) # eg. 24 (so there were 801 wavelengths)
spectrum_data_tidy
## Build new df
conc_data <- spectrum_data_tidy
conc_data
# complete MW column
protein_seq
conc_data$mw_gmol1 <- fpcountr::get_mw(protein = protein_seq)
# Pick correction method
df_7_nested
if(corr_method == "none"){
conc_fit_coef <- df_7_nested %>%
dplyr::filter(.data$type == "conc_max_std") %>%
dplyr::ungroup() %>%
dplyr::select(.data$estimate) %>%
as.numeric()
conc_fit_coef
}
if(corr_method == "baseline"){
conc_fit_coef <- df_7_nested %>%
dplyr::filter(.data$type == "conc_max_corr1") %>%
dplyr::ungroup() %>%
dplyr::select(.data$estimate) %>%
as.numeric()
conc_fit_coef
}
if(corr_method == "scatter"){
conc_fit_coef <- df_7_nested %>%
dplyr::filter(.data$type == "conc_max_corr2") %>%
dplyr::ungroup() %>%
dplyr::select(.data$estimate) %>%
as.numeric()
conc_fit_coef
}
# add concs
conc_data <- conc_data %>%
# Concentration = predicted concentration for the highest dilution, multiplied down for each dilution
dplyr::mutate(concentration_ngul = conc_fit_coef * .data$dilution * 1000) # *1000 to convert from ug/ul to ng/ul
conc_data
# fill in protein column where dilution exists (for ease of future joining)
conc_data <- conc_data %>%
dplyr::mutate(protein = ifelse(.data$protein == "" & !is.na(.data$dilution), .data$calibr, .data$protein))
# if protein column is empty AND dilution is not NA, add the calibr into the protein column, else, leave protein entry as it is
conc_data
##
# Save and Return -----------
# Everything
csvname <- gsub(".csv", paste0("_ecmax.csv"), basename(processed_spectrum_csv))
utils::write.csv(x = df_5, file = file.path(outfolder, csvname), row.names = FALSE)
# Protein concentrations
csvname <- "protein_concs_ecmax.csv"
utils::write.csv(x = conc_data, file = file.path(outfolder, csvname), row.names = FALSE)
return(conc_data)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.