# TODO: replace tidyverse with base R
# TODO: include sample datasets that would allow running of examples
# TODO: add export functions for saving processed data
#### Use openxlsx to create multi-sheet files that document the processing choices made along with the data
#' Ensure proper formatting of sample names
#'
#' Eliminate special characters and ensure proper formatting of sample names to
#' comply with GCalignR formatting requirements. This function is used
#' internally in the \code{df_to_gcinput()} function
#'
#' Input must be a named list of dataframes, where each list element is named
#' with corresponding sample name, with no special characters in sample names.
#' Each dataframe must have at minimum one column for retention time, but can
#' also include columns for additional data. Retention time column must be
#' named 'rt'
#'
#' @param \code{vector} Character vector containing sample names.
#'
#' @return Returns a vector of sample names with special characters removed.
#' ' ' and '-' are replaced with '_' and '_to_', respectively, and removes file
#' '.raw' file extension. The special characters replaced are currently
#' limited, but future versions will be more comprehensive.
#'
#' @examples
#'
#'
#'
format_names_gcalignr <- function(vector){
# Function replaces ' ' and '-' with '_' and '_to_', respectively, and
# removes file '.raw' file extension. Utlimately, should replace all
# special characters, but these are the only ones found in SPRUCE data.
# This is used in df_to_gc_input()
# strips special characters and file extension
samp_name <- vector %>%
str_replace_all(pattern = '\\s', replacement = '_') %>%
str_replace_all(pattern = '-', replacement = '_to_') %>%
str_remove('.raw')
samp_name
}
# Example
#format_names_gcalignr(batch_fame_df[['BatchDataFileName']])
#' Format GC data for GCalignR
#'
#' Convert GC peak lists into the format required for alignment using the
#' GCalignR package. This function formats data to meet the folowing
#' requirements:
#'
#' Input must be a named list of dataframes, where each list element is named
#' with corresponding sample name, with no special characters in sample names.
#' Each dataframe must have at minimum one column for retention time, but can
#' also include columns for additional data. Retention time column must be
#' named 'rt'
#'
#' @param \code{batch_df} peak list dataframe or tibble from GC run in
#' IonVantage. The recommended column are as follows:
#' Batch, DataFileName, RetTimeSecs, MajorHeightnA, TotalPeakArea1,
#' DisplayDelta1, Name. This can be the output from \code{import_batch()} or
#' \code{import_batch_multi()}.
#'
#' @return Returns a nanmed list of dataframes, with each dataframe
#' reepresenting the peak list for a given sample.
#'
#' @examples
#'
#' @export
#'
df_to_gcinput <- function(batch_df){ # same as above, but avoids for loop
#
# Converts dataframe of imported GC batch to suitable input for GCalignR
#
# Args:
# batch_df: dataframe created from Isoprime batch log file (cols = ID_With batch,
# Name, RT (Sec), Height (nA), Corrected 13C Peak Name Sum Peak Area)
# - This should be direct import of 'F1CO2-log' sheet of results
#
# Returns:
# input_list: list of dataframes formatted for direct input to align_chromatograms()
# - each element will be sample name (whitespace and special char replaced with "_")
# - dataframe cols = "rt" and "area"
# Convert input dataframe to list of dataframes w/proper colnames
input_list <- lapply(unique(batch_df$DataFileName),
function(x){df <- batch_df %>%
filter(DataFileName == x) %>%
select(RetTimeSecs, TotalPeakArea1, MajorHeightnA) %>%
arrange(RetTimeSecs) %>% # Need to sort on RT or interferes w/alignment
rename(rt = RetTimeSecs, area = TotalPeakArea1,
height = MajorHeightnA)
})
# Extract sample names from dataframe and replace special char w/_
sample_names <- vapply(unique(batch_df$DataFileName),
function(x){
format_names_gcalignr(x)
},
FUN.VALUE = 'character')
# Replace column names
names(input_list) <- unique(sample_names)
input_list
}
# Example
#check_input(df_to_gcinput(batch_fame_df))
#' Apply lipid names to aligned chromatograms
#'
#' Applies lipid names to all samples Using the specified named reference
#' chromatogram
#'
#'
#' @param \code{gc_align} GCalignR object containing peak alignment.
#'
#' @param \code{ref_samp_df} Named peak list dataframe in same format as output
#' \code{from import_batch()} or \code{import_batch_multi()}.
#'
#' @param \code{ref_samp} Character string of the name of the sample used as
#' the reference chromatogram for naming. THe sample name must be in proper
#' GCalignR format.
#'
#' @return This version returnss a long-form dataframe while maintaining peak
#' height and area data. Output includes following columns: BatchDataFileName,
#' RetTimeSecs, TotalPeakArea1, MajorHeightnA, Peak_Name
#'
#' @examples
#'
#' @export
#'
name_alignment <- function(gcalign, ref_samp_df, ref_samp){
# This function attaches names to peak list from GCalignR. This version
# outputs long-form dataframe while maintaining peak height and area
# data. Output includes following columns: DataFileName,
# RetTimeSecs, TotalPeakArea1, MajorHeightnA, Peak_Name
# extract aligned peak list from GCalignR obj
# sort = F should prevent resorting df, but still sorts on the merge col
# Identical samp names will get prefixed w/name of list item (eg. rt., area.)
gcalign[['aligned']][['rt']] <- merge(gcalign[['aligned']][['rt']],
ref_samp_df,
# could make this more automatic if desired
# Is reference stored anywhere?
by.x = c(ref_samp), # Formatted name of sample used as reference
by.y = c('RetTimeSecs'),
all.x = T,
sort = F) %>%
# Need to re-sort on mean_RT, which is how all alignments are initially sorted
arrange(mean_RT)
# also sort the area and height df in the aligned list to ensure matching
# Before I had both sorted, the sometimmes failed for unknown reason
gcalign[['aligned']][['area']] <- arrange(gcalign[['aligned']][['area']], mean_RT)
gcalign[['aligned']][['height']] <- arrange(gcalign[['aligned']][['height']], mean_RT)
# Convert to long format
# names_sep arg specifies new separate val cols created for each prefix
# portion after prefix will become entry in DataFileName col
tmp_df <- do.call('cbind', gcalign[['aligned']]) %>%
pivot_longer(cols = -rt.Name,
names_to = c('.value', 'DataFileName'),
names_sep = '\\.') %>%
rename(Name = rt.Name, RetTimeSecs = rt,
TotalPeakArea1 = area, MajorHeightnA = height)
# If aligned rt, area, height somehow get out of order, data will get
# associated with wrong peak. This adds a check for correct binding
if(nrow(tmp_df %>% filter(RetTimeSecs == 0 & TotalPeakArea1 != 0,
RetTimeSecs == 0 & MajorHeightnA != 0)) > 0){
warning('Peak data improperly aligned with retention time')
}
# Filter RetTimeSecs == 0, since these represent no peak
out_df <- tmp_df %>% filter(RetTimeSecs != 0) %>%
select(DataFileName, Name, RetTimeSecs,
MajorHeightnA, TotalPeakArea1)
out_df
}
# Example
#print(name_alignment(al, fame_ref, format_names_gcalignr(ref_samp)))
#' Run peak alignment and assign peak names
#'
#' Uses a single reference chromatogram to align a batch of chromatograms and
#' assign peak names based on the reference. This function wraps all stages of
#' data prep.
#'
#'
#' @param \code{df} peak list dataframe or tibble from GC run in
#' IonVantage. The recommended column are as follows:
#' Batch, DataFileName, RetTimeSecs, MajorHeightnA, TotalPeakArea1,
#' DisplayDelta1, Name. This can be the output from \code{import_batch()} or
#' \code{import_batch_multi()}.
#'
#' @param \code{reference} Character string of the name of file used as your
#' reference chromatogram for alignment and naming. This should be in the same
#' format as in the peak list dataframe (i.e. not formatted for GCalignR)
#'
#' @return
#'
#' @examples
#'
#' @export
#'
align_name_chrom <- function(df, #ref_samp,
rt_col_name = 'rt', # retention time variable name
rt_cutoff_low = 400, # remove peaks below 15 Minutes
rt_cutoff_high = 3600, # remove peaks exceeding 45 Minutes
reference = NULL, # choose automatically NOTE: Needs to be in format matching source df DataFileName
max_linear_shift = 25, # max. shift for linear corrections
max_diff_peak2mean = 20, # max. distance of a peak to the mean across samples
min_diff_peak2peak = 20, # min. expected distance between peaks
blanks = NULL, # negative control
delete_single_peak = FALSE, # delete peaks that are present in just one sample
write_output = NULL,# add variable names to write aligned data to text files)
remove_empty = F){ # removes row if no peak
# This is just a giant wrapper function created specifically for the purpose
# of quickly testing automated naming of PLFA batches using GCalignR. Input a
# PLFA peak list, the filename of a reference sample, and any
# GCalignR::align_chromatograms() args. This will return a dataframe showing
# retention time of each peak for each file in the batch along with the
# automatically generated name and the originally assigned name. Must use
# named peak lists for this to work properly.
# Additionally, this will print GCalignR::gc_heatmap() and summary tibbles
# showing the number of orginal to auto-assigned name mismatches for each
# and overa the entire batch.
# (user-defined fun) NOTE: spaces (and special chars) converted to underscore
input_list <- df_to_gcinput(df)
# Check the input for formatting errors
GCalignR::check_input(input_list)
# summarize distribution of dist between peaks - use to set alignment params
GCalignR::peak_interspace(data = input_list, rt_col_name = 'rt')
### NEW ###
# Extract named peak list for reference sample
# This will be used to assign names to aligned peak lists
# use a dataframe to preserve original names so we can restore in the end
sample_name_df <- df %>%
mutate(GCalignName = format_names_gcalignr(DataFileName)) %>%
select(OriginDataFileName = DataFileName, GCalignName) %>%
distinct()
### NEW ###
# store a dataframe of the original names and corresponding GCalignR names in order
# to restore at the end
#origin_name_df <- data.frame()
name_ref <- df %>%
filter(DataFileName == reference) %>%
select(RetTimeSecs, Name) #####switched from Peak_Name to Name to be consistent with plfaR format
# Remove special characters and make sure supplied sample name matches
# GCalignR format
ref_samp <- format_names_gcalignr(reference)
# Function just wraps a few steps that I'm using to test how alignment works
# Mostly just modifying the input list and re-running
align <- GCalignR::align_chromatograms(data = input_list, # input data
rt_col_name = rt_col_name, # retention time variable name
rt_cutoff_low = rt_cutoff_low, # remove peaks below 15 Minutes
rt_cutoff_high = rt_cutoff_high, # remove peaks exceeding 45 Minutes
reference = ref_samp, # choose automatically
max_linear_shift = max_linear_shift, # max. shift for linear corrections
max_diff_peak2mean = max_diff_peak2mean, # max. distance of a peak to the mean across samples
min_diff_peak2peak = min_diff_peak2peak, # min. expected distance between peaks
blanks = blanks, # negative control
delete_single_peak = delete_single_peak, # delete peaks that are present in just one sample
write_output = write_output,
remove_empty = remove_empty) # add variable names to write aligned data to text files
# Display heatmap figure of alignment
#print(GCalignR::gc_heatmap(align))
named <- name_alignment(align, name_ref, ref_samp)
final_named_df <- named %>%
left_join(sample_name_df, by = c('DataFileName' = 'GCalignName')) %>%
select(-DataFileName, DataFileName = OriginDataFileName)
}
compare_naming <- function(named_aligned_df, origin_df){
# Function joins peak names from named peak list dataframe to the named
# GCalignR output. This allows you to compare the hand naming data to
# automated naming.
# This works, because the GCalignR doesn't change the retention times
# for samples in the $aligned$rt dataframe.
# Make a DF to merge with peak alignment in order to check new naming
# Since we changed the naming function to restore file names, no longer
# need to convert names in ref df
ref_df <- origin_df %>%
select(DataFileName, Name, RetTimeSecs) #%>%
#mutate(DataFileName = format_names_gcalignr(DataFileName))
full_df <- named_aligned_df %>%
filter(RetTimeSecs != 0) %>%
select(DataFileName, NewName = Name, RetTimeSecs) %>%
full_join(ref_df, by = c('DataFileName', 'RetTimeSecs')) %>%
arrange(DataFileName, RetTimeSecs) %>%
select(-NewName, NewName) # Not necessary, but makes old and new adjacent
full_df
} # Compare GCalignR naming to previous naming
# Example
#compare_naming(name_alignment(al, fame_ref, format_names_gcalignr(ref_samp)), batch_fame_df)
# Input a df that is already named, and this will align, name, and produce summary stats
test_align <- function(df, #ref_samp,
rt_col_name = 'rt', # retention time variable name
rt_cutoff_low = 400, # remove peaks below 15 Minutes
rt_cutoff_high = 3600, # remove peaks exceeding 45 Minutes
reference = NULL, # choose automatically NOTE: Needs to be in format matching source df DataFileName
max_linear_shift = 25, # max. shift for linear corrections
max_diff_peak2mean = 20, # max. distance of a peak to the mean across samples
min_diff_peak2peak = 20, # min. expected distance between peaks
blanks = NULL, # negative control
delete_single_peak = FALSE, # delete peaks that are present in just one sample
write_output = NULL,# add variable names to write aligned data to text files)
remove_empty = F){ # removes row if no peak
# This is just a giant wrapper function created specifically for the purpose
# of quickly testing automated naming of PLFA batches using GCalignR. Input a
# PLFA peak list, the filename of a reference sample, and any
# GCalignR::align_chromatograms() args. This will return a dataframe showing
# retention time of each peak for each file in the batch along with the
# automatically generated name and the originally assigned name. Must use
# named peak lists for this to work properly.
# Additionally, this will print GCalignR::gc_heatmap() and summary tibbles
# showing the number of orginal to auto-assigned name mismatches for each
# and overa the entire batch.
# (user-defined fun) NOTE: spaces (and special chars) converted to underscore
input_list <- df_to_gcinput(df)
# Check the input for formatting errors
check_input(input_list)
# summarize distribution of dist between peaks - use to set alignment params
peak_interspace(data = input_list, rt_col_name = 'rt')
# Extract named peak list for reference sample
# This will be used to assign names to aligned peak lists
ref_samp <- format_names_gcalignr(reference)
name_ref <- df %>%
filter(DataFileName == reference) %>%
select(RetTimeSecs, Name)
# Function just wraps a few steps that I'm using to test how alignment works
# Mostly just modifying the input list and re-running
align <- align_chromatograms(data = input_list, # input data
rt_col_name = rt_col_name, # retention time variable name
rt_cutoff_low = rt_cutoff_low, # remove peaks below 15 Minutes
rt_cutoff_high = rt_cutoff_high, # remove peaks exceeding 45 Minutes
reference = ref_samp, # choose automatically
max_linear_shift = max_linear_shift, # max. shift for linear corrections
max_diff_peak2mean = max_diff_peak2mean, # max. distance of a peak to the mean across samples
min_diff_peak2peak = min_diff_peak2peak, # min. expected distance between peaks
blanks = blanks, # negative control
delete_single_peak = delete_single_peak, # delete peaks that are present in just one sample
write_output = write_output,
remove_empty = remove_empty) # add variable names to write aligned data to text files
# Display heatmap figure of alignment
print(gc_heatmap(align))
named <- name_alignment(align, name_ref, ref_samp)
named_long <- compare_naming(named, df)
# summary stats that indicate how well the alignment matches
# Count number of mis-identified peaks
## During intitial alignment of reference RT/names to the alignment, any
## rows that aren't aligned with a named reference peak will become NA
## Since we then merge original hand-named peak list to the auto named peak
## list based on file/sample name and RT, any mismatches in name become NA in
## cases where the RT doesn't correspond to the same name should also produce
## and NA
## If there is NA in both columns, that would indicate unnamed peak matched
## to unnamed peak
#print(
# named_long %>%
# filter(!(is.na(Name) & is.na(NewName))) %>% # Remove rows with no Name or NewName
# group_by(DataFileName) %>%
# summarise(Misnamed = sum(is.na(NewName)))
#)
#print(
# named_long %>%
# filter(!(is.na(Name) & is.na(NewName))) %>% # Remove rows with no Name or NewName
# group_by(DataFileName) %>%
# summarise(Misnamed = sum(is.na(NewName))) %>%
# summarise(MeanMissed = mean(Misnamed), TotalMissed = sum(Misnamed))
#)
print(
named_long %>%
filter(!(is.na(Name) & is.na(NewName)),
DataFileName != 'mean_RT') %>% # Remove rows with no Name or NewName
mutate(Mismatch = if_else(is.na(NewName) | is.na(Name), TRUE,
(Name != NewName))) %>%
group_by(DataFileName) %>%
summarise(Misnamed = sum(Mismatch))
)
print(
named_long %>%
filter(!(is.na(Name) & is.na(NewName)),
DataFileName != 'mean_RT') %>% # Remove rows with no Name or NewName
mutate(Mismatch = if_else(is.na(NewName) | is.na(Name), TRUE,
(Name != NewName))) %>%
group_by(DataFileName) %>%
summarise(Misnamed = sum(Mismatch)) %>%
summarise(MeanMissed = mean(Misnamed), TotalMissed = sum(Misnamed))
)
named_long
# NOTE: although the above summary stats exclude rows with NA for both Name
# and NewName, these remain in the output dataframe for transparency
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.