#' Import an Agilent MassHunter chromatogram from a csv file into a useful
#' format
#'
#' This function takes a csv file that was generated by exporting TIC, SIM, MRM,
#' or binary-pump pressure chromatograms from Agilent MassHunter Qualitative
#' Analysis and tidies the data into a useful data.frame since the output from
#' Agilent software is so weird. Input is a character string that is the name of
#' the csv file.
#'
#'
#' @param csvfile The csv file name that was exported from MassHunter Qual
#'
#' @return Output is a tidy data.frame with columns indicating the file, the
#' chromatogram type, the ionization mode, the retention time, the counts,
#' etc. If \code{saveFile} is set to TRUE, a new csv file of the tidied
#' chromatographic data will be saved in the same directory, and it will have
#' "- tidy" appended to the input file name.
#' @export
#'
#'
#'
importChrom <- function(csvfile, saveFile = FALSE){
# Defining the pipe operator
`%>%` <- magrittr::`%>%`
# Newer vs. older versions of MassHunter export chromatograms differently.
# Checking on whether the file provided is one of the older versions,
# circa mid 2000s, I think.
DF1 <- scan(csvfile, nlines = 1, what = "character", sep = "|")
FileEra <- ifelse(DF1[[1]] == "Sample Information", "older", "newer")
if(FileEra == "newer"){
# Newer versions of MassHunter put the file name on the 1st line but
# only in one cell, and this causes R to interpret the file as
# having only 1 column. Thus the odd way of reading in the file
# w/nlines = 1 above.
# Sometimes getting extra commas at end of this. Removing.
DF1 <- gsub(",", "", DF1)
DF <- read.csv(csvfile, stringsAsFactors = FALSE,
skip = 1)
names(DF) <- c("Point", "Time_min", "Count")
DF$Point[DF$Point == "#Point"] <- "Point"
InjNameRows <- which(stringr::str_detect(DF$Point, "\\#"))
} else {
DF1 <- data.table::fread(csvfile, sep = ",", header = FALSE,
select = 1, fill = TRUE, data.table = FALSE)
# Finding 1st row w/actual data
StartRow <- which(DF1 == "Raw Data")[1]
DF <- read.csv(csvfile, header = FALSE, skip = StartRow)
names(DF) <- c("Point", "Time_min", "Count")
InjNameRows <- which(DF$Point == "Point") - 1
}
DF$Chromatogram <- NA
for(i in 2:length(InjNameRows)){
DF$Chromatogram[InjNameRows[i-1]:InjNameRows[i]] <-
DF$Point[InjNameRows[i-1]]
if(i == length(InjNameRows)){
DF$Chromatogram[InjNameRows[i]:nrow(DF)] <-
DF$Point[InjNameRows[i]]
}
}
# Taking care of the special case of 1st chromatogram
if(FileEra == "newer"){
DF$Chromatogram[1:(InjNameRows[1]-1)] <- DF1
InjNameRows <- c(1, InjNameRows)
}
# Sometimes, I think only w/SIM experiments, there are quotes around
# Chromatogram. Removing those. Also trimming white space.
DF$Chromatogram <- stringr::str_trim(gsub("\"", "", DF$Chromatogram))
AllInjections <- DF[InjNameRows, "Chromatogram"]
# If there were any injections where there was "ZERO ABUNDANCE", that adds
# a bunch of spaces after the .d. Removing those from AllInjections AND
# removing them from DF.
AllInjections <- sub(" ...ZERO ABUNDANCE...| ...NO DATA POINTS...",
"", AllInjections)
DF$Chromatogram <- sub(" ...ZERO ABUNDANCE...| ...NO DATA POINTS...",
"", DF$Chromatogram)
Injections_init <- as.data.frame(stringr::str_split(AllInjections, " ",
simplify = TRUE))
## Dealing with spaces and quotes in the file name columns
concat <- function(x){
gsub("\"", "",
stringr::str_trim(stringr::str_c(x, collapse = " "))
)
}
## SIM experiments
# "SIM" seems to show up in V3 for pretty much every instrument and mode,
# so that makes things a little easier.
if(any(stringr::str_detect(Injections_init$V3, "SIM"), na.rm = T)){
# SIM traces
SIMrows <- which(stringr::str_detect(Injections_init$V3, "SIM") &
!stringr::str_detect(Injections_init$V2, "TIC"))
SIM <- Injections_init[SIMrows, ]
if(nrow(SIM) > 0){
# ChemStation will only create 3 columns for stuff that's not
# a file name. Checking for whether the number of columns is >
# 4 b/c if it's exactly 4, then it must be a ChemStation
# chromatogram and the file doesn't have spaces and it's
# located in V4.
if(ncol(SIM) == 4){
SIM <- SIM %>%
dplyr::mutate(
File = V4,
ChromatogramType = "SIM",
Ion = gsub("EIC\\(|\\)", "", V2))
} else {
# If there are more than 4 columns, could be ChemStation
# file, could be more modern QQQ file. Figuring out
# where the last column that's *not* a file name is so
# that we can figure out where the file column should
# begin.
FileStart <- which(as.logical(
apply(SIM[1, ], MARGIN = 1,
FUN = function(x){
stringr::str_detect(x, "DF=")}))) + 1
# If it's a modern QQQ file, then FileStart will have a
# numeric value and all the columns after that should be
# concatenated.
if(length(FileStart) > 0){
# If there are no other columns after FileStart,
# attempting to concatenate will throw an error.
# Dealing with that.
if(FileStart > ncol(SIM)){
SIM[, FileStart] <-
apply(SIM[, FileStart:ncol(SIM)],
MARGIN = 1, FUN = concat)
}
names(SIM)[FileStart] <- "File"
} else {
# This is the situation where it's a ChemStation
# file with spaces.
SIM[, 4] <-
apply(SIM[, 4:ncol(SIM)], MARGIN = 1,
FUN = concat)
}
}
SIM <- SIM %>%
dplyr::mutate(
Ion = gsub("SIM\\(|\\)|EIC\\(", "", V2),
ChromatogramType = "SIM")
# For all of these, adding the column "Chromatogram" from
# AllInjections.
SIM$Chromatogram <- AllInjections[SIMrows]
}
# SIM TIC traces
SIM_TICrows <- which(stringr::str_detect(Injections_init$V3, "SIM") &
stringr::str_detect(Injections_init$V2, "TIC"))
SIM_TIC <- Injections_init[SIM_TICrows, ]
if(nrow(SIM_TIC) > 0){
# ChemStation will only create 3 columns for stuff that's not
# a file name. Checking for whether the number of columns is >
# 4 b/c if it's exactly 4, then it must be a ChemStation
# chromatogram and the file doesn't have spaces and it's
# located in V4.
if(ncol(SIM_TIC) == 4){
SIM_TIC <- SIM_TIC %>%
dplyr::mutate(
File = V4,
ChromatogramType = "TIC")
} else {
# If there are more than 4 columns, could be ChemStation
# file, could be more modern QQQ file. Figuring out
# where the last column that's *not* a file name is so
# that we can figure out where the file column should
# begin.
FileStart <- which(as.logical(
apply(SIM_TIC[1, ], MARGIN = 1,
FUN = function(x){
stringr::str_detect(x, "DF=")}))) + 1
if(length(FileStart) > 0){
# If there are no other columns after FileStart,
# attempting to concatenate will throw an error.
# Dealing with that.
if(FileStart > ncol(SIM_TIC)){
SIM_TIC[, FileStart] <-
apply(SIM_TIC[, FileStart:ncol(SIM_TIC)],
MARGIN = 1, FUN = concat)
}
names(SIM_TIC)[FileStart] <- "File"
} else {
# This is the situation where it's a ChemStation
# file with spaces.
SIM_TIC[, FileStart] <-
apply(SIM_TIC[, FileStart:ncol(SIM_TIC)],
MARGIN = 1, FUN = concat)
}
names(SIM_TIC)[FileStart] <- "File"
SIM_TIC$ChromatogramType <- "TIC"
}
# For all of these, adding the column "Chromatogram" from
# AllInjections.
SIM_TIC$Chromatogram <- AllInjections[SIM_TICrows]
}
}
## MRM experiments
# MRM shows up only in V2 when it's exclusively an MRM trace and not an
# MRM TIC trace.
if(any(stringr::str_detect(Injections_init$V2, "MRM"), na.rm = T)){
MRMrows <- which(stringr::str_detect(Injections_init$V2, "MRM"))
MRM <- Injections_init[MRMrows, ]
if(nrow(MRM) > 0){
# Determining which columns to check for file names.
FileStart <- ifelse(stringr::str_detect(MRM$V4[1], "DF="),
8, 6)
if(ncol(MRM) > FileStart){
MRM[, FileStart] <-
apply(MRM[, FileStart:ncol(MRM)],
MARGIN = 1, FUN = concat)
}
names(MRM)[FileStart] <- "File"
# FileStart also informs where to find ions, etc.
if(FileStart == 8){
MRM <- MRM %>%
dplyr::mutate(
PrecursorIon = as.numeric(sub("\\(", "", V5)),
ProductIon = as.numeric(sub("\\)", "", V7)))
} else {
MRM <- MRM %>%
dplyr::mutate(
PrecursorIon = as.numeric(sub("\\(", "", V3)),
ProductIon = as.numeric(sub("\\)", "", V5)))
}
MRM$Ion <- paste(MRM$PrecursorIon, "->", MRM$ProductIon)
MRM$ChromatogramType <- "MRM"
# For all of these, adding the column "Chromatogram" from
# AllInjections.
MRM$Chromatogram <- AllInjections[MRMrows]
}
}
# MRM TIC
if(any(stringr::str_detect(Injections_init$V3, "MRM") &
stringr::str_detect(Injections_init$V2, "TIC"), na.rm = T)){
MRM_TICrows <- which(stringr::str_detect(Injections_init$V3, "MRM") &
stringr::str_detect(Injections_init$V2, "TIC"))
MRM_TIC <- Injections_init[MRM_TICrows, ]
if(nrow(MRM_TIC) > 0){
# Determining which columns to check for file names.
FileStart <- ifelse(stringr::str_detect(MRM_TIC$V5[1], "DF="),
9, 7)
if(ncol(MRM_TIC) > FileStart){
MRM_TIC[, FileStart] <-
apply(MRM_TIC[, FileStart:ncol(MRM_TIC)],
MARGIN = 1, FUN = concat)
}
names(MRM_TIC)[FileStart] <- "File"
MRM_TIC$ChromatogramType <- "TIC"
# For all of these, adding the column "Chromatogram" from
# AllInjections.
MRM_TIC$Chromatogram <- AllInjections[MRM_TICrows]
}
}
## BinP traces
if(any(stringr::str_detect(Injections_init$V1, "BinP"))){
BinProws <- which(stringr::str_detect(Injections_init$V1, "BinP"))
BinP <- Injections_init[BinProws, ]
if(nrow(BinP) > 0){
# Determining column with beginning of file name.
FileStart <- which(as.logical(
apply(BinP[1, ], MARGIN = 1,
FUN = function(x){
stringr::str_detect(x, "Pressure")}))) + 1
if(ncol(BinP) > FileStart){
BinP[, FileStart] <- apply(BinP[, FileStart:ncol(BinP)],
MARGIN = 1, FUN = concat)
}
names(BinP)[FileStart] <- "File"
BinP$ChromatogramType <- "binary pump pressure"
# For all of these, adding the column "Chromatogram" from
# AllInjections.
BinP$Chromatogram <- AllInjections[BinProws]
}
}
# Make 0 row data.frames out of any of these objects that do not exist so
# that I can bind_rows all of them into one.
if(exists("MRM_TIC") == FALSE){
MRM_TIC <- data.frame(V1 = NA)
}
if(exists("SIM_TIC") == FALSE){
SIM_TIC <- data.frame(V1 = NA)
}
if(exists("BinP") == FALSE){
BinP <- data.frame(V1 = NA)
}
if(exists("MRM") == FALSE){
MRM <- data.frame(V1 = NA)
}
if(exists("SIM") == FALSE){
SIM <- data.frame(V1 = NA)
}
Injections <- dplyr::bind_rows(MRM_TIC, SIM_TIC, MRM, SIM, BinP) %>%
dplyr::mutate(Mode = stringr::str_extract(V1, "\\+|-")) %>%
dplyr::select(tidyselect::any_of(c("Mode", "ChromatogramType", "Ion",
"PrecursorIon", "ProductIon",
"File", "Chromatogram"))) %>%
dplyr::filter(complete.cases(File))
DF <- suppressWarnings(
DF %>% dplyr::left_join(Injections, by = "Chromatogram") %>%
dplyr::mutate_at(.vars = dplyr::vars(matches("Time_min|Count")),
.funs = as.numeric) %>%
dplyr::filter(complete.cases(Time_min))
)
if(saveFile){
write.csv(DF, file = sub(".csv", " - tidy.csv", csvfile), row.names = FALSE)
}
return(DF)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.