R/importChrom.R

Defines functions importChrom

Documented in importChrom

#' Import an Agilent MassHunter chromatogram from a csv file into a useful
#' format
#'
#' This function takes a csv file that was generated by exporting TIC, SIM, MRM,
#' or binary-pump pressure chromatograms from Agilent MassHunter Qualitative
#' Analysis and tidies the data into a useful data.frame since the output from
#' Agilent software is so weird. Input is a character string that is the name of
#' the csv file.
#'
#'
#' @param csvfile The csv file name that was exported from MassHunter Qual
#'
#' @return Output is a tidy data.frame with columns indicating the file, the
#'   chromatogram type, the ionization mode, the retention time, the counts,
#'   etc. If \code{saveFile} is set to TRUE, a new csv file of the tidied
#'   chromatographic data will be saved in the same directory, and it will have
#'   "- tidy" appended to the input file name.
#' @export
#'
#'
#'
importChrom <- function(csvfile, saveFile = FALSE){

      # Defining the pipe operator
      `%>%` <- magrittr::`%>%`

      # Newer vs. older versions of MassHunter export chromatograms differently.
      # Checking on whether the file provided is one of the older versions,
      # circa mid 2000s, I think.
      DF1 <- scan(csvfile, nlines = 1, what = "character", sep = "|")

      FileEra <- ifelse(DF1[[1]] == "Sample Information", "older", "newer")

      if(FileEra == "newer"){
            # Newer versions of MassHunter put the file name on the 1st line but
            # only in one cell, and this causes R to interpret the file as
            # having only 1 column. Thus the odd way of reading in the file
            # w/nlines = 1 above.

            # Sometimes getting extra commas at end of this. Removing.
            DF1 <- gsub(",", "", DF1)
            DF <- read.csv(csvfile, stringsAsFactors = FALSE,
                           skip = 1)
            names(DF) <- c("Point", "Time_min", "Count")
            DF$Point[DF$Point == "#Point"] <- "Point"

            InjNameRows <- which(stringr::str_detect(DF$Point, "\\#"))

      } else {
            DF1 <- data.table::fread(csvfile, sep = ",", header = FALSE,
                                     select = 1, fill = TRUE, data.table = FALSE)
            # Finding 1st row w/actual data
            StartRow <- which(DF1 == "Raw Data")[1]
            DF <- read.csv(csvfile, header = FALSE, skip = StartRow)
            names(DF) <- c("Point", "Time_min", "Count")

            InjNameRows <- which(DF$Point == "Point") - 1

      }

      DF$Chromatogram <- NA

      for(i in 2:length(InjNameRows)){
            DF$Chromatogram[InjNameRows[i-1]:InjNameRows[i]] <-
                  DF$Point[InjNameRows[i-1]]
            if(i == length(InjNameRows)){
                  DF$Chromatogram[InjNameRows[i]:nrow(DF)] <-
                        DF$Point[InjNameRows[i]]
            }
      }

      # Taking care of the special case of 1st chromatogram
      if(FileEra == "newer"){
            DF$Chromatogram[1:(InjNameRows[1]-1)] <- DF1
            InjNameRows <- c(1, InjNameRows)
      }

      # Sometimes, I think only w/SIM experiments, there are quotes around
      # Chromatogram. Removing those. Also trimming white space.
      DF$Chromatogram <- stringr::str_trim(gsub("\"", "", DF$Chromatogram))

      AllInjections <- DF[InjNameRows, "Chromatogram"]
      # If there were any injections where there was "ZERO ABUNDANCE", that adds
      # a bunch of spaces after the .d. Removing those from AllInjections AND
      # removing them from DF.
      AllInjections <- sub("    ...ZERO ABUNDANCE...|    ...NO DATA POINTS...",
                           "", AllInjections)
      DF$Chromatogram <- sub("    ...ZERO ABUNDANCE...|    ...NO DATA POINTS...",
                             "", DF$Chromatogram)

      Injections_init <- as.data.frame(stringr::str_split(AllInjections, " ",
                                                          simplify = TRUE))

      ## Dealing with spaces and quotes in the file name columns
      concat <- function(x){
            gsub("\"", "",
                 stringr::str_trim(stringr::str_c(x, collapse = " "))
            )
      }

      ## SIM experiments

      # "SIM" seems to show up in V3 for pretty much every instrument and mode,
      # so that makes things a little easier.
      if(any(stringr::str_detect(Injections_init$V3, "SIM"), na.rm = T)){

            # SIM traces
            SIMrows <- which(stringr::str_detect(Injections_init$V3, "SIM") &
                                   !stringr::str_detect(Injections_init$V2, "TIC"))
            SIM <- Injections_init[SIMrows, ]

            if(nrow(SIM) > 0){

                  # ChemStation will only create 3 columns for stuff that's not
                  # a file name. Checking for whether the number of columns is >
                  # 4 b/c if it's exactly 4, then it must be a ChemStation
                  # chromatogram and the file doesn't have spaces and it's
                  # located in V4.
                  if(ncol(SIM) == 4){
                        SIM <- SIM %>%
                              dplyr::mutate(
                                    File = V4,
                                    ChromatogramType = "SIM",
                                    Ion = gsub("EIC\\(|\\)", "", V2))
                  } else {

                        # If there are more than 4 columns, could be ChemStation
                        # file, could be more modern QQQ file. Figuring out
                        # where the last column that's *not* a file name is so
                        # that we can figure out where the file column should
                        # begin.
                        FileStart <- which(as.logical(
                              apply(SIM[1, ], MARGIN = 1,
                                    FUN = function(x){
                                          stringr::str_detect(x, "DF=")}))) + 1

                        # If it's a modern QQQ file, then FileStart will have a
                        # numeric value and all the columns after that should be
                        # concatenated.
                        if(length(FileStart) > 0){
                              # If there are no other columns after FileStart,
                              # attempting to concatenate will throw an error.
                              # Dealing with that.
                              if(FileStart > ncol(SIM)){
                                    SIM[, FileStart] <-
                                          apply(SIM[, FileStart:ncol(SIM)],
                                                MARGIN = 1, FUN = concat)
                              }

                              names(SIM)[FileStart] <- "File"

                        } else {
                              # This is the situation where it's a ChemStation
                              # file with spaces.
                              SIM[, 4] <-
                                    apply(SIM[, 4:ncol(SIM)], MARGIN = 1,
                                          FUN = concat)
                        }
                  }

                  SIM <- SIM %>%
                        dplyr::mutate(
                              Ion = gsub("SIM\\(|\\)|EIC\\(", "", V2),
                              ChromatogramType = "SIM")

                  # For all of these, adding the column "Chromatogram" from
                  # AllInjections.
                  SIM$Chromatogram <- AllInjections[SIMrows]
            }

            # SIM TIC traces
            SIM_TICrows <- which(stringr::str_detect(Injections_init$V3, "SIM") &
                                       stringr::str_detect(Injections_init$V2, "TIC"))
            SIM_TIC <- Injections_init[SIM_TICrows, ]

            if(nrow(SIM_TIC) > 0){

                  # ChemStation will only create 3 columns for stuff that's not
                  # a file name. Checking for whether the number of columns is >
                  # 4 b/c if it's exactly 4, then it must be a ChemStation
                  # chromatogram and the file doesn't have spaces and it's
                  # located in V4.
                  if(ncol(SIM_TIC) == 4){
                        SIM_TIC <- SIM_TIC %>%
                              dplyr::mutate(
                                    File = V4,
                                    ChromatogramType = "TIC")
                  } else {

                        # If there are more than 4 columns, could be ChemStation
                        # file, could be more modern QQQ file. Figuring out
                        # where the last column that's *not* a file name is so
                        # that we can figure out where the file column should
                        # begin.
                        FileStart <- which(as.logical(
                              apply(SIM_TIC[1, ], MARGIN = 1,
                                    FUN = function(x){
                                          stringr::str_detect(x, "DF=")}))) + 1

                        if(length(FileStart) > 0){
                              # If there are no other columns after FileStart,
                              # attempting to concatenate will throw an error.
                              # Dealing with that.
                              if(FileStart > ncol(SIM_TIC)){
                                    SIM_TIC[, FileStart] <-
                                          apply(SIM_TIC[, FileStart:ncol(SIM_TIC)],
                                                MARGIN = 1, FUN = concat)
                              }

                              names(SIM_TIC)[FileStart] <- "File"

                        } else {
                              # This is the situation where it's a ChemStation
                              # file with spaces.
                              SIM_TIC[, FileStart] <-
                                    apply(SIM_TIC[, FileStart:ncol(SIM_TIC)],
                                          MARGIN = 1, FUN = concat)
                        }
                        names(SIM_TIC)[FileStart] <- "File"
                        SIM_TIC$ChromatogramType <- "TIC"
                  }
                  # For all of these, adding the column "Chromatogram" from
                  # AllInjections.
                  SIM_TIC$Chromatogram <- AllInjections[SIM_TICrows]
            }
      }

      ## MRM experiments

      # MRM shows up only in V2 when it's exclusively an MRM trace and not an
      # MRM TIC trace.
      if(any(stringr::str_detect(Injections_init$V2, "MRM"), na.rm = T)){

            MRMrows <- which(stringr::str_detect(Injections_init$V2, "MRM"))
            MRM <- Injections_init[MRMrows, ]

            if(nrow(MRM) > 0){

                  # Determining which columns to check for file names.
                  FileStart <- ifelse(stringr::str_detect(MRM$V4[1], "DF="),
                                      8, 6)

                  if(ncol(MRM) > FileStart){
                        MRM[, FileStart] <-
                              apply(MRM[, FileStart:ncol(MRM)],
                                    MARGIN = 1, FUN = concat)
                  }

                  names(MRM)[FileStart] <- "File"

                  # FileStart also informs where to find ions, etc.
                  if(FileStart == 8){
                        MRM <- MRM %>%
                              dplyr::mutate(
                                    PrecursorIon = as.numeric(sub("\\(", "", V5)),
                                    ProductIon = as.numeric(sub("\\)", "", V7)))

                  } else {
                        MRM <- MRM %>%
                              dplyr::mutate(
                                    PrecursorIon = as.numeric(sub("\\(", "", V3)),
                                    ProductIon = as.numeric(sub("\\)", "", V5)))
                  }

                  MRM$Ion <- paste(MRM$PrecursorIon, "->", MRM$ProductIon)
                  MRM$ChromatogramType <- "MRM"
                  # For all of these, adding the column "Chromatogram" from
                  # AllInjections.
                  MRM$Chromatogram <- AllInjections[MRMrows]
            }
      }

      # MRM TIC
      if(any(stringr::str_detect(Injections_init$V3, "MRM") &
             stringr::str_detect(Injections_init$V2, "TIC"), na.rm = T)){

            MRM_TICrows <- which(stringr::str_detect(Injections_init$V3, "MRM") &
                                       stringr::str_detect(Injections_init$V2, "TIC"))
            MRM_TIC <- Injections_init[MRM_TICrows, ]

            if(nrow(MRM_TIC) > 0){

                  # Determining which columns to check for file names.
                  FileStart <- ifelse(stringr::str_detect(MRM_TIC$V5[1], "DF="),
                                      9, 7)

                  if(ncol(MRM_TIC) > FileStart){
                        MRM_TIC[, FileStart] <-
                              apply(MRM_TIC[, FileStart:ncol(MRM_TIC)],
                                    MARGIN = 1, FUN = concat)
                  }

                  names(MRM_TIC)[FileStart] <- "File"
                  MRM_TIC$ChromatogramType <- "TIC"
                  # For all of these, adding the column "Chromatogram" from
                  # AllInjections.
                  MRM_TIC$Chromatogram <- AllInjections[MRM_TICrows]
            }
      }


      ## BinP traces

      if(any(stringr::str_detect(Injections_init$V1, "BinP"))){
            BinProws <- which(stringr::str_detect(Injections_init$V1, "BinP"))
            BinP <- Injections_init[BinProws, ]

            if(nrow(BinP) > 0){
                  # Determining column with beginning of file name.
                  FileStart <- which(as.logical(
                        apply(BinP[1, ], MARGIN = 1,
                              FUN = function(x){
                                    stringr::str_detect(x, "Pressure")}))) + 1
                  if(ncol(BinP) > FileStart){
                        BinP[, FileStart] <- apply(BinP[, FileStart:ncol(BinP)],
                                                   MARGIN = 1, FUN = concat)
                  }

                  names(BinP)[FileStart] <- "File"
                  BinP$ChromatogramType <- "binary pump pressure"
                  # For all of these, adding the column "Chromatogram" from
                  # AllInjections.
                  BinP$Chromatogram <- AllInjections[BinProws]
            }
      }

      # Make 0 row data.frames out of any of these objects that do not exist so
      # that I can bind_rows all of them into one.
      if(exists("MRM_TIC") == FALSE){
            MRM_TIC <- data.frame(V1 = NA)
      }
      if(exists("SIM_TIC") == FALSE){
            SIM_TIC <- data.frame(V1 = NA)
      }
      if(exists("BinP") == FALSE){
            BinP <- data.frame(V1 = NA)
      }
      if(exists("MRM") == FALSE){
            MRM <- data.frame(V1 = NA)
      }
      if(exists("SIM") == FALSE){
            SIM <- data.frame(V1 = NA)
      }

      Injections <- dplyr::bind_rows(MRM_TIC, SIM_TIC, MRM, SIM, BinP) %>%
            dplyr::mutate(Mode = stringr::str_extract(V1, "\\+|-")) %>%
            dplyr::select(tidyselect::any_of(c("Mode", "ChromatogramType", "Ion",
                                               "PrecursorIon", "ProductIon",
                                               "File", "Chromatogram"))) %>%
            dplyr::filter(complete.cases(File))


      DF <- suppressWarnings(
            DF %>% dplyr::left_join(Injections, by = "Chromatogram") %>%
                  dplyr::mutate_at(.vars = dplyr::vars(matches("Time_min|Count")),
                                   .funs = as.numeric) %>%
                  dplyr::filter(complete.cases(Time_min))
      )

      if(saveFile){
            write.csv(DF, file = sub(".csv", " - tidy.csv", csvfile), row.names = FALSE)
      }

      return(DF)


}
shirewoman2/LaurasHelpers documentation built on Oct. 22, 2023, 2:07 p.m.