R/make_every_XIC_MS1.R

Defines functions make_every_XIC_MS1

Documented in make_every_XIC_MS1

#' make_every_XIC_MS1
#'
#' @param rawFileDir Directory containing target raw file.
#' @param rawFileName Target raw file name.
#' @param targetSeqData Path to target sequences data. Should be a .csv file.
#' @param outputDir Path to output directory.
#' @param target_col_name Name of column in target sequences data which contains
#' unique identifiers for proteoforms to be identified.Defaults to "UNIPROTKB".
#' @param target_sequence_col_name Name of column in target sequences data which
#' contains the amino acid sequences of target proteoforms.Defaults to
#' "ProteoformSequence".
#' @param PTMname_col_name Name of column in target sequences data which
#' contains names and positions of PTMs. Defaults to "PTMname".
#' @param PTMformula_col_name1 Name of column in target sequences data which
#' contains chemical formulas for PTMs to be added to the formula of the bare
#' proteoform sequence. Defaults to "FormulaToAdd".
#' @param PTMformula_col_name2 Name of column in target sequences data which
#' contains chemical formulas for PTMs to be subtracted from the formula of the
#' bare proteoform sequence. Defaults to "FormulaToSubtract".
#' @param isoAbund Named numeric vector specifying abundances of isotopes to be
#' used for generating theoretical isotopic distributions. See data(isotopes,
#' package = 'enviPat') for isotope names. Defaults to c("12C" = 0.9893, "14N" = 0.99636).
#' @param target_charges Numeric vector, length 2. Range of charges used to
#' generate theoretical isotopic distributions.
#' @param mass_range Numeric vector, length 2. Mass range used to filter the
#' target sequences data on the basis of monoisotopic mass. Defaults to c(0, 100000)
#' - effectively no filter.
#' @param mz_range Numeric vector, length 2. m/z range used to filter putative
#' theoretical isotopic distributions, i.e. only theoretical peaks in this m/z
#' range will be considered. Defaults to c(600,2000).
#' @param abund_cutoff Numeric vector, length 1. Controls the minimum relative
#' abundance (compared to the theoretical highest abundance isotopologue) an
#' isotopologue peak must have to be included in the search. Defaults to 5.
#' @param sample_n_pforms Numeric vector, length 1. Number of proteoforms to
#' randomly sample from the the target sequences data. Defaults to NULL.
#' @param XIC_tol Numeric vector, length 1. Tolerance (in ppm) used to generate extracted ion chromatograms
#' from theoretical isotopic distributions. Defaults to 5.
#' @param use_IAA Boolean value. Controls whether proteoform sequences should be
#' considered to be alkylated with iodoacetamide at all cysteine residues. Argument
#' is passed to OrgMassSpecR::ConvertPeptide.
#' @param include_PTMs Boolean value. Controls whether PTM chemical formulas are
#' added when generating theoretical isotopic distributions. If false, ONLY the
#' bare sequence is considered. Defaults to TRUE.
#' @param save_output Boolean value. Controls whether output is saved to outputDir.
#' Defaults to TRUE.
#' @param scoreMFAcutoff Numeric vector, length 1. Minimum value of ScoreMFA for
#' comparison of theoretical and observed isotopic distributions to be considered
#' valid. Defaults to 0.3.
#' @param cosinesimcutoff Numeric vector, length 1. Minimum value of cosine
#' similarity (AKA dot product) for comparison of theoretical and observed isotopic
#' distributions to be considered valid. Defaults to 0.9.
#' @param SN_cutoff Numeric vector, length 1. Minimum allowed value for the estimated
#' S/N of the observed isotopologue peak corresponding to the theoretical highest
#' abundance isotopologue. Defaults to 20.
#' @param resPowerMS1 Numeric vector, length 1. Resolving power to be used with
#' isotopologue_window_multiplier to determine size of the isotopologue window.
#' USe resolving power at 400 m/z for best results.
#' @param isotopologue_window_multiplier Numeric vector, length 1. After the width
#' at half-max is estimated from resPowerMS1 at a particular m/z value, it is
#' multiplied by this number to determine width of the isotopologue window.
#' Defaults to 8.
#' @param mz_window Numeric vector, length 1. Controls the width of the window used
#' for the "specZoom" output which focuses on isotopic distributions of single charge
#' states of single proteoforms. Defaults to 3.
#' @param return_timers Boolean value. Controls whether the function returns a
#' dataframe of timers (TRUE) or an R object containing the zoomed MS2 spectra
#' (FALSE). Defaults to TRUE.
#' @param rawrrTemp Path to temporary directory to be used by the rawrr package.
#' Defaults to tempdir().
#' @param save_spec_object Boolean value. Controls whether an R object containing
#' the zoomed MS1 spectra is saved to the outputdir. Defaults to TRUE.
#'
#' @return
#' @export
#'
#' @import MSnbase
#' @importFrom magrittr %>%
#'

make_every_XIC_MS1 <-
   function(
      rawFileDir = NULL,
      rawFileName = NULL,
      targetSeqData = NULL,
      outputDir = getwd(),
      target_col_name = "UNIPROTKB",
      target_sequence_col_name = "ProteoformSequence",
      PTMname_col_name = "PTMname",
      PTMformula_col_name1 = "FormulaToAdd",
      PTMformula_col_name2 = "FormulaToSubtract",
      isoAbund = c("12C" = 0.9893, "14N" = 0.99636),
      target_charges = c(1:50),
      mass_range = c(0,100000),
      mz_range = c(600,2000),
      abund_cutoff = 5,
      sample_n_pforms = NULL,
      XIC_tol = 5,
      use_IAA = FALSE,
      include_PTMs = TRUE,
      save_output = TRUE,
      scoreMFAcutoff = 0.3,
      cosinesimcutoff = 0.9,
      SN_cutoff = 20,
      resPowerMS1 = 300000,
      isotopologue_window_multiplier = 8,
      mz_window = 3,
      return_timers = TRUE,
      rawrrTemp = tempdir(),
      save_spec_object = TRUE
   ) {

      library(rawrr)

      options(dplyr.summarise.inform = FALSE)

      # Assertions --------------------------------------------------------------

      assertthat::assert_that(
         assertthat::is.dir(rawFileDir),
         msg = "rawFileDir is not a recognized path"
      )


      assertthat::assert_that(
         assertthat::is.string(rawFileName),
         msg = "rawFileName is not a string"
      )

      assertthat::assert_that(
         assertthat::has_extension(targetSeqData, "csv") |
            is.data.frame(targetSeqData),
         msg = "targetSeqData is not a CSV file or dataframe"
      )

      if (
         assertthat::see_if(assertthat::has_extension(targetSeqData, "csv"))
         == TRUE
      ) {

         assertthat::assert_that(
            assertthat::is.readable(targetSeqData),
            msg = "targetSeqData is not a readable file"
         )

      }

      assertthat::assert_that(
         assertthat::is.dir(outputDir),
         msg = "outputDir is not a recognized directory"
      )

      assertthat::assert_that(
         assertthat::is.flag(use_IAA),
         msg = "use_IAA should be TRUE or FALSE"
      )

      assertthat::assert_that(
         assertthat::is.flag(save_output),
         msg = "save_output should be TRUE or FALSE"
      )

      if (is.null(sample_n_pforms) == FALSE && sample_n_pforms > 0) {

         assertthat::assert_that(
            assertthat::is.count(sample_n_pforms),
            msg = "sample_n_pforms should be a positive integer"
         )

      }

      # Create necessary quosures -----------------------------------------------

      target_col_name <- rlang::enquo(target_col_name)
      target_sequence_col_name_sym <- rlang::sym(target_sequence_col_name)
      target_sequence_col_name <- rlang::enquo(target_sequence_col_name)
      PTMformula_col_name1 <- rlang::enquo(PTMformula_col_name1)
      PTMformula_col_name2 <- rlang::enquo(PTMformula_col_name2)
      PTMname_col_name <- rlang::enquo(PTMname_col_name)

      # Check filename and path -------------------------------------------------

      rawFilesInDir <-
         fs::dir_ls(
            rawFileDir,
            recurse = TRUE,
            type = "file",
            regexp = c("(?i)[.]raw$")
         )

      if (length(rawFilesInDir) == 0) {
         stop("No .raw files found in raw file directory")
      }

      if (rawFilesInDir %>%
          stringr::str_detect(rawFileName) %>%
          any() == FALSE) {
         stop("Input raw file not found in raw file directory")
      }

      rawFile <-
         rawFilesInDir %>%
         stringr::str_subset(rawFileName)

      # ggplot themes -----------------------------------------------------------

      XICtheme <-
         list(
            ggplot2::geom_text(
               data = ~dplyr::filter(.x, int_sum == max(int_sum)),
               ggplot2::aes(
                  x = times,
                  y = int_sum,
                  label = glue::glue(
                     "Max TIC: {format(int_sum, scientific = TRUE, nsmall = 4, digits = 4)}\n RT@Max: {format(times, scientific = FALSE, nsmall = 4, digits = 3)}\n PTM:{!!PTMname}"),
                  alpha = 0.5,
                  size = 6
               ),
               vjust="inward",
               hjust="inward",
               nudge_x = 5
            ),
            ggthemes::theme_clean(base_size = 14),
            ggplot2::labs(
               x = "Retention Time (min)",
               y = "Total Ion Current"
            ),
            ggplot2::guides(
               alpha = "none",
               size = "none"
            )
         )

      # Load isotopes table -----------------------------------------------------

      data(isotopes, package = "enviPat")

      isotopes_to_use <- isotopes

      # Get positions of specific isotopes in dataframe

      indices <-
         list(
            "12C" = which(isotopes$isotope == "12C") %>% .[[1]],
            "13C" = which(isotopes$isotope == "13C") %>% .[[1]],
            "14N" = which(isotopes$isotope == "14N") %>% .[[1]],
            "15N" = which(isotopes$isotope == "15N") %>% .[[1]]
         )

      # Replace values in dataframe with supplied values

      isotopes_to_use$abundance[indices[["12C"]]] <-
         isoAbund[which(names(isoAbund) == "12C")]

      isotopes_to_use$abundance[indices[["13C"]]] <-
         1 - isoAbund[which(names(isoAbund) == "12C")]

      isotopes_to_use$abundance[indices[["14N"]]] <-
         isoAbund[which(names(isoAbund) == "14N")]

      isotopes_to_use$abundance[indices[["15N"]]] <-
         1 - isoAbund[which(names(isoAbund) == "14N")]

      # Process target sequences ------------------------------------------------

      target_seqs_df <-
         targetSeqData %>%
         {if (!is.data.frame(.)) readr::read_csv(.) else .} %>%
         dplyr::mutate(
            MonoisoMass =
               Peptides::mw(!!target_sequence_col_name_sym, monoisotopic = TRUE)
         ) %>%
         dplyr::filter(
            MonoisoMass >= mass_range[[1]],
            MonoisoMass <= mass_range[[2]]
         ) %>%
         {if (!is.null(sample_n_pforms)) dplyr::sample_n(., size = sample_n_pforms) else .}

      target_seqs <-
         target_seqs_df %>%
         dplyr::select(
            !!target_col_name, !!target_sequence_col_name
         ) %>%
         tibble::deframe() %>%
         as.list()

      # Create save directory ---------------------------------------------------

      systime <- format(Sys.time(), "%Y%m%d_%H%M")
      systime2 <- Sys.time()

      saveDir <-
         fs::path(
            outputDir,
            paste0(
               systime,
               "_",
               fs::path_ext_remove(rawFileName),
               "_",
               length(target_seqs),
               "seqs"
            )
         ) %>%
         stringr::str_trunc(246, "right", ellipsis = "")


      if (fs::dir_exists(saveDir) == FALSE) fs::dir_create(saveDir)

      timer <-
         timeR::createTimer(verbose = FALSE)

      ## Write params to text file ------------

      params_path <-
         fs::path(
            saveDir,
            'GEX_params_MS1.txt'
         )

      if (!fs::file_exists(params_path)) {

         readr::write_lines(
            glue::glue(
               "
               ----------------
               {systime2}
               ----------------
               rawFileDir = {toString(rawFileDir)}
               rawFileName = {toString(rawFileName)}
               targetSeqData = {toString(targetSeqData)}
               outputDir = {toString(outputDir)}
               target_col_name = {rlang::as_name(target_col_name)}
               target_sequence_col_name = {rlang::as_name(target_sequence_col_name)}
               PTMname_col_name = {rlang::as_name(PTMname_col_name)}
               PTMformula_col_name1 = {rlang::as_name(PTMformula_col_name1)}
               PTMformula_col_name2 = {rlang::as_name(PTMformula_col_name2)}
               isoNames = {toString(names(isoAbund))}
               isoAbund = {toString(isoAbund)}
               target_charges = {toString(target_charges)}
               mass_range = {toString(mass_range)}
               mz_range = {toString(mz_range)}
               abund_cutoff = {abund_cutoff}
               XIC_tol = {XIC_tol}
               use_IAA = {use_IAA}
               include_PTMs = {include_PTMs}
               save_output = {save_output}
               scoreMFAcutoff = {scoreMFAcutoff}
               cosinesimcutoff = {cosinesimcutoff}
               SN_cutoff = {SN_cutoff}
               resPowerMS1 = {resPowerMS1}
               isotopologue_window_multiplier = {isotopologue_window_multiplier}
               mz_window = {mz_window}
               return_timers = {return_timers}
               abund_cutoff = {abund_cutoff}
               rawrrTemp = {rawrrTemp}
               "
            ),
            file = params_path
         )

      }

      # Get elemental composition -----------------------------------------------

      timer$start("Chemical formulas and isotopic distributions")

      chemform_temp <-
         purrr::map(
            target_seqs,
            OrgMassSpecR::ConvertPeptide,
            IAA = use_IAA
         )

      chemform_names <-
         chemform_temp %>%
         purrr::map(unlist) %>%
         purrr::map(names)

      chemform <-
         chemform_temp %>%
         purrr::map(unlist) %>%
         purrr::map2(
            chemform_names,
            ~purrr::map2_chr(.y, .x, paste0)
         ) %>%
         purrr::map(
            paste0,
            collapse = ""
         )

      if (include_PTMs == FALSE) {

         PTM_names_list <-
            rep("None", length(target_seqs)) %>%
            as.list()

      } else if (include_PTMs == TRUE) {

         ## Extract PTM names and make list for later use

         PTM_names_list <-
            target_seqs_df %>%
            dplyr::select(!!target_col_name, !!PTMname_col_name) %>%
            tibble::deframe() %>%
            as.list()

         PTM_names_list[is.na(PTM_names_list) == TRUE] <- "None"

         ## Make list of PTM formulas to ADD

         PTM_form_to_add <-
            target_seqs_df %>%
            dplyr::pull(!!PTMformula_col_name1) %>%
            as.list()

         PTM_form_to_add[is.na(PTM_form_to_add) == TRUE] <- "C0"


         for (i in seq_along(PTM_form_to_add)) {

            PTM_form_to_add[[i]] <-
               stringr::str_split(PTM_form_to_add[[i]], ";") %>%
               .[[1]] %>%
               stringr::str_trim() %>%
               purrr::reduce(enviPat::mergeform)

         }


         PTM_form_to_add <-
            purrr::map(
               PTM_form_to_add,
               ~enviPat::check_chemform(isotopes = isotopes_to_use, chemforms = .x) %>%
                  dplyr::pull(new_formula)
            )

         ## Make list of PTM formulas to SUBTRACT

         PTM_form_to_sub <-
            target_seqs_df %>%
            dplyr::pull(!!PTMformula_col_name2) %>%
            as.list()

         PTM_form_to_sub[is.na(PTM_form_to_sub) == TRUE] <- "C0"


         for (i in seq_along(PTM_form_to_sub)) {

            PTM_form_to_sub[[i]] <-
               stringr::str_split(PTM_form_to_sub[[i]], ";") %>%
               .[[1]] %>%
               stringr::str_trim() %>%
               purrr::reduce(enviPat::mergeform)

         }

         PTM_form_to_sub <-
            purrr::map(
               PTM_form_to_sub,
               ~enviPat::check_chemform(isotopes = isotopes_to_use, chemforms = .x) %>%
                  dplyr::pull(new_formula)
            )

         ## Add and subtract PTM formulas as needed

         chemform <-
            purrr::map2(
               chemform,
               PTM_form_to_add,
               ~enviPat::mergeform(.x, .y)
            ) %>%
            purrr::map2(
               PTM_form_to_sub,
               ~enviPat::subform(.x, .y)
            )

      }

      # Add one H per positive charge

      H_to_merge <-
         target_charges %>%
         purrr::map_chr(~paste0("H", .x)) %>%
         list() %>%
         rep(length(target_seqs))

      chemform_to_merge <-
         chemform %>%
         purrr::map(
            ~rep(.x, length(target_charges))
         )

      chemform_withH <-
         purrr::map2(
            chemform_to_merge,
            H_to_merge,
            ~purrr::map2_chr(.x, .y, ~enviPat::mergeform(.x, .y))
         )

      # Remove any elements with 0 count, they cause enviPat::isopattern to fail

      for (i in seq_along(chemform_withH)) {

         if (
            any(stringr::str_detect(chemform_withH[[i]], "C0|H0|N0|O0|P0|S0")) == TRUE
         ) {
            chemform_withH[[i]] <-
               stringr::str_remove_all(chemform_withH[[i]], "C0|H0|N0|O0|P0|S0")
         }
      }

      # Get rid of some unneeded junk

      rm(chemform_temp)
      rm(chemform_names)
      rm(chemform_to_merge)
      rm(H_to_merge)

      # Calculate isotopic dist -------------------------------------------------

      purrr::map(
         chemform,
         ~enviPat::check_chemform(isotopes = isotopes_to_use, chemforms = .x) %>%
            dplyr::pull(warning) %>%
            any() %>%
            `if`(., stop("Problem with chemical formula"))
      )

      iso_dist <-
         purrr::map2(
            chemform_withH,
            list(target_charges) %>% rep(length(target_seqs)),
            ~purrr::map2(
               .x,
               .y,
               ~enviPat::isopattern(
                  isotopes_to_use,
                  chemform=.x,
                  threshold=0.1,
                  plotit=FALSE,
                  charge=.y,
                  emass=0.00054858,
                  algo=1,
                  verbose = F
               ) %>%
                  enviPat::envelope(
                     dmz = "get",
                     resolution = resPowerMS1,
                     verbose = F
                  )
            )
         ) %>%
         purrr::modify_depth(
            3,
            ~new(
               "Spectrum1",
               mz = .x[,1],
               intensity = .x[,2],
               centroided = FALSE
            ) %>%
               MSnbase::pickPeaks(
                  SNR = 1,
                  method = "MAD",
                  refineMz = "kNeighbors",
                  k = 2
               )
         )


      target_charges_list <-
         list(target_charges) %>%
         rep(length(target_seqs))

      iso_dist_list_union <-
         iso_dist %>%
         purrr::modify_depth(
            3,
            ~tibble::tibble(
               `m/z` = MSnbase::mz(.x),
               abundance = MSnbase::intensity(.x)
            )
         ) %>%
         purrr::map2(
            .y = target_charges_list,
            ~purrr::map2(
               .x = .x,
               .y = .y,
               ~purrr::map2(
                  .x = .x,
                  .y = .y,
                  ~dplyr::mutate(.x, charge = .y) %>%
                     dplyr::filter(
                        `m/z` > mz_range[[1]] & `m/z` < mz_range[[2]],
                        abundance > abund_cutoff
                     )
               )
            )
         )

      # Remove iso_dist to save space hopefully

      rm(iso_dist)

      # Keep the next three chunks separate, doesn't work if they are condensed

      iso_dist_list_union <-
         purrr::map(
            iso_dist_list_union,
            purrr::reduce,
            dplyr::union_all
         )

      # Save this for later use in making spectra

      iso_dist_vlines <-
         iso_dist_list_union


      iso_dist_list_union <-
         purrr::map(
            iso_dist_list_union,
            ~tibble_list_checker(.x)
         )


      iso_dist_list_union <-
         purrr::map(
            iso_dist_list_union,
            purrr::reduce,
            dplyr::union_all
         )

      timer$stop("Chemical formulas and isotopic distributions")

      # Calculate XIC -----------------------------------------------------------

      timer$start("Calculate and plot XICs")

      # Not sure whether all isotopologue peaks should be included in generating
      # this XIC. Use most abundant peak for every charge state for now

      XIC_target_mz <-
         iso_dist_list_union %>%
         purrr::map(~dplyr::group_by(.x, charge) %>%
                       dplyr::filter(abundance == max(abundance)) %>%
                       dplyr::pull(`m/z`)) %>%
         purrr::map(unique)

      XIC <-
         purrr::map(
            XIC_target_mz,
            ~rawrr::readChromatogram(
               rawfile = rawFile,
               mass = .x,
               filter = "ms",
               type = "xic",
               tol = XIC_tol
            )
         )

      XIC_nonull <-
         XIC %>%
         purrr::map(kickoutXIC)

      # SAVE SPACE BY DELETING XICS!

      rm(XIC)

      sumXIC1 <-
         XIC_nonull %>%
         purrr::modify_depth(
            2,
            ~tibble::tibble(
               times = .x$times,
               intensities = .x$intensities
            )
         ) %>%
         purrr::modify_depth(2, ~dplyr::select(.x, times, intensities)) %>%
         purrr::map(
            ~suppressMessages(
               purrr::reduce(
                  .x,
                  dplyr::full_join
               )
            )
         )


      sumXIC2 <-
         sumXIC1 %>%
         purrr::map(~dplyr::group_by(.x, times)) %>%
         purrr::map(~dplyr::summarize(.x, int_sum = sum(intensities)))

      # Get RT corresponding to maximum TIC for each sequence for later use in
      # making spectra

      RT_of_maxTIC <-
         sumXIC2 %>%
         purrr::map(
            ~dplyr::filter(.x, int_sum == max(int_sum))
         ) %>%
         purrr::map(
            ~dplyr::pull(.x, times)
         )

      rawFileMetadata <-
         rawrr::readIndex(
            rawFile,
            tmpdir = rawrrTemp
         ) %>%
         tibble::as_tibble() %>%
         dplyr::mutate(RT_min = rtinseconds/60)

      scanNumber_and_RT <-
         rawFileMetadata %>%
         dplyr::select(scan, RT_min)

      scanNumber_and_RT_vec <-
         scanNumber_and_RT$RT_min %>%
         purrr::set_names(scanNumber_and_RT$scan)


      scanNumsToRead <-
         purrr::map(
            RT_of_maxTIC,
            ~which(
               abs(.x - scanNumber_and_RT_vec) ==
                  min(abs(.x - scanNumber_and_RT_vec))
            )
         )

      # Make summary of XIC data

      sumXIC_summary <-
         sumXIC2 %>%
         rlang::set_names(chemform) %>%
         purrr::map(
            ~dplyr::summarize(.x, max_TIC = max(int_sum))
         ) %>%
         purrr::map2(
            names(.),
            ~dplyr::mutate(.x, chem_form = .y)
         ) %>%
         purrr::reduce(dplyr::union_all) %>%
         dplyr::mutate(Sequence = unlist(target_seqs)) %>%
         dplyr::mutate(seq_name = names(target_seqs)) %>%
         dplyr::mutate(RT_of_maxTIC = unlist(RT_of_maxTIC)) %>%
         dplyr::mutate(scan_of_maxTIC = unlist(scanNumsToRead)) %>%
         dplyr::select(seq_name, Sequence, chem_form, max_TIC, tidyr::everything()) %>%
         dplyr::arrange(dplyr::desc(max_TIC))

      # Arrange XICs to go in order of decreasing intensity

      sumXIC2 <-
         sumXIC2 %>%
         .[sumXIC_summary %>% dplyr::pull(seq_name)]


      PTM_names_list <-
         PTM_names_list %>%
         .[sumXIC_summary %>% dplyr::pull(seq_name)]

      iso_dist_list_union <-
         iso_dist_list_union %>%
         .[sumXIC_summary %>% dplyr::pull(seq_name)]

      scanNumsToRead <-
         scanNumsToRead %>%
         .[sumXIC_summary %>% dplyr::pull(seq_name)]

      # Plot XICs ---------------------------------------------------------------

      XIC_plots <-
         purrr::pmap(
            list(
               sumXIC2,
               PTM_names_list,
               names(sumXIC2)
            ),
            ~make_XIC_plot(
               ..1,
               times,
               int_sum,
               ..2,
               ..3
            )
         )

      ########## EVERYTHING AFTER THIS IS CONTROLLED BY save_output!

      if (save_output == TRUE) {

         # MultiArrange XIC grobs -------------------------------------------------------

         XIC_groblist <-
            gridExtra::marrangeGrob(
               grobs = XIC_plots,
               ncol = 3,
               nrow = 2,
               top = rawFileName
            )

         timer$stop("Calculate and plot XICs")

         # Save XIC results --------------------------------------------------------

         timer$start("Save XIC results")

         # Save target sequences to saveDir

         message(
            paste0(
               "Saving target sequences to ",
               saveDir,
               "/",
               fs::path_ext_remove(
                  stringr::str_trunc(
                     rawFileName, 90, "right", ellipsis = ""
                  )
               ),
               '_seqs.csv'
            )
         )

         readr::write_csv(
            target_seqs_df,
            file =
               fs::path(
                  saveDir,
                  paste0(
                     fs::path_ext_remove(
                        stringr::str_trunc(
                           rawFileName, 90, "right", ellipsis = ""
                        )
                     ),
                     '_seqs.csv'
                  )
               )
         )

         # Save XIC data to saveDir

         readr::write_csv(
            sumXIC_summary,
            path =
               fs::path(
                  saveDir,
                  paste0(
                     fs::path_ext_remove(
                        stringr::str_trunc(
                           rawFileName, 90, "right", ellipsis = ""
                        )
                     ),
                     '_XICsum.csv'
                  )
               )
         )

         # Save theo isotope distributions to saveDir

         purrr::imap(
            iso_dist_list_union,
            ~dplyr::mutate(.x, accession = .y)
         ) %>%
            purrr::reduce(dplyr::union_all) %>%
            dplyr::select(accession, tidyr::everything()) %>%
            readr::write_csv(
               path =
                  fs::path(
                     saveDir,
                     paste0(
                        fs::path_ext_remove(
                           stringr::str_trunc(
                              rawFileName, 90, "right", ellipsis = ""
                           )
                        ),
                        '_isodist.csv'
                     )
                  )
            )

         # Save chromatograms, all together

         ggplot2::ggsave(
            filename =
               fs::path(
                  saveDir,
                  paste0(
                     fs::path_ext_remove(
                        stringr::str_trunc(
                           rawFileName, 90, "right", ellipsis = ""
                        )
                     ),
                     '_XICs.pdf'
                  )
               ),
            plot = XIC_groblist,
            width = 20,
            height = 15,
         )

         timer$stop("Save XIC results")

      }


      # make_every_spectrum -----------------------------------------------------

      ## ggplot themes -----------------------------------------------------------


      MStheme01 <-
         list(
            ggthemes::theme_clean(base_size = 16),
            ggplot2::labs(
               x = "m/z",
               y = "Intensity"
            ),
            ggplot2::guides(
               alpha = "none",
               size = "none"
            )
         )



      ## Analyze data ------------------------------------------------------------


      timer$start("Make MS, Top 1 most intense PART 1")

      scansToPlot <-
         purrr::map(
            scanNumsToRead,
            ~rawrr::readSpectrum(
               rawfile = rawFile,
               scan = .x,
               tmpdir = rawrrTemp
            )
         )

      spectra_highestTIC <-
         purrr::imap(
            scansToPlot,
            ~tibble::tibble(
               UNIPROTKB = .y,
               scan = .x[[1]]$scan,
               mz = .x[[1]]$mZ,
               intensity = .x[[1]]$intensity)
         ) %>%
         .[sumXIC_summary %>% dplyr::pull(seq_name)]

      # Estimate noise for all spectra using MALDIquant::estimateNoise

      spectra_noiseEstimates <-
         spectra_highestTIC %>%
         purrr::map(
            ~MALDIquant::createMassSpectrum(
               mass = .x$mz,
               intensity = .x$intensity,
               metaData = list(name = .x$UNIPROTKB[[1]])
            ) %>%
               MALDIquant::estimateNoise(method = "MAD") %>%
               .[,2] %>%
               .[[1]]
         )

      # Add noise estimates to spectra

      spectra_highestTIC <-
         purrr::map2(
            spectra_highestTIC,
            spectra_noiseEstimates,
            ~dplyr::mutate(.x, noise = .y)
         )

      # m/z value of highest abundance theoretical isotopologue peaks.
      # These are the standard for comparison of list lengths!

      message("Getting highest intensity isotopologues")

      mz_max_abund <-
         iso_dist_list_union %>%
         purrr::map(
            ~dplyr::group_by(.x, charge) %>%
               dplyr::filter(abundance == max(abundance)) %>%
               dplyr::slice(1)
         ) %>%
         purrr::map(
            ~dplyr::pull(.x, `m/z`)
         ) %>%
         purrr::map(as.list) %>%
         .[sumXIC_summary %>% dplyr::pull(seq_name)]

      standard_lengths <-
         purrr::map(mz_max_abund, length)

      message("Getting charges of highest intensity isotopologues")

      mz_max_abund_charge <-
         iso_dist_list_union %>%
         purrr::map(
            ~dplyr::group_by(.x, charge) %>%
               dplyr::filter(abundance == max(abundance)) %>%
               dplyr::slice(1)
         ) %>%
         purrr::map(
            ~dplyr::pull(.x, charge)
         ) %>%
         purrr::map(as.list) %>%
         .[sumXIC_summary %>% dplyr::pull(seq_name)] %>%
         purrr::map2(
            standard_lengths,
            ~fix_list_length(.x, .y)
         )

      # Get m/zs for all theoretical isotopologues by charge state
      # for use in getting observed intensities for each isotopologue

      message("Getting relative abundances of isotopologues")

      mz_theo_MS1 <-
         iso_dist_list_union %>%
         purrr::map(
            ~dplyr::group_by(.x, charge) %>%
               dplyr::group_split() %>%
               purrr::map(
                  ~dplyr::select(.x, `m/z`) %>%
                     tibble::deframe()
               )
         ) %>%
         purrr::map2(
            standard_lengths,
            ~fix_list_length(.x, .y)
         )

      # Get abundances for all theoretical isotopologues by charge state
      # for use in calculating cosine similarities

      message("Getting isotopologue abundance per charge state")

      intensity_theo_MS1 <-
         iso_dist_list_union %>%
         purrr::map(
            ~dplyr::group_by(.x, charge) %>%
               dplyr::group_split() %>%
               purrr::map(
                  ~dplyr::select(.x, abundance) %>%
                     tibble::deframe()
               )
         ) %>%
         purrr::map2(
            standard_lengths,
            ~fix_list_length(.x, .y)
         )


      # Get first noise value from spectrum to use in results_chargestateTICs2
      # Theyre all the same anyway

      message("Getting noise estimate for highest intensity isotopologue")

      mz_max_abund_noise <-
         spectra_highestTIC %>%
         purrr::map(
            ~dplyr::pull(.x, noise) %>%
               .[[1]]
         ) %>%
         purrr::map(as.list) %>%
         .[sumXIC_summary %>% dplyr::pull(seq_name)]

      # Get values to use for vertical lines on plots

      message("Get values to use for vertical lines on plots")

      iso_dist_vlines2 <-
         iso_dist_vlines %>%
         purrr::map(
            tibble_list_checker
         ) %>%
         purrr::modify_depth(
            2,
            ~dplyr::filter(
               .x,
               abundance != 100
            ) %>%
               dplyr::top_n(10, abundance)
         ) %>%
         purrr::modify_depth(
            2,
            ~dplyr::pull(.x, `m/z`)
         ) %>%
         purrr::map(as.list) %>%
         .[sumXIC_summary %>% dplyr::pull(seq_name)] %>%
         purrr::map2(
            standard_lengths,
            ~fix_list_length(.x, .y)
         )

      spectra_highestTIC_names <-
         spectra_highestTIC %>%
         names() %>%
         as.list()

      timer$stop("Make MS, Top 1 most intense PART 1")

      ## Extract highest TICs from MS made in the previous step from within a
      ## narrow window of the theoretical most abundant peak (mz_window * mz_window_scaling),
      ## add all charge states together

      timer$start("Make MS, Top 1 most intense, PART 2")

      message("Getting intensity for theoretical highest abundance isotopologue")

      intensity_THAI_MS1 <-
         purrr::map2(
            spectra_highestTIC %>% purrr::map(list),
            mz_max_abund,
            ~purrr::map2(
               .x,
               .y,
               ~get_maxY_in_Xrange(
                  df = .x,
                  x = mz,
                  y = intensity,
                  xrange =
                     c(
                        .y - (.y/resPowerMS1), .y + (.y/resPowerMS1)
                     )
               )
            )
         ) %>%
         purrr::map2(
            standard_lengths,
            ~fix_list_length(.x, .y)
         )

      ## Extract highest TICs from MS made in the previous step from within a
      ## narrow window of the all isotopologue peaks

      message("Getting intensities for all isotopologues")

      intensity_obs_MS1 <-
         purrr::map2(
            spectra_highestTIC %>% purrr::map(list),
            mz_theo_MS1,
            ~purrr::map2(
               .x,
               .y,
               ~get_maxY_in_Xrange_vector(
                  df = .x,
                  x = mz,
                  y = intensity,
                  mz = .y,
                  res_power = resPowerMS1,
                  isotopologueWinMultiplier = isotopologue_window_multiplier
               )
            )
         ) %>%
         purrr::map2(
            standard_lengths,
            ~fix_list_length(.x, .y)
         )

      ## Extract highest TICs from MS made in the previous step from within a
      ## narrow window of the all isotopologue peaks (mz_window * mz_window_scaling)

      message("Getting m/z values at max intensity for all isotopologues")

      mz_obs_MS1 <-
         purrr::map2(
            spectra_highestTIC %>% purrr::map(list),
            mz_theo_MS1,
            ~purrr::map2(
               .x,
               .y,
               ~get_maxX_in_Xrange_vector(
                  df = .x,
                  x = mz,
                  y = intensity,
                  mz = .y,
                  res_power = resPowerMS1,
                  isotopologueWinMultiplier = isotopologue_window_multiplier
               )
            )
         ) %>%
         purrr::map2(
            standard_lengths,
            ~fix_list_length(.x, .y)
         )


      # Get abundances for theoretical isotopologues by charge state
      # for use in plotting by scaling theoretical isotopologue
      # intensity distribution by observed values

      message("Scaling theoretical isotopic abundances")


      intensity_theo_scaling_factors <-
         purrr::map2(
            intensity_obs_MS1,
            intensity_theo_MS1,
            ~purrr::map2(
               .x = .x,
               .y = .y,
               ~max(.x)/max(.y)
            )
         )


      intensity_theo_MS1_scaled <-
         purrr::pmap(
            list(
               intensity_theo_MS1,
               intensity_theo_scaling_factors,
               intensity_obs_MS1
            ),
            ~purrr::pmap(
               list(
                  ..1,
                  ..2,
                  ..3
               ),
               ~as.numeric(.x)*as.numeric(.y) %>%
                  {if (length(.) == 0) rep(0, length(..3)) else .}
            )
         ) %>%
         purrr::map2(
            standard_lengths,
            ~fix_list_length(.x, .y)
         )

      ## Try to determine peak resolving power using linear interpolation

      rp_obs_MS1 <-
         purrr::pmap(
            list(
               spectra_highestTIC,
               mz_theo_MS1
            ),
            ~purrr::pmap(
               list(
                  ..1 = list(..1),
                  ..2 = ..2
               ),
               ~purrr::pmap(
                  list(
                     ..1 = list(..1),
                     ..2 = ..2
                  ),
                  ~get_rp_in_Xrange_vector(
                     df = ..1,
                     x = mz,
                     y = intensity,
                     mz = ..2,
                     res_power = resPowerMS1,
                     isotopologueWinMultiplier = isotopologue_window_multiplier
                  )
               ) %>%
                  unlist()
            )
         )

      # Convert all resolving powers into a data frame for linear modeling

      rp_obs_MS1_df <-
         purrr::map2(
            rp_obs_MS1,
            mz_theo_MS1,
            ~purrr::map2(
               .x,
               .y,
               ~tibble::tibble(
                  rp = .x,
                  mz = .y
               ) %>%
                  tidyr::unnest() %>%
                  dplyr::filter(rp != 0)
            )
         ) %>%
         dplyr::bind_rows() %>%
         dplyr::arrange(mz)

      # Calculate linear model for RP based on determined non-zero values

      x <- rp_obs_MS1_df$mz
      y <- rp_obs_MS1_df$rp

      resolving_power_model <-
         fit <- lm(y ~ I(1/x))

      # Calculate theoretical resolving powers for every isotopologue peak

      rp_theo_MS1 <-
         purrr::map_depth(
            mz_theo_MS1,
            2,
            ~predict(
               resolving_power_model,
               newdata =
                  data.frame(
                     x = .x
                  )
            )
         )

      # Replace all zeros in rp_obs_MS1 with theoretical values from the fit

      rp_obs_theo_hybrid_MS1 <-
         purrr::map2(
            rp_obs_MS1,
            rp_theo_MS1,
            ~purrr::map2(
               .x,
               .y,
               ~purrr::map2_dbl(
                  .x,
                  .y,
                  ~{if (.x == 0) .y else .x}
               )
            )
         )

      ## Calculate S/N estimates

      SN_estimate_obs_MS1 <-
         purrr::map2(
            intensity_obs_MS1,
            mz_max_abund_noise,
            ~purrr::map2(
               .x,
               .y,
               ~(.x/.y) %>%
                  {if (length(.) == 0 | is.nan(.) | is.null(.)) 0 else .}
            )
         ) %>%
         purrr::map(
            ~purrr::map_if(
               .x,
               ~length(.x) == 0,
               ~0
            )
         ) %>%
         purrr::map2(
            standard_lengths,
            ~fix_list_length(.x, .y)
         )


      SN_estimate_theo_MS1 <-
         purrr::pmap(
            list(
               intensity_theo_MS1_scaled,
               mz_max_abund_noise,
               SN_estimate_obs_MS1
            ),
            ~purrr::pmap(
               list(
                  ..1,
                  ..2,
                  ..3
               ),
               ~(..1/..2) %>%
                  {if (length(.) == 0) rep(0, length(..3)) else .}
               # {if (is.nan(.) | is.null(.) | is.infinite(.)) rep(0, length(.x)) else .}
            )
         ) %>%
         purrr::map2(
            standard_lengths,
            ~fix_list_length(.x, .y)
         )

      ## Calculate Mel's ScoreMFA

      score_MFA_MS1 <-
         purrr::pmap(
            list(
               mz_obs_MS1,
               mz_theo_MS1,
               rp_obs_theo_hybrid_MS1,
               SN_estimate_obs_MS1,
               SN_estimate_theo_MS1
            ),
            ~purrr::pmap(
               list(
                  ..1,
                  ..2,
                  ..3,
                  ..4,
                  ..5
               ),
               ~ScoreMFA(
                  ..1,
                  ..2,
                  ..3,
                  ..4,
                  ..5,
                  rp_mult = 2
               ) %>%
                  {if (is.nan(.) | is.null(.) | length(.) == 0) 0 else .}
            )
         ) %>%
         purrr::map2(
            standard_lengths,
            ~fix_list_length(.x, .y)
         )

      ## Calculate cosine similarities

      message("Calculating cosine similarities")

      cosine_sim_MS1 <-
         purrr::map2(
            intensity_obs_MS1,
            intensity_theo_MS1,
            ~purrr::map2(
               .x,
               .y,
               ~calculate_cosine_similarity(
                  .x,
                  .y
               ) %>%
                  {if (length(.) == 0 | is.nan(.) | is.null(.)) 0 else .}
            )
         ) %>%
         purrr::map2(
            standard_lengths,
            ~fix_list_length(.x, .y)
         )

      # Calculate MMA for every peak

      MMA_MS1 <-
         purrr::pmap(
            list(
               mz_obs_MS1,
               mz_theo_MS1
            ),
            ~purrr::pmap(
               list(
                  ..1,
                  ..2
               ),
               ~purrr::pmap(
                  list(
                     ..1,
                     ..2
                  ),
                  ~calculate_mma_ppm(
                     ..1,
                     ..2
                  )
                  # {if (is.nan(.) | is.null(.) | length(.) == 0) NA else .}
               )
            )
         )

      ## Save score_MFA_MS1 data for further analysis, TESTING PURPOSES ONLY

      # saveRDS(score_MFA_MS1, paste0(saveDir, "/score_MFA_MS1.rds"))
      #
      # saveRDS(mz_obs_MS1, paste0(saveDir, "/mz_obs_MS1.rds"))
      # saveRDS(mz_theo_MS1, paste0(saveDir, "/mz_theo_MS1.rds"))
      #
      # saveRDS(SN_estimate_obs_MS1, paste0(saveDir, "/SN_estimate_obs_MS1.rds"))
      # saveRDS(SN_estimate_theo_MS1, paste0(saveDir, "/SN_estimate_theo_MS1.rds"))

      ## Make table with all charge state TICs

      message("Making table with all charge state TICs")

      results_chargestateTICs2 <-
         purrr::pmap(
            list(
               spectra_highestTIC_names,
               mz_max_abund,
               mz_max_abund_charge,
               PTM_names_list,
               intensity_THAI_MS1,
               mz_max_abund_noise,
               cosine_sim_MS1,
               score_MFA_MS1
            ),
            ~purrr::pmap(
               list(
                  ..1,
                  ..2,
                  ..3,
                  ..4,
                  ..5,
                  ..6,
                  ..7,
                  ..8
               ),
               ~tibble::tibble(
                  name = ..1,
                  mz_max_abund = ..2,
                  mz_max_abund_charge = ..3,
                  PTM_name = ..4,
                  maxTIC = ..5,
                  noise_estimate = ..6,
                  `estimated_S/N` = round(maxTIC/noise_estimate, digits = 0),
                  cosine_sim = ..7,
                  score_MFA = ..8
               )
            )
         ) %>%
         purrr::reduce(dplyr::union_all) %>%
         purrr::reduce(dplyr::union_all) %>%
         dplyr::distinct()

      results_chargestateTICs_summary <-
         results_chargestateTICs2 %>%
         dplyr::group_by(name) %>%
         dplyr::filter(
            `estimated_S/N` > SN_cutoff,
            cosine_sim != 0,
            score_MFA != 0
         ) %>%
         dplyr::summarize(
            charge_states = paste(unique(mz_max_abund_charge), collapse = ", "),
            PTM_name = PTM_name[[1]],
            maxTICsum = sum(maxTIC),
            `highest_S/N` = max(`estimated_S/N`),
            `lowest_S/N` = min(`estimated_S/N`),
            `mean_S/N` = mean(`estimated_S/N`),
            mean_cosine_sim = mean(cosine_sim),
            mean_score_mfa = mean(score_MFA)
         ) %>%
         dplyr::arrange(mean_cosine_sim)

      results_chargestateTICs_summary_filtered <-
         results_chargestateTICs_summary %>%
         dplyr::filter(
            `highest_S/N` > SN_cutoff,
            mean_score_mfa > scoreMFAcutoff,
            mean_cosine_sim > cosinesimcutoff
         )

      ## Make all MS based on mz of max abundance for each charge state

      message("Making plots")

      spectra_highestTIC_plots <-
         purrr::pmap(
            list(
               spectra_highestTIC %>% purrr::map(list),
               spectra_highestTIC_names,
               mz_max_abund,
               mz_max_abund_charge,
               PTM_names_list,
               iso_dist_vlines2,
               intensity_THAI_MS1,
               cosine_sim_MS1,
               mz_theo_MS1,
               intensity_theo_MS1_scaled,
               mz_obs_MS1,
               intensity_obs_MS1,
               MMA_MS1,
               score_MFA_MS1,
               SN_estimate_obs_MS1
            ),
            ~purrr::pmap(
               list(
                  ..1,
                  ..2,
                  ..3,
                  ..4,
                  ..5,
                  ..6,
                  ..7,
                  ..8,
                  ..9,
                  ..10,
                  ..11,
                  ..12,
                  ..13,
                  ..14,
                  ..15
               ),
               ~{
                  if (..14 > scoreMFAcutoff & ..8 > cosinesimcutoff & max(..15) > SN_cutoff) {
                     make_spectrum_top1(
                        df = ..1,
                        x = mz,
                        y = intensity,
                        noise = noise,
                        accession = ..2,
                        scan_num = scan,
                        charge = ..4,
                        xrange = c(..3 - (mz_window/2), ..3 + (mz_window/2)),
                        ..5,
                        vlines = ..6,
                        chargestateTIC = ..7,
                        cosine_sim = ..8,
                        mz_theo = ..9,
                        int_theo = ..10,
                        mz_obs = ..11,
                        int_obs = ..12,
                        mma = ..13,
                        isotopologue_window = isotopologue_window_multiplier*(..9/resPowerMS1),
                        theme = MStheme01,
                        score_mfa = ..14
                     )
                  }
               }
            ) %>%
               purrr::set_names(..4)
         ) %>%
         .[sumXIC_summary %>% dplyr::pull(seq_name)] %>%
         purrr::map(
            ~ggplot_list_checker(.x)
         )

      # SAVE PLOTS OBJECT, FOR DEV PURPOSES

      if (save_spec_object == TRUE) {

         saveRDS(spectra_highestTIC_plots, paste0(saveDir, "/spectra_highestTIC_plots.rds"))
         saveRDS(target_seqs, paste0(saveDir, "/target_seqs.rds"))

      }

      timer$stop("Make MS, Top 1 most intense, PART 2")

      ## Arrange MS Grobs, Top 1 ----------------------------------------------

      timer$start("Arrange MS grobs, Top 1 most intense")

      message("Making tablegrob list for PDF output")

      tablegrob_list_multi <-
         spectra_highestTIC_plots %>%
         unlist(recursive = FALSE) %>%
         gridExtra::marrangeGrob(
            grobs = .,
            ncol = 3,
            nrow = 2,
            top = paste0(rawFileName)
         )

      timer$stop("Arrange MS grobs, Top 1 most intense")

      ## Save arranged MS, Top 1 -------------------------------------------------

      # Save max TIC for charge states

      writexl::write_xlsx(
         list(
            "TIC per CS" = results_chargestateTICs2,
            "TIC summary" = results_chargestateTICs_summary,
            "TIC summary filtered" = results_chargestateTICs_summary_filtered
         ),
         fs::path(
            saveDir,
            paste0(
               fs::path_ext_remove(rawFileName),
               "_maxTIC.xlsx"
            )
         ),
         format_headers = TRUE
      )

      # Multi-arranged

      timer$start("Save MS, Top 1, PDF")

      ggplot2::ggsave(
         filename =
            fs::path(
               saveDir,
               paste0(
                  fs::path_ext_remove(rawFileName),
                  "_specZoom.pdf"
               )
            ),
         plot = tablegrob_list_multi,
         width = 15,
         height = 10,
         limitsize = FALSE
      )


      message("\n\n make_every_spectrum done")

      timer$stop("Save MS, Top 1, PDF")


      if (return_timers == TRUE) {
         return(
            timeR::getTimer(timer)
         )
      } else {
         return(
            spectra_highestTIC_plots
         )
      }


   }
davidsbutcher/meXICan-spectrum documentation built on Sept. 1, 2021, 1:35 p.m.