R/RT_Factor_sorting_with_hummel_dbase.R

Defines functions RT_Factor_Sort

#### RT Factor Filtering Function
# Functional Draft
# Feb. 20, 2019
# input a raw lobset

# Load data and RT Factor database
# setwd("C:/Users/TSQ/Desktop/Daniel Lowenstein/Nano_Take_Two/Intermediate_Third/")
# original_data <- read.csv("Nano_Intermediate_Third_Pos_Raw_LOBSTAHS_screened_peakdata_2019-06-18T2-27-43_PM-0400.csv")
# RT_Factor_Dbase <-read.csv("C:/Users/TSQ/Desktop/Daniel Lowenstein/Older_Projects/RT_Factors/Hummel RtF Master Database - rtf_data.csv")

RT_Factor_Sort <- function(original_data, RT_Factor_Dbase, choose_class = FALSE, plot_data = FALSE, save_plots = FALSE, data_title){

  library(tidyr)
  library(dplyr)

  ### Check Inputs ###

  if (is.null(RT_Factor_Dbase$Mean_DNPPE_Factor)) {

    stop("Looks like input 'RT_Factor_Dbase' is not the right one.\n",
         "Make sure you're using an up to date database with",
         "a Mean DNPPE Factor for each compound.")

  }

  if (is.null(original_data$match_ID)) {

    stop("Input data.frame does not contain a 'match_ID' column.",
         "Please use a data.frame generated by 'getLOBpeaklist'.")

  }

  if (is.null(original_data$compound_name)) {

    stop("Input data.frame does not contain a 'compound_name' column.",
         "Please use a data.frame generated by 'getLOBpeaklist'.")

  }

  if (is.null(original_data$LOBdbase_mz)) {

    stop("Input data.frame does not contain a 'LOBdbase_mz' column.",
         "Please use a data.frame generated by 'getLOBpeaklist'.")

  }

  if (is.null(original_data$peakgroup_rt)) {

    stop("Input data.frame does not contain a 'peakgroup_rt' column.",
         "Please use a data.frame generated by 'getLOBpeaklist'.")

  }

  if (is.null(original_data$FA_total_no_C)) {

    stop("Input data.frame does not contain a 'FA_total_no_C' column.",
         "Please use a data.frame generated by 'getLOBpeaklist'.")

  }

  if (is.null(original_data$FA_total_no_DB)) {

    stop("Input data.frame does not contain a 'FA_total_no_DB' column.",
         "Please use a data.frame generated by 'getLOBpeaklist'.")

  }


  # make sure peakgroup rt is numeric
  original_data$peakgroup_rt <- as.numeric(original_data$peakgroup_rt)

  # Extract correct DNPPE retention time
  DNPPE_RT <- as.numeric(original_data$peakgroup_rt[which(grepl("DNPPE", original_data$compound_name))])

  if(length(DNPPE_RT) > 1){
    stop("Looks like there's more than one DNPPE peak.",
         "Please resolve in input LOBpeaklist and try again.")
  }

  # Add column for DNPPE factor and an empty one for flagging
  original_data <- original_data %>%
    mutate(DNPPE_Factor = peakgroup_rt/DNPPE_RT, Flag = "None", DBase_DNPPE_RF = "NA") %>%
    filter(species != "NA")

  # isolate major intact polar lipid classes, unoxidized (need to add pigments, etc.)
  Main_Lipids <- original_data %>%
    filter(degree_oxidation == "0",
           lipid_class == "IP_DAG"|
             lipid_class == "IP_MAG"|
             lipid_class == "TAG"|
             lipid_class == "FFA"
           )

  # if you're choosing a class
  if(choose_class != FALSE){
    Main_Lipids <- original_data[original_data$species == choose_class, ] %>%
      filter(degree_oxidation == "0")
  }

  # isolate oxidized lipids into df
  Ox_Lipids <- original_data %>%
    filter(degree_oxidation > 0)

  # flag known vs unknown by checking whether grepl returns anything in the database
  for (i in 1:length(Main_Lipids$compound_name)){
    which_row <- which(grepl(paste0("^", Main_Lipids$compound_name[i], "$"), RT_Factor_Dbase$compound_name))

    if(length(which(grepl(paste0("^", Main_Lipids$compound_name[i], "$"), RT_Factor_Dbase$compound_name))) >0){
      if(is.na(RT_Factor_Dbase$Mean_DNPPE_Factor[which_row]) != TRUE ){
        Main_Lipids$Flag[i] = "Known"
      }else{
        Main_Lipids$Flag[i] = "Unknown"
      }
    }else{Main_Lipids$Flag[i] = "Not In Database"}
    cat("\r")
    flush.console()
    cat("Checking for databases entries.","Compound",i,"of",length(Main_Lipids$compound_name),"...")
  }
cat("Done!")

  # separate into two dfs
  Known_RtFs <- Main_Lipids %>% filter(Flag == "Known")
  Unknown_RtFs <- Main_Lipids %>% filter(Flag == "Unknown")

  # for each compound in the "Known" df, grab its corresponding row number in RT_Factor_Dbase,
  # then flag it as follows
  # first check if it's in a 10%
cat("\n")
  for (i in 1:length(Known_RtFs$compound_name)){

  which_row <- as.numeric(which(grepl(paste0("^", Known_RtFs$compound_name[i], "$"), RT_Factor_Dbase$compound_name)))
    Known_RtFs$DBase_DNPPE_RF[i] = RT_Factor_Dbase$Mean_DNPPE_Factor[which_row]

    if(Known_RtFs$DNPPE_Factor[i] < (RT_Factor_Dbase$Mean_DNPPE_Factor[which_row]*1.1) & Known_RtFs$DNPPE_Factor[i] > (RT_Factor_Dbase$Mean_DNPPE_Factor[which_row]*0.9)){
      if(Known_RtFs$DNPPE_Factor[i] < (RT_Factor_Dbase$Mean_DNPPE_Factor[which_row]*1.05) & Known_RtFs$DNPPE_Factor[i] > (RT_Factor_Dbase$Mean_DNPPE_Factor[which_row]*0.95)){
        if(RT_Factor_Dbase$ms2_verified[which_row] == "Yes"){
          Known_RtFs$Flag[i] = "ms2v"
        }else{
          Known_RtFs$Flag[i] = "5%_rtv"
        }
      }else{
        (Known_RtFs$Flag[i] = "10%_rtv")
      }
    }else{
      Known_RtFs$Flag[i] = "Red"
    }

    cat("\r")
    flush.console()
    cat("Comparing database RtFs to data.","Compound",i,"of",length(Known_RtFs$compound_name),"...")

    }
  cat("Done!")

  # check for possible double peaks
  Green_Flags <- Known_RtFs %>% filter(Flag == "ms2v" | Flag == "5%_rtv" | Flag == "10%_rtv")
  split_by_compound_name <- split(Green_Flags,
                                  duplicated(Green_Flags$compound_name)|duplicated(Green_Flags$compound_name, fromLast = TRUE))
  Multiple_Flags <- split_by_compound_name[["TRUE"]]
  Multiple_Flags$Flag <- rep("Double_Peak?", length(Multiple_Flags$compound_name))


  cat("\n")
  for (j in 1:length(Multiple_Flags$match_ID)){
    match_row <- as.numeric(which(grepl(paste0("^", Multiple_Flags$match_ID[j], "$"), Known_RtFs$match_ID)))
    Known_RtFs$Flag[match_row] = as.character(Multiple_Flags$Flag[j])

    cat("\r")
    flush.console()
    cat("Flagging possible multiple peaks.","Compound",j,"of",length(Multiple_Flags$match_ID),"...")
  }
  cat("Done!")

  # combine known and unknown dfs
  Combined <- rbind(Known_RtFs, Unknown_RtFs)

  # change levels of colors so they plot in the right order
  #Combined$Flag = factor(Combined$Flag, levels = c("Red", "ms2v", "5%_rtv", "10%_rtv", "Unknown"))

  Flagged_Data <- original_data
  Flagged_Data$Flag <- rep("Unknown", length(Flagged_Data$Flag))

  cat("\n")
  for (j in 1:length(Combined$match_ID)){
    match_row <- as.numeric(which(grepl(paste0("^", Combined$match_ID[j], "$"), Flagged_Data$match_ID)))
    Flagged_Data$Flag[match_row] = as.character(Combined$Flag[j])
    Flagged_Data$DBase_DNPPE_RF[match_row] = Combined$DBase_DNPPE_RF[j]

    cat("\r")
    flush.console()
    cat("Combining data.","Compound",j,"of",length(Combined$match_ID),"...")
  }
cat("Done!")

  Flagged_Data$Flag = factor(Flagged_Data$Flag, levels = c("Red", "ms2v", "5%_rtv", "10%_rtv", "Double_Peak?", "Unknown"))
  Flagged_Data$DBase_DNPPE_RF <- as.numeric(Flagged_Data$DBase_DNPPE_RF)


  if(plot_data == TRUE){
    #####################
    # From here, giving the option to generate mz vs. rt graphs
    # for each of the major classes.

    # add a column for plot labelling by C and DB #
    lipidclass <- Flagged_Data %>%
      mutate(C_DB = paste0(str_extract(FA_total_no_C, "\\d+"), ":", str_extract(FA_total_no_DB, "\\d+"))) %>%
      filter(degree_oxidation == 0)

    # going through the big nine lipids plus TAGs, DAGs, and FFAs
    lipid_classes <- unique(Main_Lipids$species)

    # and plot each, saving a copy to the working directory
    cat("\n")
    for (i in 1:length(lipid_classes)){
      Lipid <- lipidclass %>%
        filter(species == paste(lipid_classes[i]))

      print(ggplot(data = Lipid)+
              geom_point(aes(x = peakgroup_rt, y = LOBdbase_mz, color =  Flag, size = 0.05))+
              scale_color_manual(values=c("Red" = "#e55934","ms2v"="#fdbd4c","5%_rtv"="#9bc53d","10%_rtv"="#0f5f20","Double_Peak?"="#5bc0eb","Unknown"="#ff99ff")) +
              geom_point(aes(x = DBase_DNPPE_RF*DNPPE_RT, y = LOBdbase_mz, color =  Flag), shape = 3)+
              geom_errorbarh(aes(xmax = DBase_DNPPE_RF*DNPPE_RT*1.1, xmin = DBase_DNPPE_RF*DNPPE_RT*0.9, height = 0.2, y = LOBdbase_mz, color = Flag))+
              geom_text(aes(x = peakgroup_rt, y = LOBdbase_mz, label = C_DB, hjust = 1, vjust = 2, color = Flag))+
              ggtitle(paste0("M/Z vs. RT in ", lipid_classes[i])))

      if(save_plots == TRUE){
      ggsave(filename = paste0(lipid_classes[i], "_MZRT_Flag_", data_title, ".tiff"),
             plot = last_plot(),
             device = "tiff",
             width = 22, height = 17)
      }
      cat("\r")
      flush.console()
      cat("Generating plot",i,"of",length(lipid_classes),"...")
    }
    cat("Done!")
  }

  return(Flagged_Data)
}
hholm/LOB_tools documentation built on Oct. 27, 2019, 4:14 a.m.