R/LOB_lpsolveAPI.R

Defines functions LOB_lpsolveAPI

# LOB_lpsolveAPI

LOB_lpsolveAPI <- function(LOBpeaklist, choose_class = NULL, save.files = FALSE, use_ms2 = FALSE, plot_data = FALSE) {
  library(lpSolveAPI)

  ### Check Inputs ###

  if (!class(LOBpeaklist) == "data.frame") {
    stop(
      "Input 'LOBpeaklist' is not an 'data.frame' object.\n",
      "Please use a data.frame generated by 'getLOBpeaklist'."
    )
  }

  if (is.null(LOBpeaklist$match_ID)) {
    stop(
      "Input data.frame does not contain a 'match_ID' column.",
      "Please use a data.frame generated by 'getLOBpeaklist'."
    )
  }

  if (is.null(LOBpeaklist$compound_name)) {
    stop(
      "Input data.frame does not contain a 'compound_name' column.",
      "Please use a data.frame generated by 'getLOBpeaklist'."
    )
  }

  if (is.null(LOBpeaklist$LOBdbase_mz)) {
    stop(
      "Input data.frame does not contain a 'LOBdbase_mz' column.",
      "Please use a data.frame generated by 'getLOBpeaklist'."
    )
  }

  if (is.null(LOBpeaklist$peakgroup_rt)) {
    stop(
      "Input data.frame does not contain a 'peakgroup_rt' column.",
      "Please use a data.frame generated by 'getLOBpeaklist'."
    )
  }

  if (is.null(LOBpeaklist$FA_total_no_C)) {
    stop(
      "Input data.frame does not contain a 'FA_total_no_C' column.",
      "Please use a data.frame generated by 'getLOBpeaklist'."
    )
  }

  if (is.null(LOBpeaklist$FA_total_no_DB)) {
    stop(
      "Input data.frame does not contain a 'FA_total_no_DB' column.",
      "Please use a data.frame generated by 'getLOBpeaklist'."
    )
  }

  if (is.null(LOBpeaklist$Flag) & isTRUE(use_ms2)) {
    stop(
      "Input data.frame does not contain a 'Flag' column despite 'use_ms2' being set too TRUE. ",
      "Please run RT_factor_sort() to produce Flag column."
    )
  }

  ### Define Helper Functions ###

  showplots <- function(X, extra) {
    # for (i in 1:length(unique(X$species))){

    # run <- X[X$species == unique(X$species)[i],]
    # run <-run[run$match_ID %in% LOBpeaklist$match_ID,]

    print(ggplot(X, aes(x = peakgroup_rt, y = LOBdbase_mz, color = lpSolve)) +
      scale_color_manual(values = c("Maybe" = "#e7cd08", "No" = "#e70808", "Yes" = "#08e799")) +
      geom_point() +
      geom_text(
        label = paste0(
          as.character(X$FA_total_no_C), ":",
          as.character(X$FA_total_no_DB)
        ),
        hjust = 1,
        vjust = 2,
        size = 3,
        color = "black"
      ) +
      ggtitle(paste0("lpSolve Screened Data - ", as.character(X$species), "-", extra)) +
      xlab("Peak Group Retention Time (sec)") +
      ylab("Peak Group m/z"))
  }

  screen <- function(X) {
    # for (k in 1:length(unique(X$species))) {

    run <- X
    return <- done
    # Binary string for each point
    Binary_String <- rep(1, nrow(run))

    # Empty String we will build a restrictions from
    Empty_String <- rep(0, nrow(run))

    # Matrix of our Exclusions
    Exclusion_Matrix <- matrix(nrow = 1, ncol = nrow(run))


    # Run a loop to find what to exclude for each point
    cat("\n")
    for (i in 1:nrow(run)) {

      # Get our row
      subject <- run[i, ]
      Exclusion_Table <- run

      # Make a table to store our exclusion info
      Exclusion_Table$Exclude <- rep(FALSE, nrow(run))

      # lets add more info about the peaks to our exclusion table based on info in it
      Exclusion_Table$C_num_diff <- Exclusion_Table$FA_total_no_C - subject$FA_total_no_C
      Exclusion_Table$DB_num_diff <- Exclusion_Table$FA_total_no_DB - subject$FA_total_no_DB
      Exclusion_Table$diff_factor <- Exclusion_Table$C_num_diff / Exclusion_Table$DB_num_diff

      # Lets sort the compounds run above and below our point in terms of rt
      lower_rt <- Exclusion_Table[which(Exclusion_Table$peakgroup_rt < subject$peakgroup_rt), ]
      higher_rt <- Exclusion_Table[which(Exclusion_Table$peakgroup_rt > subject$peakgroup_rt), ]

      # Now find ones that break the rules for lower and higher and set Exclude to TRUE in the Exclusion_Table

      # Exclude the lower with more carbon and less double bonds
      lower_match_ID <- lower_rt[lower_rt$FA_total_no_C >= subject$FA_total_no_C & lower_rt$FA_total_no_DB <= subject$FA_total_no_DB, "match_ID"]
      Exclusion_Table[which(Exclusion_Table$match_ID %in% lower_match_ID), "Exclude"] <- TRUE

      lower_diff <- lower_rt[which(lower_rt$diff_factor > 0 & lower_rt$diff_factor <= 1 & lower_rt$C_num_diff < 0), ]
      lower_diff_exclude <- lower_diff[which(lower_diff$C_num_diff == -1 & lower_diff$DB_num_diff == -1), ]
      if (nrow(lower_diff_exclude) > 0) {
        lower_diff <- lower_diff[!(lower_diff$match_ID %in% lower_diff_exclude$match_ID), ]
      }
      lower_diff_names <- row.names(lower_diff)
      Exclusion_Table[lower_diff_names, "Exclude"] <- TRUE


      # Exclude the higher
      higher_match_ID <- higher_rt[higher_rt$FA_total_no_C <= subject$FA_total_no_C & higher_rt$FA_total_no_DB >= subject$FA_total_no_DB, "match_ID"]
      Exclusion_Table[higher_match_ID, "Exclude"] <- TRUE

      higher_diff <- higher_rt[which(higher_rt$diff_factor > 0 & higher_rt$diff_factor <= 1 & higher_rt$C_num_diff > 0), ]
      higher_diff_exclude <- higher_diff[which(higher_diff$C_num_diff == 1 & higher_diff$DB_num_diff == 1), ]
      if (nrow(higher_diff_exclude) > 0) {
        higher_diff <- higher_diff[!(higher_diff$match_ID %in% higher_diff_exclude$match_ID), ]
      }
      higher_diff_names <- row.names(higher_diff)
      Exclusion_Table[higher_diff_names, "Exclude"] <- TRUE

      # Exclude the compounds with the same name
      Exclusion_Table[which(Exclusion_Table$compound_name == subject$compound_name), "Exclude"] <- TRUE

      Exclusion_String <- Empty_String

      for (j in 1:nrow(run)) {
        if (j != i) {
          if (Exclusion_Table[j, "Exclude"] == TRUE) {
            Exclusion_String <- Empty_String
            Exclusion_String[j] <- 1
            Exclusion_String[i] <- 1
            Exclusion_Matrix <- rbind(Exclusion_Matrix, Exclusion_String)
            rownames(Exclusion_Matrix) <- NULL
            Exclusion_Matrix <- unique(Exclusion_Matrix)
          }
        }
        cat("\r")
        flush.console()
        cat("Writing rules for", as.character(unique(X$species)), "compound number", i, "of", nrow(run), ". Number of Rules created:", nrow(Exclusion_Matrix) - 1, "...")
      }
    }
    cat("Done")
    Final_Exclusion_Matrix <- Exclusion_Matrix[-1, ]

    if (NROW(Final_Exclusion_Matrix) == 0) {
      cat("\n No rules created. Compound class is to small or any lacks noise to screen. Returning all points as Yes")
      return[return$match_ID %in% run$match_ID, "lpSolve"] <- "Yes"
    } else {
      if (NCOL(Final_Exclusion_Matrix) == 1) {
        rows <- 1
        Final_Exclusion_Matrix <- t(matrix(Final_Exclusion_Matrix))
      } else {
        rows <- nrow(Final_Exclusion_Matrix)
      }


      # time to screen

      cat("\nApplying lpSolveAPI algorythm...")

      num_con <- rows
      num_points <- nrow(run)

      lpmodel <- make.lp(num_con, num_points)
      set.constr.type(lpmodel, rep("<=", num_con))
      set.rhs(lpmodel, rep(1, num_con))
      set.type(lpmodel, columns = c(1:num_points), "binary")
      lp.control(lpmodel, sense = "max")
      for (i in 1:num_points) {
        set.column(lpmodel, i, rep(1, length(which(Final_Exclusion_Matrix[, i] == 1))), which(Final_Exclusion_Matrix[, i] == 1))
      }
      set.objfn(lpmodel, rep(1, num_points))

      # find first solution
      status <- solve(lpmodel)
      sols <- matrix(ncol = num_points)
      sols <- sols[-1, ] # create list for more solutions
      obj0 <- get.objective(lpmodel)
      counter <- 0 # construct a counter so you wont get more than 100 solutions

      # find more solutions
      while (counter < 100) {
        sol <- get.variables(lpmodel)
        sols <- rbind(sols, sol)
        add.constraint(lpmodel, 2 * sol - 1, "<=", sum(sol) - 1)
        rc <- solve(lpmodel)
        if (status != 0) break
        if (get.objective(lpmodel) < obj0) break
        counter <- counter + 1
      }

      cat(" Done")

      bar <- data.frame(colSums(sols))
      run$Type <- rep(0, nrow(run))
      run[which(bar == 0), "Type"] <- "No"
      run[which(bar != nrow(sols) & bar != 0), "Type"] <- "Maybe"
      run[which(bar == nrow(sols)), "Type"] <- "Yes"
      return[return$match_ID %in% run$match_ID, "lpSolve"] <- run[order(run$match_ID), "Type"]
    }
    # }
    return(return)
  }

  ### Format our input in a 'run' dataframe

  done <- LOBpeaklist
  done$lpSolve <- rep(NA, length(done$match_ID))
  LOBpeaklist <- LOBpeaklist[which(LOBpeaklist$degree_oxidation == 0), ]

  if (is.null(choose_class) == FALSE) {
    if (any(!(choose_class %in% unique(LOBpeaklist$species)))) {
      stop("Chosen 'choose_class' does not appear in the 'species' column of data.frame.")
    } else {
      LOBpeaklist <- LOBpeaklist[which(LOBpeaklist$species == choose_class), ]
    }
  } else {
    LOBpeaklist <- LOBpeaklist[which(is.na(LOBpeaklist$FA_total_no_DB) == FALSE), ]
  }

  for (k in 1:length(unique(LOBpeaklist$species))) {
    LOBrun <- LOBpeaklist[which(LOBpeaklist$species == unique(LOBpeaklist$species)[k]), ]

    # Put what we need in a dataframe, only include RtF data if it exists
    if (is.null(LOBrun$Flag) == FALSE) {
      PRErun <- data.frame(
        LOBrun$match_ID,
        LOBrun$compound_name,
        LOBrun$LOBdbase_mz,
        LOBrun$peakgroup_rt,
        LOBrun$FA_total_no_C,
        LOBrun$FA_total_no_DB,
        LOBrun$species,
        LOBrun$DNPPE_Factor,
        LOBrun$Flag,
        LOBrun$DBase_DNPPE_RF
      )
      # Re-name our column names
      colnames(PRErun) <- c(
        "match_ID",
        "compound_name",
        "LOBdbase_mz",
        "peakgroup_rt",
        "FA_total_no_C",
        "FA_total_no_DB",
        "species",
        "DNPPE_Factor",
        "Flag",
        "DBase_DNPPE_RF"
      )
    } else {
      PRErun <- data.frame(
        LOBrun$match_ID,
        LOBrun$compound_name,
        LOBrun$LOBdbase_mz,
        LOBrun$peakgroup_rt,
        LOBrun$FA_total_no_C,
        LOBrun$FA_total_no_DB,
        LOBrun$species
      )
      # Re-name our column names
      colnames(PRErun) <- c(
        "match_ID",
        "compound_name",
        "LOBdbase_mz",
        "peakgroup_rt",
        "FA_total_no_C",
        "FA_total_no_DB",
        "species"
      )
    }

    # If we have elected to use MS2 data, limit our options based on that
    Final_Switch <- FALSE
    if (use_ms2 == TRUE) {
      if (any(PRErun$Flag %in% "5%_rtv" | PRErun$Flag %in% "ms2v")) {
        Final_Switch <- TRUE
        cat("\nPerforming Prescreen of MS2 data for use later for", as.character(unique(LOBpeaklist$species)[k]))

        # find ms2v and %5v points from the data set
        ms2v <- PRErun[which(PRErun$Flag == "ms2v"), ]
        rtv <- PRErun[which(PRErun$Flag == "5%_rtv"), ]
        ms2v <- rbind(ms2v, rtv)

        # Screen for points within that that dont fit and plot the results
        ms2v_screened <- screen(ms2v)
        ms2v_screened_noNA <- ms2v_screened[which(is.na(ms2v_screened$lpSolve) == FALSE & ms2v_screened$species == unique(LOBpeaklist$species)[k]), ]

        if (plot_data == TRUE) {
          showplots(ms2v_screened_noNA, extra = "RtF Confirmed Only")
        }

        # Take only the yes points
        use2screen <- ms2v_screened_noNA[ms2v_screened_noNA$lpSolve == "Yes", ]

        cat("\nWill use", nrow(use2screen), "verified points for screening.")
        cat("\n")

        # For each class eliminate everything that doesn't fit with the yes points

        run <- use2screen
        final_cut <- integer()
        for (i in 1:nrow(run)) {

          # Get our row
          subject <- run[i, ]
          Exclusion_Table <- PRErun

          # Make a table to store our exclusion info
          Exclusion_Table$Exclude <- rep(FALSE, nrow(Exclusion_Table))

          # lets add more info about the peaks to our exclusion table based on info in it
          Exclusion_Table$C_num_diff <- Exclusion_Table$FA_total_no_C - subject$FA_total_no_C
          Exclusion_Table$DB_num_diff <- Exclusion_Table$FA_total_no_DB - subject$FA_total_no_DB
          Exclusion_Table$diff_factor <- Exclusion_Table$C_num_diff / Exclusion_Table$DB_num_diff

          # Lets sort the compounds run above and below our point in terms of rt
          lower_rt <- Exclusion_Table[which(Exclusion_Table$peakgroup_rt < subject$peakgroup_rt), ]
          higher_rt <- Exclusion_Table[which(Exclusion_Table$peakgroup_rt > subject$peakgroup_rt), ]

          # Now find ones that break the rules for lower and higher and set Exclude to TRUE in the Exclusion_Table

          # Exclude the lower
          lower_match_ID <- lower_rt[lower_rt$FA_total_no_C >= subject$FA_total_no_C & lower_rt$FA_total_no_DB <= subject$FA_total_no_DB, "match_ID"]

          lower_diff <- lower_rt[which(lower_rt$diff_factor > 0 & lower_rt$diff_factor <= 1 & lower_rt$C_num_diff < 0), ]
          lower_diff_exclude <- lower_diff[which(lower_diff$C_num_diff == -1 & lower_diff$DB_num_diff == -1), ]
          if (nrow(lower_diff_exclude) > 0) {
            lower_diff <- lower_diff[!(lower_diff$match_ID %in% lower_diff_exclude$match_ID), ]
          }

          final_cut <- c(lower_diff$match_ID, lower_match_ID, final_cut)


          # Exclude the higher
          higher_match_ID <- higher_rt[higher_rt$FA_total_no_C <= subject$FA_total_no_C & higher_rt$FA_total_no_DB >= subject$FA_total_no_DB, "match_ID"]

          higher_diff <- higher_rt[which(higher_rt$diff_factor > 0 & higher_rt$diff_factor <= 1 & higher_rt$C_num_diff > 0), ]
          higher_diff_exclude <- higher_diff[which(higher_diff$C_num_diff == 1 & higher_diff$DB_num_diff == 1), ]
          if (nrow(higher_diff_exclude) > 0) {
            higher_diff <- higher_diff[!(higher_diff$match_ID %in% higher_diff_exclude$match_ID), ]
          }

          final_cut <- c(higher_diff$match_ID, higher_match_ID, final_cut)

          cat("\r")
          flush.console()
          cat("Eliminating points that do not fit. Point number", i, "of", nrow(run), ". Number of points excluded:", length(unique(final_cut)), "...")
        }
        cat("Done!")
        cat("\n")
        ms2_excluded_matchids <- PRErun[PRErun$match_ID %in% final_cut, "match_ID"]
        PRErun <- PRErun[!(PRErun$match_ID %in% final_cut), ]
      } else {
        cat("\nNo points within 5% of a retention time window. Moving to next class")
        next()
      }
    }


    # Screen everything
    cat("\nPerforming screening of complete dataset for", as.character(unique(LOBpeaklist$species)[k]))
    screened <- screen(PRErun)
    if (Final_Switch == TRUE) {
      screened[screened$match_ID %in% ms2_excluded_matchids, "lpSolve"] <- "No"
    }
    screened_plot <- screened[which(is.na(screened$lpSolve) == FALSE & screened$species == unique(LOBpeaklist$species)[k]), ]

    if (plot_data == TRUE) {
      showplots(screened_plot, extra = "Final")
    }

    # if (save.files==TRUE){
    #   ggsave(filename = paste0(as.character(run$species), "_LP_solve.tiff"),
    #          plot = last_plot(),
    #          device = "tiff",
    #          width = 22, height = 17)
    # }
    cat("\n")
    if (use_ms2 == TRUE & Final_Switch == TRUE) {
      test <- unique(screened[which(screened$match_ID %in% use2screen$match_ID), "lpSolve"])
      if ("No" %in% test) {
        cat("\n")
        warning("Not all MS2v peaks included in solution.")
        done[done$match_ID %in% screened_plot$match_ID, "lpSolve"] <- screened_plot[order(screened_plot$match_ID), "lpSolve"]
      }
      if ("Maybe" %in% test) {
        cat("\n")
        warning("Not all MS2v peaks included in solution.")
        done[done$match_ID %in% screened_plot$match_ID, "lpSolve"] <- screened_plot[order(screened_plot$match_ID), "lpSolve"]
      }
      cat("\n")
      cat("All MS2v peaks included in solution!")
      done[done$match_ID %in% screened_plot$match_ID, "lpSolve"] <- screened_plot[order(screened_plot$match_ID), "lpSolve"]
    } else {
      done[done$match_ID %in% screened_plot$match_ID, "lpSolve"] <- screened_plot[order(screened_plot$match_ID), "lpSolve"]
      cat("Class Screened without MS2 data.")
    }
    cat("\n")
  }
  return(done)
}
hholm/LOB_tools documentation built on Oct. 27, 2019, 4:14 a.m.