R/LOB_lpsolve.R

Defines functions LOB_lpsolve

# LOB_lpsolve

LOB_lpsolve <- function(LOBpeaklist,choose_class=NULL,save.files=FALSE,use_ms2=FALSE,plot_data = FALSE) {


  library(lpSolve)


  ### Check Inputs ###

  if (!class(LOBpeaklist)=="data.frame") {

    stop("Input 'LOBpeaklist' is not an 'data.frame' object.\n",
         "Please use a data.frame generated by 'getLOBpeaklist'.")

  }

  if (is.null(LOBpeaklist$match_ID)) {

    stop("Input data.frame does not contain a 'match_ID' column.",
         "Please use a data.frame generated by 'getLOBpeaklist'.")

  }

  if (is.null(LOBpeaklist$compound_name)) {

    stop("Input data.frame does not contain a 'compound_name' column.",
         "Please use a data.frame generated by 'getLOBpeaklist'.")

  }

  if (is.null(LOBpeaklist$LOBdbase_mz)) {

    stop("Input data.frame does not contain a 'LOBdbase_mz' column.",
         "Please use a data.frame generated by 'getLOBpeaklist'.")

  }

  if (is.null(LOBpeaklist$peakgroup_rt)) {

    stop("Input data.frame does not contain a 'peakgroup_rt' column.",
         "Please use a data.frame generated by 'getLOBpeaklist'.")

  }

  if (is.null(LOBpeaklist$FA_total_no_C)) {

    stop("Input data.frame does not contain a 'FA_total_no_C' column.",
         "Please use a data.frame generated by 'getLOBpeaklist'.")

  }

  if (is.null(LOBpeaklist$FA_total_no_DB)) {

    stop("Input data.frame does not contain a 'FA_total_no_DB' column.",
         "Please use a data.frame generated by 'getLOBpeaklist'.")

  }

  ### Define Helper Functions ###

  showplots<-function(X,extra){
    #for (i in 1:length(unique(X$species))){

    #run <- X[X$species == unique(X$species)[i],]
    #run <-run[run$match_ID %in% LOBpeaklist$match_ID,]

    print(ggplot(X,aes(x = peakgroup_rt, y = LOBdbase_mz,color=lpSolve)) +
            scale_color_manual(values=c("Maybe" = "#e7cd08","No"="#e70808","Yes"="#08e799")) +
            geom_point() +
            geom_text(label=paste0(as.character(X$FA_total_no_C), ":",
                                   as.character(X$FA_total_no_DB)),
                      hjust = 1,
                      vjust = 2,
                      size=3,
                      color="black")+
            ggtitle(paste0("lpSolve Screened Data - ", as.character(X$species),"-",extra)) +
            xlab("Peak Group Retention Time (sec)")+
            ylab("Peak Group m/z")

    )}

  screen<-function(X){
    #for (k in 1:length(unique(X$species))) {

      run <- X
      return <- done
      #Binary string for each point
      Binary_String <- rep(1, nrow(run))

      #Empty String we will build a restrictions from
      Empty_String <- rep(0, nrow(run))

      #Matrix of our Exclusions
      Exclusion_Matrix <- matrix(nrow = 1,ncol = nrow(run))


      # Run a loop to find what to exclude for each point
      cat("\n")
      for (i in 1:nrow(run)) {

        #Get our row
        subject <- run[i,]
        Exclusion_Table <- run

        # Make a table to store our exclusion info
        Exclusion_Table$Exclude <- rep(FALSE,nrow(run))

        #lets add more info about the peaks to our exclusion table based on info in it
        Exclusion_Table$C_num_diff <- Exclusion_Table$FA_total_no_C - subject$FA_total_no_C
        Exclusion_Table$DB_num_diff <- Exclusion_Table$FA_total_no_DB - subject$FA_total_no_DB
        Exclusion_Table$diff_factor <- Exclusion_Table$C_num_diff/Exclusion_Table$DB_num_diff

        #Lets sort the compounds run above and below our point in terms of rt
        lower_rt <- Exclusion_Table[which(Exclusion_Table$peakgroup_rt < subject$peakgroup_rt),]
        higher_rt <- Exclusion_Table[which(Exclusion_Table$peakgroup_rt > subject$peakgroup_rt),]

        #Now find ones that break the rules for lower and higher and set Exclude to TRUE in the Exclusion_Table

        #Exclude the lower with more carbon and less double bonds
        lower_match_ID <- lower_rt[lower_rt$FA_total_no_C>=subject$FA_total_no_C & lower_rt$FA_total_no_DB<=subject$FA_total_no_DB,"match_ID"]
        Exclusion_Table[which(Exclusion_Table$match_ID %in% lower_match_ID),"Exclude"] <- TRUE

        lower_diff <- lower_rt[which(lower_rt$diff_factor>0 & lower_rt$diff_factor<=1 & lower_rt$C_num_diff<0),]
        lower_diff_exclude <- lower_diff[which(lower_diff$C_num_diff==-1 & lower_diff$DB_num_diff==-1),]
        if (nrow(lower_diff_exclude)>0){
          lower_diff <- lower_diff[!(lower_diff$match_ID %in% lower_diff_exclude$match_ID),]
        }
        lower_diff_names <- row.names(lower_diff)
        Exclusion_Table[lower_diff_names,"Exclude"] <- TRUE


        #Exclude the higher
        higher_match_ID <- higher_rt[higher_rt$FA_total_no_C<=subject$FA_total_no_C & higher_rt$FA_total_no_DB>=subject$FA_total_no_DB,'match_ID']
        Exclusion_Table[higher_match_ID,"Exclude"] <- TRUE

        higher_diff <- higher_rt[which(higher_rt$diff_factor>0 & higher_rt$diff_factor<=1 & higher_rt$C_num_diff>0),]
        higher_diff_exclude <- higher_diff[which(higher_diff$C_num_diff==1 & higher_diff$DB_num_diff==1),]
        if (nrow(higher_diff_exclude)>0){
          higher_diff <- higher_diff[!(higher_diff$match_ID %in% higher_diff_exclude$match_ID),]
        }
        higher_diff_names <- row.names(higher_diff)
        Exclusion_Table[higher_diff_names,"Exclude"] <- TRUE

        #Exclude the compounds with the same name
        Exclusion_Table[which(Exclusion_Table$compound_name==subject$compound_name),"Exclude"] <- TRUE

        Exclusion_String <- Empty_String

        for (j in 1:nrow(run)) {
          if(j!=i){
            if(Exclusion_Table[j,"Exclude"]==TRUE){
              Exclusion_String <- Empty_String
              Exclusion_String[j]<-1
              Exclusion_String[i]<-1
              Exclusion_Matrix <- rbind(Exclusion_Matrix,Exclusion_String)
              rownames(Exclusion_Matrix) <- NULL
              Exclusion_Matrix <- unique(Exclusion_Matrix)
            }
          }
          cat("\r")
          flush.console()
          cat("Writing rules for",as.character(unique(X$species)),"compound number",i,"of",nrow(run),". Number of Rules created:",nrow(Exclusion_Matrix)-1,"...")
        }
      }
      cat("Done")
      Final_Exclusion_Matrix <- Exclusion_Matrix[-1,]

      if(NROW(Final_Exclusion_Matrix)==0){
        cat("\n No rules created. Compound class is to small or any lacks noise to screen. Returning all points as Yes")
        return[return$match_ID %in% run$match_ID,"lpSolve"] <- "Yes"
      }else{

        if (NCOL(Final_Exclusion_Matrix)==1) {
          rows <- 1
          Final_Exclusion_Matrix <- t(matrix(Final_Exclusion_Matrix))
        }else{
          rows <- nrow(Final_Exclusion_Matrix)
        }


        #time to screen
        dir <- rep("<=", rows) # all constraints '<='

        rhs <- rep(1, rows) # all right hand sides = 1

        #all.bin for binary. Set Solution number high to get all solutions.


        cat("\nApplying lpSolve algorythm...")
        gc()
        sol <- lp("max", Binary_String, Final_Exclusion_Matrix, dir, rhs,all.bin = TRUE,num.bin.solns = 100)
        gc()
        cat(" Done")
        numcols <- nrow(run)
        numsols <- sol$num.bin.solns

        solutions <- matrix(head(sol$solution, numcols*numsols), nrow=numsols, byrow=TRUE)

        bar <- data.frame(colSums(solutions))
        run$Type<-rep(0,nrow(run))
        run[which(bar==0),"Type"] <- 'No'
        run[which(bar!=nrow(solutions) & bar!=0 ),"Type"] <- 'Maybe'
        run[which(bar==nrow(solutions)),"Type"] <- 'Yes'
        return[return$match_ID %in% run$match_ID,"lpSolve"] <- run[order(run$match_ID),"Type"]
      }
    #}
    return(return)
  }

  ### Format our input in a 'run' dataframe

  done <- LOBpeaklist

  done$lpSolve <- rep(NA,length(done$match_ID))

  LOBpeaklist <- LOBpeaklist[which(LOBpeaklist$degree_oxidation==0),]

  if (is.null(choose_class)==FALSE) {
    if(any(!(choose_class %in% unique(LOBpeaklist$species)))){
      stop("Chosen 'choose_class' does not appear in the 'species' column of data.frame.")
    }else{
      LOBpeaklist <- LOBpeaklist[which(LOBpeaklist$species==choose_class),]}
  }else{
    LOBpeaklist <- LOBpeaklist[which(is.na(LOBpeaklist$FA_total_no_DB)==FALSE),]
  }

for (k in 1:length(unique(LOBpeaklist$species))) {

    LOBrun <- LOBpeaklist[which(LOBpeaklist$species== unique(LOBpeaklist$species)[k]),]

    # Put what we need in a dataframe, only include RtF data if it exists
    if(is.null(LOBrun$Flag)==FALSE){
      PRErun  <- data.frame(LOBrun$match_ID,
                            LOBrun$compound_name,
                            LOBrun$LOBdbase_mz,
                            LOBrun$peakgroup_rt,
                            LOBrun$FA_total_no_C,
                            LOBrun$FA_total_no_DB,
                            LOBrun$species,
                            LOBrun$DNPPE_Factor,
                            LOBrun$Flag,
                            LOBrun$DBase_DNPPE_RF)
      #Re-name our column names
      colnames(PRErun) <- c("match_ID",
                            "compound_name",
                            "LOBdbase_mz",
                            "peakgroup_rt",
                            "FA_total_no_C",
                            "FA_total_no_DB",
                            "species",
                            "DNPPE_Factor",
                            "Flag",
                            "DBase_DNPPE_RF")
    }else{
      PRErun  <- data.frame(LOBrun$match_ID,
                            LOBrun$compound_name,
                            LOBrun$LOBdbase_mz,
                            LOBrun$peakgroup_rt,
                            LOBrun$FA_total_no_C,
                            LOBrun$FA_total_no_DB,
                            LOBrun$species)
      #Re-name our column names
      colnames(PRErun) <- c("match_ID",
                            "compound_name",
                            "LOBdbase_mz",
                            "peakgroup_rt",
                            "FA_total_no_C",
                            "FA_total_no_DB",
                            "species")
    }

    #If we have elected to use MS2 data, limit our options based on that
    Final_Switch <- FALSE
    if (use_ms2 == TRUE){
      if(any(PRErun$Flag %in% "5%_rtv" | PRErun$Flag %in% "ms2v")){
        Final_Switch <- TRUE
        cat("\nPerforming Prescreen of MS2 data for use later for",as.character(unique(LOBpeaklist$species)[k]))

        #find ms2v and %5v points from the data set
        ms2v <- PRErun[which(PRErun$Flag == "ms2v"),]
        rtv <- PRErun[which(PRErun$Flag == "5%_rtv"),]
        ms2v <- rbind(ms2v,rtv)

        #Screen for points within that that dont fit and plot the results
        ms2v_screened <- screen(ms2v)
        ms2v_screened_noNA <- ms2v_screened[which(is.na(ms2v_screened$lpSolve)==FALSE & ms2v_screened$species == unique(LOBpeaklist$species)[k]),]

        if(plot_data == TRUE){
        showplots(ms2v_screened_noNA,extra = "RtF Confirmed Only")
          }

        #Take only the yes points
        use2screen <- ms2v_screened_noNA[ms2v_screened_noNA$lpSolve == "Yes",]

        cat("\nWill use",nrow(use2screen),"verified points for screening.")
        cat("\n")

        #For each class eliminate everything that doesn't fit with the yes points

        run <- use2screen
        final_cut <- integer()
        for (i in 1:nrow(run)) {

          #Get our row
          subject <- run[i,]
          Exclusion_Table <- PRErun

          # Make a table to store our exclusion info
          Exclusion_Table$Exclude <- rep(FALSE,nrow(Exclusion_Table))

          #lets add more info about the peaks to our exclusion table based on info in it
          Exclusion_Table$C_num_diff <- Exclusion_Table$FA_total_no_C - subject$FA_total_no_C
          Exclusion_Table$DB_num_diff <- Exclusion_Table$FA_total_no_DB - subject$FA_total_no_DB
          Exclusion_Table$diff_factor <- Exclusion_Table$C_num_diff/Exclusion_Table$DB_num_diff

          #Lets sort the compounds run above and below our point in terms of rt
          lower_rt <- Exclusion_Table[which(Exclusion_Table$peakgroup_rt < subject$peakgroup_rt),]
          higher_rt <- Exclusion_Table[which(Exclusion_Table$peakgroup_rt > subject$peakgroup_rt),]

          #Now find ones that break the rules for lower and higher and set Exclude to TRUE in the Exclusion_Table

          #Exclude the lower
          lower_match_ID <- lower_rt[lower_rt$FA_total_no_C>=subject$FA_total_no_C & lower_rt$FA_total_no_DB<=subject$FA_total_no_DB,"match_ID"]

          lower_diff <- lower_rt[which(lower_rt$diff_factor>0 & lower_rt$diff_factor<=1 & lower_rt$C_num_diff<0),]
          lower_diff_exclude <- lower_diff[which(lower_diff$C_num_diff==-1 & lower_diff$DB_num_diff==-1),]
          if (nrow(lower_diff_exclude)>0){
            lower_diff <- lower_diff[!(lower_diff$match_ID %in% lower_diff_exclude$match_ID),]
          }

          final_cut <- c(lower_diff$match_ID,lower_match_ID,final_cut)


          #Exclude the higher
          higher_match_ID <- higher_rt[higher_rt$FA_total_no_C<=subject$FA_total_no_C & higher_rt$FA_total_no_DB>=subject$FA_total_no_DB,"match_ID"]

          higher_diff <- higher_rt[which(higher_rt$diff_factor>0 & higher_rt$diff_factor<=1 & higher_rt$C_num_diff>0),]
          higher_diff_exclude <- higher_diff[which(higher_diff$C_num_diff==1 & higher_diff$DB_num_diff==1),]
          if (nrow(higher_diff_exclude)>0){
            higher_diff <- higher_diff[!(higher_diff$match_ID %in% higher_diff_exclude$match_ID),]
          }

          final_cut <- c(higher_diff$match_ID,higher_match_ID,final_cut)

          cat("\r")
          flush.console()
          cat("Eliminating points that do not fit. Point number",i,"of",nrow(run),". Number of points excluded:",length(unique(final_cut)),"...")
        }
        cat("Done!")
        cat("\n")
        ms2_excluded_matchids <- PRErun[PRErun$match_ID %in% final_cut,"match_ID"]
        PRErun <- PRErun[!(PRErun$match_ID %in% final_cut),]

    }else{
      cat("\nNo points within 5% of a retention time window. Moving to next class")
      next()
    }
  }


  #Screen everything
  cat("\nPerforming screening of complete dataset for",as.character(unique(LOBpeaklist$species)[k]))
  screened <- screen(PRErun)
  if(Final_Switch==TRUE){
    screened[screened$match_ID %in% ms2_excluded_matchids,"lpSolve"] <- "No"
  }
  screened_plot <- screened[which(is.na(screened$lpSolve)==FALSE & screened$species == unique(LOBpeaklist$species)[k]),]

  if(plot_data==TRUE){
  showplots(screened_plot,extra = "Final")
  }

  # if (save.files==TRUE){
  #   ggsave(filename = paste0(as.character(run$species), "_LP_solve.tiff"),
  #          plot = last_plot(),
  #          device = "tiff",
  #          width = 22, height = 17)
  # }
  cat("\n")
  if (use_ms2==TRUE & Final_Switch == TRUE) {
    test <- unique(screened[which(screened$match_ID %in% use2screen$match_ID),"lpSolve"])
    if("No" %in% test){
      cat("\n")
      warning("Not all MS2v peaks included in solution.")
      done[done$match_ID %in% screened_plot$match_ID,"lpSolve"] <- screened_plot[order(screened_plot$match_ID),"lpSolve"]
    }
    if("Maybe" %in% test){
      cat("\n")
      warning("Not all MS2v peaks included in solution.")
      done[done$match_ID %in% screened_plot$match_ID,"lpSolve"] <- screened_plot[order(screened_plot$match_ID),"lpSolve"]
    }
    cat("\n")
    cat("All MS2v peaks included in solution!")
    done[done$match_ID %in% screened_plot$match_ID,"lpSolve"] <- screened_plot[order(screened_plot$match_ID),"lpSolve"]
  }else{
    done[done$match_ID %in% screened_plot$match_ID,"lpSolve"] <- screened_plot[order(screened_plot$match_ID),"lpSolve"]
    cat("Class Screened without MS2 data.")
  }
cat("\n")
}
  return(done)
}
hholm/LOB_tools documentation built on Oct. 27, 2019, 4:14 a.m.