R/filterDataSet.R

Defines functions filterDataSet

Documented in filterDataSet

#' @title Filter Data set
#' @author André Vidas Olsen
#' @description filterDataSet selects relevants columns from a data set and
#' remove species if the name does not exist in the global database.
#' @param data data formatted by the use of the mergeDataSet function from
#' LipidQ.
#' @param endogene_lipid_db the endogene lipid database
#' @param ISTD_lipid_db the ISTD lipid database
#' @param userSpecifiedColnames the column names template file containing user
#' specified column names for the input data.
#' @return a filtered data set
#' @export
#' @examples
#' # load endo & ISTD databases as well as user specified column names file.
#' endogene_lipid_db <- read.table(system.file("extdata/dataTables/checks",
#'  "endogene_lipid_db.csv", package = "lipidQ"), stringsAsFactors = FALSE,
#'  header = TRUE, sep = ",")
#'
#' ISTD_lipid_db <- read.table(system.file("extdata/dataTables/checks",
#'  "ISTD_lipid_db.csv", package = "lipidQ"), stringsAsFactors = FALSE,
#'  header = TRUE, sep = ",")
#'
#' userSpecifiedColnames <- read.table(system.file("extdata/LipidQ_DataBase",
#'  "userSpecifiedColnames.csv", package = "lipidQ"),
#'  stringsAsFactors = FALSE, header = TRUE, sep = ",")
#'
#' # load is sorted mergedDataSet made by using the mergeDataSets() function
#' mergedDataSetsIsSorted <- read.table(system.file("extdata/dataTables/checks",
#'  "mergedDataSets.csv", package = "lipidQ"),
#'  stringsAsFactors = FALSE, header = TRUE, sep = ",")
#'
#' # filter the data set
#' filteredDatasets <- filterDataSet(data = mergedDataSetsIsSorted,
#'  endogene_lipid_db = endogene_lipid_db, ISTD_lipid_db = ISTD_lipid_db,
#'  userSpecifiedColnames = userSpecifiedColnames)
filterDataSet <- function(data, endogene_lipid_db, ISTD_lipid_db,
                          userSpecifiedColnames = NULL){

  # merge endogene_lipid_db and ISTD_lipid_db together
  database <- merge_endo_and_ISTD_db(endogene_lipid_db, ISTD_lipid_db)

  # get colnames for data
  dataColnames <- checkColnames(userSpecifiedColnames)

  # for each row, check if either the NAME or SPECIE column begins with a
  # [SPACE] and remove this [SPACE] if true.
  database[,dataColnames$SUM_COMPOSITION] <- ifelse(substring(
    database[, dataColnames$SUM_COMPOSITION],1,1) == " ",
    substring(database[, dataColnames$SUM_COMPOSITION], 2),
    database[, dataColnames$SUM_COMPOSITION])




  #### select relevant columns
  data_tmp <- data[,c(dataColnames$SUM_COMPOSITION, dataColnames$PPM,
        dataColnames$MASS_TO_CHARGE,dataColnames$SPECIE_COMPOSITION, "MODE",
        dataColnames$C_CHAIN, dataColnames$DOUBLE_BOND, dataColnames$OH_GROUP)]
  MS1x_tmp <- data[,c(colnames(data)[grep(paste0("^", dataColnames$MS1x),
                                          colnames(data))])]

  # find user specified columns of MS2ix column names -> MS2ix_userCols
  MS2ix_cols <- dataColnames[grep("MS2",colnames(dataColnames))]

  MS2ix_userCols <- c()
  for(userCol in as.character(MS2ix_cols)){
    MS2ix_userCols <-
      append(MS2ix_userCols, c(colnames(data)[grep(userCol, colnames(data))]))
  }
  MS2ix_userCols <- unique(MS2ix_userCols)

  MS2ix_tmp <- data[,MS2ix_userCols]
  data <- cbind(data_tmp,MS1x_tmp, MS2ix_tmp)



  #### Filtering based on 1/0 columns in database

  # select user specified columns of MS2ix columns from database
  MS2ix_userCols_database <- c()
  for(userCol in as.character(MS2ix_cols)){
    MS2ix_userCols_database <-
      append(MS2ix_userCols_database,
      colnames(database)[grep(userCol,colnames(database))])
  }

  # find all class names in database
  classNames <- unique(database[,dataColnames$SUM_COMPOSITION])


  # for each class in database, choose all species in data
  for(className in classNames){
    # select relevant columns (based on the 1/0's in the database)

    col_index <-
      (database[database[,dataColnames$SUM_COMPOSITION] == className,
      MS2ix_userCols_database] == 1)[1,] # the [1,] ensures that only the first
    # row of columns is chosen (if there are multiple rows with the same
    # className in the SUM_COMPOSITION column)

    selectedColNames <- MS2ix_userCols_database[col_index]



    if(length(selectedColNames) > 0){ # avoid error by ensuring that there
      # exists some selected columns.

      # use selectedColNames to select all relevant columns for each sample in
      # data
      for(k in 1:(ncol(MS1x_tmp))){


        if(k <= 9){


          selectedColRows <-
            subset(data,data[,dataColnames$SUM_COMPOSITION] == className,
            select = paste0(selectedColNames, "_0",k))
        }else{
          selectedColRows <-
            subset(data, data[,dataColnames$SUM_COMPOSITION] == className,
            select = paste0(selectedColNames, "_",k))

        }


        if(nrow(selectedColRows) > 0){ # only check condition on column-row
          # values if selectedColRows actually contains rows
          for(i in 1:nrow(selectedColRows)){
            # check that all relevant columns (selected by 1/0 in the database)
            # has a value > 0 for a given row. If at least one has a value <= 0,
            # then this row == 0 for all columns.
            if(any(selectedColRows[i,] <= 0)){
              selectedColRows[i,] <- 0
            }
          }
          # set MS1x to 0 if all rows for the 1. columns in a given class has
          # values <= 0. (The remaining columns are not neccesary to check,
          # since checks and modifications of all columns for each row have been
          # made).
          if(all(selectedColRows[,1] <= "0")){
            data[data[,dataColnames$SUM_COMPOSITION] == className,
                 colnames(MS1x_tmp)[k]] <- 0
          }
        }
      }
    }
  }


  # create QUAN_SCAN column to data consisting of the QUAN_SCAN column in
  # database.
  data$QUAN_SCAN <-
    database$QUAN_SCAN[match(data[,dataColnames$SUM_COMPOSITION],
    database[,dataColnames$SUM_COMPOSITION])]


  # create SPECIE_COMPOSITION.GLOBAL column to data consisting of the
  # SPECIE_COMPOSITION column in database.
  data[, paste0(dataColnames$SPECIE_COMPOSITION, ".GLOBAL")] <-
    database[match(data[,dataColnames$SUM_COMPOSITION],
    database[,dataColnames$SUM_COMPOSITION]), dataColnames$SPECIE_COMPOSITION]


  #### remove specie names that are not included in database
  GLOBAL.SUM_COMPOSITION.CHECK <-
    database[match(data[,dataColnames$SUM_COMPOSITION],
    database[,dataColnames$SUM_COMPOSITION]), dataColnames$SUM_COMPOSITION] #
  # transfer SUM_COMPOSITION col from database to data

  data <- data[!is.na(GLOBAL.SUM_COMPOSITION.CHECK),] # remove all rows whose
  # names were not found in database


  #### only include rows that have been monitored with the specified
  #### MODE (POS/NEG) in the database column
  data <-
    data[data[,"MODE"] == database[match(data[, dataColnames$SUM_COMPOSITION],
    database[,dataColnames$SUM_COMPOSITION]), "QUAN_MODE"],]


  #### create SPECIE_COMPOSITION.ALL col: consists of all species within specie
  #### name seperated by "|", e.g. DAG 16:1-16:1|DAG 18:1-14:1
  data[,paste0(dataColnames$SPECIE_COMPOSITION,".ALL")] <- NA
  nameList <- unique(data[,dataColnames$SUM_COMPOSITION])
  for(name in nameList){ # for each specie name, find all species and insert
    # them into SPECIE_COMPOSITION.ALL seperated by "|"
    specie_tmp <-data[data[,dataColnames$SUM_COMPOSITION] == name,
                      dataColnames$SPECIE_COMPOSITION]

    specie_tmp <- paste(specie_tmp, collapse = '|')

    data[which(data[,dataColnames$SUM_COMPOSITION] == name),
         paste0(dataColnames$SPECIE_COMPOSITION, ".ALL")] <- specie_tmp
  }


  #### remove duplicates

  # find potential value > 0 for each SUM_COMPOSITION in each MS1x column and
  # set this value as the default for this class name (if it exists), so that
  # they appear after removal of duplicates. (MS1x always have the same
  # value > 0)
  classNames <- unique(data[,dataColnames$SUM_COMPOSITION])
  for(MS1x in colnames(MS1x_tmp)){
    for(className in classNames){
      data[data[,dataColnames$SUM_COMPOSITION] == className, MS1x] <-
        max(data[data[,dataColnames$SUM_COMPOSITION] == className, MS1x])
    }
  }

  # remove all duplicates of SUM_COMPOSITION.
  data <- data[!duplicated(data[,dataColnames$SUM_COMPOSITION]),]

  # replace NA and "" with "none" in SPECIE_COMPOSITION.GLOBAL
  data[, paste0(dataColnames$SPECIE_COMPOSITION, ".GLOBAL")] <-
    ifelse(is.na(data[, paste0(dataColnames$SPECIE_COMPOSITION, ".GLOBAL")]) |
    data[, paste0(dataColnames$SPECIE_COMPOSITION, ".GLOBAL")] == "", "none",
    data[, paste0(dataColnames$SPECIE_COMPOSITION, ".GLOBAL")])


  return(data)
}
ELELAB/lipidQ documentation built on Feb. 24, 2020, 12:54 a.m.