R/PreprocessData.R
In MetID: Network-based prioritization of putative metabolite IDs

Documented in get_cleaned

#' Preprocess input file.
#'
#' @param filename the name of the file which the data are to be read from. Its type should be chosen
#'             in 'type' parameter. Also, it should have columns named exactly as 'metid' (IDs for peaks),
#'             'query_m.z' (query mass of peaks), 'exact_m.z' (exact mass of putitative IDs),
#'             'kegg_id' (IDs of putitative IDs from KEGG Database), 'pubchem_cid' (CIDs of putitative IDs
#'             from PubChem Database). Otherwise, this function would not work.
#' @param type string indicating the type of the file. It can be a 'data.frame' which is already loaded
#'             into R, or some other types like a csv file.
#' @param na a character vector of strings which are to be interpreted as NA values.
#' @param sep a character value which seperates multiple IDs in kegg_id or pubchem_cid field, if there
#'            are multiple IDs.
#' @return get_cleaned returns a list containing the following components:
#'        \item{df}{a data frame which is the original input data.}
#'        \item{clean_data}{a data frame with unuseful observations and features removed.}
#'        \item{mass}{a data frame with unique query peak, along with query mass.}
#'        \item{ID}{a data frame with unique putitative IDs, along with PubChem ID, KEGG ID, exact mass.}
#'        \item{index_na}{a vector of row indexes which contains NA values.}

get_cleaned <- function(filename, type = c('data.frame','csv','txt'), na, sep){


  ## Accept different types of files
  if (type == 'data.frame'){
    df <- filename
  } else if (type == 'csv'){
      df <- read.csv(filename, header = TRUE, stringsAsFactors = FALSE, na.strings = na)
  } else if (type == 'txt'){
        df <- read.table(filename, header = TRUE, stringsAsFactors = FALSE, na.strings = na)
  } else {
          stop('Please provide a data frame in R or a csv/text file!')
    }
  # Check column names of data
  if (!all(c("query_m.z", "exact_m.z", "kegg_id", "pubchem_cid") %in% colnames(df))){
    stop('Please provide features needed, with corresponding column names!')
  }

  ## Load data
  qmass <- df$query_m.z
  emass <- df$exact_m.z
  kid <- df$kegg_id
  cid <- df$pubchem_cid
  # Add metid column
  mass_set <- qmass[!duplicated(qmass)]
  metid <- match(qmass, mass_set, nomatch=0)
  # Format id columns
  kid <- format_id(kid, na = na, sep = sep)
  cid <- format_id(cid, na = na, sep = sep)
  cid <- as.numeric(stringr::word(cid,1))
  # Get inchikey
  inchikey <- get_inchikey(cid)
  df$inchikey <- inchikey
  df$metid <- metid

  ## Format data
  df_formated <- data.frame(metid,qmass,emass,kid,cid,inchikey,stringsAsFactors = FALSE)
  # remove NAs
  index_empty <- which(is.na(cid))
  df_formated <- df_formated[!is.na(cid),]
  # Combine rows with same inchikey
  metid <- unique(df_formated$metid)
  compound <- df_formated[df_formated$metid==metid[1],]
  df_combined <- combine_inchikey(compound)
  if (length(metid)>1){
    for (i in 2:length(metid)){
      compound <- df_formated[df_formated$metid==metid[i],]
      df_combined <- rbind(df_combined,combine_inchikey(compound))
    }
  }

  ## get mass data -- with m query_m.z
  mass <- df_combined[!duplicated(df_combined$metid),]
  mass <- subset(mass,select = c(metid,qmass))


  ## get ID data -- with c identifications
  ID <- df_combined[!duplicated(df_combined$cid),]
  ID <- subset(ID,select = c(metid,kid,cid,emass))

  return(list(df=df, clean_data = df_formated, mass = mass, ID = ID, index_na = index_empty))
}



################################## helper functions ##################################

format_id <- function(id, na, sep){
  # This function formats id columns.
  id[is.na(id)] <- ''
  for (i in 1:length(id)){
    id[i] <- stringr::str_replace_all(id[i], pattern = na,'')
    if (startsWith(id[i], sep)){id[i] <- stringr::str_replace(id[i], pattern = sep, '')}
    id[i] <- stringr::str_replace_all(id[i], pattern = ' ','')
    id[i] <- stringr::str_replace_all(id[i], pattern = paste0(sep,sep,'|',sep), ' ')
    id[i] <- stringr::str_trim(id[i])
  }
  return(id)
}

get_inchikey<- function(ids){
  response<-vapply(ids, function(i) {
    InchiKey<-get("InchiKey")
    if(is.na(i) || !i %in% InchiKey$CID){
      as.character(NA)
    }
    else {
    as.character(substr(InchiKey$InchiKey[which(InchiKey$CID == i)],1,14))}
    }, FUN.VALUE = character(1))
  return(response)}

combine_inchikey <- function(compound){
  # This function combines identifications with same InchiKey.
  group <- compound$inchikey[!duplicated(compound$inchikey)]
  combined <- compound[1:length(group),]
  for (j in 1:length(group)){
    sub <- compound[compound$inchikey==group[j],]
    combined[j,] <- sub[1,]
    combined$kid[j] <- stringr::str_trim(paste(sub$kid, collapse = ' '))
  }
  return(combined)
}