R/library_reader.R

Defines functions standarlize_library readMSP2 readMGF2 load_object library_reader

Documented in library_reader

#' Read and extract an existing library
#'
#' The function creates or extracts a list object containing dataframe of metadata and a list of spectra from library object 
#'
#' @param raw_library Library file name (character) or object generated by library_generator. If character, the file extension must be mgf, msp or rdata
#' 
#' @return
#' \itemize{
#'   \item{sp:}{ List of all extracted spectra. Each spectrum is a data matrix with two columns: m/z and intensity} 
#'   \item{metadata:}{ Data frame containing metadata of extracted scans.}
#' }
#' 
#' @importFrom tools file_ext
#' @importFrom MSnbase fData readMgfData
#' @importFrom plyr rbind.fill
#' 
#' @export
#' 
library_reader<-function(raw_library){
  
  options(stringsAsFactors = FALSE)
  options(warn=-1)
  
  library_complete = NULL
  library_consensus = NULL
  library_network = NULL
  
  ########################
  ### Check input format:#
  ########################
  
  if (is.character(raw_library)){
    input_format = tolower(file_ext(raw_library))
    if (!input_format %in% c("msp", "rdata", "mgf")){
      stop("The input library must be mgf, msp or RData format!")
    }
  }
  
  if (is.null(raw_library)){
    stop("Please provide file name or a library object as library input!") 
  }
  
  if (is.list(raw_library)){
    
    input_format = "list"
    
    if ("SELECTED" %in% names(raw_library)){raw_library = raw_library$SELECTED}
    tmp_labels = names(raw_library)
    
    if (!("complete" %in% tmp_labels) || !("consensus" %in% tmp_labels) & !("network" %in% tmp_labels)){
      if ("metadata" %in% tmp_labels & "sp" %in% tmp_labels){
        raw_library = list(complete = raw_library, consensus = raw_library, network = NULL)
      } else {
        stop("Format of the library is not valid")
      }
    }
  }
  
  ##################
  ### Read inputs:##
  ##################
  
  if (input_format=="list"){
    library_complete = raw_library$complete
    library_consensus = raw_library$consensus
    library_network = raw_library$network
  }
  
  if (input_format=="mgf"){library_complete = readMGF2(raw_library)}
  
  if (input_format=="rdata"){
    raw_library = load_object(raw_library)
    library_complete = raw_library$complete
    library_consensus = raw_library$consensus
    library_network = raw_library$network
  }
  
  if (input_format=="msp"){library_complete = readMSP2(raw_library)}
  
  #############################################
  ### Clean and standardize complete library:##
  #############################################
  
  #library_complete = standarlize_library(library_complete)
  #if (!is.null(library_consensus)){library_consensus = standarlize_library(library_consensus)}
  
  ####################
  ### Output inputs:##
  ####################
  
  input_library = list(complete = library_complete, consensus = library_consensus, network=library_network)
  
  return(input_library)
}

######################
### Internal function:
######################

load_object <- function(file) {
  tmp <- new.env()
  load(file = file, envir = tmp)
  tmp[[ls(tmp)[1]]]
}

readMGF2<-function(con){
  
  ### Reading from spectral library
  
  db = readMgfData(con, verbose = FALSE)
  metadata = fData(db)
  N = nrow(metadata)
  
  # From a MSnBase object to a list of spectra m/z intensity
  spectrum_list=list()

  for (i in 1:N){
    spectrum_list[[i]]=cbind(db[[i]]@mz,db[[i]]@intensity)
  }
  
  ### Return new results:

  library = list()
  if (N>0){
    library$sp = spectrum_list
    library$metadata = metadata
  }
  return(library)
}

readMSP2<-function(con){

  ### Reading from spectral library

  msp <- file(con,open="r")
  msp_lines <- readLines(msp,warn = F)
  msp_lines = msp_lines[msp_lines!=""]
  close(msp)
  
  begin_ind = which(grepl("Name:",msp_lines))
  #end_ind = which(grepl("Num Peaks:",msp_lines))
  
  N = length(begin_ind)
  begin_ind = c(begin_ind, length(msp_lines)+1)
  
  metadata = c() # New metadata
  splist = list()
  
  for (i in 1:N){
    
    # Find metadata ranges:
    checked = FALSE
    ki = begin_ind[i]
    while (!checked){
      ki = ki + 1
      checked = !is.na(as.numeric(strsplit(msp_lines[ki], " ")[[1]][1]))
    }
    end_ind = ki - 1
    
    metadata_ind = begin_ind[i]:end_ind
    metadata_range = msp_lines[metadata_ind]
    NI = length(metadata_ind)
    
    new_metadata_names = sapply(metadata_range, function(x) strsplit(x,": ")[[1]][1])
    new_metadata_names = as.character(new_metadata_names)
    new_metadata = sapply(metadata_range, function(x) strsplit(x,": ")[[1]][2])
    new_metadata = as.data.frame(t(new_metadata))
    colnames(new_metadata) = new_metadata_names
    
    metadata = rbind.fill(metadata, new_metadata)
    
    spectrum_ind = (end_ind+1):(begin_ind[i+1]-1)
    nb_peak = length(spectrum_ind)
    spectrum_range = msp_lines[spectrum_ind]
    mzlist = sapply(spectrum_range,function(x) strsplit(x," ")[[1]][1])
    intlist = sapply(spectrum_range,function(x) strsplit(x," ")[[1]][2])
    spectrum = cbind(as.numeric(mzlist),as.numeric(intlist))
    splist[[i]] = spectrum
  }
  
  ### Return results

  rownames(metadata) = NULL
  metadata= data.frame(metadata)
  
  return(list(metadata = metadata, sp = splist))
}

standarlize_library<-function(library0){
  
  metadata = library0$metadata
  sp = library0$sp
  temp_col = toupper(colnames(metadata))

  # ID:
  ind1 = which(temp_col=="ID")
  ind2 = which(temp_col=="NAME")
  if (length(ind1)==0 & length(ind2)==1){colnames(metadata)[ind2] = "ID"}
  if (!("ID" %in% colnames(metadata))){stop("Metadata must contain a column ID!")}
  metadata$ID = as.character(metadata$ID)

  #PEPMASS:
  ind = which(temp_col=="PRECURSORMZ")
  if (length(ind)==1){colnames(metadata)[ind] = "PEPMASS"}  
  if ("PEPMASS" %in% colnames(metadata)){
    temp_mz = round(as.numeric(metadata$PEPMASS),4)
    temp_mz[is.na(temp_mz)]="N/A"
    metadata$PEPMASS = temp_mz
  }

  #RT:
  ind = which(temp_col=="RTINSECONDS")
  if (length(ind)==1){
    colnames(metadata)[ind] = "RT"
    metadata$RT = as.numeric(metadata$RT)/60 # From second to minute
  }
  if ("RT" %in% colnames(metadata)){
    temp_rt = round(as.numeric(metadata$RT),2)
    temp_rt[is.na(temp_rt)]="N/A"
    metadata$RT = temp_rt
  }

  #MSLEVEL:
  ind = which(temp_col=="SPECTRUM_TYPE")
  if (length(ind)==1){colnames(metadata)[ind] = "MSLEVEL"}
  if ("MSLEVEL" %in% colnames(metadata)){
    temp = metadata$MSLEVEL
    temp[temp=="MS2"] = 2
    temp[temp=="MS1"] = 1
    temp = as.numeric(temp)
    temp[is.na(temp)]="N/A"
    metadata$MSLEVEL = temp
  }
  
  # ADDUCT:
  ind = which(temp_col=="PRECURSOR_TYPE")
  if (length(ind)==1){colnames(metadata)[ind] = "ADDUCT"}
  
  # IONDMODE:
  ind = which(temp_col=="ION_MODE")
  if (length(ind)==1){colnames(metadata)[ind] = "IONMODE"}
  if ("IONMODE" %in% colnames(metadata)){
    temp = as.character(metadata$IONMODE)
    temp[temp=="P"] = "Positive"
    temp[temp=="N"] = "Negative"
    temp[which(!(temp %in% c("Positive", "Negative")))] = "N/A"
    metadata$IONMODE = temp
  } 
  
  # SCANS:
  ind = which(temp_col %in% "SCAN")
  if (length(ind)==1){colnames(metadata)[ind] = "SCANS"}
  if (!("SCANS" %in% colnames(metadata))){metadata$SCANS = 1:nrow(metadata)}
  if (NA %in% as.numeric(metadata$SCANS)){metadata$SCANS = 1:nrow(metadata)}
  
  # COMMENTS:
  ind = which(temp_col=="COMMENTS")
  if (length(ind)==1){metadata = metadata[,-ind]}
  
  # sp
  
  if (!is.list(sp)){sp = list(sp)}
  sp = lapply(sp, function(x) data.matrix(x[,1:2,drop=FALSE]))

  library1 = list(metadata = metadata, sp = sp)
  return(library1)
}
daniellyz/MergeION2 documentation built on Jan. 26, 2024, 6:24 a.m.