R/convert_mzid.R

Defines functions convert_mzid Convert_by_usr Ind_mod_fix1 Ind_mod_fix Ind_modtype_err AddModLabel Add_label

#' @author Liya Ming
#' @title convert_mzid
#' @description Convertion of mzid format to the input file required by function mms2plot. 
#' @export convert_mzid
#' @param path_user_table File path of the table that contains the information of 
#'        user-specified modification.The columns in the table represent respectively
#'        the paths of the raw file (format.mzML), the first two letters of the 
#'        modification of interest, the mass shift of the modification 
#'        (e.g.The mass shift of oxidation is 15.994915),and the acceptable mass error.
#' @param dir_mzid The full path of searching result mzid format
#'        (e.g.result of comet).
#' @param label.by.Raw_seq_charge a logical value about the way of adding lables,
#'        if FALSE, the PSMs from the same .raw(.mzML or .mzXML) file and were 
#'        identified as the same sequence will be labled as the same number, 
#'        whether or not the number of charges is the same;If TRUE,only psms 
#'        have the same number of charges will be labled the same number,
#'        the psms labled the same number will be outputed to the same PDF 
#'        file when generating mirrored or aligned spectra using mms2plot.
#' @return the input file required by mms2plot.
#' @import stringr
#' @import mzID
#' @importFrom tools file_path_sans_ext
#' @importFrom dplyr select
#' @importFrom data.table fread
#' @note See vignettes for more details
#' @examples
#' ###
#' general_path = system.file( package = "mms2plot",dir = "extdata" )
#' setwd(general_path)
#' ######
#' #Mzid files generated by comet
#' path_prot = 'mzid_convert/acc_genename.txt'
#' path_user_table = 'mzid_convert/user_table_test.txt'
#' dir_mzid = 'mzid_convert/mzid_comet'
#' lf = list.files(dir_mzid,full.names = T)
#' a=mapply(unzip,lf[grep('\\.zip',lf)],exdir = dir_mzid,USE.NAMES = F)
#' # mms2plot_input = convert_mzid(path_user_table,dir_mzid,label.by.Raw_seq_charge)#for test
#' ###
#' #Mzid files generated by MSGFPlus
#' path_prot = 'mzid_convert/acc_genename.txt'
#' path_user_table = 'mzid_convert/user_table_test.txt'
#' dir_mzid = 'mzid_convert/mzid_MSGFPlus'
#' lf = list.files(dir_mzid,full.names = T)
#' a=mapply(unzip,lf[grep('\\.zip',lf)],exdir = dir_mzid,USE.NAMES = F)
#' # mms2plot_input = convert_mzid(path_user_table,dir_mzid,label.by.Raw_seq_charge)#for test

#rm(list=ls())

options(stringsAsFactors = FALSE)
options(digits = 15)


######Path of protein information:
test = F

if(test){
  general_path = 'M:/Software/Git/new/inst/extdata'
}else{
  general_path = system.file( package = "mms2plot",dir = "extdata" )
}

#setwd(general_path)

path_prot = paste(general_path,'mzid_convert/acc_genename.txt',sep = '/')

###########******FUNCTION******#########
####***AddModLabel:add labels by raw_sequence***#########
Add_label=function(i,df,keys){
  df_sub=df[keys == unique(keys)[i],]
  df_sub$label = i
  return(df_sub)
}

###***AddModLabel:add modification symbols into sequence***####
AddModLabel = function(i,seqs,poss,modlabel){
  seq = seqs[i]
  pos = as.numeric(poss[[i]])
  split = unlist(strsplit(seq,''))
  mod.res = split[pos]
  split[pos] = paste0(mod.res,modlabel)
  modseq = paste(split,collapse = '')
  return(modseq)
}

#Get indicies beyond the mw range of user-specified modification and fixed modification 
Ind_modtype_err =  function(MW,flo,cei,c_flo,c_cei){
  mw = as.numeric(MW)
  ind_within = which(flo < mw & mw < cei | c_flo < mw & mw < c_cei)
}

#get the mw and position within range of fixed modification
Ind_mod_fix =  function(MW,c_flo,c_cei){
  mw = as.numeric(MW)
  ind_within = which(c_flo < mw & mw < c_cei)
}

#paste mws and positions
Ind_mod_fix1 = function(MW_POS,c_flo,c_cei){
  mw = as.numeric(unlist(stringr::str_extract_all(MW_POS,"[^\\(]\\d+\\.?\\d+[^\\)]")))
  pos = stringr::str_remove_all(string = unlist(stringr::str_extract_all(MW_POS,'\\(\\d+\\)')),pattern = '[\\(\\)]')
  ind_within = which(c_flo < mw & mw < c_cei)
  mw_1 = mw[-ind_within]
  pos_1 = pos[-ind_within]
  pas = paste(paste(mw_1,paste0('(',pos_1,')')),collapse=';')
  return(pas)
}

Convert_by_usr <- function(i,table,paths_mzid){
  # i is the row number of table
  # table is user_table which is a dataframe
  user_table = table[i,]
  raw_path = user_table$fullfilepath
  rawName = basename(tools::file_path_sans_ext(raw_path))
  #>>>>>>
  
  if(!file.exists(raw_path)){
    lf=list.files(dirname(raw_path),full.names = T)
    if(file.exists(paste(dirname(raw_path),paste0(rawName,'.zip'),sep = '/'))){
      unzip(paste(dirname(raw_path),paste0(rawName,'.zip'),sep = '/'),exdir = dirname(raw_path))
    }else{
      stop(paste0('The raw file does NOT exist in the path: ',
                  dirname(raw_path)),'! Please check the folder...')
    }
  }
  
  path_mzid_user = paths_mzid[grep(paste0(rawName,'.mzid'),basename(paths_mzid),fixed = T)]
  ######******MZID --> flatten file
  mzid = mzID::mzID(path_mzid_user)
  fla = mzID::flatten(mzid)
  fla = fla[with(fla,rank == 1),]#comet
  # browser()
  # #msgfplus:
  inv.col = c("start","end","pre","post","accession","description")
  if(length(which(inv.col %in% colnames(fla))) > 0){
    fla = fla[!duplicated(subset(fla,select = -which(colnames(fla) %in% inv.col))),]
  }
  # browser()
  ######******filter modification mass shift in flatten file
  mod = subset(fla,fla$modified == TRUE)
  unmod = subset(fla,fla$modified == FALSE)
  ###***range of mw,only for user_table with single row
  accu_usr = user_table$delta
  mw_usr = user_table$mass_shift
  mw_floor = as.numeric(mw_usr) - as.numeric(accu_usr)
  mw_ceiling = as.numeric(mw_usr) + as.numeric(accu_usr)
  fix_c_mw = 57.021464
  c_fixed_floor = as.numeric(fix_c_mw) - as.numeric(accu_usr)
  c_fixed_ceiling = as.numeric(fix_c_mw) + as.numeric(accu_usr)
  ######******Extract all of mw and midified positions
  mw_pos=mod$modification#mass shift and position
  extra_mw = stringr::str_extract_all(mw_pos,"[^\\(]\\d+\\.?\\d+[^\\)]")
  extra_pos = mapply(FUN = stringr::str_remove_all,
                     string = stringr::str_extract_all(mw_pos,'\\(\\d+\\)'),pattern = '[\\(\\)]')
  #####***Save mw within range [mw-variance,mw+variance]
  #remove mw not within modification specified by user and fixed modification
  inds_correct_ls = lapply(FUN = Ind_modtype_err,X = extra_mw,
                           flo = mw_floor,cei = mw_ceiling,
                           c_flo = c_fixed_floor,c_cei = c_fixed_ceiling)
  count_mw = unlist(lapply(extra_mw,length))
  count_correct = unlist(lapply(inds_correct_ls,length))
  inds_correct = which(count_mw == count_correct)
  mod_user_1 = mod[inds_correct,]#subset within mw range and fixed modification mw range
  mw_pos_1=mod_user_1$modification
  extra_mw_1 = stringr::str_extract_all(mw_pos_1,"[^\\(]\\d+\\.?\\d+[^\\)]")
  inds_fix_within = lapply(FUN=Ind_mod_fix,X = extra_mw_1,c_flo = c_fixed_floor,c_cei = c_fixed_ceiling)
  count_fix = unlist(lapply(inds_fix_within,length))
  inds_fix = which(count_fix != 0)
  mod_fix_within = mod_user_1[inds_fix,]#subset containing fixed modification
  mod_nonfix = mod_user_1[-inds_fix,]#subset DOES NOT contain fixed modification
  #####***Save mw within range [mw-variance,mw+variance]
  mw_pos_2=mod_fix_within$modification
  extra_mw_2 = stringr::str_extract_all(mw_pos_2,"[^\\(]\\d+\\.?\\d+[^\\)]")
  count_mw_2 = unlist(lapply(extra_mw_2,length))
  #When a peptide has only fixed modification,
  #the modification column will be changed to NA, modified to FALSE, and merged into the unmod subset
  ind_only_fix = which(count_fix[count_fix != 0] == count_mw_2)
  mod_only_fix = mod_fix_within[ind_only_fix,]#subset has only fixed modification
  mod_only_fix$modified = FALSE
  mod_only_fix$modification = NA
  unmod_1 = rbind(mod_only_fix,unmod)#merge into unmod subset
  mod_mul_types = mod_fix_within[-ind_only_fix,]#subset containing fixed and variable modifications simultaneously
  mw_pos_clean = unlist(lapply(X=mod_mul_types$modification,c_flo=c_fixed_floor,c_cei=c_fixed_ceiling,FUN=Ind_mod_fix1))
  mod_mul_types$modification = mw_pos_clean
  mod_1 = rbind(mod_mul_types,mod_nonfix)#merge into mod subset
  ######***Save pairs/groups***######
  seq_intsect = intersect(unmod_1$pepseq,mod_1$pepseq)
  ind_sub = which(mod_1$pepseq %in% seq_intsect)
  mod_1_sub = mod_1[ind_sub,]
  unmod_sub = unmod_1[unmod_1$pepseq %in% seq_intsect,]
  ######******Extract mw and modified position from flatten file******######
  mw_pos_mod_1 = mod_1_sub$modification
  MWs = stringr::str_extract_all(mw_pos_mod_1,"[^\\(]\\d+\\.?\\d+[^\\)]")
  POSs = mapply(FUN = stringr::str_remove_all,
                string = stringr::str_extract_all(mw_pos_mod_1,'\\(\\d+\\)'),pattern = '[\\(\\)]')
  ######******Get modified sequence with modification labels such as "(ox)"******######
  SEQ = mod_1_sub$pepseq
  modlabel = paste0('(',tolower(user_table$modification),')')
  Modified_seq = mapply(AddModLabel,seq(nrow(mod_1_sub)),MoreArgs = list(seqs=SEQ,poss=POSs,modlabel=modlabel))#modified sequences
  ######******Get the dataframe required by mms2plot******######
  unmod_sub$`Modified sequence` = unmod_sub$pepseq
  mod_1_sub$`Modified sequence` = Modified_seq
  unmod_sub$Modifications = 'Unmodified'
  mod_1_sub$Modifications = user_table$modification
  bind_mod_unmod = rbind(mod_1_sub,unmod_sub)
  bind_mod_unmod$`Raw file` = raw_path
  
  if('description' %in% colnames(bind_mod_unmod)){
    cols_select = c('Raw file','acquisitionnum','Modifications','pepseq',
                    'Modified sequence','chargestate','description')
    df_mms2plot = dplyr::select(bind_mod_unmod,cols_select)
    colnames(df_mms2plot) = c('Raw file','Scan number','Modifications','Sequence',
                              'Modified sequence','Charge','description')
    GNs=stringr::str_remove(string=stringr::str_extract(df_mms2plot$description,'GN=\\w+'),
                            pattern = 'GN=')
    df_mms2plot = dplyr::select(df_mms2plot,-description)
    df_mms2plot$`Gene Names` = GNs
  }else{
    cols_select = c('Raw file','acquisitionnum','Modifications','pepseq',
                    'Modified sequence','chargestate','accession')
    df_mms2plot = dplyr::select(bind_mod_unmod,cols_select)
    colnames(df_mms2plot) = c('Raw file','Scan number','Modifications','Sequence',
                              'Modified sequence','Charge','Protein')
    acc.names = stringr::str_remove_all(stringr::str_extract(df_mms2plot$Protein,'\\|\\w+\\|'),'\\|')
    prot_acc_gn = data.table::fread(path_prot,header=T)
    GNs = prot_acc_gn$gene_name[match(acc.names,prot_acc_gn$accession)]
    df_mms2plot$`Gene Names` = GNs
    df_mms2plot = dplyr::select(df_mms2plot,-Protein)
  }
  return(df_mms2plot)
}

####>>>>>>  Main function:convert_mzid  <<<<<<####
convert_mzid = function(path_user_table,dir_mzid,label.by.Raw_seq_charge){
  user_table = data.table::fread(path_user_table,header = T)#read user table
  paths_mzid = list.files(dir_mzid,recursive = T,full.names = T)
  result_df = lapply(FUN=Convert_by_usr,X=seq(nrow(user_table)),
                     table=user_table,
                     paths_mzid=paths_mzid)
  df = do.call(rbind,result_df)
  df_mod = df[with(df,Modifications == 'Unmodified'),]
  df_unmod = df[with(df,Modifications != 'Unmodified'),]
  if(label.by.Raw_seq_charge){
    key_unmod = unique(paste(df_unmod$`Raw file`,df_unmod$Sequence,df_unmod$Charge))
    key_mod = unique(paste(df_mod$`Raw file`,df_mod$Sequence,df_mod$Charge))
    key_intersect = intersect(key_unmod,key_mod)
    keep_unmod = df_unmod[with(df_unmod,paste(`Raw file`,`Sequence`,`Charge`) %in% key_intersect),]
    keep_mod = df_mod[with(df_mod,paste(`Raw file`,`Sequence`,`Charge`) %in% key_intersect),]
    bind_groups = rbind(keep_mod,keep_unmod)
    keys = paste(bind_groups$`Raw file`,bind_groups$Sequence,bind_groups$Charge)
  }else{
    key_unmod = unique(paste(df_unmod$`Raw file`,df_unmod$Sequence))
    key_mod = unique(paste(df_mod$`Raw file`,df_mod$Sequence))
    key_intersect = intersect(key_unmod,key_mod)
    keep_unmod = df_unmod[with(df_unmod,paste(`Raw file`,`Sequence`) %in% key_intersect),]
    keep_mod = df_mod[with(df_mod,paste(`Raw file`,`Sequence`) %in% key_intersect),]
    bind_groups = rbind(keep_mod,keep_unmod)
    keys = paste(bind_groups$`Raw file`,bind_groups$Sequence)
  }
  df_sub_ls = lapply(seq(length(key_intersect)),Add_label,df = bind_groups,keys = keys)
  df_labeled = do.call(rbind,df_sub_ls)
  return(df_labeled)
}

###***Run the main function:convert_mz id
LeahMing/mms2plot documentation built on May 1, 2020, 4:01 p.m.