R/PrepareInuputFiles.R

Defines functions PrepareInuputFiles

Documented in PrepareInuputFiles

#' @title Prepare input file in standard formats of quantification tools
#' @description process the output files obtained from popular proteomic quantification software (>=18).
#' @param acquisitionmethods Input the corresponding "number" of acquisition techniques as follows:
#' If set 1, the user chooses to process the data based on SWATH-MS.
#' If set 2, the user chooses to process the data based on Peak Intensity.
#' If set 3, the user chooses to process the data based on Spectral Counting.
#' @param rawdataset Input the name of your raw dataset directly obtained from software.
#' @param lable Input the label of your dataset.
#' @return prepareinputfile matrix
#' @importFrom tidyr spread
#' @usage PrepareInuputFiles(acquisitionmethods, rawdataset, lable)
#' @export PrepareInuputFiles
#' @examples
#' library(EVALFQ)
#' \donttest{data_s <- PrepareInuputFiles(acquisitionmethods=2,
#' rawdataset = "MaxQuant_proteinGroups_LFQ.txt", lable = "MaxQuant_LFQ_Label.txt")}

PrepareInuputFiles <- function(acquisitionmethods, rawdataset, lable){

  #path_1 <- "MaxQuant_proteinGroups_LFQ.txt"
  path_1 <- rawdataset
  pre_file2_1 <- readLines(path_1, n = 2)
  loc <- which.max(c(length(unlist(strsplit(pre_file2_1, ","))), length(unlist(strsplit(pre_file2_1, ";"))), length(unlist(strsplit(pre_file2_1, "\t")))))
  sep_seq <- c(",", ";", "\t")
  data1 <- read.csv(path_1,header=TRUE,sep=sep_seq[loc],stringsAsFactors = FALSE,check.names=FALSE)

  #path_2 <- "MaxQuant_LFQ_Label.txt"
  path_2 <- lable
  pre_file2_2 <- readLines(path_2, n = 1)
  loc <- which.max(c(length(unlist(strsplit(pre_file2_2, ","))), length(unlist(strsplit(pre_file2_2, ";"))), length(unlist(strsplit(pre_file2_2, "\t")))))
  sep_seq <- c(",", ";", "\t")
  data2 <- read.csv(path_2,header=TRUE,sep=sep_seq[loc],stringsAsFactors = FALSE,check.names=FALSE)

if( acquisitionmethods=="1" ){
  dataa<-data1
  if(any(grepl("R.Condition",colnames(dataa))) && any(grepl("R.Replicate",colnames(dataa))) && any(grepl("R.FileName",colnames(dataa))) &&
     any(grepl("EG.MinProfileQvalue",colnames(dataa))) && any(grepl("FG.TotalPeakArea",colnames(dataa))) &&
     any(grepl("FG.Charge",colnames(dataa)))&&any(grepl("EG.ProteinId",colnames(dataa)))){

    dataa1 <- dataa[,c("R.FileName","EG.ProteinId","FG.NormalizedTotalPeakArea")]

    mydata3 <- spread(data=dataa1, key=EG.ProteinId, value=FG.NormalizedTotalPeakArea)
    row.names(mydata3) <- mydata3[,"R.FileName"]
    mydata3 <- mydata3[,-1]
    datalabel<-data2
    M_sample<-match(datalabel[,1], row.names(mydata3))
    Lable <- datalabel[,2]
    res<-cbind(Lable, mydata3[M_sample,])
    message("the Resulting Data File Generated by the Quantification Software: Spectronaut")

  }else if(any(grepl("transition_group_id",colnames(dataa))) && any(grepl("run_id",colnames(dataa))) && any(grepl("filename",colnames(dataa))) &&
           any(grepl("decoy",colnames(dataa))) && any(grepl("Intensity",colnames(dataa))) &&
           any(grepl("ProteinName",colnames(dataa)))&&any(grepl("total_xic",colnames(dataa)))){

    dataa1 <- dataa[,c("filename","ProteinName","Intensity")]

    mydata3 <- spread(
      data=dataa1,
      key=ProteinName,
      value=Intensity
    )
    row.names(mydata3) <- mydata3[,"filename"]
    mydata3 <- mydata3[,-1]
    datalabel<-data2
    M_sample<-match(datalabel[,1], row.names(mydata3))
    Lable <- datalabel[,2]
    res<-cbind(Lable, mydata3[M_sample,])
    message("the Resulting Data File Generated by the Quantification Software: OpenSWATH")

  }else if(any(grepl("ReplicateName",colnames(dataa))) && any(grepl("FileName",colnames(dataa))) && any(grepl("ProteinName",colnames(dataa))) &&
           any(grepl("PrecursorCharge",colnames(dataa))) && any(grepl("IsDecoy",colnames(dataa))) &&
           any(grepl("TotalArea",colnames(dataa)))&&any(grepl("annotation_QValue",colnames(dataa)))){

    dataa1 <- dataa[,c("ReplicateName","ProteinName","TotalArea")]

    mydata3 <- spread(
      data=dataa1,
      key=ProteinName,
      value=TotalArea
    )
    row.names(mydata3) <- mydata3[,"ReplicateName"]
    mydata3 <- mydata3[,-1]
    datalabel<-data2
    M_sample<-match(datalabel[,1], row.names(mydata3))
    Lable <- datalabel[,2]
    res <- cbind(Lable, mydata3[M_sample,])
    message("the Resulting Data File Generated by the Quantification Software: Skyline")

  }else if(any(grepl("rotein Key",colnames(dataa))) && any(grepl("_Prob",colnames(dataa))) && any(grepl("_Peptides",colnames(dataa))) &&
           any(grepl("_PSMs",colnames(dataa))) && any(grepl("_Top6Top6Freq",colnames(dataa)))) {

    sampleID<-grep("_Top6Top6Freq",colnames(dataa))
    proID<-grep("rotein Key",colnames(dataa))
    result<-dataa[,c(proID,sampleID)]

    datalabel<-data2
    M_sample<-match(datalabel[,1],colnames(result))
    result_data<-rbind(label=as.character(datalabel),result[,M_sample])
    inf<-c("label",as.character(result[,1]))
    rownames(result)<- inf
    res <- t(result)
    message("the Resulting Data File Generated by the Quantification Software: DIA-UMPIRE")

  }else if(any(grepl("Protein",colnames(dataa))) && any(grepl("Peptide",colnames(dataa))) && any(grepl("Precursor MZ",colnames(dataa))) &&
           any(grepl("Precursor Charge",colnames(dataa))) && any(grepl("Fragment MZ",colnames(dataa))) && any(grepl("Fragment Charge",colnames(dataa))) &&
           any(grepl("Ion Type",colnames(dataa)))){

    proID<-grep("Protein",colnames(dataa))

    datalabel<-data2
    M_sample<-match(datalabel[,1],colnames(dataa))
    result_data<-rbind(label=as.character(datalabel),dataa[,M_sample])
    inf<-c("label",as.character(dataa[,proID]))
    rownames(result)<- inf
    res <- t(result)
    message("the Resulting Data File Generated by the Quantification Software: PeakView")
  }

}


if( acquisitionmethods=="2" ){
  dataa<-data1
  num1<-match("Normalized abundance",colnames(dataa))
  num2<-match("Raw abundance",colnames(dataa))

  if(!is.na(match("Protein IDs",colnames(dataa)))
     &&!is.na(match("Majority protein IDs",colnames(dataa)))
     &&!is.na(grep("LFQ intensity.",colnames(dataa)))){

    datalabel<-data2[,2]
    M_sample<-match(data2[,1],colnames(dataa))


    result<-rbind(label=as.character(datalabel),dataa[,M_sample])

    inf<-c("label",as.character(dataa[,1]))

    rownames(result)<- inf
    res <- t(result)

    message("the Resulting Data File Generated by the Quantification Software: MaxQuant")

  }else if(!is.na(num1)&& !is.na(num2) && dataa[2,1]=="Accession" && dataa[2,2]=="Peptide count"){

    result_data<-dataa[-1,c(1,num2:ncol(dataa))]
    colnames(result_data)<-unlist(as.list(result_data[1,]))
    result_data<-result_data[-1,]

    datalabel<-data2[,2]
    M_sample<-match(data2[,1],colnames(result_data))
    result<-rbind(label=as.character(datalabel),result_data[,M_sample])
    inf<-c("label",as.character(result_data[,1]))

    rownames(result)<- inf
    res <- t(result)

    message("the Resulting Data File Generated by the Quantification Software: Progenesis QI")

  }else if(grepl("Protein Group",colnames(dataa)[1])&& grepl("Protein ID",colnames(dataa)[2]) && grepl("Accession",colnames(dataa)[3])
           && !is.na(match("#Peptides",colnames(dataa)))&& !is.na(match("#Unique",colnames(dataa))) && any(grepl("Area ",colnames(dataa)))){

    datalabel<-data2[,2]
    M_sample<-match(data2[,1],colnames(dataa))
    result<-rbind(label=as.character(datalabel),dataa[,M_sample])
    inf<-c("label",as.character(dataa[,"Accession"]))
    rownames(result)<- inf
    res <- t(result)

    message("the Resulting Data File Generated by the Quantification Software: PEAKS")

  }else if(grepl("Samples report created on",colnames(dataa)[1]) && any(grepl("Scaffold: Version: Scaffold",dataa[,2]))){

    a<-match("#",dataa[,1])
    result<-dataa[(a-2):(nrow(dataa)-2),]
    b<-which("Accession Number"==result[3,])
    sample<-which("Quantitative Value (Total Precursor Intensity)"==result[1,])
    result<-result[,c(b,sample)]
    colnames(result)<-unlist(as.list(result[3,]))
    result<-result[-(1:3),]

    datalabel<-data2[,2]
    M_sample<-match(data2[,1],colnames(result))
    result_data<-rbind(label=as.character(datalabel),result[,M_sample])
    inf<-c("label",as.character(result[,1]))

    rownames(result)<- inf
    res <- t(result)

    message("the Resulting Data File Generated by the Quantification Software: Scaffold")


  }else if(any(grepl("e-Value",colnames(dataa))) && any(grep("Score Type",colnames(dataa)))&& any(grep("Protein Id",colnames(dataa))) && any(grep("Total Intensity",colnames(dataa)))){

    sampleID<-grep("Total Intensity",colnames(dataa))
    proID<-grep("Protein Id",colnames(dataa))
    result<-dataa[,c(proID,sampleID)]

    datalabel<-data2[,2]
    M_sample<-match(data2[,1],colnames(result))
    result_data<-rbind(label=as.character(datalabel),result[,M_sample])
    inf<-c("label",as.character(result[,1]))
    rownames(result)<- inf
    res <- t(result)

    message("the Resulting Data File Generated by the Quantification Software: Proteios SE")

  }else if(any(grepl("Accession",colnames(dataa))) && any(grepl("Description",colnames(dataa))) && any(grepl("# Peptides",colnames(dataa))) && any(grep("# PSMs",colnames(dataa)))&&
           any(grep("# Unique Peptides",colnames(dataa))) && any(grep("# Protein Groups",colnames(dataa)))&& any(grep("# AAs",colnames(dataa))) && any(grep("Abundances",colnames(dataa)))){


    sampleID<-grep("Abundances:",colnames(dataa))
    proID<-grep("Accession",colnames(dataa))
    result<-dataa[,c(proID,sampleID)]

    datalabel<-data2[,2]
    M_sample<-match(data2[,1],colnames(result))
    result_data<-rbind(label=as.character(datalabel),result[,M_sample])
    inf<-c("label",as.character(result[,1]))
    rownames(result)<- inf
    res <- t(result)

    message("the Resulting Data File Generated by the Quantification Software: Thermo Proteome Discoverer")

  }else if(any(grepl("#QPep",colnames(dataa))) && any(grepl("#IPep",colnames(dataa))) && any(grepl("Coverage ",colnames(dataa))) &&
           any(c(grep("accession",colnames(dataa)),grep("Accession",colnames(dataa))))){


    proID<-c(grep("accession",colnames(dataa)),grep("Accession",colnames(dataa)))

    datalabel<-data2[,2]
    M_sample<-match(data2[,1],colnames(dataa))
    result_data<-rbind(label=as.character(datalabel),dataa[, M_sample])
    inf<-c("label",as.character(dataa[,proID]))
    rownames(result)<- inf
    res <- t(result)

    message("the Resulting Data File Generated by the Quantification Software: MFPaQ")

  }else if(any(grepl("Protein",colnames(dataa))) && any(grepl("Peptide",colnames(dataa))) && any(grepl("Charge",colnames(dataa))) &&
           any(grepl("rt",colnames(dataa))) && any(grepl("mz",colnames(dataa))) && any(grepl("width",colnames(dataa))) &&
           any(grepl("intensity",colnames(dataa)))){

    proID<-grep("Protein",colnames(dataa))

    datalabel<-data2[,2]
    M_sample<-match(data2[,1],colnames(dataa))
    result_data<-rbind(label=as.character(datalabel),dataa[, M_sample])
    inf<-c("label",as.character(dataa[,proID]))
    rownames(result)<- inf
    res <- t(result)

    message("the Resulting Data File Generated by the Quantification Software: OpenMS")

  }}


if( acquisitionmethods=="3" ){
  dataa<-data1
        if(any(grepl("Protein IDs",colnames(dataa))) && any(grepl("Majority protein IDs",colnames(dataa))) && any(grepl("Unique peptides",colnames(dataa))) &&
           any(grepl("Razor + unique peptides",colnames(dataa)),perl = TRUE) && any(grepl("MS/MS count ",colnames(dataa),ignore.case=TRUE))){

          sampleID<-grep("MS/MS count ", colnames(dataa),ignore.case=TRUE)
          proID<-grep("Protein IDs",colnames(dataa))
          result<-dataa[,c(proID,sampleID)]

          datalabel<-data2
          M_sample<-match(datalabel[,1],colnames(result))
          result_data<-rbind(label=as.character(datalabel),result[,M_sample])
          inf<-c("label",as.character(result[,1]))
          rownames(result)<- inf
          res <- t(result)
          message("the Resulting Data File Generated by the Quantification Software: Maxquant")

        }else if(grepl("Samples report",colnames(dataa)[1]) && any(grepl("Scaffold: Version: Scaffold",dataa[,2])) &&  any(grepl("Total Spectrum Count",dataa))){

          a<-match("#",dataa[,1])
          result<-dataa[(a-2):(nrow(dataa)-2),]
          b<-which("Accession Number"==result[3,])
          sample<-which("Total Spectrum Count"==result[1,])
          result<-result[,c(b,sample)]
          colnames(result)<-unlist(as.list(result[3,]))
          result<-result[-(1:3),]

          datalabel<-data2
          M_sample<-match(datalabel[,1],colnames(result))
          result_data<-rbind(label=as.character(datalabel),result[,M_sample])
          inf<-c("label",as.character(result[,1]))
          rownames(result)<- inf
          res <- t(result)
          message("the Resulting Data File Generated by the Quantification Software: Scaffold")

        }else if(any(grepl("PROTID",colnames(dataa))) && any(grepl("_NUMPEPSUNIQ",colnames(dataa))) && any(grepl("_NUMSPECSTOT",colnames(dataa))) &&
                 any(grepl("_PW",colnames(dataa)))){

          sampleID<-grep("_NUMSPECSTOT",colnames(dataa))
          proID<-grep("PROTID",colnames(dataa))
          result<-dataa[,c(proID,sampleID)]

          datalabel<-data2
          M_sample<-match(datalabel[,1],colnames(result))
          result_data<-rbind(label=as.character(datalabel),result[,M_sample])
          inf<-c("label",as.character(result[,1]))
          rownames(result)<- inf
          res <- t(result)

          message("the Resulting Data File Generated by the Quantification Software: Abacus")

        }else if(any(grepl("Census version",colnames(dataa)[2])) && any(grepl("H",dataa[,1])) && any(grepl("P",dataa[,1])) &&
                 any(grepl("S",dataa[,1]))){

          title<-grep("PLINE",dataa[,2])
          sampleID<-grep("SPEC_COUNT",dataa[title[2],])
          row<-grep("P",dataa[,1])
          result<-dataa[row,c(2,sampleID)]
          sample<-as.character(dataa[title[2],sampleID])
          colnames(result)<-c("PLINE",sample)

          datalabel<-data2
          M_sample<-match(datalabel[,1],colnames(result))
          result_data<-rbind(label=as.character(datalabel),result[,M_sample])
          inf<-c("label",as.character(result[,1]))
          rownames(result)<- inf
          res <- t(result)
          message("the Resulting Data File Generated by the Quantification Software: Census")

        }else if(any(grepl("accession",colnames(dataa))) && any(grepl("SPEC_COUNT:",colnames(dataa)))){


          sampleID<-grep("SPEC_COUNT:",colnames(dataa))
          proID<-grep("accession",colnames(dataa))
          result<-dataa[,c(proID,sampleID)]

          datalabel<-data2
          M_sample<-match(datalabel[,1],colnames(result))
          result_data<-rbind(label=as.character(datalabel),result[,M_sample])
          inf<-c("label",as.character(result[,1]))
          rownames(result)<- inf
          res <- t(result)
          message("the Resulting Data File Generated by the Quantification Software: MFPaQ")

        }else if(any(grepl("total # pepptides",colnames(dataa))) && any(grepl("ensembl",colnames(dataa))) && any(grepl("swissprot",colnames(dataa)))){


          sampleID<-grep("total # pepptides",colnames(dataa))
          proID<-grep("swissprot",colnames(dataa))
          result<-dataa[,c(proID,sampleID)]

          datalabel<-data2
          M_sample<-match(datalabel[,1],colnames(result))
          result_data<-rbind(label=as.character(datalabel),result[,M_sample])
          inf<-c("label",as.character(result[,1]))
          rownames(result)<- inf
          res <- t(result)
          message("the Resulting Data File Generated by the Quantification Software: ProteinProphet")

        }else if(any(grepl("ACCESSION",colnames(dataa))) && any(grepl("PEPTIDES_COUNT",colnames(dataa)))){


          sampleID<-grep("PEPTIDES_COUNT",colnames(dataa))
          proID<-grep("ACCESSION",colnames(dataa))
          result<-dataa[,c(proID,sampleID)]

          datalabel<-data2
          M_sample<-match(datalabel[,1],colnames(result))
          result_data<-rbind(label=as.character(datalabel),result[,M_sample])
          inf<-c("label",as.character(result[,1]))
          rownames(result)<- inf
          res <- t(result)
          message("the Resulting Data File Generated by the Quantification Software: IRMa-hEIDI")

        }else if(any(grepl("Proteins",colnames(dataa))) && any(grepl("Num of Spectra",colnames(dataa)))){


          sampleID<-grep("Num of Spectra",colnames(dataa))
          proID<-grep("Proteins",colnames(dataa))
          result<-dataa[,c(proID,sampleID)]

          datalabel<-data2
          M_sample<-match(datalabel[,1],colnames(result))
          result_data<-rbind(label=as.character(datalabel),result[,M_sample])
          inf<-c("label",as.character(result[,1]))
          rownames(result)<- inf
          res <- t(result)
          message("the Resulting Data File Generated by the Quantification Software: DTASelect")

        }
      }

  return(res)
}
idrblab/EVALFQ documentation built on Sept. 29, 2022, 6:34 p.m.