R/MaxquantDataconvert.R

Defines functions .uniprot_database ID_match P.G.extract MaxQprotein MaxQdataconvert

Documented in ID_match MaxQdataconvert MaxQprotein P.G.extract

#one-step to extract Maxquant quantification information and convert
#20190418version
#20190517
MaxQdataconvert <- function(pgfilename, IDname = "Majority.protein.IDs",
                            IDtype = c("MaxQ","none"), CONremove = TRUE,
                            justID = TRUE, status1 = TRUE, ENTRY1 = TRUE,
                            db1.path = NULL, db2.path = NULL,
                            out.folder = NULL, blast.path = NULL,
                            savecsvpath = NULL, csvfilename = NULL,
                            verbose = 1, ...)
{
  if (!file.exists(pgfilename)) stop("pgfilename is not exist.")
  my_fasta <- read.delim(pgfilename, stringsAsFactors = FALSE)
  IDtype <- match.arg(IDtype, c("MaxQ","none"))
  data <- MaxQprotein(my_fasta, IDname = IDname,
                      IDtype = IDtype, remove = CONremove,
                      verbose = verbose, ...);
  NEXTtoIDmatch = FALSE;
  if (IDtype == "MaxQ"){
    inf <- try(P.G.extract(data$protein_IDs, justID = justID,
                           status1 = status1, ENTRY1 = ENTRY1,
                           verbose = verbose-1));
    if (!inherits(inf,"try-error")) { #220516 #200627 200804 !any(class(inf) %in% "try-error")
      inf <- data.frame(inf[ ,1:3], stringsAsFactors = FALSE);
      colnames(inf) <- c("db.type", "ori.ID", "ENTRY.NAME");
      NEXTtoIDmatch = TRUE;
      }
  }


  if (IDtype == "none") {
    #inf <- my_fasta[ ,IDname];
    inf <- data$protein_IDs;
    inf <- gsub("(.*?);.*","\\1",inf);
    inf <- gsub(".*(tr|sp)\\|(.*)\\|.*", "\\2", inf);
    if (!is.null(db2.path)) {
      if(file.exists(db2.path)){
        my_sequences <- try(.uniprot_database(input_file = db2.path))
        if (!inherits(my_sequences,"try-error")) { #class(my_sequences) != "try-error"
          pos <- match(inf, my_sequences$pro_info$Uniprot.ID);
          inf <- my_sequences$pro_info[pos, 1:3];
          con <- which(is.na(inf[ ,2]));
          #if (length(con) != 0) inf[con, 2] <- my_fasta[con, IDname];
          if (length(con) != 0) inf[con, 2] <- data$protein_IDs[con];
          if (CONremove & length(con)!= 0) {
            inf <- inf[-con, ];
            data$protein_IDs <- data$protein_IDs[-con]
            data$intensity <- data$intensity[-con, ];
          }
          rownames(inf) <- 1:nrow(inf);
          colnames(inf) <- c("db.type", "ori.ID", "ENTRY.NAME");
          NEXTtoIDmatch = TRUE;
          if (length(con) == length(inf)) {
            inf <- data$protein_IDs;
            if(verbose > 0) warning("protein ID is no match for db2.");
            NEXTtoIDmatch = FALSE;
          }
        }
      }
    }
  }

  if (NEXTtoIDmatch) {
    data_match <- try(ID_match(data = inf,
                               db1.path = db1.path,
                               db2.path = db2.path,
                               out.folder=out.folder,
                               blast.path=blast.path,
                               verbose = verbose));
    if (!inherits(data_match,"try-error")) { #220516 class(data_match) != "try-error"
      inf <- data.frame(data_match, db.type = inf$db.type,
                        protein_IDs = data$protein_IDs,
                        stringsAsFactors = FALSE);
    } else warning("ID match is not execute.");
  } else {
    data_match <- try(ID_match(data = inf), silent = TRUE);
    warning("ID match is not execute.")
  }

  if(IDtype == "none")
    data <- list(inf = inf, intensity = data$intensity) else
      data <- list(inf = inf, intensity = data$intensity,
                   iBAQ = data$iBAQ, LFQ = data$LFQ);
  if(NEXTtoIDmatch & ! inherits(data_match,"try-error")) { #220516 class(data_match) != "try-error"
    if (is.character(savecsvpath) & is.character(csvfilename)){
      my_fasta_match <- data.frame(inf, data$LFQ);
      if (!dir.exists(savecsvpath)) dir.create(savecsvpath);
      csvfilename <- paste0(savecsvpath,"/",csvfilename);
      write.csv(my_fasta_match, file = csvfilename);
    } else if (is.character(savecsvpath)) {
      if (verbose > 0) warning("wrong csvfilename");
    } else if (verbose > 0) warning("Don't save csv file.")
  }
  data
}


#read Maxquant proteingroups data, remove contaminant and reverse protein
#20190517 add verbose
MaxQprotein <- function(proteinGroups, IDname = "Majority.protein.IDs",
                        IDtype = "MaxQ", remove = TRUE,
                        QuanCol = NULL, verbose = 1){
  removeConRev <- function(infile){
    ##  remove contaminant, reverse proteinID
    if (is.element("Potential.contaminant", colnames(infile)) &
        is.element("+", unique(infile$Potential.contaminant))) {
      infile <- infile[-which(infile$Potential.contaminant %in% "+"), ]
    }
    if (is.element("Reverse", colnames(infile)) &
        is.element("+", unique(infile$Reverse))) {
      infile <- infile[-which(infile$Reverse %in% "+"), ]
    }
    infile
  }
  if (length(IDname) != 1 & is.vector(IDname) )
    stop ("IDname only allow one column name.")
  if (is.na(IDname)) stop ("IDname is NA.")
  IDtype <- match.arg(IDtype, c("MaxQ","none"));
  if (remove & IDtype == "MaxQ") {
    proteinGroups <- removeConRev(proteinGroups);
    if(verbose > 0) message('* + Contaminant, + Reverse, proteins are removed.')
    }
  colname <- colnames(proteinGroups);
  if (is.element(IDname, colname))
    protein_IDs <- proteinGroups[, IDname] else
      stop(paste("no", IDname));
  if (IDtype == "MaxQ"){
    intensity <- grep("Intensity..*", colname);
    if (length(intensity) == 0)
      intensity <- NULL else
        intensity <- proteinGroups[, intensity];
    iBAQ <- grep("iBAQ..*", colname);
    if (length(iBAQ) == 0)
      iBAQ <- NULL else
        iBAQ <- proteinGroups[, iBAQ];
    LFQ <- grep("LFQ.intensity..*", colname);
    if (length(LFQ) == 0)
      LFQ <- NULL else
        LFQ <- proteinGroups[, LFQ];
    data <- list(protein_IDs = protein_IDs, intensity = intensity, iBAQ = iBAQ, LFQ = LFQ)
  }
  if (IDtype == "none"){
    if (is.null(QuanCol)) {
      IDpos <- which(colname == IDname);
      intensity <- proteinGroups[, -IDpos];
    }
    if (is.numeric(QuanCol)) {
      if (max(QuanCol) <= ncol(proteinGroups) & min(QuanCol) >= 1)
        intensity <- proteinGroups[, QuanCol] else
          stop ("QuanCol is wrong.
Some of the numbers are larger than proteinGroups column number or less than 1.")
    }
    if (is.character(QuanCol)) {
      if (all(is.element(QuanCol, colname)))
        intensity <- proteinGroups[, QuanCol] else
          stop ("QuanCol is wrong.")
    }
    data <- list(protein_IDs = protein_IDs, intensity = intensity)
  }
  data
}


#extract protein groups information
#20190105version
#20190517 add verbose
#20220505 add parameter sp1 and onlysp extract sp ID or not
P.G.extract <- function(inf, ncol = 4,
                        justID = FALSE, status1 = TRUE,
                        ENTRY1 = TRUE, ID1 = TRUE,
                        sp1 = TRUE, onlysp = FALSE,
                        verbose = 0) {
  n.gene <- length(inf)
  if (n.gene == 0) stop("wrong inf.")
  inf_matrix <- matrix(rep("", ncol*n.gene), nrow = n.gene, ncol = ncol)
  for (i in 1:n.gene){
    gene <- unlist(strsplit(inf[i], split = ";"))
    status <- NULL; ENTRY <- NULL;
    possp <- grep("sp",gene); #220505
    if(onlysp & length(possp) != 0) gene <- gene[possp];#220505 only keep sp ID
    if(sp1) pos <- grep("sp",gene)[1] else pos = 1;#220505
    if(is.na(pos)) pos = 1; #220505 when no sp ID, choose the first one.
    seprow1 <- unlist(strsplit(gene[pos], split = "\\|"));
    if (status1) status <- seprow1[1];
    #status<-gsub("(tr|sp)\\|.*","\\1",gene[1])
    if (ENTRY1) ENTRY <- seprow1[3];
    #ENTRY<-gsub("tr|sp\\|.*\\|(.*)","\\1",gene[1])
    if (justID) gene <- gsub(".*(tr|sp)\\|(.*)\\|.*", "\\2", gene);
    if(ID1) gene <- c(seprow1[2],gene[-pos]) else  gene <- c(gene[pos],gene[-pos]);#220505
    gene <- c(status, gene[1], ENTRY, gene[-1]);
    if (ncol >= length(gene))
      inf_matrix[i, 1:length(gene)] <- gene else {
        inf_matrix[i,] <- gene[1:ncol];
        if(verbose > 0) warning (paste("The", i, "proteins have more than",
                                       ncol, "protein ID"));
        }
  }
  inf_matrix
}

################################
#need install blast+   blast.path="e:/blast/ncbi-blast-2.7.1+/bin/"
#out.folder and db1.path should be in the same folder path
#path should have no special character
#db1.path,db2.path,out.folder are both need the complete path
#input should have colname: ori.ID,ENTRY.NAME  output have 4 cols:ori.ID,ENTRY.NAME,new.ID,match.type
#db1.path is transfered species fasta file, db2.path is original species fasta file
#blast+ download website:ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/
#blast+ version need 2.7.1
#evalue lower means more rigorous;verbose:0 no verbose:1 have
#190601 reset the default working directory
#190605 add alternative solution without use blast+
#200602 fix a bug in step 2
#220505 add GN info

ID_match <- function(data, db1.path = NULL, db2.path = NULL,
                     out.folder = NULL, blast.path = NULL,
                     evalue = 0.1, verbose = 1)
{

  question <- function (...)#190605
  {
    yes <- c("Yes", "Definitely", "Sure",
               "I agree", "Absolutely")
    no <- c("No", "Nope", "Not Sure")
    cat(paste0(..., collapse = ""))
    qs <- c(sample(yes, 1), sample(no, 1))
    menu(qs) == 1
  }
  #extract local database
  if (!requireNamespace("Biostrings", quietly = TRUE)) {
    stop("Biostrings in Bioconductor needed for this function to work. Please install it.",
         call. = FALSE)
  }
  old.wd <- getwd(); on.exit(setwd(old.wd)); #190601
  if (is.null(db1.path)) stop ("db1.path is null.");
  if (is.null(db2.path)) stop ("db2.path is null.");
  if (is.null(out.folder)) stop ("out.folder path is null.");
  if (!dir.exists(out.folder)) dir.create(out.folder);
  if (!file.exists(db2.path)) stop ("db2.path is wrong.");
  if (!file.exists(db1.path)) stop ("db1.path is wrong.");
  db1.folder <- gsub("(.*/+).*","\\1", db1.path);
  db1.folder <- c(db1.folder, gsub("(.*)/","\\1", db1.folder));
  #db1.folder <- gsub(paste0("(",out.folder,").*"), "\\1", db1.path);
  if (all(out.folder != db1.folder)) stop ("db1.path is not in the out.folder");
  Alignment = FALSE; #190605
  Continue = TRUE; #200626
  #190605
  if (is.null(blast.path)) {
    message("blast.path is null.\n Use pairwiseAlignment function, it will take lots of time.");
    Alignment <- TRUE; #200626 remove the question function
    #cat("blast.path is null.\n");
  } else if (!dir.exists(blast.path)) {
    message("blast.path is not exists.\n");
    #cat("blast.path is not exists.\n");
    if (question("Do you want to do ID_match without using blast+ software?"))
      {if(question("It will take large amouts of time. Are you sure?"))
        Alignment <- TRUE else stop("Please give the correct blast.path and re-run the function.");
    } else { if (question("Continue to run?"))
      Continue = FALSE else stop("Please give the correct blast.path and re-run the function.");
    }#200626
  }#190605
  my_sequences <- .uniprot_database(input_file = db1.path);
  if(inherits(my_sequences,"try-error"))#220516 #190605 class(my_sequences) == "try-error"
    stop("db1.path file is not the correct format.") else{ #190605
      database1 <- my_sequences$pro_info;
      database1_seq <- my_sequences$pro_seq;
      if (verbose > 0) message("db1 extraction is completed")
    }
  my_sequences <- .uniprot_database(input_file = db2.path);
  if(inherits(my_sequences,"try-error")) #220516 #190605
    stop("db1.path file is not the correct format.") else { #190605
      database2 <- my_sequences$pro_info;
      database2_seq <- my_sequences$pro_seq;
      if (verbose > 0) message("db2 extraction is completed")
    }
  #190605 add error information
  if (sum(colnames(data) %in% c("ori.ID")) * sum(colnames(data) %in% c("ENTRY.NAME")) != 1 )
    stop("data should have column: ori.ID and ENTRY.NAME.")


  #1.ENTRY.NAME match between data and database1, no matched IDs saved in datano1,match.type is 1
  #delete species information in ENTRY.NAME.
  protein_name_despecies <- function(data){
    pro_name <- gsub("(.*)_.*", "\\1", as.character(data$ENTRY.NAME), perl = TRUE);
    data$ENTRY.NAME <- pro_name;
    data;
  }
  #data database1 despecies
  data_despecies <- protein_name_despecies(data);
  database1_despecies <- protein_name_despecies(database1);
  #order is ENTRY.NAME,ori.ID,new.ID
  data_IDmatch <- merge(data_despecies[ , c('ori.ID', 'ENTRY.NAME')],
                        database1_despecies[ , c('Uniprot.ID', 'ENTRY.NAME',"GN")],
                      by='ENTRY.NAME',all.x=TRUE,sort = FALSE);#20220505 add GN info
  names(data_IDmatch)[names(data_IDmatch) == 'Uniprot.ID'] <- 'new.ID';
  #match.type is 1
  match.type <- "1";
  data_IDmatch <- data.frame(data_IDmatch, match.type, stringsAsFactors = FALSE);
  #data_IDmatch$match.type[is.na(data_IDmatch$new.ID)]<-NA;
  data_IDmatch <- merge(data[ , c('ori.ID', 'ENTRY.NAME')],
                      data_IDmatch[ ,-1],
                      by = 'ori.ID', sort = FALSE);
  #data_IDmatch$new.ID<-as.character(data_IDmatch$new.ID);#factor to character
  datano1 <- data_IDmatch[is.na(data_IDmatch$new.ID), ];
  if (verbose > 0) message(paste0("ENTRY.NAME match is completed, ", length(na.omit(data_IDmatch$new.ID))," IDs are matched. Match.type is 1."))#220505

  #2.gn match, no matched IDs in datano2, match.type is 2
  if (nrow(datano1) != 0){
    #add GN information
    data_withgn <- merge(datano1[,-4], database2[ ,c('Uniprot.ID', 'GN')],
                       by.x = 'ori.ID',by.y = 'Uniprot.ID',sort = FALSE);#220505 add GN
    nogn <- which(data_withgn$GN == "") #181210
    if(length(nogn) != 0 ) data_withgn <- data_withgn[-nogn, ] #181210 #200602
    # extract GN information
    data_GN <- gsub("(.*)","^\\1", as.character(data_withgn$GN), perl = TRUE);
    #limitation maxium matched protein ID number is 14
    GN <- matrix(nrow = length(data_GN), ncol = 14);
    ID <- NULL;
    #GN match: if no match, delete the last character and match again
    #delete at most 4 characters, if match more than 1 ID, do pairwise
    for (j in 1:length(data_GN))
    {
      gn <- NULL;
      ##maxium delete character:4
      data_GN[j] <- gsub("\\(|\\)|\\.|\\*|\\[|\\]|\\{|\\}|/","",data_GN[j]) #210528 remove regexp meta char
      max <- nchar(data_GN[j],type = "width"); #210528 fixed        #max <- nchar(data_GN[j]);
      for (i in 1:5)
      {
        ##match first, if no then delete character
        if (length(gn) == 0){
          data_GN[j] <- substr(data_GN[j], start = 1, stop = max+1-i);
          gn <- grep(data_GN[j], database1$GN,
                     ignore.case = TRUE, perl = TRUE, value = TRUE);
          ##if delete 4 character and no match,then gn<-NA
          if (i == 5 && length(gn) == 0) gn <- NA;
          if (i == 5 && length(gn) > 14) gn <- NA;
        }
        ##if match, how many ID have been matched.
        else{
          if (length(gn) > 14) gn<-NA;
          break;
        }
      }
      GN[j, 1:length(gn)] <- gn;
      id <- as.character(database1$Uniprot.ID[database1$GN %in% gn]);
      ##if ID# more than 1, pairwise
      if (length(id) != 1 && length(id) != 0){
        similar_seq <- as.character(database1_seq[names(database1_seq) %in% id]);
        name <- data_withgn$ori.ID[j];
        ori_seq <- as.character(database2_seq[names(database2_seq) == name]);
        #remove the unregular amino acid
        aa <- unlist(strsplit(unname(ori_seq),"")); #190605 #200626 use base::strsplit instead of Biostrings::strsplit
        aa <- aa[aa %in% c("A","C","D","E","F","G","H","I",
                           "K","L","M","N","P","Q","R","S",
                           "T","V","W","Y")]; #190605
        aa <- paste0(aa,collapse = "");   #190605
        ori_seq <- aa;  #190605
        scorem <- 0;
        type2_results <- list();
        for (k in 1:length(similar_seq)) {#190605
          pwalign <- try(Biostrings::pairwiseAlignment(as.character(ori_seq),
                                                       as.character(similar_seq[k]),
                                                       type = "overlap",
                                                       substitutionMatrix = "BLOSUM62",
                                                       gapOpening = 9.5,
                                                       gapExtension = 0.5,
                                                       scoreOnly = TRUE), silent = TRUE);
          if (inherits(pwalign,"try-error")) {#220516 class(pwalign) == "try-error"
            aa <- unlist(strsplit(unname( as.character(similar_seq[k]) ),"")); #200626
            aa <- aa[aa %in% c("A","C","D","E","F","G","H","I",
                               "K","L","M","N","P","Q","R","S",
                               "T","V","W","Y")];
            aa <- paste0(aa,collapse = "")
            align_seq <- aa;
            pwalign <- try(Biostrings::pairwiseAlignment(as.character(ori_seq),
                                                         align_seq,
                                                         type = "overlap",
                                                         substitutionMatrix = "BLOSUM62",
                                                         gapOpening = 9.5,
                                                         gapExtension = 0.5,
                                                         scoreOnly = TRUE), silent = TRUE);
            if (inherits(pwalign,"try-error"))#220516
            {pwalign = 0;
            warning("Please email the warning to the author. Thank you!")
            }
          }
          scorem[k] <- pwalign;
        }#190605
        type2_results[[name]] <- data.frame(rep(name, length(similar_seq)),
                                            similar_seq,
                                            scorem);
        ID[j] <- id[which.max(scorem)];
      }
      ##if ID# is 1 then record
      else if(length(id) != 0) ID[j] <- id
      else {id <- NA; ID[j] <- id;}
    }
    #no matched ID in datano2
    new.ID <- ID;
    match.type <- rep('2', length(new.ID));#181210
    data_withgn <- data.frame(data_withgn[,c(-3,-4,-5)],
                              new.ID,
                              GN = data_withgn$GN,
                              match.type,
                              stringsAsFactors = FALSE);#181210 #220505 add GN
    datano1[datano1$ori.ID %in% data_withgn$ori.ID, ] <- data_withgn;#181210
    #match.type<-rep('2',length(datano1$ori.ID)); #181210
    #datano1<-data.frame(datano1[,c(-3,-4)],new.ID,match.type,stringsAsFactors = FALSE);#181210
    datano1$match.type[is.na(datano1$new.ID)] <- NA;
    datano2 <- datano1[is.na(datano1$new.ID), ];
    if(verbose > 0) message(paste0("GN similar match is completed, ", length(na.omit(new.ID))," IDs are matched. Match.type is 2."))#220505

    data_IDmatch[is.na(data_IDmatch$new.ID), ] <- datano1;
    #3.blast match.type is 3
    #test#if(length(datano2$ori.ID)==length(data$ori.ID)){
    if (length(datano2$ori.ID) != 0) {
      ID <- NULL;
      data_ori.ID <- as.character(datano2$ori.ID);
      if (!Alignment & Continue) { #190605
        log.fil <- file.path(out.folder, "log.txt");
        db.fil <- file.path(out.folder, "blastDB");
        command <- paste("makeblastdb -logfile", log.fil,
                         "-dbtype prot -out", db.fil, "-in", db1.path);
        setwd(blast.path)
        system(command)
        Continue <- TRUE; #190605
        #190605
        if ( !file.exists(log.fil) ) {
          warning("blast.path have no blast+ software.\n");
          Continue <- FALSE;
          if (question("Do you want to do ID_match without using blast+ software?"))
            if(question("It will take large amouts of time. Are you sure?"))
              Alignment <- TRUE;
        } else if (!file.exists(paste0(db.fil,".psq"))) {
          file.remove( log.fil, paste( log.fil, ".perf", sep=""));
          warning("blast+ software cannot extract db1.bath information.\n")
          #cat("Please email the error to author. Thank you!\n");
          #cat("The author Email is liukefu19@163.com\n")
          stop("Please email the error to author. Thank you!\n The author Email is liukefu19@163.com");
        }
        if (Continue) {
          type3_results <- list();
          for (j in 1:length(data_ori.ID)) {
            name <- data_ori.ID[j];
            ori_seq <- as.character(database2_seq[names(database2_seq) == name]);
            write(ori_seq, file = file.path(out.folder, "input.fasta"));
            input <- paste( "-query ", file.path(out.folder, "input.fasta"), sep = "" );
            dbase <- paste( "-db ", db.fil, sep = "" );
            output <- paste( "-out ",
                             file.path( out.folder, "blast_result.txt" ),
                             sep = "" );
            # command <- paste( "blastp -matrix BLOSUM45 -evalue", 0.001, "-num_threads", 1,
            #                  "-outfmt 6", input, dbase, output )
            command <- paste( "blastp -evalue", evalue,
                              "-outfmt 6", input, dbase, output );
            system(command)
            if(length(scan(file.path(out.folder, "blast_result.txt"),
                           what = character(), quiet = TRUE))== 0)
              ID[j] <- NA
            else{
              similar_seq <- read.table(file.path( out.folder, "blast_result.txt" ));
              similar_seq[,1] <- data_ori.ID[j];
              type3_results[[name]] <- similar_seq;
              similar_ID <- as.character(similar_seq[1, 2]);
              similar_ID <- unlist(strsplit(similar_ID, split = "\\|"));
              ID[j] <- similar_ID[2];
            }
          }
          file.remove( paste( db.fil, ".pin", sep="" ) );
          file.remove( paste( db.fil, ".phr", sep="" ) );
          file.remove( paste( db.fil, ".psq", sep="" ) );
          file.remove( log.fil, paste( log.fil, ".perf", sep=""));
          file.remove(file.path( out.folder, "input.fasta"),
                      file.path( out.folder, "blast_result.txt"));
        }
      }#190605
      #190605
      if (Alignment) {
        Continue = FALSE; #200626
        type3_results <- list();
        for (j in 1:length(data_ori.ID)) {
          name <- data_ori.ID[j];
          ori_seq <- as.character(database2_seq[names(database2_seq) == name]);
          aa <- unlist(strsplit(unname(ori_seq),"")); #200626
          aa <- aa[aa %in% c("A","C","D","E","F","G","H","I",
                             "K","L","M","N","P","Q","R","S",
                             "T","V","W","Y")];
          aa <- paste0(aa,collapse = "");
          ori_seq <- aa;
          scorem <- 0;
          for (k in 1:length(database1_seq)) {
            pwalign <- try(Biostrings::pairwiseAlignment(as.character(ori_seq),
                                                         as.character(database1_seq[k]),
                                                         type = "overlap",
                                                         substitutionMatrix = "BLOSUM62",
                                                         gapOpening = 9.5,
                                                         gapExtension = 0.5,
                                                         scoreOnly = TRUE), silent = TRUE);
            if (inherits(pwalign,"try-error")) {#220516
              aa <- unlist(strsplit(unname(as.character(database1_seq[k])),"")); #200626
              aa <- aa[aa %in% c("A","C","D","E","F","G","H","I",
                                 "K","L","M","N","P","Q","R","S",
                                 "T","V","W","Y")];
              aa <- paste0(aa,collapse = "")
              align_seq <- aa;
              pwalign <- try(Biostrings::pairwiseAlignment(as.character(ori_seq),
                                                           align_seq,
                                                           type = "overlap",
                                                           substitutionMatrix = "BLOSUM62",
                                                           gapOpening = 9.5,
                                                           gapExtension = 0.5,
                                                           scoreOnly = TRUE), silent = TRUE);
              if (inherits(pwalign,"try-error"))#220516
              {pwalign = 0;
              warning("Please email the warning to the author. Thank you!")
              }
            }
            scorem[k] <- pwalign;
          }
          ID[j] <- names(database1_seq)[which.max(scorem)];
        }
      }
      if (Alignment | Continue) { #190605
        new.ID <- ID;
        GN3 <- database1$GN[match(new.ID,database1$Uniprot.ID)];#220505 add GN
        match.type <- "3";
        datano2 <- data.frame(datano2[,c(-3,-4,-5)],
                              new.ID,
                              GN = GN3,
                              match.type,
                              stringsAsFactors = FALSE);#220505 add GN
        #
        data_IDmatch[is.na(data_IDmatch$new.ID), ] <- datano2;
        if (verbose > 0) message(paste0("seq blast is completed, ", length(na.omit(new.ID))," IDs are matched. Match.type is 3.")) #220505
      } else message("match.type 3 is not run.")#190605
    }
  }
  #220505 add message about how many ID are matched.
  datano3 <- data_IDmatch[is.na(data_IDmatch$new.ID), ];
  if(nrow(datano3) == 0 & verbose > 0)
    message("ID match step is finished.\n all IDs are matched.") else if (verbose > 0) {
      datano3$GN <- data_withgn$GN[match(datano3$ori.ID,data_withgn$ori.ID)];
      datano3$match.type <- ""
      data_IDmatch[is.na(data_IDmatch$new.ID), ] <- datano3;
      message(paste0("ID match step is finished.\n", length(na.omit(data_IDmatch$new.ID))," IDs are matched. ", nrow(datano3)," IDs are not matched."))#220505
    }

  data_IDmatch
}



#Uniprot database fasta file(type2 suited for human, mouse, Monkey )
##version2.0 no factor format
#190605 add error when the file is not correct.
.uniprot_database <- function(input_file, type = 1) {
  if (!requireNamespace("Biostrings", quietly = TRUE)) {
    stop("Biostrings in Bioconductor needed for this function to work. Please install it.",
         call. = FALSE)
  }
  # read fasta file
  my_fasta <- try(Biostrings::readAAStringSet(input_file), silent = TRUE); #190605
  if(!inherits(my_fasta,"try-error")) { #220516 #190605 class(my_fasta) != "try-error"
    protein_inf <- names(my_fasta);
    #seperate information based by "|"
    if (type == 1) {
      #protein_inf<-unlist(strsplit(protein_inf,split="_HUMAN.*|_MOUSE.*|_MACMU.*|_MACFA.*"))
      protein_inf <- unlist( strsplit( protein_inf, split = "_.*"))
      protein_inf <- data.frame(matrix( unlist( strsplit( protein_inf, split="\\|")),
                                        ncol = 3,byrow = TRUE),
                                stringsAsFactors = FALSE)
      names(protein_inf)<- c("status", "Uniprot.ID", "ENTRY-NAME");
      GN <- gsub(".*GN=(.*) PE=.*", "\\1",
                 as.character(names(my_fasta)), perl = TRUE)
      GN <- gsub(".*\\|.*", "", GN, perl = TRUE)
      protein_inf<-data.frame(protein_inf,GN,stringsAsFactors = FALSE);
    }
    if(type == 2){
      protein_inf <- data.frame(matrix(unlist(strsplit(protein_inf, split = "\\|")),
                                       ncol = 3,byrow = TRUE),
                                stringsAsFactors = FALSE);
      names(protein_inf) <- c("status", "Uniprot.ID", "ENTRY-NAME");
      #extract entryname and GN
      entryname <- gsub("(.*_HUMAN).*|(.*_MOUSE).*|(.*_MACMU).*|(.*_MACFA).*",
                        "\\1\\2", as.character(protein_inf[,3]), perl = TRUE);
      GN <- gsub(".*GN=(.*) PE=.*", "\\1",
                 as.character(protein_inf[ ,3]), perl = TRUE);
      protein_inf[ ,3] <- entryname;
      protein_inf <- data.frame(protein_inf, GN, stringsAsFactors = FALSE);
    }
    my_id_sequence <- Biostrings::readAAStringSet(input_file);
    names(my_id_sequence) <- as.character(protein_inf$Uniprot.ID);
    list(pro_info = protein_inf, pro_seq = my_id_sequence);
  } else my_fasta; #190605
}

Try the DDPNA package in your browser

Any scripts or data that you put into this service are public.

DDPNA documentation built on May 17, 2022, 5:05 p.m.