In julia-wrobel/DepLabData: data

Outputting other organism identifiers (id mapping tab files downloaded from uniprot ftp site).

library(data.table)

path = "/Users/nickgiangreco/GitHub/Proteomic_Correlation_Shiny/DepLab/inst/extdata/ID_lists/"

#MOUSE
tab<-fread("~/Downloads/MOUSE_10090_idmapping.dat",sep="\t",header=F)

colnames(tab)=c("uni","id","name")

subtab<-tab[id == "Gene_Name"]

subsubtab<-subtab[,c(1,3)]

write.table(subsubtab,paste0(path,"mouse.id.txt"),quote=F,row.names = F,col.names = F,sep="\t")

#RAT
tab<-fread("~/Downloads/RAT_10116_idmapping.dat",sep="\t",header=F)

colnames(tab)=c("uni","id","name")

subtab<-tab[id == "Gene_Name"]

subsubtab<-subtab[,c(1,3)]

write.table(subsubtab,paste0(path,"rat.id.txt"),quote=F,row.names = F,col.names = F,sep="\t")

#WORM
tab<-fread("~/Downloads/CAEEL_6239_idmapping.dat",sep="\t",header=F)

colnames(tab)=c("uni","id","name")

subtab<-tab[id == "Gene_Name"]

subsubtab<-subtab[,c(1,3)]

write.table(subsubtab,paste0(path,"worm.id.txt"),quote=F,row.names = F,col.names = F,sep="\t")

Viewing id first character distribution in tabular format.

path = "/Users/nickgiangreco/GitHub/Proteomic_Correlation_Shiny/DepLab/inst/extdata/ID_lists/"

h<-read.table(paste0(path,"human.id.txt"),sep="\t")
fchar<-substr(h[,1],start=1,stop=1)
table(fchar)

y<-read.table(paste0(path,"yeast.id.txt"),sep="\t")
fchar<-substr(y[,1],start=1,stop=1)
table(fchar)


m<-read.table(paste0(path,"mouse.id.txt"),sep="\t")
fchar<-substr(m[,1],start=1,stop=1)
table(fchar)

r<-read.table(paste0(path,"rat.id.txt"),sep="\t")
fchar<-substr(r[,1],start=1,stop=1)
table(fchar)

w<-read.table(paste0(path,"worm.id.txt"),sep="\t")
fchar<-substr(w[,1],start=1,stop=1)
table(fchar)

Implementation of other organisms in clean function.

cleaning_MQ <- function(mq.df, remove.contaminants = TRUE,
                        remove.decoys = TRUE, 
                        poi = NULL, spikeIn = NULL){
  # Currently, this function will subset the data.frame more and more, thus
  # multiple filtering options may clash. E.g., if the data.frame is already 
  # filtered to only contain trypsin-related entries, it will most likely not
  # find anything related to a Uniprot search for non-trypsin proteins.
  if(dim(mq.df)[1] == 0)(warning("The input to cleaning_MQ is empty."))

  mq.out <- mq.df

  if(!remove.contaminants && !remove.decoys && is.null(poi) && is.null(spikeIn)){
    warning("Note that none of the offered filtering options is set. 
            The in-going data frame should be the same as the out-going one.")
  }

  if(remove.contaminants){
    mq.out <- subset(mq.out, !grepl("CON", mq.out$Protein.IDs))
  }

  if(remove.decoys){
    mq.out <- subset(mq.out, !grepl("REV", mq.out$Protein.IDs))
  }

  if(!is.null(poi)){

    if else(poi == "yeast"){

      mq.out <- subset(mq.out, grepl("^[YQ]+", Protein.IDs))

    }else{

      # the massive regex in the middle is from TrEMBL (http://www.uniprot.org/help/accession_numbers)
      mq.out <- subset(mq.out, grepl("([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})", Protein.IDs))

    }
  }

  # extracting spiked-in proteins
  if(!is.null(spikeIn)){

    if(!is.null(poi)){
      stop("The option to retrieve spike-in entries is not compatible with
           retrieving yeast or human entries. Set `poi = NULL`.")
    }

    if(remove.contaminants){
      warning("Extracting the results for spike-ins while at the same time 
              removing contaminants will probably not yield the desired results
              (if anything). Recommended settings: remove.contaminants = FALSE,
              remove.decoys = TRUE, poi = NULL")
    }

    if(all(spikeIn == "trypsin")){

      mq.out <- subset(mq.out, grepl("P00761$|P00761[^a-zA-Z0-9]", Protein.IDs) &
                         grepl("CON", Protein.IDs))
    }else{
      ID.check <- check_nomenclature(spikeIn)
      if(!all(ID.check)){stop("The ID(s) you supplied to `spikeIn =` do(es) 
            not meet the UniProt or yeast gene nomenclature criteria.")}
      reg.1 <- paste(paste(spikeIn, "$", sep = ""), collapse="|")
      reg.2 <- paste(paste(spikeIn, "[^a-zA-Z0-9]", sep = ""), collapse="|")
      reg.combi <- paste(reg.1, reg.2, sep = "|", collapse = "")
      mq.out <- subset(mq.out, grepl( reg.combi, Protein.IDs))
    }
    }

  # done cleaning
  if(dim(mq.out)[1] == 0){
    warning("None of the entries in the MaxQuant output survived the cleaning. 
            Check that you selected the correct organism for the data that you uploaded.")
  }

  return(mq.out)
}

mq.h <- reading_MQ(system.file("extdata","test_data", "proteinGroups_human.txt", package = "DepLab"))
mq.h.clean <- cleaning_MQ(mq.h, remove.contaminants = TRUE, remove.decoys = TRUE, poi = "human")