library("data.table",quietly=TRUE,verbose=FALSE)
#Read big files
library("Biostrings",quietly=TRUE,verbose=FALSE)
#Read fasta files
library("limma",quietly=TRUE,verbose=FALSE)
#make some pretty Venn Diagram
library("ggplot2",quietly=TRUE,verbose=FALSE)
#make some pretty plot
library("viridis",quietly=TRUE,verbose=FALSE)
#make pretty intersection genomic range
library("GenomicRanges",quietly=TRUE,verbose=FALSE)
#Pretty colors
library("DT",quietly=TRUE,verbose=FALSE)
#make some fancy HTML table for HTML output
library("treemap")
#make some graph
library("DiagrammeR")
#make some histogram
library("plotly")

library("knitr",quietly=TRUE,verbose=FALSE)
library("kableExtra",quietly=TRUE,verbose=FALSE)

ProteinBank<-readAAStringSet(params$ProteinBank)
#The file that have been used for the mass spec search
Pep<-fread(params$Pep)
#The peptide Search Using MaxQuant 
Rescue<-read.table(params$Rescue,sep="\t",header=TRUE,quote="\"")
#The fasta file of the MrnaEntries that did not have a perfect match in the Whole Uniprot

SEQUENCES_NAMES<-names(ProteinBank)
#Name of the fasta entries in UNIPROT
#cat(params$RefRegexp,"\n")
#cat(params$mRNARegexp,"\n")
#Just to check out the regexp and how it will be interpreted by R
UNIPROT_BOOL<-intersect(
    grep(",Blast=",
         SEQUENCES_NAMES,
         invert=TRUE),
    grep(params$RefRegexp,
         SEQUENCES_NAMES)
  )
#The References names in the Used protein bank
STRINGTIE_BOOL<-grep(params$mRNARegexp,SEQUENCES_NAMES)
#The Mrna names in the Used protein bank

#Classic Venn Diagram trick
#All the names 2 columns, vennCounts, Diagram 
SequencesMatrix<-matrix(0,
  ncol=2,
  nrow=length(SEQUENCES_NAMES)
)
rownames(SequencesMatrix)<-SEQUENCES_NAMES
colnames(SequencesMatrix)<-c("Reference","mRNA")
SequencesMatrix[UNIPROT_BOOL,"Reference"]<-1
SequencesMatrix[STRINGTIE_BOOL,"mRNA"]<-1
SequencesCounts<-vennCounts(SequencesMatrix)


# protein names that came from mRNA 
PROTEINBANK_MRNA_NAMES<-names(ProteinBank)[grep(params$mRNARegexp,
                                                     names(ProteinBank))]
# protein names that came from Swissprot
# i.e. either swissprot canonical that are unperfectly present or isoform that had a blast HIT
SWISSPROT.UNMATCHED<-names(ProteinBank)[
  grep(
    "^sp",
    names(ProteinBank)
    )
  ]
# Names of the mRNA that have a perfect match with an entry in UNIPROT
PerfectMatch<-PROTEINBANK_MRNA_NAMES[grep(",UNIPROT=",
                                              PROTEINBANK_MRNA_NAMES)]
# Names of the mRNA that have a perfect match with an entry in UNIPROT
# that is not a TREMBL
# or and swissprot isoform
PerfectMatch.SP.Canonical<-PerfectMatch[intersect(grep("-[0-9]+\\|",
                                                       PerfectMatch,
                                                       invert=TRUE),
                                                  grep("tr\\|",
                                                       PerfectMatch,
                                                       invert=TRUE))]
#Isoform perfect match
PerfectMatch.SP.Isoform<-PerfectMatch[intersect(grep("-[0-9]+\\|",
                                                       PerfectMatch),
                                                grep("tr\\|",
                                                       PerfectMatch,
                                                       invert=TRUE))]
#TrEMBL perfect match
PerfectMatch.TR<-PerfectMatch[grep("tr\\|",PerfectMatch)]

#names of mRNA that do not have a perfect match but a blast hit in UNIPROT
BLAST.Hits<-PROTEINBANK_MRNA_NAMES[grep(",Blast=",
                                              PROTEINBANK_MRNA_NAMES)]
BLAST.Hits.SP.Canonical<-BLAST.Hits[intersect(grep("-[0-9]+\\|",
                                                       BLAST.Hits,
                                                       invert=TRUE),
                                                  grep(",Blast=tr\\|",
                                                       BLAST.Hits,
                                                       invert=TRUE))]
BLAST.Hits.SP.Isoform<-BLAST.Hits[grep("-[0-9]+\\|",
                                       BLAST.Hits)]
BLAST.Hits.TR<-BLAST.Hits[grep(",Blast=tr\\|",
                               BLAST.Hits)]

#Those who do not match anything neither in UNIPROT nor BLAST
Unknown<-PROTEINBANK_MRNA_NAMES[intersect(grep(",UNIPROT=",
                                              PROTEINBANK_MRNA_NAMES,
                                              invert=TRUE),
                                         grep(",Blast=",
                                              PROTEINBANK_MRNA_NAMES,
                                              invert=TRUE))]


#Dataframe long format
ProteinBank.Counts<-data.frame(
  Match=c(rep("Perfect match",3),
          rep("Blast hit",3),
          "unknown"),
  Form=c(rep(c("Canonical",
               "Isoform",
               "TrEMBL"),2),
         "unknown"),
  Effectif=c(length(PerfectMatch.SP.Canonical),
             length(PerfectMatch.SP.Isoform),
             length(PerfectMatch.TR),
             length(BLAST.Hits.SP.Canonical),
             length(BLAST.Hits.SP.Isoform),
             length(BLAST.Hits.TR),
             length(Unknown))
)

#The names of the proteins that are related to UNIPROT either associated to an mRNA or reinjected.
UNIPROT_NAMES<-names(ProteinBank)[
    grep(params$RefRegexp,names(ProteinBank))
  ]
UP_MATCH<-grep(",UNIPROT=",UNIPROT_NAMES)
UP_BLAST<-grep(",Blast=",UNIPROT_NAMES)

UP_Access_ID<-unlist(lapply(UNIPROT_NAMES,
                     function(x){
                       unlist(strsplit(
                         x,
                         split="\\|"))[2]
                       }
                     ))
UNIPROT_Presence.Matrix<-matrix(0,
                                nrow=length(UP_Access_ID),
                                ncol=2)
rownames(UNIPROT_Presence.Matrix)<-UP_Access_ID
colnames(UNIPROT_Presence.Matrix)<-c("Match","Blast")
UNIPROT_Presence.Matrix[UP_MATCH,"Match"]<-1
UNIPROT_Presence.Matrix[UP_BLAST,"Blast"]<-1
UNIPROT_Presence.DF<-data.frame(UNIPROT_Presence.Matrix)
UNIPROT_Presence.DF$Uniprot_Entry<-UP_Access_ID
UNIPROT_Presence.collapsed.DF<-aggregate(
  UNIPROT_Presence.DF[
    ,
    c("Match","Blast")
    ],
  by=list(UNIPROT_Presence.DF$Uniprot_Entry),
  FUN=sum)
UNIPROT_Presence.collapsed.counts<-vennCounts(
  UNIPROT_Presence.collapsed.DF[
    ,c("Match","Blast")])

Protein bank file: r params$ProteinBank, obtained from the r params$Type protocol.
Peptide file: r params$Pep, MaxQuant peptide file using the Protein bank file.

Protein bank file {.tabset}

Rationale for the protein bank construction.

The idea is to build a protein bank that is the closest to the mRNA rna-seq results. In this present study we will focus mainly on Isoform. From these isoforms and also UNIPROT knowlegde, we can produce a protein bank composed of:

##
#Info file is supposed to be a 3 columns tsv
# is composed of differents logs spawning accros the whole pipeline
# the first column the flag is "INFO" to be able to grep the line
# the column 2 and 3 called Variable and value are similar to a long data frame 
#

Info<-read.table(params$AllLogs,
                 header=FALSE,
                 sep="\t",
                 col.names = c("FLAG","Variable","value"))

#Some pretty colors
GraphPalette<-viridis(4,alpha=0.6)
mRNAColor<-GraphPalette[1]
UniprotColor<-GraphPalette[2]
ActionColor<-GraphPalette[3]
ProteinBankColor<-GraphPalette[4]

#Get the numbers
N_mRNA<-as.numeric(as.vector(Info[Info$Variable=="mRNA_length","value"]))
N_ORFS<-as.numeric(as.vector(Info[Info$Variable=="All_ORFs_non_disjoint","value"]))
N_no_match<-as.numeric(as.vector(Info[Info$Variable=="LORFs_without_UNIPROT_perfect_match","value"]))
N_UNIPROT<-as.numeric(as.vector(Info[Info$Variable=="UNIPROT_length","value"]))
N_SP_whithout_perfect_match<-as.numeric(as.vector(Info[Info$Variable=="Swissprot_canonical_without_mRNA","value"]))
N_canonical<-as.numeric(as.vector(Info[Info$Variable=="Swissprot","value"]))
N_isoform<-as.numeric(as.vector(Info[Info$Variable=="Isoforms","value"]))
N_TrEMBL<-as.numeric(as.vector(Info[Info$Variable=="TrEMBL","value"]))
N_ProteinBank<-as.numeric(as.vector(Info[Info$Variable=="Protein_Bank","value"]))
N_BLAST_hit<-as.numeric(as.vector(Info[Info$Variable=="Blast_Hits","value"]))
N_unknown<-length(Unknown)
N_match<-length(UP_MATCH)
N_Isoform_TrEBML_BLAST_hit<-as.numeric(as.vector(Info[Info$Variable=="Uniprot_add_suspect","value"]))


#DiagrammeR stuff
# presentation https://rich-iannone.github.io/DiagrammeR/graphviz_and_mermaid.html
# options https://graphviz.gitlab.io/_pages/doc/info/attrs.html
# From what I understood
#  GrViz will interpret some text to produce the graph in a HTML widget
#   - To include the numbers, we make a string with every informations, the numbers are formated to be easier to read
#  shape, fontname, style, fontsize, width, height,fillcolor and label are self explainatory
# overlap : Not so sure but supposed to be nodes overlap
# rankdir : General orientation of the graph LR Left to Right
# splines : how the edges should be drawn
#
# keywords: graph, node, edge
# [] attributes/values "," separated
# nodes are ; separated
# edge are space separated
#  node "From" name -> node "To" name
# 
# Ranks part
# in the end of the graph declaration 
# the ranks are some kind of mapping of the graph
# for example
#    { rank=same; A; B; C; 2; } 
#   means that nodes A B C and 2 share the same Left Right alignement
#
# width 768 to fit the final html file 
# height 432 to keep a 16:9 ratio
#
grViz(paste("
digraph boxes_and_circles {

  # a 'graph' statement
  graph [overlap = true,
        fontsize =24,
        rankdir = LR,
        splines=ortho]

  # several 'node' statements
  node [shape = box,
        fontname = Helvetica,
        style = filled,
        fontsize =24.0]
  E[label=\"mRNA\nn: ",
  formatC(N_mRNA,big.mark=" ",format="d"),
  "\",fillcolor=\"",
  mRNAColor,
  "\",width=2,height=3];
  F[label=\"All ORFs\nn: ",
  formatC(N_ORFS,big.mark=" ",format="d"),
  "\",fillcolor=\"",
  mRNAColor,
  "\",width=2,height=3];
  I[label=\"mRNA\nwithout a ORF\nthat match\nperfectcly\nn: ",
  formatC(N_no_match,big.mark=" ",format="d"),
  "\",fillcolor=\"",
  mRNAColor,
  "\"];
  J[label=\"Longuest\nORF\nper mRNA\nn: ",
  formatC(N_no_match,big.mark=" ",format="d"),
  "\",fillcolor=\"",
  mRNAColor,
  "\"];

  A[label=\"Swissprot\ncanonical\nn: ",
  formatC(N_canonical,big.mark=" ",format="d"),
  "\",fillcolor=\"",
  UniprotColor,
  "\",width=3];
  B[label=\"Swissprot\nisoform\nn: ",
  formatC(N_isoform,big.mark=" ",format="d"),
  "\",fillcolor=\"",UniprotColor,"\",width=3];
  C[label=\"TrEMBL\nn: ",
  formatC(N_TrEMBL,big.mark=" ",format="d"),
  "\",fillcolor=\"",UniprotColor,"\",width=3];
  D[label=\"UNIPROT\nn: ",
  formatC(N_UNIPROT,big.mark=" ",format="d"),
  "\",fillcolor=\"",UniprotColor,"\",width=3,height=3];
  M[label=\"Protein bank\nfor\nMaxQuant\nn: ",
  formatC(N_ProteinBank,big.mark=" ",format="d"),
  "\",fillcolor=\"",ProteinBankColor,"\",height=9]

  K[label=\"C: LORF\nwith\na blast hit\nn: ",
  formatC(N_BLAST_hit,big.mark=" ",format="d"),
  "\",fillcolor=\"",mRNAColor,"\",width=3.5];
  L[label=\"A: Unknown\nLORF\nn: ",
  formatC(N_unknown,big.mark=" ",format="d"),
  "\",fillcolor=\"",mRNAColor,"\",width=3.5];
  H[label=\"D: mRNA\nwith a ORF\nthat\nmatch perfectly\nn: ",
  formatC(N_match,big.mark=" ",format="d"),
  "\",fillcolor=\"",UniprotColor,"\",width=3.5];
  G[label=\"B: Isoform\nTrEmbl\nBlast HIT\nn: ",
  formatC(N_Isoform_TrEBML_BLAST_hit,big.mark=" ",format="d"),
  "\",fillcolor=\"",UniprotColor,"\",width=3.5];
  N[label=\"E: Swissprot Canonical\nwithout\na perfect Match\nn: ",
  formatC(N_SP_whithout_perfect_match,big.mark=" ",format="d"),
  "\",fillcolor=\"",UniprotColor,"\",width=3.5];

  node [shape = plaintext,fillcolor=\"",ActionColor,"\"];
  1[label=\"Concat\nkeep only\nunique\nsequence\",height=3]; 2[label=\"Find ORFs\"];
  3[label=\"Perfect\nMatch\",height=4]; 4[label=\"Select\"]; 5[label=\"BLAST\",height=4]

  # several 'edge' statements
  edge [len=2.0]
  E->2 2->F F->3 
  3->I 4->J J->5 
  I->4 
  A->1 B->1 C->1 1->D 
  D->3 
  D->5 5->K
  5->G 5->L L->M G->M K->M H->M N->M
  3->H
  3->N

  { rank=same; A; B; C; 2; }
  { rank=same; F; 1; }
  { rank=same; D; 3; }
  { rank=same; I;}
  { rank=same; L; K; G; H; N}
}"),width=768,height=432)

Protein bank {.tabset}

Width

Length histogram of the protein of the bank.

#plotly histogram
plot_ly(x =~ width(ProteinBank),
  type="histogram")

Venn diagram

The protein bank file contains r formatC(length(ProteinBank),big.mark=" ",format="d") proteins.
The reference sequences are characterized by the "r paste(params$RefRegexp)" regular expression.
The mRNA sequences are characterized by the "r paste(params$mRNARegexp)" regular expression.

vennDiagram(SequencesCounts,
            main="Perfect match",
            names=c("From UNIPROT","From mRNA"))

mRNA Treemap

The treemap represent the r formatC(length(STRINGTIE_BOOL),big.mark=" ",format="d") proteins from the mRNAs of the protein bank. They are divided in 3 categories:

treemap::treemap(ProteinBank.Counts,
        index=c("Match","Form"),
        vSize = "Effectif",
        vColor="Form",
        type="index")

Uniprot representation

From all the UNIPROT entries in the database, how do they interact with our mRNAs? Do they match perfectly some of our mRNA or do they have a BLAST hit?

vennDiagram(UNIPROT_Presence.collapsed.counts,
            "Uniprot Coverage")

Mass spectrometry {.tabset}

The protein bank obtained from the mRNA pipeline is then used for proteomic analysis. The spectrum have been searched with MaxQuant.

Peptide match

We focus here on the leading razor protein. The leading razor protein is the protein that MaxQuant consider as the most likely when a peptide match several proteinor isoforms, based on the numer of peptide matching the protein.

This is the repartition of the leading razor proteins:

#Get the contaminant
ContamsInMS<-unique(Pep[grep("(CON|REV)__",`Leading razor protein`),`Leading razor protein`])
#Discard the contaminant
ProteinsInMS.NoContam<-unique(Pep[grep("(CON|REV)__",`Leading razor protein`,invert=TRUE),`Leading razor protein`])
#How many
N.ContamsInMS<-length(ContamsInMS)
#The idea is that if the entry is only present in SwissProt it will be featured as the first on the list, it doesn't matter if it match any TR or any oter swissprot isoform
ProteinsInMS.NoContam.SwissprotOnly<-ProteinsInMS.NoContam[grep("^sp\\|",ProteinsInMS.NoContam)]
N.Uniprot<-length(ProteinsInMS.NoContam.SwissprotOnly)

#Leading Razor Protein that match MRNA
ProteinsInMS.NoContam.Mrna<-ProteinsInMS.NoContam[grep("^strngt",ProteinsInMS.NoContam)]

#Of those who matched the mrna how many have a perfect match in the Uniprot database
ProteinsInMS.NoContam.Mrna.Both<-ProteinsInMS.NoContam.Mrna[grep("UNIPROT=",ProteinsInMS.NoContam.Mrna)]
N.Both<-length(ProteinsInMS.NoContam.Mrna.Both)

#Of those who match the Mrna how many do not match anything in UNIPROT 
ProteinsInMS.NoContam.Mrna.MrnaOnly<-ProteinsInMS.NoContam.Mrna[grep("UNIPROT=",ProteinsInMS.NoContam.Mrna,invert=TRUE)]
N.Mrna<-length(ProteinsInMS.NoContam.Mrna.MrnaOnly)

#Some Unpretty representation of our Leading Razon Protein Situation
df <- data.frame(group=factor(c("Contaminants",
    "Only in Uniprot",
    "Both in UNIPROT and MRNA",
    "Only in MRNA"),
    levels=c("Contaminants",
      "Only in Uniprot",
      "Both in UNIPROT and MRNA",
      "Only in MRNA")),
    Effectif=c(N.ContamsInMS,
    N.Uniprot,
    N.Both,
    N.Mrna))
datatable(df,rownames = FALSE)

g<-ggplot(df, aes(x = "", y = Effectif, fill = group)) +
  geom_col(width = 1) +
  scale_fill_manual(values = viridis(4)) +
  labs(title = "Two pass/Leading Razon protein")+
  theme_bw()
print(g)

Leading Razor Protein

Of all the r length(unique(ProteinsInMS.NoContam)), what kind of uniprot entry are they and where do they come from?

Uniprot entry:

Origin:

ProteinsInMS.NoContam.Swissprot<-ProteinsInMS.NoContam[grep("^sp\\|",ProteinsInMS.NoContam)]

ProteinsInMS.NoContam.Swissprot.canonical<-ProteinsInMS.NoContam.Swissprot[
  grep("^sp\\|[^-]*\\|",
       ProteinsInMS.NoContam.Swissprot)]
ProteinsInMS.NoContam.Swissprot.Iso<-ProteinsInMS.NoContam.Swissprot[
  grep("^sp\\|[^-]*\\|",
       ProteinsInMS.NoContam.Swissprot,
       invert=TRUE)]
ProteinsInMS.NoContam.TR<-ProteinsInMS.NoContam[
  grep("^tr\\|",
       ProteinsInMS.NoContam)]



ProteinsInMS.NoContam.mRNA<-ProteinsInMS.NoContam[grep("^st",ProteinsInMS.NoContam)]
ProteinsInMS.NoContam.mRNA.PerfectMatch<-ProteinsInMS.NoContam.mRNA[grep("UNIPROT=",ProteinsInMS.NoContam.mRNA)]
ProteinsInMS.NoContam.mRNA.PerfectMatch.canonical<-ProteinsInMS.NoContam.mRNA.PerfectMatch[
  grep(
    "UNIPROT=sp\\|[^-]*\\|",                                                                  
    ProteinsInMS.NoContam.mRNA.PerfectMatch
       )]
ProteinsInMS.NoContam.mRNA.PerfectMatch.Iso<-ProteinsInMS.NoContam.mRNA.PerfectMatch[
  grep(
    "UNIPROT=sp\\|[^-]*-[^-]*\\|",                                                                  
    ProteinsInMS.NoContam.mRNA.PerfectMatch
       )]
ProteinsInMS.NoContam.mRNA.PerfectMatch.TR<-ProteinsInMS.NoContam.mRNA.PerfectMatch[
  grep(
    "UNIPROT=tr\\|",                                                                  
    ProteinsInMS.NoContam.mRNA.PerfectMatch
       )]


ProteinsInMS.NoContam.mRNA.LookAlike<-ProteinsInMS.NoContam.mRNA[
  grep("Blast=",
       ProteinsInMS.NoContam.mRNA)]
ProteinsInMS.NoContam.mRNA.LookAlike.canonical<-ProteinsInMS.NoContam.mRNA.LookAlike[
  grep(
    "Blast=sp\\|[^-]*\\|",                                                                  
    ProteinsInMS.NoContam.mRNA.LookAlike
       )]
ProteinsInMS.NoContam.mRNA.LookAlike.Iso<-ProteinsInMS.NoContam.mRNA.LookAlike[
  grep(
    "Blast=sp\\|[^-]*-[^-]*\\|",                                                                  
    ProteinsInMS.NoContam.mRNA.LookAlike
       )]
ProteinsInMS.NoContam.mRNA.LookAlike.TR<-ProteinsInMS.NoContam.mRNA.LookAlike[
  grep(
    "Blast=tr\\|",                                                                  
    ProteinsInMS.NoContam.mRNA.LookAlike
       )]

ProteinsInMS.NoContam.mRNAOnly<-ProteinsInMS.NoContam.mRNA[
  intersect(
    grep("UNIPROT=",
         ProteinsInMS.NoContam.mRNA,
         invert=TRUE),
    grep("Blast=",
         ProteinsInMS.NoContam.mRNA,
         invert=TRUE)
  )
]

LeadingRazor.DF<-data.frame(Origin=c(rep("Reinjected",3),
                                     rep("mRNAPerfectMatch",3),
                                     rep("mRNABlast",3),
                                     "mRNA"
                                     ),
                            Database=c(
                              rep(
                                rep(
                                  c("canonical",
                                    "isoform",
                                    "TrEMBL")
                                ),
                                3
                              ),"unknown"),
                            Effectif=rep(
                              0,
                              10
                              )
                            )

LeadingRazor.DF[LeadingRazor.DF$Origin=="Reinjected" &
                  LeadingRazor.DF$Database=="canonical",
                "Effectif"]<-length(ProteinsInMS.NoContam.Swissprot.canonical)
LeadingRazor.DF[LeadingRazor.DF$Origin=="Reinjected" &
                  LeadingRazor.DF$Database=="isoform",
                "Effectif"]<-length(ProteinsInMS.NoContam.Swissprot.Iso)
LeadingRazor.DF[LeadingRazor.DF$Origin=="Reinjected" &
                  LeadingRazor.DF$Database=="TrEMBL",
                "Effectif"]<-length(ProteinsInMS.NoContam.TR)

LeadingRazor.DF[LeadingRazor.DF$Origin=="mRNAPerfectMatch" &
                  LeadingRazor.DF$Database=="canonical",
                "Effectif"]<-length(ProteinsInMS.NoContam.mRNA.PerfectMatch.canonical)
LeadingRazor.DF[LeadingRazor.DF$Origin=="mRNAPerfectMatch" &
                  LeadingRazor.DF$Database=="isoform",
                "Effectif"]<-length(ProteinsInMS.NoContam.mRNA.PerfectMatch.Iso)
LeadingRazor.DF[LeadingRazor.DF$Origin=="mRNAPerfectMatch" &
                  LeadingRazor.DF$Database=="TrEMBL",
                "Effectif"]<-length(ProteinsInMS.NoContam.mRNA.PerfectMatch.TR)

LeadingRazor.DF[LeadingRazor.DF$Origin=="mRNABlast" &
                  LeadingRazor.DF$Database=="canonical",
                "Effectif"]<-length(ProteinsInMS.NoContam.mRNA.LookAlike.canonical)
LeadingRazor.DF[LeadingRazor.DF$Origin=="mRNABlast" &
                  LeadingRazor.DF$Database=="isoform",
                "Effectif"]<-length(ProteinsInMS.NoContam.mRNA.LookAlike.Iso)
LeadingRazor.DF[LeadingRazor.DF$Origin=="mRNABlast" &
                  LeadingRazor.DF$Database=="TrEMBL",
                "Effectif"]<-length(ProteinsInMS.NoContam.mRNA.LookAlike.TR)

LeadingRazor.DF[LeadingRazor.DF$Origin=="mRNA" &
                  LeadingRazor.DF$Database=="unknown",
                "Effectif"]<-length(ProteinsInMS.NoContam.mRNAOnly)
datatable(LeadingRazor.DF,rownames = FALSE)
treemap(LeadingRazor.DF,
        index=c("Database","Origin"),
        vSize = "Effectif",
        type="index",
        align.labels = list(c("center","center"),c("left","top")))

Isoforms

We isolated the leading razor protein identified as isoforms with at least a proteotypic peptide, meaning a peptide that match only one protein.

How to read the table

The name correspond to the Fasta entry in the ProteinBank that is the most parcimonious hypothesis able to explain all listed peptides.

The name field contains a HTML link to the UNIPROT online sequence entry.

The N_Peptides contains the number of peptides that match the Protein.

The table have hidden fields accessible through the green button on the left before each name. It will unhide some supplementary informations about the protein. You can hide these informations through the red button. These supplementary informations consists of:

The MatchProtein contains the names of all the proteins that could have been match with theses peptides.

The N_MatchProtein contains the number of proteins match that could have been match with theses peptides.

The PEPTIDES tables contains all the informations relatives to the peptides.

The SequenceInHTML is a fasta friendly representation of the protein sequence. The red parts are the peptides matchs on the sequence. They might be some overlaping peptides due to misscleavage.

Isoforms<-c(ProteinsInMS.NoContam.mRNA.PerfectMatch.Iso,
ProteinsInMS.NoContam.Swissprot.Iso)
if(length(Isoforms)){
Isoforms.Proteotypic<-Isoforms[Isoforms %in% Pep$Proteins]
Pep.Iso<-data.frame(Pep[Pep$`Leading razor protein` %in% Isoforms.Proteotypic,])
ShortNames<-c("Leading.razor.protein",
              "Proteins",
              "Start.position",
              "End.position",
              "PEP",
              "Sequence")
Pep.Iso.Short<-Pep.Iso[,ShortNames]
colnames(Pep.Iso.Short)[6]<-"PeptideSequence"
ProteinBank.Iso<-ProteinBank[gsub(" .*","",names(ProteinBank)) %in% Isoforms.Proteotypic]
IsoSequencesOfInterest<-ProteinBank[gsub(" .*","",names(ProteinBank)) %in% Isoforms.Proteotypic]
IsoSequencesOfInterest.DF<-data.frame(
  sequence=as.vector(IsoSequencesOfInterest),
  shortName=gsub(" .*",
                 "",
                 names(IsoSequencesOfInterest)),
  Name=names(IsoSequencesOfInterest))
rownames(IsoSequencesOfInterest.DF)<-NULL
Pep.Iso.Short.CoordPart<-aggregate(
    Pep.Iso.Short[,c("Start.position",
                           "End.position",
                           "PEP",
                           "Proteins",
                     "PeptideSequence")],
    by=list(Pep.Iso.Short$Leading.razor.protein),
    function(x){
      paste0(x,
        collapse="/")
    }
  )
  Pep.Iso.data.table<-merge(Pep.Iso.Short.CoordPart,
                                 IsoSequencesOfInterest.DF,
                                 by.x="Group.1",
                                 by.y="shortName")
  Pep.Iso.data.table$Name<-lapply(X = as.vector(Pep.Iso.data.table$Name),
                                  function(x){
                                    Infos<-unlist(strsplit(x,"\\|"))
                                    if(length(Infos)>=3){
                                      LOCAL_UNIPROT_ENTRY<-gsub("-[^-]*","",Infos[2])
                                      return(paste("<a href=\"https://www.uniprot.org/uniprot/",
                                                   LOCAL_UNIPROT_ENTRY,
                                                    "#sequences\" target=\"_blank\">",
                                                   x,
                                                   "</a>",sep=""))

                                    }else{
                                      return(x)
                                    }
                                  })
  Pep.Iso.data.table$SequenceInHTML<-apply(
    Pep.Iso.data.table[,c("sequence",
                          "Start.position",
                          "End.position")],
    1,
    function(x){
      AAs<-unlist(
        strsplit(
          as.vector(
            x["sequence"]
          ),
          split="")
      )
      Starts<-as.numeric(
        unlist(
          strsplit(
            x["Start.position"],
            "/")
        )
      )
      Stops<-as.numeric(
        unlist(
          strsplit(
            x["End.position"],
            "/")
        )
      )
      Intervals<-reduce(IRanges(Starts,Stops))
      AAs[start(Intervals)]<-paste("<font color=red>",
                                   AAs[start(Intervals)],
                                   sep="")
      AAs[end(Intervals)]<-paste(AAs[end(Intervals)],
                                 "</font>",
                                 sep="")

      Blocks<-base::seq(10,length(AAs),10)
      if(length(AAs)>60){
        NewLines<-base::seq(60,length(AAs),60)
        Blocks<-Blocks[!Blocks %in% NewLines]
        AAs[NewLines]<-paste(AAs[NewLines],"</br>",sep="")
      }
      AAs[Blocks]<-paste(AAs[Blocks],"&nbsp;",sep="")
      AAs[1]<-paste("<div style=\"font-family:monospace;font-weight: bold\">",
        AAs[1],
        sep="")
      AAs[length(AAs)]<-paste(AAs[length(AAs)],
        "</div>",
        sep="")
      AAs<-paste(AAs,collapse="")
      return(AAs)
    })
 Pep.Iso.data.table$PEPTIDES<-apply(Pep.Iso.data.table[,
                                    c("Start.position",
                                      "End.position",
                                      "PEP",
                                      "Proteins",
                                      "PeptideSequence")],
                           1,
                           function(x){
                             Starts<-strsplit(x=x["Start.position"],
                                              split="/")
                             End<-strsplit(x=x["End.position"],
                                           split="/")
                             PEP<-strsplit(x=x["PEP"],
                                           split="/")
                             Sequences<-strsplit(x=x["PeptideSequence"],
                                                 split="/")

                             Proteins<-strsplit(x=x["Proteins"],
                                           split="/")
                             Proteins<-lapply(Proteins,
                                              function(y){
                                                y<-gsub(";",
                                                     " ",
                                                     y)
                                                y<-gsub(",[^ ]*",
                                                     "",
                                                     y)
                                              y<-gsub("sp\\||tr\\|",
                                                     "",
                                                     y)

                                                y<-gsub("\\|[^ ]*",
                                                     "",
                                                     y)
                                                return(y)
                                              }
                             )
                             TABLE<-data.frame(Start=Starts,
                                               End=End,
                                               Pep=PEP,
                                               Proteins=Proteins,
                                               Sequences=Sequences)
                             colnames(TABLE)<-c("Start",
                                                "End",
                                                "PEP",
                                                "Proteins",
                                                "Sequences")
                             rownames(TABLE)<-NULL
                             TABLE<-TABLE[
                               order(
                                 as.numeric(
                                   as.vector(
                                   TABLE$Start
                                   )
                                   )

                                 ),]
                             scroll_box(
                               kable_styling(
                                 kable(TABLE,row.names = FALSE)
                               ),
                               fixed_thead = T,
                               height = "150px"
                             )
                           })
 Pep.Iso.data.table$MatchProtein<-lapply(
   Pep.Iso.data.table[,
            "Proteins"],
   function(y){
     y<-gsub("/",
             " ",
             y)
     y<-gsub(";",
             " ",
             y)
     y<-gsub(",[^ ]*",
             "",
             y)
     y<-gsub("sp\\||tr\\|",
             "",
             y)

     y<-gsub("\\|[^ ]*",
             "",
             y)
     y<-unique(unlist(strsplit(y," ")))
     y<-paste(y,collapse = " ")
     return(y)
   }
 )
 Pep.Iso.data.table$N_Peptides<-unlist(
    lapply(Pep.Iso.data.table$Start.position,
      function(x){
        return(length(unlist(strsplit(as.vector(x),"/"))))
      }
    )
  )
 Pep.Iso.data.table$N_MatchProtein<-lapply(
   Pep.Iso.data.table[,
            "Proteins"],
   function(y){
     y<-gsub("/",
             " ",
             y)
     y<-gsub(";",
             " ",
             y)
     y<-unique(unlist(strsplit(y," ")))
     return(length(y))
   }
 )
  Pep.Iso.data.table[,c("sequence",
                          "Start.position",
                          "End.position",
                        "PEP",
                        "Proteins","Group.1")]<-NULL
  Pep.Iso.data.table<-Pep.Iso.data.table[,c("Name",
                                            "N_Peptides",
                                            "MatchProtein",
                                            "N_MatchProtein",
                                            "PEPTIDES",
                                            "SequenceInHTML")]
  Pep.Iso.data.table<-Pep.Iso.data.table[order(Pep.Iso.data.table$N_Peptides,
                                               decreasing = TRUE),]
  DT::datatable(Pep.Iso.data.table,
                rownames=FALSE,
                escape=FALSE,
                extensions = 'Responsive'
                )
  }

Pep score

r formatC(dim(Pep)[1],big.mark=" ") peptides have been identified with r formatC(sum(Pep[,"MS/MS Count"]),big.mark=" ") MS/MS.

 PEP.PEP<-unlist(Pep[,"PEP"])
 names(PEP.PEP)<-NULL
 PEP.PEP[PEP.PEP<=1e-300]<-1e-300
 hist(-log(PEP.PEP,10),
  breaks=seq(0,300,1),
  xlab="-log_10(PEP)",
  ylab="Frequency",
  main="PEP score repartition - histogram")
 plot(ecdf(-log(PEP.PEP,10)),
  xlab="-log_10(PEP)",
  main="PEP score repartition - empirical cumulative distribution")

From mRNAs {.tabset}

This part is focus on the protein of the bank that cames from the mRNA that do not have a perfect match.

Even though they do not have a perfect match in UNIPROT they might have a blast hit on UNIPROT.

In that case the Bold part of the sequence correspond to the matching part of the protein sequence.

If you hover with the mouse cursor over the bold part, the blast informations should appear.

The underlined part corresponds to blast missmatch to the UNIPROT entry or potential SAAV. NB the used genome for generating the protein sequence is the Reference genome and may not reflect the actual polymorphism of our samples.

The yellow highlight part correspond to an insertion to the UNIPROT entry.

The green highlight part correspond to a deletion to the UNIPROT entry. The missing part is between the two highlighted Amino Acid. On hover, the missing part should appear.

The red letters corresponds to the peptide match on the protein.

Peptides and Proteins

  #The Blast File
  #Might have to discard this part about QuerySeqId
  Rescue$QuerySeqId<-gsub(",.*","",Rescue$QuerySeqId)
  #Should be interesting to sort it before discard duplicate
  Rescue<-Rescue[order(Rescue$BitScore,decreasing=TRUE),]
  Rescue<-Rescue[!duplicated(Rescue$QuerySeqId),]

  #Create a data frame for the protein bank
  ProteinBank.DF<-data.frame(Protein=as.vector(ProteinBank),
    Name=names(ProteinBank))
  rownames(ProteinBank.DF)<-NULL
  #Due to some miss used of semi colon We need to handle things to be able to match it
  #Will need to remove it when everything is done
  ProteinBank.DF$Name<-gsub("[ ;].*$","",ProteinBank.DF$Name)
  ProteinBank.DF$Name<-gsub(",.*","",ProteinBank.DF$Name)
  #Smaller set
  Pep.Short<-Pep[,c("Sequence",
    "Leading razor protein",
    "Start position",
    "End position",
    "PEP",
    "Proteins",
    "Sequence")]
  Pep.Short<-data.frame(Pep.Short)
  #Missused of semi colon
  #Will need to remove it when everything is done
  Pep.Short$Leading.razor.protein<-gsub(",.*$",
    "",
    Pep.Short$Leading.razor.protein)
  Pep.Short.NoNA<-Pep.Short[!is.na(Pep.Short$Start.position),]
  Pep.GR<-makeGRangesFromDataFrame(Pep.Short.NoNA,keep.extra.columns = FALSE,
                                   seqnames.field = "Leading.razor.protein",
                                   start.field = "Start.position",
                                   end.field = "End.position")
  Pep.GR<-reduce(Pep.GR)
  #Big data frame
  # The Leading razor protein will be used as peptide identificator
  # 1 line per PSM
  #  - redundance of information about Protein identification
  #ProteinBank.DF$Name<-gsub(",.*","",ProteinBank.DF$Name)
  #Pep.Short$Leading.razor.protein<-gsub(",.*","",Pep.Short$Leading.razor.protein)
  ProteinBank.DF<-merge(ProteinBank.DF,
    Pep.Short,
    by.x = "Name",
    by.y = "Leading.razor.protein",
    sort = FALSE)
  ProteinsInMS.NoContam.Mrna.MrnaOnly.NameInPeptide<-gsub(",.*","",ProteinsInMS.NoContam.Mrna.MrnaOnly)
  # I will focus on the positive input of the proteogenomic approach so
  # I will only need the information reagarding my leading razor protein
  # Of these leading razor protein I just want those who are from the mRNA
  # at this step the data frame is still PSM centered
  MrnaOnlyPeptideBank<-ProteinBank.DF[ProteinBank.DF$Name %in% ProteinsInMS.NoContam.Mrna.MrnaOnly.NameInPeptide,]
  MrnaOnlyPeptideBank<-MrnaOnlyPeptideBank[order(MrnaOnlyPeptideBank$Start.position,MrnaOnlyPeptideBank$Name),]
  #Aggregate everythingby protein
  # 1 line = 1 Protein
  # 1 Protein -> 1,n PSM
  MrnaOnly.NamePart<-unique(MrnaOnlyPeptideBank[,c("Name","Protein")])
  MrnaOnlyPeptideBank$Proteins<-gsub(";Origin=3'",
                                     "",
                                     MrnaOnlyPeptideBank$Proteins)
  if(dim(MrnaOnlyPeptideBank)[1]!=0){
  MrnaOnly.CoordPart<-aggregate(
    MrnaOnlyPeptideBank[,c("Start.position",
                           "End.position",
                           "PEP",
                           "Proteins",
                           "Sequence")],
    by=list(MrnaOnlyPeptideBank$Protein),
    function(x){
      paste0(x,
        collapse="/")
    }
  )

  # Just to known how many peptide match this protein
  MrnaOnly.CoordPart$N_Peptides<-unlist(
    lapply(MrnaOnly.CoordPart$Start.position,
      function(x){
        return(length(unlist(strsplit(as.vector(x),"/"))))
      }
    )
  )
  MrnaOnly<-merge(MrnaOnly.NamePart,
    MrnaOnly.CoordPart,
    by.x="Protein",
    by.y="Group.1")
  #MrnaOnly is at this moment just a digest of the pep file
  #we will add the information concerning the Blast results
  MrnaOnly<-merge(MrnaOnly,
    Rescue,
    by.x="Name",
    by.y="QuerySeqId",
    all.x=TRUE)

  MrnaOnly.Informations<-matrix(FALSE,nrow=length(ProteinsInMS.NoContam.Mrna.MrnaOnly.NameInPeptide),ncol=8)
  rownames(MrnaOnly.Informations)<-as.vector(MrnaOnly$Name)
  colnames(MrnaOnly.Informations)<-c("SAAV",
                                     "Insertion",
                                     "Deletion",
                                     "SAAV_In_Peptide",
                                     "Insertion_In_Peptide",
                                     "Deletion_In_Peptide",
                                     "Peptide_Not_In_Blast",
                                     "Not_In_Blast")

  # Will produce a HTML ready to print fasta sequence
  # 
  #
  #

  #
  #
  #
  #DEBUG part
  #case strngt.5186.1,start=37400,stop=37897,frame=1,flag=LonguestORF
  # case<-"strngt.22916.1,start=29892,stop=30719,frame=2,flag=LonguestORF"
  # MrnaOnly<-MrnaOnly[MrnaOnly$Name==case,]
  # Rescue<-Rescue[Rescue$QuerySeqId==case,]
  # MrnaOnly<-as.matrix(MrnaOnly)
  # x<-as.vector(MrnaOnly[1,])
  # names(x)<-colnames(MrnaOnly)

  MrnaOnly$SequenceInHTML<-apply(MrnaOnly,1,function(x,SpectralGRanges){
      ProteinSequence<-unlist(strsplit(as.vector(x["Protein"]),""))
      Starts<-as.numeric(unlist(strsplit(x["Start.position"],"/")))
      Stops<-as.numeric(unlist(strsplit(x["End.position"],"/")))
      Intervals<-reduce(IRanges(Starts,Stops))


      ProteinSequence[start(Intervals)]<-paste("<font color=red>",
        ProteinSequence[start(Intervals)],
        sep="")
      ProteinSequence[end(Intervals)]<-paste(ProteinSequence[end(Intervals)],
        "</font>",
        sep="")

      if(!is.na(x["SubjectSeqId"])){
        if(sum(c(Starts<as.numeric(as.vector(x["QueryStart"])),
               Stops>as.numeric(as.vector(x["QueryEnd"]))))>0){
                    MrnaOnly.Informations[as.vector(x["Name"]),"Peptide_Not_In_Blast"]<<-TRUE
        }
        QuerySequence<-as.vector(x["QuerySequence"])
        SubjectSequence<-as.vector(x["SubjectSequence"])
        Query<-unlist(strsplit(QuerySequence,""))
        Subject<-unlist(strsplit(SubjectSequence,""))

        DeletionsInQuery<-which(Query=="-")
        if(length(DeletionsInQuery)>0){
          DeletionsInQuery<-reduce(IRanges(DeletionsInQuery,
                                           DeletionsInQuery))
          Deletions.Sequences<-substring(SubjectSequence,
                                         first=start(DeletionsInQuery),
                                         last=end(DeletionsInQuery))
          ShiftInducedByDeletions<-c(0,cumsum(width(DeletionsInQuery)))
          ShiftInducedByDeletions<-ShiftInducedByDeletions[-length(ShiftInducedByDeletions)]
          DeletionsInQuery<-shift(DeletionsInQuery,-ShiftInducedByDeletions)
          EdgesOfTheDeletionsInProtein<-IRanges(start(DeletionsInQuery)-1,
                                                start(DeletionsInQuery))
          EdgesOfTheDeletionsInProtein<-shift(EdgesOfTheDeletionsInProtein,
                as.numeric(as.vector(x["QueryStart"]))-1)
          ProteinSequence[start(EdgesOfTheDeletionsInProtein)]<-paste("<mark style=\"background-color:green;padding:0;border-width:0\" title=\"Deletion: ",
                                                                      ProteinSequence[start(EdgesOfTheDeletionsInProtein)],
                                                                      "|",
                                                                      Deletions.Sequences,
                                                                      "|",
                                                                      ProteinSequence[end(EdgesOfTheDeletionsInProtein)],
                                                                      "->",
                                                                      ProteinSequence[start(EdgesOfTheDeletionsInProtein)], ProteinSequence[end(EdgesOfTheDeletionsInProtein)],
                                                                      "\">", ProteinSequence[start(EdgesOfTheDeletionsInProtein)],
                                                                      sep="")
          ProteinSequence[end(EdgesOfTheDeletionsInProtein)]<-paste(ProteinSequence[end(EdgesOfTheDeletionsInProtein)],
                                                                    "</mark>",
                                                                    sep="")
          #shift(EdgesOfTheDeletionsInProtein,1)
          Deletions.GR<-GRanges(seqnames=as.vector(x["Name"]),
                                ranges = EdgesOfTheDeletionsInProtein)
          MrnaOnly.Informations[as.vector(x["Name"]),"Deletion"]<<-TRUE
          DeletionsInPeptides.GR<-GenomicRanges::intersect(SpectralGRanges,
                                                           Deletions.GR)
          if(length(DeletionsInPeptides.GR)>0 &
             sum(width(DeletionsInPeptides.GR)==2)>=1
             ){
            MrnaOnly.Informations[as.vector(x["Name"]),"Deletion_In_Peptide"]<<-TRUE
          }
          Subject<-Subject[-which(Query=="-")]
          Query<-Query[-which(Query=="-")]
        }

        #
        #

        Insertions<-which(Subject=="-")+as.numeric(as.vector(x["QueryStart"]))-1
        if(length(Insertions)>0){
          Insertions.IR<-reduce(IRanges(Insertions,
                                        Insertions))
          ProteinSequence[start(Insertions.IR)]<-paste("<mark style=\"background-color:yellow;padding:0;border-width:0\" title=\"Insertion\">",
                                                       ProteinSequence[start(Insertions.IR)],
                                                       sep="")
          ProteinSequence[end(Insertions.IR)]<-paste(ProteinSequence[end(Insertions.IR)],
                                                     "</mark>",sep="")
          Subject[which(Subject=="-")]<-Query[which(Subject=="-")]
          Insertions.GR<-GRanges(seqnames=as.vector(x["Name"]),
                                ranges = ranges(Insertions.IR))
          MrnaOnly.Informations[as.vector(x["Name"]),"Insertion"]<<-TRUE
          InsertionInPeptides.GR<-GenomicRanges::intersect(SpectralGRanges,Insertions.GR)
          if(length(InsertionInPeptides.GR)>0){
            MrnaOnly.Informations[as.vector(x["Name"]),"Insertion_In_Peptide"]<<-TRUE
          } 
        } 
        ProteinSequence[as.numeric(as.vector(x["QueryStart"]))]<-paste("<b title=\"",
          as.vector(x["SubjectSeqId"]),
          " From:",as.vector(x["SubjectStart"]),
          " To:",as.vector(x["SubjectEnd"]),
          " Of:",as.vector(x["SubjectLength"]),
          " Mismatch:",as.vector(x["Mismatch"]),
          " Gaps:",as.vector(x["Gaps"]),
          "\">",ProteinSequence[as.numeric(as.vector(x["QueryStart"]))],
          sep="")
        ProteinSequence[as.numeric(as.vector(x["QueryEnd"]))]<-paste(ProteinSequence[as.numeric(as.vector(x["QueryEnd"]))],
        "</b>",
        sep="")
        SubjectNoGap.Sequence<-paste(Subject,collapse="")
      Diffs<-which(Query!=Subject)
      if(length(Diffs)>0){
        Diffs<-reduce(IRanges(Diffs,Diffs))
        Mismatch.Sequences<-substring(SubjectNoGap.Sequence,first=start(Diffs),last=end(Diffs))
        Diffs<-shift(Diffs,as.numeric(as.vector(x["QueryStart"]))-1)
        ProteinSequence[start(Diffs)]<-paste("<u color=red title=\"", Mismatch.Sequences,"\">",ProteinSequence[start(Diffs)],sep="")
        ProteinSequence[end(Diffs)]<-paste(ProteinSequence[end(Diffs)],"</u>",sep="")
        Diffs.GR<-GRanges(seqnames=as.vector(x["Name"]),
                                ranges = ranges(Diffs))
          MrnaOnly.Informations[as.vector(x["Name"]),"SAAV"]<<-TRUE
          DiffsInPeptides.GR<-GenomicRanges::intersect(SpectralGRanges,Diffs.GR)
          if(length(DiffsInPeptides.GR)>0){
            MrnaOnly.Informations[as.vector(x["Name"]),"SAAV_In_Peptide"]<<-TRUE
          }
        }
      }else{
        MrnaOnly.Informations[as.vector(x["Name"]),"Not_In_Blast"]<<-TRUE
      }




      Blocks<-seq(10,length(ProteinSequence),10)
      if(length(ProteinSequence)>60){
        NewLines<-seq(60,length(ProteinSequence),60)
        Blocks<-Blocks[!Blocks %in% NewLines]
        ProteinSequence[NewLines]<-paste(ProteinSequence[NewLines],"</br>",sep="")
      }
      ProteinSequence[Blocks]<-paste(ProteinSequence[Blocks],"&nbsp;",sep="")
      ProteinSequence[1]<-paste("<div style=\"font-family:monospace\">",
        ProteinSequence[1],
        sep="")
      ProteinSequence[length(ProteinSequence)]<-paste(ProteinSequence[length(ProteinSequence)],
        "</div>",
        sep="")
      ProteinSequence<-paste(ProteinSequence,collapse="")
      return(ProteinSequence)
    },
    Pep.GR
  )
  MrnaOnly$Start.position<-gsub("/",
    " ",
    MrnaOnly$Start.position)
  MrnaOnly$End.position<-gsub("/",
    " ",
    MrnaOnly$End.position)
  MrnaOnly$PEP<-gsub("/",
    " ",
    MrnaOnly$PEP)
  MrnaOnly$Proteins<-gsub("/",
    " ",
    MrnaOnly$Proteins)
  MrnaOnly$Sequence<-gsub("/",
    " ",
    MrnaOnly$Sequence)


  MrnaOnly$Protein<-NULL
  #MrnaOnly$Name<-gsub(","," ",MrnaOnly$Name)
  #MrnaOnly$Name<-gsub("=",":",MrnaOnly$Name)  
  MrnaOnly<-MrnaOnly[order(MrnaOnly$N_Peptides,
    decreasing=TRUE),]
  MrnaOnly[,c("QueryLength",
              "QueryStart",
              "QueryEnd",
              "QuerySequence",
              "SubjectSeqId",
              "SubjectStart",
              "SubjectEnd",
              "SubjectLength",
              "SubjectSequence",
              "Length",
              "Mismatch",
              "Gaps",
              "Evalue",
              "BitScore")]<-NULL
  MrnaOnly$PEPTIDES<-apply(MrnaOnly[,
                                    c("Start.position",
                                      "End.position",
                                      "PEP",
                                      "Proteins",
                                      "Sequence")],
                           1,
                           function(x){
                             Starts<-strsplit(x=x["Start.position"],
                                              split=" ")
                             End<-strsplit(x=x["End.position"],
                                           split=" ")
                             PEP<-strsplit(x=x["PEP"],
                                           split=" ")
                             Proteins<-strsplit(x=x["Proteins"],
                                           split=" ")
                             Sequences<-strsplit(x=x["Sequence"],
                                           split=" ")
                             Proteins<-lapply(Proteins,
                                              function(y){
                                                y<-gsub(";",
                                                     " ",
                                                     y)
                                                y<-gsub(",[^ ]*",
                                                     "",
                                                     y)
                                              y<-gsub("sp\\||tr\\|",
                                                     "",
                                                     y)

                                                y<-gsub("\\|[^ ]*",
                                                     "",
                                                     y)
                                                return(y)
                                              }
                             )
                             TABLE<-data.frame(Start=Starts,
                                               End=End,
                                               Pep=PEP,
                                               Proteins=Proteins,
                                               Sequence=Sequences)
                             colnames(TABLE)<-c("Start",
                                                "End",
                                                "PEP",
                                                "Proteins",
                                                "Sequences")
                             TABLE<-TABLE[
                               order(
                                 as.numeric(
                                   as.vector(
                                   TABLE$Start
                                   )
                                   )
                                 ),]
                             scroll_box(
                               kable_styling(
                                 kable(TABLE)
                               ),
                               fixed_thead = T,
                               height = "150px"
                             )
                           })
  MrnaOnly$MatchProtein<-lapply(
    MrnaOnly[,
             "Proteins"],
    function(y){
      y<-gsub(";",
              " ",
              y)
      y<-gsub(",[^ ]*",
              "",
              y)
      y<-gsub("sp\\||tr\\|",
              "",
              y)

      y<-gsub("\\|[^ ]*",
              "",
              y)
      y<-unique(unlist(strsplit(y," ")))
      y<-paste(y,collapse = " ")
      return(y)
    }
  )
  MrnaOnly$N_MatchProtein<-lapply(
    MrnaOnly[,
             "Proteins"],
    function(y){
      y<-gsub(";",
              " ",
              y)
      y<-gsub(",[^ ]*",
              "",
              y)
      y<-gsub("sp\\||tr\\|",
              "",
              y)

      y<-gsub("\\|[^ ]*",
              "",
              y)
      y<-unique(unlist(strsplit(y," ")))
      return(length(y))
    }
  )
  MrnaOnly[,c("Start.position",
                                      "End.position",
                                      "PEP",
                                      "Proteins")]<-NULL
  MrnaOnly<-MrnaOnly[,c("Name",
                        "N_Peptides",
                        "N_MatchProtein",
                        "MatchProtein",
                        "PEPTIDES",
                        "SequenceInHTML")]
  datatable(MrnaOnly,
    escape=FALSE,
    rownames=FALSE,
    extensions = 'Responsive',
    options=list(columnDefs=list(list(className="none",targets=5))))

}else{

  cat("No proteins from mRNA whitout perfect match with UNIPROT")
  MrnaOnly<-MrnaOnly.NamePart
  MrnaOnly.Informations<-matrix(FALSE,nrow=length(ProteinsInMS.NoContam.Mrna.MrnaOnly.NameInPeptide),ncol=8)
  rownames(MrnaOnly.Informations)<-as.vector(MrnaOnly$Name)
  colnames(MrnaOnly.Informations)<-c("SAAV",
                                     "Insertion",
                                     "Deletion",
                                     "SAAV_In_Peptide",
                                     "Insertion_In_Peptide",
                                     "Deletion_In_Peptide",
                                     "Peptide_Not_In_Blast",
                                     "Not_In_Blast")
}
  MrnaOnly.Informations<-data.frame(MrnaOnly.Informations)

SAAV

if(dim(MrnaOnly)[1]!=0 || sum(MrnaOnly.Informations$SAAV)!=0){
datatable(MrnaOnly[MrnaOnly$Name %in% rownames(MrnaOnly.Informations)[MrnaOnly.Informations$SAAV],],
          escape=FALSE,
          rownames=FALSE,
          extensions = 'Responsive',
          options=list(columnDefs=list(list(className="none",targets=5))))
}else{
if(dim(MrnaOnly)[1]==0) cat("No proteins from mRNA from Protein\n")
if(sum(MrnaOnly.Informations$SAAV)==0) cat("No proteins with SAAV\n")
}

SAAV in peptides

if(dim(MrnaOnly)[1]!=0 || sum(MrnaOnly.Informations$SAAV_In_Peptide)){
datatable(MrnaOnly[MrnaOnly$Name %in% rownames(MrnaOnly.Informations)[MrnaOnly.Informations$SAAV_In_Peptide],],
          escape=FALSE,
          rownames=FALSE,
          extensions = 'Responsive',
          options=list(columnDefs=list(list(className="none",targets=5))))
}else{
if(dim(MrnaOnly)[1]==0) cat("No proteins from mRNA from Protein\n")
if(sum(MrnaOnly.Informations$SAAV)==0) cat("No proteins with peptides in SAAV\n")
}

Insertion

if(dim(MrnaOnly)[1]!=0 || sum(MrnaOnly.Informations$Insertion)){
datatable(MrnaOnly[MrnaOnly$Name %in% rownames(MrnaOnly.Informations)[MrnaOnly.Informations$Insertion],],
          escape=FALSE,
          rownames=FALSE,
          extensions = 'Responsive',
          options=list(columnDefs=list(list(className="none",targets=5))))
}else{
if(dim(MrnaOnly)[1]==0) cat("No proteins from mRNA from Protein\n")
if(sum(MrnaOnly.Informations$SAAV)==0) cat("No proteins with Insertion\n")
}

Insertion in peptides

if(dim(MrnaOnly)[1]!=0 || sum(MrnaOnly.Informations$Insertion_In_Peptide)){
datatable(MrnaOnly[MrnaOnly$Name %in% rownames(MrnaOnly.Informations)[MrnaOnly.Informations$Insertion_In_Peptide],],
          escape=FALSE,
          rownames=FALSE,
          extensions = 'Responsive',
          options=list(columnDefs=list(list(className="none",targets=5))))
}else{
if(dim(MrnaOnly)[1]==0) cat("No proteins from mRNA from Protein\n")
if(sum(MrnaOnly.Informations$SAAV)==0) cat("No proteins with peptide in Insertion\n")
}

Deletion

if(dim(MrnaOnly)[1]!=0 || sum(MrnaOnly.Informations$Deletion)){
datatable(MrnaOnly[MrnaOnly$Name %in% rownames(MrnaOnly.Informations)[MrnaOnly.Informations$Deletion],],
          escape=FALSE,
          rownames=FALSE,
          extensions = 'Responsive',
          options=list(columnDefs=list(list(className="none",targets=5))))
}else{
if(dim(MrnaOnly)[1]==0) cat("No proteins from mRNA from Protein\n")
if(sum(MrnaOnly.Informations$Deletion)==0) cat("No proteins with Deletion\n")
}

Deletion in peptides

if(dim(MrnaOnly)[1]!=0 || sum(MrnaOnly.Informations$Deletion_In_Peptide)){
datatable(MrnaOnly[MrnaOnly$Name %in% rownames(MrnaOnly.Informations)[MrnaOnly.Informations$Deletion_In_Peptide],],
          escape=FALSE,
          rownames=FALSE,
          extensions = 'Responsive',
          options=list(columnDefs=list(list(className="none",targets=5))))
}else{
if(dim(MrnaOnly)[1]==0) cat("No proteins from mRNA from Protein\n")
if(sum(MrnaOnly.Informations$Deletion_In_Peptide)==0) cat("No proteins with peptide in Deletion\n")
}

Peptide not in BLAST

if(dim(MrnaOnly)[1]!=0 || sum(MrnaOnly.Informations$Deletion_In_Peptide)){
datatable(MrnaOnly[MrnaOnly$Name %in% rownames(MrnaOnly.Informations)[MrnaOnly.Informations$Peptide_Not_In_Blast],],
          escape=FALSE,
          rownames=FALSE,
          extensions = 'Responsive',
          options=list(columnDefs=list(list(className="none",targets=5))))
}else{
if(dim(MrnaOnly)[1]==0) cat("No proteins from mRNA from Protein\n")
if(sum(MrnaOnly.Informations$Peptide_Not_In_Blast)==0) cat("No proteins with peptide outside of blast match part\n")
}

mRNA Not in BLAST

if(dim(MrnaOnly)[1]!=0 || sum(MrnaOnly.Informations$Not_In_Blast)){
datatable(MrnaOnly[MrnaOnly$Name %in% rownames(MrnaOnly.Informations)[MrnaOnly.Informations$Not_In_Blast],],
          escape=FALSE,
          rownames=FALSE,
          extensions = 'Responsive',
          options=list(columnDefs=list(list(className="none",targets=5))))
}else{
if(dim(MrnaOnly)[1]==0) cat("No proteins from mRNA from Protein\n")
if(sum(MrnaOnly.Informations$Not_In_Blast)==0) cat("No proteins whithout blast match\n")
}
cat("# Infos {.tabset}

## Parameters
")
PARAMETERS<-unlist(params)
PARAMETERS.DF<-data.frame(PARAMETER=names(PARAMETERS),VALUE=PARAMETERS)
datatable(PARAMETERS.DF,rownames = FALSE)
cat("## Session
")
sessionInfo()
ProteinBank.Names<-names(ProteinBank)
SP_Canonical<-ProteinBank.Names[
  intersect(
    grep("^sp",ProteinBank.Names),
    grep("-[0-9]*\\|",ProteinBank.Names,invert=TRUE))]
SP_Canonical.Short<-gsub(" .*","",SP_Canonical)
BlastNames<-ProteinBank.Names[
  grep("Blast=",ProteinBank.Names)]
BlastHitNames<-gsub(".*Blast=","",BlastNames)
SP_Canonical.NoMrna.Short<-SP_Canonical.Short[!SP_Canonical.Short %in% BlastHitNames]
SP_Canonical.NoMrna.InMS.Short<-SP_Canonical.NoMrna.Short[SP_Canonical.NoMrna.Short %in% ProteinsInMS.NoContam]
SP_Canonical.NoMrna.InMS.Short<-gsub(
  "\\|.*",
  "",
  gsub(
    "sp\\|",
    "",
    SP_Canonical.NoMrna.InMS.Short
  )
)
write.table(data.frame(UNIPROT_ID=SP_Canonical.NoMrna.InMS.Short),
            "PURE_CANONICAL.NomRNA.UNIPROT_ID.txt",
            row.names = FALSE,
            quote=FALSE,
            sep="\t")
ProteinBank.Names<-names(ProteinBank)
CommonUniprot<-ProteinBank.Names[grep("(UNIPROT|Blast)=",ProteinBank.Names)]
CommonUniprot<-gsub("^[^\\|]*\\|",
                    "",
                    CommonUniprot)
CommonUniprot<-gsub("\\|.*",
                    "",
                    CommonUniprot)

write.table(data.frame(UNIPROT_ID=CommonUniprot),
            "PRESENT_SPECTRO_RNASEQ.UNIPROT_ID.txt",
            row.names = FALSE,
            quote=FALSE,
            sep="\t")


47Lies/isoformnspectRe documentation built on May 29, 2021, 3 p.m.