knitr::opts_chunk$set(
  #collapse = TRUE,
  comment = "#>",
  fig.width = 4,
  fig.height = 4,
  message = FALSE,
  warning = FALSE,
  tidy.opts = list(
    keep.blank.line = TRUE,
    width.cutoff = 150
    ),
  options(width = 150),
  eval = TRUE
)

Introduction and creation of five-channel data

In previous vignettes we investigated the use of all nine TMT-MS2 fractions when performing CPA. In this vignette, we investigate the performance of CPA using only the five classical differential centrifugation fractions N, M, L, P, and S. We derive the five classical fractions by removing the Nyc fractions and combining L1 and L2 to make the L fraction. We load the protlocassign fraction and summarize the full nine-fraction data as follows:

library(protlocassign)
data(protNSA_AT5tmtMS2)
dim(protNSA_AT5tmtMS2)

Next, we convert the data from the original NSA data to the Acup-transformed data, which is the appropriate transformation for combining fractions.

data(totProtAT5)
protAcup_AT5tmtMS2_data <- AcupFromNSA(protNSA_AT5tmtMS2[,1:6],
                                       NstartMaterialFractions=6, 
                                       totProt=totProtAT5[1:6])

We complete the conversion to five-channel data as follows:

# take the first six fractions:
protAcupSixTemp <- protAcup_AT5tmtMS2_data[,1:6]  
# add L1 and L2 together, and put into L1; then re-name L1 as L
# make L1 the sum of L1 and L2:
protAcupFiveTemp <- within(protAcupSixTemp, L1 <- L1+L2) 
# L is now the name of third columm:
names(protAcupFiveTemp)[3] <- "L"  
# drop L2 column (column 4)
protAcup <- protAcupFiveTemp[, -4]  
head(round(protAcup,3))

We also must create a vector totProt with five values, corresponding to N, M, L (=L1+L2), P, S. Here are the original values of total protein in the nine channels:

round(totProtAT5, 3)

Now we complete the calculation:

totProtTemp <- totProtAT5
totProtTemp[3] <- totProtAT5[3] + totProtAT5[4]
names(totProtTemp)[3] <- "L"
# remove L2 (component 4) and Nyc values (columns 7, 8, 9)
totProt <- totProtTemp[c(-4, -7, -8, -9)]   
round(totProt, 3)

Transformations of the five-channel data and reference profiles

Next we can convert the five-channel Acup data to NSA and also to RSA:

protNSA<- NSAfromAcup(Acup=protAcup, NstartMaterialFractions=5, 
                      totProt=totProt)
head(round(protNSA,3))

protRSA <- RSAfromNSA(NSA=protNSA,NstartMaterialFractions = 5, 
                      totProt = totProt)

We may obtain the three transformations of the reference profiles as follows:

data(markerListJadot)
refLocationProfilesNSA <- locationProfileSetup(profile=protNSA, 
                            markerList=markerListJadot, numDataCols=5)
refLocationProfilesAcup <- AcupFromNSA(refLocationProfilesNSA, 
                            NstartMaterialFractions=5, totProt=totProt)
refLocationProfilesRSA <- RSAfromNSA(refLocationProfilesNSA, 
                            NstartMaterialFractions=5, totProt=totProt)

round(refLocationProfilesNSA, 3)
round(refLocationProfilesRSA, 3)
round(refLocationProfilesAcup, 3)

We then generate profile plots:

loc.list <- rownames(refLocationProfilesRSA)
n.loc <- length(loc.list)
par(mfrow=c(4,2))
for (i in 1:n.loc) {
  markerProfilePlot(refLoc=loc.list[i], profile=protRSA,
                    markerList=markerListJadot,
                    refLocationProfiles=refLocationProfilesRSA, 
                    ylab="RSA")
}

Mixture simulations using five-channel data

One can generate any desired mixture and transformation by adjusting values of i and j.

i=1  #Cyto
j=2  #ER
mixProtiProtj12Acup <- proteinMix(refLocationProfilesAcup, Loc1=i, Loc2=j)
mixProtiProtj12RSA <- RSAfromAcup(Acup=mixProtiProtj12Acup, 
                            NstartMaterialFractions=5, totProt=totProt)
mixProtiProtjCPAfromRSA12 <- fitCPA(profile=mixProtiProtj12RSA,
                            refLocationProfiles=refLocationProfilesRSA,
                            numDataCols=5)

i=1  #Cyto
j=3  #Golgi
mixProtiProtj13Acup <- proteinMix(refLocationProfilesAcup, Loc1=i, Loc2=j)
mixProtiProtj13RSA <- RSAfromAcup(Acup=mixProtiProtj13Acup, 
                                  NstartMaterialFractions=5, totProt=totProt)
mixProtiProtjCPAfromRSA13 <- fitCPA(profile=mixProtiProtj13RSA,
                                 refLocationProfiles=refLocationProfilesRSA,
                                 numDataCols=5)
i=1  #Cyto
j=4  #Lyso
mixProtiProtj14Acup <- proteinMix(refLocationProfilesAcup, Loc1=i, Loc2=j)
mixProtiProtj14RSA <- RSAfromAcup(Acup=mixProtiProtj14Acup, 
                            NstartMaterialFractions=5, totProt=totProt)
mixProtiProtjCPAfromRSA14 <- fitCPA(profile=mixProtiProtj14RSA,
                                refLocationProfiles=refLocationProfilesRSA,
                                numDataCols=5)

i=1  #Cyto
j=5  #Mito
mixProtiProtj15Acup <- proteinMix(refLocationProfilesAcup, Loc1=i, Loc2=j)
mixProtiProtj15RSA <- RSAfromAcup(Acup=mixProtiProtj15Acup, 
                            NstartMaterialFractions=5, totProt=totProt)
mixProtiProtjCPAfromRSA15 <- fitCPA(profile=mixProtiProtj15RSA,
                                refLocationProfiles=refLocationProfilesRSA,
                                numDataCols=5)

i=1  #Cyto
j=6  #Nuc
mixProtiProtj16Acup <- proteinMix(refLocationProfilesAcup, Loc1=i, Loc2=j)
mixProtiProtj16RSA <- RSAfromAcup(Acup=mixProtiProtj16Acup, 
                            NstartMaterialFractions=5, totProt=totProt)
mixProtiProtjCPAfromRSA16 <- fitCPA(profile=mixProtiProtj16RSA,
                                refLocationProfiles=refLocationProfilesRSA,
                                numDataCols=5)
i=1  #Cyto
j=7  #Perox
mixProtiProtj17Acup <- proteinMix(refLocationProfilesAcup, Loc1=i, Loc2=j)
mixProtiProtj17RSA <- RSAfromAcup(Acup=mixProtiProtj17Acup, 
                            NstartMaterialFractions=5, totProt=totProt)
mixProtiProtjCPAfromRSA17 <- fitCPA(profile=mixProtiProtj17RSA,
                                refLocationProfiles=refLocationProfilesRSA,
                                numDataCols=5)

i=1  #Cyto
j=8  #PM
mixProtiProtj18Acup <- proteinMix(refLocationProfilesAcup, Loc1=i, Loc2=j)
mixProtiProtj18RSA <- RSAfromAcup(Acup=mixProtiProtj18Acup, 
                            NstartMaterialFractions=5, totProt=totProt)
mixProtiProtjCPAfromRSA18 <- fitCPA(profile=mixProtiProtj18RSA,
                             refLocationProfiles=refLocationProfilesRSA,
                            numDataCols=5)

Now we can plot all simulations for mixing Cyto with each of the other seven compartments. Note that as before, the x-coordinate represents the theoretical distribution based on simulation parameters and the y-coordinate represents the predicted values based on CPA. The assignment errors are shown in parentheses.

We see that CPA performs well except for Cyto-Lyso and Cyto-PM:

#library(pracma)   # for trapz function
par(mfrow=c(2,4))

mixturePlot(mixProtiProtjCPA=mixProtiProtjCPAfromRSA12,
            NstartMaterialFractions=5, Loc1=1, Loc2=2,
            errorReturn = TRUE, subTitle="RSA")


mixturePlot(mixProtiProtjCPA=mixProtiProtjCPAfromRSA13,
            NstartMaterialFractions=5, Loc1=1, Loc2=3,
            errorReturn = TRUE, subTitle="RSA")

mixturePlot(mixProtiProtjCPA=mixProtiProtjCPAfromRSA14,
            NstartMaterialFractions=5, Loc1=1, Loc2=4,
            errorReturn = TRUE, subTitle="RSA")

mixturePlot(mixProtiProtjCPA=mixProtiProtjCPAfromRSA15,
            NstartMaterialFractions=5, Loc1=1, Loc2=5,
            errorReturn = TRUE, subTitle="RSA")

mixturePlot(mixProtiProtjCPA=mixProtiProtjCPAfromRSA16,
            NstartMaterialFractions=5, Loc1=1, Loc2=6,
            errorReturn = TRUE, subTitle="RSA")

mixturePlot(mixProtiProtjCPA=mixProtiProtjCPAfromRSA17,
            NstartMaterialFractions=5, Loc1=1, Loc2=7,
            errorReturn = TRUE, subTitle="RSA")


mixturePlot(mixProtiProtjCPA=mixProtiProtjCPAfromRSA18,
            NstartMaterialFractions=5, Loc1=1, Loc2=8,
            errorReturn = TRUE, subTitle="RSA")

As described in the previous vignette, we can look at a heatmap of the errors for the different pairwise mixtures and transformations. As a reminder, these values are presented as a 2 by 3 array, with the three transformations as columns (RSA, left; NSA, center; Acup, right). The top row uses the original values (identity transformation), and the bottom row uses a log2 transformation of these values. The prediction errors are listed in each box with larger errors indicated by darker colors. CPA on mixtures involving Cyto and Lyso or PM have the highest error rates. The errors are reduced for these mixtures when conducting CPA on log-transformed NSA profiles but this has adverse effects on the fit for other compartments (see below, Appendix).

#install.packages(c("plot.matrix", "viridis", "grid", "gridExtra"))

par(mfrow=c(1,1))
errorMatAll <- mixtureHeatMap(Acup=refLocationProfilesAcup, 
                        totProt=totProt, NstartMaterialFractions=5)

We can also either print out overall errors (i.e., sum of all errors for each combination of data transformation method), or, alternatively, generate a heatmap of the overall errors.

errorMatAll
#library(plot.matrix)
#library("viridis")
op <- par(mar=c(4,4,1,1))  # save default parameters in variable op

col <- rev(viridis::magma(16))
library(plot.matrix)
plot(errorMatAll, col=col, breaks=seq(0, 15, 1), key=NULL, main="",
     axis.col=NULL, axis.row=NULL, 
     xlab="RSA                        NSA                       Acup", 
     ylab="Log2 Linear", digits=2, cex=2, cex.lab=1.1)
par(op) # restore default parameters

Restricted constrained proportional assignment

From the examples above, our modeling shows that fitCPA does not report the correct assignment for some of the simulated binary mixtures from a five-fraction differential centrifugation experiment. To investigate this further, we have conducted modeling where we restrict the CPA algorithm to only assign weights to specified subsets of compartments. In the following code, we examine the effect of restricting CPA to fit a simulated protein with a 50:50 distribution between Cyto and Lyso. We have already created this earlier in this vignette as the sixth entry in the data frame mixProtiProtj14RSA and create a new one-row data frame that only contains this simulated protein.

data.frame(rownames(mixProtiProtj14RSA))
mixCyto50Lyso50RSA <- mixProtiProtj14RSA[6,]

In our first example, we force the algorithm to only assign positive weights to Cyto and Lyso, holding the remaining weights fixed at zero. We do this using the fitCPA function where the optional parameter ind.vary specifies the row numbers for the compartments whose proportions are allowed to vary (Cyto=1, Lyso=4) with the remaining compartment proportions fixed at zero. The option minVal=TRUE causes the value of the sum of squares goodness of fit (value) to be printed. From the output we see that the CPA routine with these constraints correctly reports the distribution of the simulated protein.

mixCyto50Lyso50CPAfromRSAforced <- fitCPA(profile= mixCyto50Lyso50RSA,
                                refLocationProfiles=refLocationProfilesRSA,
                                numDataCols=5, ind.vary=c(1,4), minVal=TRUE)
round(mixCyto50Lyso50CPAfromRSAforced, digits=4)

As a second example, we allow positive weights to only two of the eight compartments at a time as described above, cycling through all possible pairs of compartments. Here we introduce a new function, fCPAsubsets, that automates this procedure, with an argument nCPAcomparts that specifies the number of compartments allowed to vary in each cycle. Note that when we specify nCPAcomparts=2 for eight compartments, there are 8 choose 2 = 28 possible pairs. The output is in a data frame that contains each of the assignments and goodness of fit values, sorted by the parameter value. Here, the top CPA assignment has a goodness of fit value markedly better than the others and gives the correct assignment between Cyto and Lyso.

mixCyto50Lyso50CPAfromRSApairs <- fCPAsubsets(profile= mixCyto50Lyso50RSA,
                                  refLocationProfiles=refLocationProfilesRSA,
                                  numDataCols=5,nCPAcomparts=2)
round(mixCyto50Lyso50CPAfromRSApairs, digits=4)

It is also possible to consider other fixed combinations. For example, nCPAcomparts=3 would cycle through 8 choose 3 = 56 combinations where three compartment proportions are allowed to vary with the remaining five fixed at zero. As a final example we consider all 2^8 - 1 = 255 possible subsets of compartments and conduct CPA using each of these subsets. In the code below, we successively set nCPAcomparts equal to 1, 2, 3…, 8 and create a data frame that contains the estimated proportions and goodness of fit parameter values. We then sort by the parameter value and observe that many CPA assignments are nearly as good as the top assignment.

mixCyto50Lyso50CPAfromRSAallSubsets<- NULL
for (kk in 1:8) {
   temp  <- fCPAsubsets(profile=mixCyto50Lyso50RSA,
                              refLocationProfiles=refLocationProfilesRSA,
                              numDataCols=5, nCPAcomparts=kk)
   mixCyto50Lyso50CPAfromRSAallSubsets <- 
                       rbind(mixCyto50Lyso50CPAfromRSAallSubsets, temp)
}
# sort dataframe by the variable “value” by creating an intermediate 
#   list of indices "ord_fit".
ord_fit <- order(mixCyto50Lyso50CPAfromRSAallSubsets$value)
mixCyto50Lyso50CPAfromRSAallSubsetsOrd <- 
         mixCyto50Lyso50CPAfromRSAallSubsets[ord_fit,]
round(head(mixCyto50Lyso50CPAfromRSAallSubsetsOrd), 3)

We may plot the goodness of fit statistic value and the CPA assignment proportions for Cyto and Lyso versus model number as follows, with the best fitting models first. This plot shows that the goodness of fit statistic is extremely low for a large number of models:

modelVals <- 1:255
par(mfrow=c(1,1))  # reset the plot space
plot(mixCyto50Lyso50CPAfromRSAallSubsetsOrd$value ~ modelVals, 
     xlab="Model id", ylab="Goodness of fit value", las=1)

In the following plot we show the CPA estimates for Cyto and Lyso for all models. Even for the best fitting models (with id numbers 1 to about 90), the CPA estimates vary quite a lot.

par(mfrow=c(1,1))  # reset the plot space
plot(mixCyto50Lyso50CPAfromRSAallSubsetsOrd$Lyso ~ modelVals, xlab="Model id", 
     ylab="CPA proortion", type="n", ylim=c(0, 1.1), las=1)

points(mixCyto50Lyso50CPAfromRSAallSubsetsOrd$Lyso ~ modelVals, pch=4, 
       col="darkgreen", cex=1.5)
points(mixCyto50Lyso50CPAfromRSAallSubsetsOrd$Cyto ~ modelVals, pch=1, col="red")
legend(x="topleft", legend=c("Cyto proportion", "Lyso proportion"),
       pch=c( 1, 4), col=c("red", "darkgreen"))

From this it is clear that the model is over-parameterized with multiple models that fit nearly as well yet give inaccurate CPA estimates. Further study of the properties of estimation methods when multiple models fit equally well is warranted.

Appendix: Plots of CPA assignments of all two-compartent simulated proteins using linear and log RSA, NSA, and Acup data

errorAllRSAlinear <- mixturePlotPanel(refLocationProfilesAcup=
                                refLocationProfilesAcup,
                                totProt=totProt, NstartMaterialFractions=5, 
                                errorReturn = TRUE,
                                fitType="RSA", log2Transf=FALSE)

The mixing simulations indicate that for mixtures involving Lyso or PM, there is a poor fit.

Analysis of pairwise simulations using log2-transformed RSA profiles as well as NSA and Acup profiles (with and without log2 transformations) are displayed below. Overall, the best results are obtained using log-transformed NSA data. Here, we see that pairwise mixtures involving Lyso and PM are improved compared to the fits with untransformed RSA profiles but many other combinations involving Nuc and ER are worsened:

errorAllRSAlog2 <- mixturePlotPanel(refLocationProfilesAcup=
                                    refLocationProfilesAcup,
                                    totProt=totProt, 
                                    NstartMaterialFractions=5, 
                                    errorReturn = TRUE,
                                    fitType="RSA", log2Transf=TRUE)


errorAllNSAlinear <- mixturePlotPanel(refLocationProfilesAcup=
                                    refLocationProfilesAcup,
                                    totProt=totProt, 
                                    NstartMaterialFractions=5, 
                                    errorReturn = TRUE,
                                    fitType="NSA", log2Transf=FALSE)

errorAllNSAlog2 <- mixturePlotPanel(refLocationProfilesAcup=
                                    refLocationProfilesAcup,
                                    totProt=totProt, 
                                    NstartMaterialFractions=5, 
                                    errorReturn = TRUE,
                                    fitType="NSA", log2Transf=TRUE)

errorAllAcupLinear <- mixturePlotPanel(refLocationProfilesAcup=
                                    refLocationProfilesAcup,
                                    totProt=totProt, 
                                    NstartMaterialFractions=5, 
                                    errorReturn = TRUE,
                                    fitType="Acup", log2Transf=FALSE)

errorAllAcupLog2 <- mixturePlotPanel(refLocationProfilesAcup=
                                    refLocationProfilesAcup,
                                    totProt=totProt, 
                                    NstartMaterialFractions=5, 
                                    errorReturn = TRUE,
                                    fitType="Acup", log2Transf=TRUE)

If desired, these plots of pairwise mixtures may also be saved as pdf files as described in Vignette 4.

Finally, we present the pairwise and overall errors for log2 RSA (errors for identity RSA are reported above), identity and log2 NSA, and identity and log2 Acup:

errorAllRSAlinear 
sum(errorAllRSAlinear[,3])

errorAllRSAlog2
sum(errorAllRSAlog2[,3])

errorAllNSAlinear
#write.csv(errorAllNSAlinear, file="errorAllNSAlinear5chan.csv", row.names=FALSE)
sum(errorAllNSAlinear[,3])

errorAllNSAlog2
sum(errorAllNSAlog2[,3])

errorAllAcupLinear
sum(errorAllAcupLinear[,3])

errorAllAcupLog2
sum(errorAllAcupLog2[,3])

These can also be saved as tables as described in Vignette 4.



mooredf22/protlocassign0p1p1 documentation built on Feb. 7, 2022, 1:55 a.m.