Some of our existing datasets contain sub-cellular compartments (classes) with very few markers which makes it difficult to use these data in testing our algorithms. We wish to include as many classes as possible in testing our algorithms as we wish to cover as much subcellular diversity as possible.
Currently, all markers for the datasets andy2011 dunkley2006 E14TG2a tan2009 have been defined by biologists, who conducted the experiments. These have been definied through manual searching of the literature and consulting databases.
In attempt to gather more markers and boost the above existing marker sets we have pulled GO terms with the cellular compartment namespace which have been experimentally defined and which have a unique term. These are extra terms have been added as markers to the original marker lists and then plotted using plot2D
to check validitiy with the structure of the data.
The below code demostrated the above procedure.
andy2011
- Human HEK 293library("pRoloc2") library("pRolocdata") data(andy2011) fData(andy2011)$UniprotID <- featureNames(andy2011) featureNames(andy2011) <- fData(andy2011)$Accession.No. fData(andy2011)$markers <- fData(andy2011)$markers.original fData(andy2011)$markers <- as.character(fData(andy2011)$pd.markers) ## Set parameters andy2011params <- setAnnotationParams() ## Get GO annotations assigned to single organelle with EXP only getGoLoc <- pRoloc2::getGO(andy2011, evidence="EXP", params=andy2011params) idN <- names(which(table(getGoLoc[,1])==1)) id <- sapply(idN, function(z) which(getGoLoc[,1]==z)) goLoc <- getGoLoc[unlist(id),] cc <- goLoc[,4] names(cc) <- goLoc[,1] allCl <- names(table(cc)) allN <- featureNames(andy2011) #fData(andy2011)$go <- rep("unknown", nrow(andy2011)) fcol = "markers" ## We now look for terms in `allCl` that match each sub-cellular compartment ## and add these as potential markers (curation will be further downstream) ## 1.ER nam <- grep("endop", allCl, value=TRUE) nam nam <- goLoc[unlist(sapply(nam, function(z) which(cc==z))), 1] .id <- sapply(nam, function(z) which(allN==z)) fData(andy2011)[.id, fcol] <- rep("ER", length(.id)) ## 2.Golgi nam <- grep("Golgi", allCl, value=TRUE) nam nam <- goLoc[unlist(sapply(nam, function(z) which(cc==z))), 1] .id <- sapply(nam, function(z) which(allN==z)) fData(andy2011)[.id, fcol] <- rep("Golgi", length(.id)) ## 3.Lysosome nam <- grep("lysoso", allCl, value=TRUE) nam nam <- goLoc[unlist(sapply(nam, function(z) which(cc==z))), 1] .id <- sapply(nam, function(z) which(allN==z)) fData(andy2011)[.id, fcol] <- rep("Lysosome", length(.id)) ## 4.Mitochondrion nam <- grep("mitochond", allCl, value=TRUE) nam nam <- goLoc[unlist(sapply(nam, function(z) which(cc==z))), 1] .id <- sapply(nam, function(z) which(allN==z)) fData(andy2011)[.id, fcol] <- rep("Mitochondrion", length(.id)) ## 5.Plasma membrane nam <- grep("plasma mem", allCl, value=TRUE) nam nam <- goLoc[unlist(sapply(nam, function(z) which(cc==z))), 1] .id <- sapply(nam, function(z) which(allN==z)) fData(andy2011)[.id, fcol] <- rep("PM", length(.id)) ## 6.Nucleus nam <- grep("nucleus", allCl, value=TRUE) nam nam <- goLoc[unlist(sapply(nam, function(z) which(cc==z))), 1] .id <- sapply(nam, function(z) which(allN==z)) fData(andy2011)[.id, fcol] <- rep("Nucleus", length(.id)) ## 7.Proteasome nam <- grep("protea", allCl, value=TRUE) nam nam <- goLoc[unlist(sapply(nam, function(z) which(cc==z))), 1] .id <- sapply(nam, function(z) which(allN==z)) fData(andy2011)[.id, fcol] <- rep("Proteasome", length(.id)) ## 8.Cytoskeleton nam <- grep("cytoske", allCl, value=TRUE) nam nam <- goLoc[unlist(sapply(nam, function(z) which(cc==z))), 1] .id <- sapply(nam, function(z) which(allN==z)) fData(andy2011)[.id, fcol] <- rep("Cytoskeleton", length(.id)) ## 9.Ribosome nam <- grep("ribo", allCl, value=TRUE) nam <- nam[1] nam nam <- goLoc[unlist(sapply(nam, function(z) which(cc==z))), 1] .id <- sapply(nam, function(z) which(allN==z)) fData(andy2011)[.id, fcol] <- rep("Ribosome (60S)", length(.id)) ## 11.Peroxisome nam <- grep("perox", allCl, value=TRUE) nam nam <- goLoc[unlist(sapply(nam, function(z) which(cc==z))), 1] .id <- sapply(nam, function(z) which(allN==z)) fData(andy2011)[.id, fcol] <- rep("Peroxisome", length(.id)) ## 12.Nucleolus nam <- grep("nucleo", allCl, value=TRUE) nam <- nam[2] nam <- goLoc[unlist(sapply(nam, function(z) which(cc==z))), 1] .id <- sapply(nam, function(z) which(allN==z)) fData(andy2011)[.id, fcol] <- rep("Nucleolus", length(.id)) ## 13. Cytosol nam <- grep("cytosol", allCl, value=TRUE) nam <- nam[1] nam <- goLoc[unlist(sapply(nam, function(z) which(cc==z))), 1] .id <- sapply(nam, function(z) which(allN==z)) fData(andy2011)[.id, fcol] <- rep("Cytosol", length(.id)) ## 14. Cytoplasm nam <- grep("cytoplasm", allCl, value=TRUE) nam <- nam[1] nam <- goLoc[unlist(sapply(nam, function(z) which(cc==z))), 1] .id <- sapply(nam, function(z) which(allN==z)) fData(andy2011)[.id, fcol] <- rep("Cytoplasm", length(.id)) ## 15. Chromosome nam <- grep("chro", allCl, value=TRUE) nam nam <- goLoc[unlist(sapply(nam, function(z) which(cc==z))), 1] .id <- sapply(nam, function(z) which(allN==z)) fData(andy2011)[.id, fcol] <- rep("Chromosome", length(.id)) ## 16. Endosome nam <- grep("endosom", allCl, value=TRUE) nam nam <- goLoc[unlist(sapply(nam, function(z) which(cc==z))), 1] .id <- sapply(nam, function(z) which(allN==z)) fData(andy2011)[.id, fcol] <- rep("Endosome", length(.id)) ## Looking for ribosomal proteins ## Examine all featureNames that contain RS for 40S subunit rs <- grep(pattern = "RS", x = featureNames(andy2011)) featureNames(andy2011)[rs] rl <- grep(pattern = "RL", x = featureNames(andy2011)) featureNames(andy2011)[rl] ## Add any that are ribosome
Annotations in the fData
column go
are then examined via plot2D
and and then added to pd.markers
. With this new list of markers we run a round of classification using and SVM and examine any new members to the existing clusters through querying the GO once more, other databases and the literature.
opt <- svmOptimisation(andy2011, fcol = "markers") andy2011 <- svmClassification(andy2011, assessRes = opt) ## Now examine new members of sub-compartments ## curate
dunkley2006
- Arabidopsis thalianaAs above, we follow the same procedure e.g.
Use getGO
to pull unique GO CC terms with experimental evidence
Add to current pd.markers
Examine via plot2D
to check compliance with data structure and remove any that are not.
Perform a round of classification with this new marker list and examine new entries as potential markers by comparing with databases and literature.
data(dunkley2006) #data(dunkley2006params) dunkley2006params <- setAnnotationParams() ## Get GO.CC information for proteins in dunkley only with EXP evidence ann <- pRoloc2::getGO(dunkley2006, evidence = "EXP", params = dunkley2006params) ## Now only pull unique unique <- names(which(table(ann[,1])==1)) cmn <- match(unique, ann[, 1]) ann <- ann[cmn, ] ## Add labels to dunkley fData(dunkley2006)$markers <- fData(dunkley2006)$markers.original fData(dunkley2006)$markers <- fData(dunkley2006)$pd.markers go.cc <- ann[,4] names(go.cc) <- ann[, 1] dunkley2006 <- addMarkers(dunkley2006, markers = go.cc, mcol = "go.cc.unique") mt <- which(fData(dunkley2006)$go.cc.unique=="mitochondrion") fData(dunkley2006)$markers[mt] fData(dunkley2006)$markers[mt] <- rep("Mitochondrion", length(mt)) va <- grep("vacuol", fData(dunkley2006)$go.cc.unique) fData(dunkley2006)$markers[va] fData(dunkley2006)$markers[va] <- rep("vacuole", length(va)) pt <- grep("plastid", fData(dunkley2006)$go.cc.unique) fData(dunkley2006)$markers[pt] fData(dunkley2006)$markers[pt[7]] <- rep("Plastid", length(pt[7])) cl <- grep("chloroplast", fData(dunkley2006)$go.cc.unique) fData(dunkley2006)$markers[cl[c(7,9:10)]] featureNames(dunkley2006)[cl[c(7,9:10)]] fData(dunkley2006)$markers[cl[c(7,9:10)]] <- rep("Plastid", length(cl[c(7,9:10)])) pm <- grep("plasma membrane", fData(dunkley2006)$go.cc.unique) fData(dunkley2006)$markers[pm] .x <- which(fData(dunkley2006)$markers[pm]=="unknown") pm <- pm[.x] pm <- pm[-c(1,2,9,15)] fData(dunkley2006)$markers[pm] featureNames(dunkley2006)[pm] fData(dunkley2006)$markers[pm] <- rep("PM", length(pm)) ## Check for compliance with structure plot2D(dunkley2006, "markers") ## Further curation ## Now run svm to boost further opt <- svmOptimisation(dunkley2006, fcol = "markers") dunkley2006 <- svmClassification(dunkley2006, assessRes = opt) ## Now examine new assignments and add any new markers found within these clusters.
library("organelledb") data("E14TG2aS1") featureNames(E14TG2aS1) <- fData(E14TG2aS1)$Uniprot.ID fData(E14TG2aS1)$markers0 <- fData(E14TG2aS1)$markers fData(E14TG2aS1)$markers <- NULL fData(E14TG2aS1)$UniprotName <- featureNames(E14TG2aS1) featureNames(E14TG2aS1) <- fData(E14TG2aS1)$Uniprot.ID ## Load marker list as curated by Claire M Mulvey (CMM) and Andy Christoforou (AC) load("e14mrk.rda") E14TG2aS1 <- addMarkers(E14TG2aS1, mrk) ## Add markers from the original 'markers0' and move to 'markers' ## These were the original markers added by AC, check cy <- which(fData(E14TG2aS1)$markers0 == "CYT") fData(E14TG2aS1)$markers[cy] fData(E14TG2aS1)$markers[cy] <- rep("Cytosol", length(cy)) er <- which(fData(E14TG2aS1)$markers0 == "ERT") fData(E14TG2aS1)$markers[er] fData(E14TG2aS1)$markers[er] <- rep("Endoplasmic reticulum", length(er)) ly <- which(fData(E14TG2aS1)$markers0 == "LYS") fData(E14TG2aS1)$markers[ly] ly <- ly[-2] fData(E14TG2aS1)$markers[ly] fData(E14TG2aS1)$markers[ly] <- rep("Lysosome", length(ly)) mt <- which(fData(E14TG2aS1)$markers0 == "MIT") fData(E14TG2aS1)$markers[mt] fData(E14TG2aS1)$markers[mt] <- rep("Mitochondrion", length(mt)) # peroxisome already a marker, nuc markers already labelled 'chromatin' pm <- which(fData(E14TG2aS1)$markers0 == "PLM") fData(E14TG2aS1)$markers[pm] fData(E14TG2aS1)$markers[pm] <- rep("Plasma membrane", length(pm)) # no new ribo ## Get GO loc for EXP evidence only E14TG2aS1params <- setAnnotationParams() go.E14 <- getGO(E14TG2aS1, evidence="EXP", params=E14TG2aS1params) idN <- names(which(table(go.E14[,1])==1)) id <- sapply(idN, function(z) which(go.E14[,1]==z)) goLoc <- go.E14[unlist(id),] cc <- goLoc[,4] names(cc) <- goLoc[,1] allCl <- names(table(cc)) allN <- featureNames(E14TG2aS1) fData(E14TG2aS1)$go <- rep("unknown", nrow(E14TG2aS1)) fcol = "go" ## 1.ER nam <- grep("endop", allCl, value=TRUE) nam nam <- goLoc[unlist(sapply(nam, function(z) which(cc==z))), 1] .id <- sapply(nam, function(z) which(allN==z)) fData(E14TG2aS1)[.id, fcol] <- rep("Endoplasmic reticulum", length(.id)) ## 2.Golgi nam <- grep("Golgi", allCl, value=TRUE) nam <- nam[1:2] nam nam <- goLoc[unlist(sapply(nam, function(z) which(cc==z))), 1] .id <- sapply(nam, function(z) which(allN==z)) fData(E14TG2aS1)[.id, fcol] <- rep("Golgi", length(.id)) ## 3.Lysosome nam <- grep("lysoso", allCl, value=TRUE) nam nam <- goLoc[unlist(sapply(nam, function(z) which(cc==z))), 1] .id <- sapply(nam, function(z) which(allN==z)) fData(E14TG2aS1)[.id, fcol] <- rep("Lysosome", length(.id)) ## 4.Mitochondrion nam <- grep("mitochond", allCl, value=TRUE) nam <- nam[3] nam <- goLoc[unlist(sapply(nam, function(z) which(cc==z))), 1] .id <- sapply(nam, function(z) which(allN==z)) fData(E14TG2aS1)[.id, fcol] <- rep("Mitochondrion", length(.id)) ## 5.Plasma membrane nam <- grep("plasma mem", allCl, value=TRUE) nam nam <- goLoc[unlist(sapply(nam, function(z) which(cc==z))), 1] .id <- sapply(nam, function(z) which(allN==z)) fData(E14TG2aS1)[.id, fcol] <- rep("Plasma membrane", length(.id)) ## 6.Nucleus nam <- grep("nucleus", allCl, value=TRUE) nam nam <- goLoc[unlist(sapply(nam, function(z) which(cc==z))), 1] .id <- sapply(nam, function(z) which(allN==z)) fData(E14TG2aS1)[.id, fcol] <- rep("Nucleus", length(.id)) ## 7.Proteasome nam <- grep("protea", allCl, value=TRUE) nam nam <- goLoc[unlist(sapply(nam, function(z) which(cc==z))), 1] .id <- sapply(nam, function(z) which(allN==z)) fData(E14TG2aS1)[.id, fcol] <- rep("Proteasome", length(.id)) ## 8.Cytoskeleton # no markers ## 9.Ribosome nam <- grep("ribo", allCl, value=TRUE) nam <- nam[1] nam nam <- goLoc[unlist(sapply(nam, function(z) which(cc==z))), 1] .id <- sapply(nam, function(z) which(allN==z)) fData(E14TG2aS1)[.id, fcol] <- rep("60S Ribosome", length(.id)) ## 11.Peroxisome # none ## 12.Nucleolus nam <- grep("nucleo", allCl, value=TRUE) nam <- nam[1] nam <- goLoc[unlist(sapply(nam, function(z) which(cc==z))), 1] .id <- sapply(nam, function(z) which(allN==z)) fData(E14TG2aS1)[.id, fcol] <- rep("Nucleus - Nucleolus", length(.id)) ## 13. Cytosol nam <- grep("cytosol", allCl, value=TRUE) nam <- nam[1] nam <- goLoc[unlist(sapply(nam, function(z) which(cc==z))), 1] .id <- sapply(nam, function(z) which(allN==z)) fData(E14TG2aS1)[.id, fcol] <- rep("Cytosol", length(.id)) ## 14. Cytoplasm nam <- grep("cytoplasm", allCl, value=TRUE) nam <- nam[1] nam <- goLoc[unlist(sapply(nam, function(z) which(cc==z))), 1] .id <- sapply(nam, function(z) which(allN==z)) fData(E14TG2aS1)[.id, fcol] <- rep("Cytoplasm", length(.id)) ## 15. Chromatin # none ## 16. Actin nam <- grep("actin", allCl, value=TRUE) nam nam <- goLoc[unlist(sapply(nam, function(z) which(cc==z))), 1] .id <- sapply(nam, function(z) which(allN==z)) fData(E14TG2aS1)[.id, fcol] <- rep("Actin cytoskeleton", length(.id)) ## 16. ECM nam <- grep("matrix", allCl, value=TRUE) nam nam <- goLoc[unlist(sapply(nam, function(z) which(cc==z))), 1] .id <- sapply(nam, function(z) which(allN==z)) fData(E14TG2aS1)[.id, fcol] <- rep("Extracellular matrix", length(.id)) ## Now look at fcol = "go" in plot2D and see if correlate with LOPIT ## Remove any outliers and check markers fit the structure of the data.
tan2009r1
- Drosophila melanogasterAgain, as above we pull unique GO CC terms to see if we can boost the number of markers for each sub-cellular compartment in pd.markers
. We then examine these new additions using plot2D
to check they correlate well with the existing data structure reflective of the experiment. We only find 21 new proteins we can use as markers.
data(tan2009r1) fData(tan2009r1)$FBgn <- featureNames(tan2009r1) featureNames(tan2009r1) <- fData(tan2009r1)$AccessionNo ## Get GO.CC information for proteins in dunkley only with EXP evidence tan2009params <- setAnnotationParams() go.tan <- getGO(tan2009r1, evidence = "EXP", params = tan2009params) idN <- names(which(table(go.tan[,1])==1)) id <- sapply(idN, function(z) which(go.tan[,1]==z)) goLoc <- go.tan[unlist(id),] cc <- goLoc[,4] names(cc) <- goLoc[,1] allCl <- names(table(cc)) allN <- featureNames(tan2009r1) fData(tan2009r1)$markers.new <- fData(tan2009r1)$pd.markers fcol = "markers.new" ## 1.ER nam <- grep("endop", allCl, value=TRUE) nam nam <- goLoc[unlist(sapply(nam, function(z) which(cc==z))), 1] .id <- sapply(nam, function(z) which(allN==z)) fData(tan2009r1)[.id, fcol] <- rep("ER", length(.id)) ## 2. Cytoskeleton nam <- grep("cytoskel", allCl, value=TRUE) nam ## 3. Golgi nam <- grep("Golgi", allCl, value=TRUE) nam nam <- goLoc[unlist(sapply(nam, function(z) which(cc==z))), 1] .id <- sapply(nam, function(z) which(allN==z)) fData(tan2009r1)[.id, fcol] <- rep("Golgi", length(.id)) ## 4. Lysosome nam <- grep("lysos", allCl, value=TRUE) nam nam <- goLoc[unlist(sapply(nam, function(z) which(cc==z))), 1] .id <- sapply(nam, function(z) which(allN==z)) fData(tan2009r1)[.id, fcol] ## 5. mitochondrion nam <- grep("mitoc", allCl, value=TRUE) nam <- nam[2] nam nam <- goLoc[unlist(sapply(nam, function(z) which(cc==z))), 1] .id <- sapply(nam, function(z) which(allN==z)) fData(tan2009r1)[.id, fcol] <- "mitochondrion" ## 6. nucleus nam <- grep("nucleus", allCl, value=TRUE) nam nam <- goLoc[unlist(sapply(nam, function(z) which(cc==z))), 1] .id <- sapply(nam, function(z) which(allN==z)) fData(tan2009r1)[.id, fcol] fData(tan2009r1)[.id, fcol] <- rep("Nucleus", length(.id)) ## 7. Peroxisome nam <- grep("perox", allCl, value=TRUE) nam ## 8. PM nam <- grep("plasma mem", allCl, value=TRUE) nam nam <- goLoc[unlist(sapply(nam, function(z) which(cc==z))), 1] .id <- sapply(nam, function(z) which(allN==z)) fData(tan2009r1)[.id, fcol] fData(tan2009r1)[.id, fcol] <- rep("PM", length(.id)) ## 9. Proteasome nam <- grep("proteasome", allCl, value=TRUE) nam nam <- goLoc[unlist(sapply(nam, function(z) which(cc==z))), 1] .id <- sapply(nam, function(z) which(allN==z)) fData(tan2009r1)[.id, fcol] fData(tan2009r1)[.id, fcol] <- rep("Proteasome", length(.id)) ## 10/11. ribosomes nam <- grep("ribo", allCl, value=TRUE) nam nam <- goLoc[unlist(sapply(nam, function(z) which(cc==z))), 1] .id <- sapply(nam, function(z) which(allN==z)) fData(tan2009r1)[.id, fcol] featureNames(tan2009r1)[.id] .id.40s <- .id[6] .id.60s <- .id[-6] fData(tan2009r1)[.id.40s, fcol] <- rep("Ribosome 40S", length(.id.40s)) fData(tan2009r1)[.id.60s, fcol] <- rep("Ribosome 60S", length(.id.60s)) ## Now plot and remove any outliers .torm <- plot2D(tan2009r1, fcol = "markers.new", identify = TRUE) fData(curatedTan)$markers.new[.torm] <- rep("unknown", length(.torm)) ## rename columns in fData fData(tan2009r1)$markers.orig <- fData(tan2009r1)$markers fData(tan2009r1)$markers <- fData(tan2009r1)$markers.new fData(tan2009r1)$markers.new <- NULL fvarLabels(tan2009r1) #save(tan2009r1, file = "../sandbox/curatedDatasets/curatedTan2009r1.rda")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.