inst/doc/v05-pRoloc-transfer-learning.R

## ----style, echo = FALSE, results = 'asis'------------------------------------
BiocStyle::markdown()

## ----env, include=FALSE, echo=FALSE, cache=FALSE------------------------------
library("knitr")
opts_chunk$set(stop_on_error = 1L)
suppressPackageStartupMessages(library("MSnbase"))
suppressWarnings(suppressPackageStartupMessages(library("pRoloc")))
suppressPackageStartupMessages(library("pRolocdata"))
suppressPackageStartupMessages(library("class"))
set.seed(1)
setStockcol(NULL)

## ----loadpkg------------------------------------------------------------------
library("pRoloc")

## ----loaddata-----------------------------------------------------------------
library("pRolocdata")
data(andy2011)

## ----ap-----------------------------------------------------------------------
ap <- setAnnotationParams(inputs =
                              c("Human genes",
                                "UniProtKB/Swiss-Prot ID"))
ap

## ----pdata--------------------------------------------------------------------
data(andy2011)
head(featureNames(andy2011))

## ----andgoset-----------------------------------------------------------------
andygoset <- makeGoSet(andy2011)
andygoset
exprs(andygoset)[1:7, 1:4]

## ----testandsamefeats, echo=FALSE---------------------------------------------
stopifnot(all.equal(featureNames(andy2011), featureNames(andygoset)))

## ----hparprep, eval=TRUE------------------------------------------------------
andyhpa <- andy2011
fvarLabels(andyhpa)[1] <- "accession" ## for left_join matching
## convert protein accession numbers to ensembl gene identifiers

library("biomaRt")
mart <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")

filter <- "uniprotswissprot"
attrib <- c("uniprot_gn_symbol", "uniprotswissprot", "ensembl_gene_id")
bm <- getBM(attributes = attrib,
            filters = filter,
            values = fData(andyhpa)[, "accession"],
            mart = mart)
head(bm)

## HPA data
library("hpar")

## using old version for traceability
setHparOptions(hpadata = "hpaSubcellularLoc14")
hpa <- getHpa(bm$ensembl_gene_id)
hpa$Reliability <- droplevels(hpa$Reliability)
colnames(hpa)[1] <- "ensembl_gene_id"

hpa <- dplyr::left_join(hpa, bm)
hpa <- hpa[!duplicated(hpa$uniprotswissprot), ]

## match HPA/LOPIT
colnames(hpa)[7] <- "accession"
fd <- dplyr::left_join(fData(andyhpa), hpa)
rownames(fd) <- featureNames(andyhpa)
fData(andyhpa) <- fd
stopifnot(validObject(andyhpa))

## Let's get rid of features without any hpa data
lopit <- andyhpa[!is.na(fData(andyhpa)$Main.location), ]

## ----hpadata, eval=TRUE-------------------------------------------------------
## HPA localisation
hpalocs <- c(as.character(fData(lopit)$Main.location),
             as.character(fData(lopit)$Other.location))
hpalocs <- hpalocs[!is.na(hpalocs)]
hpalocs <- unique(unlist(strsplit(hpalocs, ";")))

makeHpaSet <- function(x, score2, locs = hpalocs) {
    hpamat <- matrix(0, ncol = length(locs), nrow = nrow(x))
    colnames(hpamat) <- locs
    rownames(hpamat) <- featureNames(x)
    for  (i in 1:nrow(hpamat)) {
        loc <- unlist(strsplit(as.character(fData(x)[i, "Main.location"]), ";"))
        loc2 <- unlist(strsplit(as.character(fData(x)[i, "Other.location"]), ";"))
        score <- score2[fData(x)[i, "Reliability"]]
        hpamat[i, loc] <- score
        hpamat[i, loc2] <- score
    }
    new("MSnSet", exprs = hpamat,
        featureData = featureData(x))
}

hpaset <- makeHpaSet(lopit,
                     score2 = c(Supportive = 1, Uncertain = 0))
hpaset <- filterZeroRows(hpaset)
dim(hpaset)
exprs(hpaset)[c(1, 6, 200), 1:3]

## ----tabdelim-----------------------------------------------------------------
ppif <- system.file("extdata/tabdelimited._gHentss2F9k.txt.gz", package = "pRolocdata")
ppidf <- read.delim(ppif, header = TRUE, stringsAsFactors = FALSE)
head(ppidf)

## ----ppiset-------------------------------------------------------------------
uid <- unique(c(ppidf$X.node1, ppidf$node2))
ppim <- diag(length(uid))
colnames(ppim) <- rownames(ppim) <- uid

for (k in 1:nrow(ppidf)) {
    i <- ppidf[[k, "X.node1"]]
    j <- ppidf[[k, "node2"]]
    ppim[i, j] <- ppidf[[k, "combined_score"]]
}

ppim[1:5, 1:8]

## ----ppiset2------------------------------------------------------------------
andyppi <- andy2011
featureNames(andyppi) <- sub("_HUMAN", "", fData(andyppi)$UniProtKB.entry.name)
cmn <- intersect(featureNames(andyppi), rownames(ppim))
ppim <- ppim[cmn, ]
andyppi <- andyppi[cmn, ]

ppi <- MSnSet(ppim, fData = fData(andyppi),
              pData = data.frame(row.names = colnames(ppim)))
ppi <- filterZeroCols(ppi)

## ----mclasses, echo=FALSE-----------------------------------------------------
data(andy2011) ## load clean LOPIT data
## marker classes for andy2011
m <- unique(fData(andy2011)$markers.tl)
m <- m[m != "unknown"]

## ----andypca, echo=FALSE, fig.cap = "PCA plot of `andy2011`. The multivariate protein profiles are summarised along the two first principal components. Proteins of unknown localisation are represented by empty grey points. Protein markers, which are well-known residents of specific sub-cellular niches are colour-coded and form clusters on the figure."----
setStockcol(paste0(getStockcol(), "80"))
plot2D(andy2011, fcol = "markers.tl")
setStockcol(NULL)
addLegend(andy2011, fcol = "markers.tl",
          where = "topright", bty = "n", cex = .7)

## ----thetas0, echo=TRUE-------------------------------------------------------
head(thetas(3, by = 0.5))
dim(thetas(3, by = 0.5))

## ----thetas1, echo=TRUE-------------------------------------------------------
dim(thetas(5, length.out = 4))

## ----thetaandy----------------------------------------------------------------
## marker classes for andy2011
m <- unique(fData(andy2011)$markers.tl)
m <- m[m != "unknown"]
th <- thetas(length(m), length.out=4)
dim(th)

## ----thetaopt0, eval=FALSE----------------------------------------------------
#  topt <- knntlOptimisation(andy2011, andygoset,
#                            th = th,
#                            k = c(3, 3),
#                            fcol = "markers.tl",
#                            times = 50)

## ----thetaopt, eval=TRUE------------------------------------------------------
set.seed(1)
i <- sample(nrow(th), 12)
topt <- knntlOptimisation(andy2011, andygoset,
                          th = th[i, ],
                          k = c(3, 3),
                          fcol = "markers.tl",
                          times = 5)
topt

## ----getParam-----------------------------------------------------------------
getParams(topt)

## ----besttheta----------------------------------------------------------------
(bw <- experimentData(andy2011)@other$knntl$thetas)

## ----tlclass------------------------------------------------------------------
andy2011 <- knntlClassification(andy2011, andygoset,
                                bestTheta = bw,
                                k = c(3, 3),
                                fcol = "markers.tl")

## ----tlpreds------------------------------------------------------------------
andy2011 <- getPredictions(andy2011, fcol = "knntl")

## ----andypca2, fig.width=6, fig.height=6, fig.cap = "PCA plot of `andy2011` after transfer learning classification. The size of the points is proportional to the classification scores."----
setStockcol(paste0(getStockcol(), "80"))
ptsze <- exp(fData(andy2011)$knntl.scores) - 1
plot2D(andy2011, fcol = "knntl", cex = ptsze)
setStockcol(NULL)
addLegend(andy2011, where = "topright",
          fcol = "markers.tl",
          bty = "n", cex = .7)

## ----sessioninfo, echo=FALSE--------------------------------------------------
sessionInfo()

Try the pRoloc package in your browser

Any scripts or data that you put into this service are public.

pRoloc documentation built on Nov. 8, 2020, 6:26 p.m.