PCAN.R
In PCAN: Phenotype Consensus ANalysis (PCAN)

## -----------------------------------------------------------------------------
library(PCAN)
## Some functions can be parallelized.
## They use the bpmapply function from the BiocParallel library.
## Follow instructions provided in the BiocParallel manual to
## configure your parallelization backend.
## options(MulticoreParam=quote(MulticoreParam(workers=4)))

## ---- echo=FALSE--------------------------------------------------------------
data(traitDef, package="PCAN")
disId <- "612285"
disName <- traitDef[
    which(traitDef$id==disId & traitDef$db=="OMIM"),
    "name"
]

## ---- echo=FALSE, results='asis'----------------------------------------------
data(hpByTrait, hpDef, package="PCAN")
hpOfInterest <- hpByTrait[
    which(hpByTrait$id==disId & hpByTrait$db=="OMIM"),
    "hp"
]
knitr::kable(
    hpDef[match(hpOfInterest, hpDef$id),],
    row.names=FALSE,
    col.names=c("HP", "Name")
)

## ---- echo=FALSE--------------------------------------------------------------
data(geneDef, package="PCAN")
genId <- "57545"
genSymb <- geneDef[
    match(genId, geneDef$entrez),
    "symbol"
]

## -----------------------------------------------------------------------------
data(geneByTrait, package="PCAN")
head(geneByTrait, n=3)
dim(geneByTrait)

## -----------------------------------------------------------------------------
data(traitDef, package="PCAN")
head(traitDef, n=3)

## -----------------------------------------------------------------------------
data(geneDef, package="PCAN")
head(geneDef, n=3)

## -----------------------------------------------------------------------------
geneByTrait <- geneByTrait[
    which(geneByTrait$id!=disId | geneByTrait$entrez!=genId),
]
dim(geneByTrait)

## -----------------------------------------------------------------------------
data(hpByTrait, package="PCAN")
head(hpByTrait, n=3)

## -----------------------------------------------------------------------------
data(hpDef, package="PCAN")
head(hpDef, n=3)

## -----------------------------------------------------------------------------
data(hp_descendants, hp_ancestors, package="PCAN")
lapply(head(hp_descendants, n=3), head)
lapply(head(hp_ancestors, n=3), head)

## ---- echo=FALSE--------------------------------------------------------------
rootHpName <- "Phenotypic abnormality"
rootHpId <- hpDef$id[which(hpDef$name==rootHpName)]

## -----------------------------------------------------------------------------
geneByHp <- unique(merge(
    geneByTrait,
    hpByTrait,
    by="id"
)[,c("entrez", "hp")])
head(geneByHp, n=3)

## -----------------------------------------------------------------------------
genDis <- traitDef[
    match(
        geneByTrait[which(geneByTrait$entrez==genId), "id"],
        traitDef$id
    ),
]
genDis

genHpDef <- hpDef[
    match(
        geneByHp[which(geneByHp$entrez==genId), "hp"],
        hpDef$id
    ),
]
genHp <- genHpDef$id
dim(genHpDef)
head(genHpDef)

## -----------------------------------------------------------------------------
genHpDef[which(genHpDef$id %in% hpOfInterest),]

## -----------------------------------------------------------------------------
info <- unstack(geneByHp, entrez~hp)
ic <- computeHpIC(info, hp_descendants)

## ---- echo=FALSE--------------------------------------------------------------
hist(
    ic,
    breaks=100, col="grey",
    main="Distribution of Information Content",
    xlab="IC base on genes associated to HP"
)

## -----------------------------------------------------------------------------
hp1 <- "HP:0000518"
hp2 <- "HP:0030084"
hpDef[which(hpDef$id %in% c(hp1, hp2)), 1:2]
calcHpSim(hp1, hp2, IC=ic, ancestors=hp_ancestors)

## -----------------------------------------------------------------------------
hp1 <- "HP:0002119"
hp2 <- "HP:0001305"
hpDef[which(hpDef$id %in% c(hp1, hp2)), 1:2]
calcHpSim(hp1, hp2, IC=ic, ancestors=hp_ancestors)

## -----------------------------------------------------------------------------
compMat <- compareHPSets(
    hpSet1=genHp, hpSet2=hpOfInterest,
    IC=ic,
    ancestors=hp_ancestors,
    method="Resnik",
    BPPARAM=SerialParam()
)
dim(compMat)
head(compMat)

## -----------------------------------------------------------------------------
hpSetCompSummary(compMat, method="bma", direction="symSim")

## -----------------------------------------------------------------------------
## Compute semantic similarity between HP of interest and all HP terms
## This step is time consumming and can be parallelized.
## Use the BPPARAM parameter to specify your own 
## back-end with appropriate number of workers.
hpGeneResnik <- compareHPSets(
    hpSet1=names(ic), hpSet2=hpOfInterest,
    IC=ic,
    ancestors=hp_ancestors,
    method="Resnik",
    BPPARAM=SerialParam()
)
## Group the results by gene
hpByGene <- unstack(geneByHp, hp~entrez)
hpMatByGene <- lapply(
    hpByGene,
    function(x){
        hpGeneResnik[x, , drop=FALSE]
    }
)
## Compute the corresponding scores
resnSss <- unlist(lapply(
    hpMatByGene,
    hpSetCompSummary,
    method="bma", direction="symSim"
))
## Get the score of the gene candidate
candScore <- resnSss[genId]
candScore

## -----------------------------------------------------------------------------
candRank <- sum(resnSss >= candScore)
candRank
candRelRank <- candRank/length(resnSss)
candRelRank

## ---- echo=FALSE--------------------------------------------------------------
hist(
    resnSss,
    breaks=100, col="grey",
    ylim=c(0,300),
    xlab=expression(Sim[sym]),
    ylab="Number of genes",
    main=paste(
        "Distribution of symmetric semantic similarity scores\nfor all the",
        length(resnSss), "genes"
    )
)
polygon(
    x=c(candScore, 10, 10, candScore),
    y=c(-10, -10, 1000, 1000),
    border="#BE0000",
    col="#BE000080",
    lwd=3
)
text(
    x=candScore, y=mean(par()$usr[3:4]),
    paste0(
        candRank, " genes (",
        signif(candRank*100/length(resnSss), 2), "%)\n",
        "show a symmetric semantic\n",
        "similarity score greater than\n",
        "the gene candidate for\n",
        "for the HPs under focus."
    ),
    pos=4,
    cex=0.6
)

## -----------------------------------------------------------------------------
data(hsEntrezByRPath, rPath, package="PCAN")
head(rPath, n=3)
lapply(head(hsEntrezByRPath, 3), head)

## -----------------------------------------------------------------------------
data(hqStrNw, package="PCAN")
head(hqStrNw, n=3)

## -----------------------------------------------------------------------------
candPath <- names(hsEntrezByRPath)[which(unlist(lapply(
    hsEntrezByRPath,
    function(x) genId %in% x
)))]
rPath[which(rPath$Pathway %in% candPath),]

## -----------------------------------------------------------------------------
rPathRes <- hpGeneListComp(
    geneList=hsEntrezByRPath[[candPath]],
    ssMatByGene = hpMatByGene,
    geneSSScore = resnSss
)

## -----------------------------------------------------------------------------
length(rPathRes$scores)
sum(!is.na(rPathRes$scores))

## ---- echo=FALSE--------------------------------------------------------------
hist(
    resnSss,
    breaks=100, col="grey",
    ylim=c(0,5),
    xlab=expression(Sim[sym]),
    ylab="Density",
    main=paste(
        "Distribution of symmetric semantic similarity scores\nfor all the",
        length(resnSss), "genes"
    ),
    probability=TRUE
)
toAdd <- hist(
    rPathRes$scores,
    breaks=100,
    plot=FALSE
)
for(i in 1:length(toAdd$density)){
    polygon(
        x=toAdd$breaks[c(i, i+1, i+1, i)],
        y=c(0, 0, rep(toAdd$density[i], 2)),
        col="#BE000040",
        border="#800000FF"
    )
}
legend(
    "topright",
    paste0(
        "Genes belonging to the ", candPath," pathway:\n'",
        rPath[which(rPath$Pathway %in% candPath),"Pathway_name"],
        "'\nand with a symmetric semantic similarity score (",
        sum(!is.na(rPathRes$scores)),
        "/",
        length(rPathRes$scores),
        ")\n",
        "p-value: ", signif(rPathRes$p.value, 2)
    ),
    pch=15,
    col="#BE000040",
    bty="n",
    cex=0.6
)

## -----------------------------------------------------------------------------
pathSss <- rPathRes$scores[which(!is.na(rPathRes$scores))]
names(pathSss) <- geneDef[match(names(pathSss), geneDef$entrez), "symbol"]
par(mar=c(7.1, 4.1, 4.1, 2.1))
barplot(
    sort(pathSss),
    las=2,
    ylab=expression(Sim[sym]),
    main=rPath[which(rPath$Pathway %in% candPath),"Pathway_name"]
)
p <- c(0.25, 0.5, 0.75, 0.95)
abline(
    h=quantile(resnSss, probs=p),
    col="#BE0000",
    lty=c(2, 1, 2, 2),
    lwd=c(2, 2, 2, 1)
)
text(
    rep(0,4),
    quantile(resnSss, probs=p),
    p,
    pos=3,
    offset=0,
    col="#BE0000",
    cex=0.6
)
legend(
    "topleft",
    paste0(
        "Quantiles of the distribution of symmetric semantic similarity\n",
        "scores for all the genes."
    ),
    lty=1,
    col="#BE0000",
    cex=0.6
)

## -----------------------------------------------------------------------------
geneLabels <- geneDef$symbol[which(!duplicated(geneDef$entrez))]
names(geneLabels) <- geneDef$entrez[which(!duplicated(geneDef$entrez))]
hpLabels <- hpDef$name
names(hpLabels) <- hpDef$id
hpGeneHeatmap(
    rPathRes,
    genesOfInterest=genId,
    geneLabels=geneLabels,
    hpLabels=hpLabels,
    clustByGene=TRUE,
    clustByHp=TRUE,
    palFun=colorRampPalette(c("white", "red")),
    goiCol="blue",
    main=rPath[which(rPath$Pathway %in% candPath),"Pathway_name"]
)

## -----------------------------------------------------------------------------
neighbors <- unique(c(
    hqStrNw$gene1[which(hqStrNw$gene2==genId)],
    hqStrNw$gene2[which(hqStrNw$gene1==genId)],
    genId
))
neighRes <- hpGeneListComp(
    geneList=neighbors,
    ssMatByGene = hpMatByGene,
    geneSSScore = resnSss
)

## ---- echo=FALSE--------------------------------------------------------------
hist(
    resnSss,
    breaks=100, col="grey",
    ylim=c(0,10),
    xlab=expression(Sim[sym]),
    ylab="Density",
    main=paste(
        "Distribution of symmetric semantic similarity scores\nfor all the",
        length(resnSss), "genes"
    ),
    probability=TRUE
)
toAdd <- hist(
    neighRes$scores,
    breaks=100,
    plot=FALSE
)
for(i in 1:length(toAdd$density)){
    polygon(
        x=toAdd$breaks[c(i, i+1, i+1, i)],
        y=c(0, 0, rep(toAdd$density[i], 2)),
        col="#BE000040",
        border="#800000FF"
    )
}
legend(
    "topright",
    paste0(
        "Genes interacting with ",
        geneDef[which(geneDef$entrez==genId),"symbol"],
        " (", genId, ")",
        "\nand with a symmetric semantic similarity score (",
        sum(!is.na(neighRes$scores)),
        "/",
        length(neighRes$scores),
        ")\n",
        "p-value: ", signif(neighRes$p.value, 2)
    ),
    pch=15,
    col="#BE000040",
    bty="n",
    cex=0.6
)

## ---- echo=FALSE--------------------------------------------------------------
neighSss <- neighRes$scores[which(!is.na(neighRes$scores))]
names(neighSss) <- geneDef[match(names(neighSss), geneDef$entrez), "symbol"]
opar <- par(mar=c(7.1, 4.1, 4.1, 2.1))
barplot(
    sort(neighSss),
    las=2,
    ylab=expression(Sim[sym]),
    main=paste0(
        "Genes interacting with ",
        geneDef[which(geneDef$entrez==genId),"symbol"],
        " (", genId, ")"
    )
)
p <- c(0.25, 0.5, 0.75, 0.95)
abline(
    h=quantile(resnSss, probs=p),
    col="#BE0000",
    lty=c(2, 1, 2, 2),
    lwd=c(2, 2, 2, 1)
)
text(
    rep(0,4),
    quantile(resnSss, probs=p),
    p,
    pos=3,
    offset=0,
    col="#BE0000",
    cex=0.6
)
legend(
    "topleft",
    paste0(
        "Quantiles of the distribution of symmetric semantic similarity\n",
        "scores for all the genes."
    ),
    lty=1,
    col="#BE0000",
    cex=0.6
)

## ---- echo=FALSE--------------------------------------------------------------
hpGeneHeatmap(
    neighRes,
    genesOfInterest=genId,
    geneLabels=geneLabels,
    hpLabels=hpLabels,
    clustByGene=TRUE,
    clustByHp=TRUE,
    palFun=colorRampPalette(c("white", "red")),
    goiCol="blue",
    main=rPath[which(rPath$Pathway %in% candPath),"Pathway_name"]
)

## ---- echo=FALSE--------------------------------------------------------------
sessionInfo()