#' affinityPropagation
#'
#' @param apdata: a list of input data
#' $ output.dir
#' $ genesetInfo
#' $ genesetIds
#' @param sim method for compute similarity between gene set:
#' default is 'Jaccard'
#' @importFrom apcluster apcluster
#' @importFrom apcluster aggExCluster
#' @importFrom jsonlite unbox toJSON
#' @import proxy
affinityPropagation <- function(ap.data, sim="Jaccard"){
output.dir <- ap.data$output.dir
genesetIds <- ap.data$genesetIds
genesetInfo <- ap.data$genesetInfo
md5val <- ap.data$md5val
# compute the similiarity and input preference vector
ret <- simFunc(genesetInfo, genesetIds, sim)
sim.mat <- ret$sim.mat
ip.vec <- ret$ip.vec
ap.result <- apcluster(sim.mat, p=ip.vec)
# create json file for cytoscape
json.data <- list()
vis.data <- list()
clusters <- ap.result@clusters
exemplars <- ap.result@exemplars
# run affinity propagation among exemplars if there are more
# than 10 clusters
clusters2 <- NULL
exemplars2 <- NULL
ap.result2 <- NULL
sim.mat2 <- NULL
min.num.clusters <- 10
genesetIds2 <- genesetIds[names(exemplars)]
genesetInfo2 <- genesetInfo[names(exemplars)]
if(length(clusters) > min.num.clusters){
ret2 <- simFunc(genesetInfo2, genesetIds2, sim)
sim.mat2 <- ret2$sim.mat
ap.result2 <- apcluster(sim.mat2)
clusters2 <- ap.result2@clusters
exemplars2 <- ap.result2@exemplars
}
cnt <- 1
node.df <- data.frame(name=character(), platform=character(),
size=numeric(), color=numeric())
# get the max, min value of leNum for all nodes
all.leNum <- sapply(genesetInfo, function(x) { return(x$leNum) })
all.NES <- sapply(genesetInfo, function(x) { return(x$NES) })
all.platforms <- sapply(genesetInfo, function(x) { return(x$platform) })
all.platforms <- unique(all.platforms)
maxLeNum <- max(all.leNum)
minLeNum <- min(all.leNum)
maxNES <- max(all.NES)
minNES <- min(all.NES)
# nodes
for(i in seq_along(clusters)){
nodes <- names(clusters[[i]])
cur.cnt <- 1
for(j in seq_along(nodes)){
dat <- list()
dat[["id"]] <- unbox(nodes[j])
if(nodes[j] %in% names(exemplars)){
dat[["is_exemplar"]] <- unbox(1)
} else{
dat[["is_exemplar"]] <- unbox(0)
}
if(!is.null(exemplars2)){
if(nodes[j] %in% names(exemplars2)){
dat[["is_exemplar"]] <- unbox(2)
}
}
dat[["signP"]] <- unbox(genesetInfo[[nodes[j]]][["signP"]])
dat[["NES"]] <- unbox(genesetInfo[[nodes[j]]][["NES"]])
dat[["nSize"]] <- unbox(computeNodeDim(maxLeNum, minLeNum, genesetInfo[[nodes[j]]][["leNum"]]))
dat[["color"]] <- unbox(computeNodeColor(maxNES, minNES, genesetInfo[[nodes[j]]][["NES"]]))
dat[["platform"]] <- unbox(which(all.platforms == genesetInfo[[nodes[j]]][["platform"]]))
vis.data[[cnt]] <- list(data=dat)
cnt <- cnt+1
node.df <- rbind(node.df, list(name=nodes[j], platform=dat[["platform"]],
size=dat[["nSize"]], color=dat[["color"]]), stringsAsFactors=FALSE)
cur.cnt <- cur.cnt+1
}
}
node_file <- file.path(output.dir, paste0("ap_", md5val,"_nodelist.txt"))
write.table(node.df, node_file, col.names = TRUE, sep='\t',quote=FALSE, row.name=FALSE)
# edges
edge.cnt <- 1
edge.df <- data.frame(node1=character(), node2=character())
for(j in seq_along(exemplars)){
cur.exemplar <- names(exemplars[j])
cur.cluster <- names(clusters[[j]])
for(k in seq_along(cur.cluster)){
if(cur.exemplar != cur.cluster[k]){
# edge level:
# level 1: between an exemplar and a node
# level 2: between exemplar of exemplar and original exemplar
# for large networks, we may only show level 2 edges
vis.data[[cnt]] <- list(data=list(id=unbox(paste0("e",edge.cnt)), level=unbox(1),
source=unbox(cur.exemplar), target=unbox(cur.cluster[k])))
edge.df <- rbind(edge.df, list(node1=cur.exemplar, node2=cur.cluster[k]), stringsAsFactors=FALSE)
cnt <- cnt+1
edge.cnt <- edge.cnt+1
}
}
}
if(!is.null(exemplars2)){
for(j in seq_along(exemplars2)){
cur.exemplar <- names(exemplars2[j])
cur.cluster <- names(clusters2[[j]])
for(k in seq_along(cur.cluster)){
if(cur.exemplar != cur.cluster[k]){
# edge level:
# level 1: between an exemplar and a node
# level 2: between exemplar of exemplar and original exemplar
# for large networks, we may only show level 2 edges
vis.data[[cnt]] <- list(data=list(id=unbox(paste0("e",edge.cnt)), level=unbox(2),
source=unbox(cur.exemplar), target=unbox(cur.cluster[k])))
edge.df <- rbind(edge.df, list(node1=cur.exemplar, node2=cur.cluster[k]), stringsAsFactors=FALSE)
cnt <- cnt+1
edge.cnt <- edge.cnt+1
}
}
}
}
json.data[["vis"]] <- vis.data
json_file <- file.path(output.dir, paste0("ap_",md5val,".json"))
write(toJSON(json.data), json_file)
edge_file <- file.path(output.dir, paste0("ap_", md5val,"_edgelist.txt"))
write.table(edge.df, edge_file, col.names = FALSE, sep='\t',quote=FALSE, row.name=FALSE)
# create a zip file for download
zip_file <- file.path(output.dir, paste0("ap_",md5val,".zip"))
zip(zip_file, c(node_file, edge_file, json_file), flags="-j")
return(list(json=basename(json_file), zip=basename(zip_file)))
}
simFunc <- function(genesetInfo, genesetIds, method="Jaccard"){
# first find out the union of sets, sorted
all.genes <- sort(unique(unlist(genesetIds)))
overlap.mat <- sapply(genesetIds, function(x) {as.integer(all.genes %in% x)})
# proxy::simil
# https://cran.r-project.org/web/packages/proxy/vignettes/overview.pdf
sim.mat <- as.matrix(simil(overlap.mat, by_rows=FALSE, method=method))
sim.mat[is.na(sim.mat)] <- 1
# if there is no overlap, set the similarity to -Inf
sim.mat[sim.mat == 0] <- -Inf
# check sim.mat to see if it is identical for each pair
if(max(sim.mat)==min(sim.mat)){
# this will generate error, so randomy add some noise to off diagonal elements
mat.siz <- dim(sim.mat)[1]
rand.m <- matrix(rnorm(mat.siz*mat.siz,0,0.01),mat.siz)
# make it symmetric
rand.m[lower.tri(rand.m)] = t(rand.m)[lower.tri(rand.m)]
sim.mat <- sim.mat + rand.m
# make diagonal all 1
diag(sim.mat) <- 1
}
# set the input preference (IP) for each geneset
# give higher IP to geneset with larger -logP (remove sign)
# IP <- maxScore for geneset with largest -logP value
# IP <- minScore for geneset with smallest -logP value
# other genesets will have linearly interpolated IP value
all.minus.logP <- sapply(genesetInfo, function(x){
return(abs(x$signP))
})
max.minus.logP <- max(all.minus.logP)
min.minus.logP <- min(all.minus.logP)
#minScore <- min(sim.mat)
minScore <- 0
#maxScore <- max(sim.mat)
tmp.sim.mat <- sim.mat
tmp.sim.mat[!is.finite(tmp.sim.mat)] <- NA
# get the median excluding -Inf
maxScore <- median(tmp.sim.mat, na.rm=TRUE)
if(abs(max.minus.logP-min.minus.logP) < .Machine$double.eps^0.5){
ip.vec <- NA
} else{
ip.vec <- sapply(genesetInfo, function(x){
return(minScore+(maxScore-minScore)*(abs(x$signP)-min.minus.logP)/(max.minus.logP-min.minus.logP))
})
}
return(list(sim.mat=sim.mat, ip.vec=ip.vec))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.