library(GEOquery)
library(org.Hs.eg.db)
library(jsonlite)
library(RCurl)
gse <- "GSE72492"
gse <- getGEO(gse)
eset <- gse[[1]]
meta <- pData(eset)
ids <- as.numeric(meta$`npod id:ch1`)
# For nonspecific filtering use genefilterL::nsFilter with bioconductor annotation package or
# manually use the following steps.
# With expression matrix, filter out features lacking entrez IDs
probes <- row.names(eset)
gpl <- getGEO(annotation(eset))
# GPL files can vary somewhat, so check column names:
# colnames(Table(gpl))
gpl <- Table(gpl)[, c("ID", "GENE")]
entrez <- gpl$GENE[match(probes, gpl$ID)]
xm <- exprs(eset)
xm <- xm[!is.na(entrez), ]
entrez <- entrez[!is.na(entrez)]
# Look up symbols with entrez IDs
gene <- select(org.Hs.eg.db, as.character(entrez), c("SYMBOL", "ENTREZID"), "ENTREZID")
# A number of Entrez IDs do not have symbols because they may be discontinued/merged IDs
need_update <- unique(gene[is.na(gene$SYMBOL), "ENTREZID"])
query <- paste0("http://mygene.info/v3/gene/", need_update, "?fields=id,symbol")
results <- lapply(query, function(url) try(fromJSON(getURL(url))))
mappings <- lapply(results, function(x) if(is.null(x$error)) c(id = x[["_id"]], symbol = x[["symbol"]])
else c(id = NA, symbol = NA))
mappings <- as.data.frame(cbind(need_update, do.call(rbind, mappings)), stringsAsFactors = F)
gene[gene$ENTREZID %in% mappings$need_update, ] <- mappings[na.omit(match(gene$ENTREZID, mappings$need_update)), 2:3]
xm <- xm[!is.na(gene$ENTREZID), ]
gene <- gene[!is.na(gene$ENTREZID), ]
# IQR filtering -------------------------------------------------------------------
IQRs <- apply(xm, 1, IQR, na.rm = T)
# Optional -- not done for dataset used in app --------------------------------
# summary(IQRs)
# fIQR <- IQRs > 0.50 # using value for first quartile
# xm <- xm[fIQR, ]
# geneIDs <- geneIDs[fIQR]
# IQRs <- IQRs[fIQR]
#------------------------------------------------------------------------------
# If more than one probe for entrez gene, use highest variance:
table(duplicated(gene$ENTREZID))
whichMaxVar <- function(ids, IQRs) {
dupx <- table(ids)
dupx2 <- names(dupx)[dupx > 1] # ids with more than reporters
dupx2 <- sapply(dupx2,
function(x) {
i <- which(ids %in% x) # indices of all reporters for id
i <- i[which.max(IQRs[i])] # index with max
return(i)
}
)
keep <- vector("integer", length(dupx))
keep[dupx == 1] <- match(names(dupx)[dupx == 1], ids)
keep[dupx > 1] <- dupx2
names(keep) <- names(dupx)
return(keep)
}
keep <- whichMaxVar(gene$ENTREZID, IQRs)
xm <- xm[keep, ]
rownames(xm) <- names(keep) # new rownames will be Entrez ID
colnames(xm) <- as.character(ids)
xm.t <- t(xm)
# xm.t is exported
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.