inst/data-raw/process/PMID26983959_Kodama-2016/process.R

library(GEOquery)
library(org.Hs.eg.db)
library(jsonlite)
library(RCurl)

gse <- "GSE72492"
gse <- getGEO(gse) 
eset <- gse[[1]]
meta <- pData(eset)
ids <- as.numeric(meta$`npod id:ch1`)

# For nonspecific filtering use genefilterL::nsFilter with bioconductor annotation package or
# manually use the following steps.

# With expression matrix, filter out features lacking entrez IDs
probes <- row.names(eset)
gpl <- getGEO(annotation(eset))
# GPL files can vary somewhat, so check column names:
# colnames(Table(gpl))
gpl <- Table(gpl)[, c("ID", "GENE")]
entrez <- gpl$GENE[match(probes, gpl$ID)]

xm <- exprs(eset)
xm <- xm[!is.na(entrez), ]
entrez <- entrez[!is.na(entrez)]

# Look up symbols with entrez IDs
gene <- select(org.Hs.eg.db, as.character(entrez), c("SYMBOL", "ENTREZID"), "ENTREZID")

# A number of Entrez IDs do not have symbols because they may be discontinued/merged IDs
need_update <- unique(gene[is.na(gene$SYMBOL), "ENTREZID"])

query <- paste0("http://mygene.info/v3/gene/", need_update, "?fields=id,symbol")

results <- lapply(query, function(url) try(fromJSON(getURL(url))))
mappings <- lapply(results, function(x) if(is.null(x$error)) c(id = x[["_id"]], symbol =  x[["symbol"]]) 
                   else c(id = NA, symbol = NA))
mappings <- as.data.frame(cbind(need_update, do.call(rbind, mappings)), stringsAsFactors = F)
gene[gene$ENTREZID %in% mappings$need_update, ] <- mappings[na.omit(match(gene$ENTREZID, mappings$need_update)), 2:3]

xm <- xm[!is.na(gene$ENTREZID), ]
gene <- gene[!is.na(gene$ENTREZID), ]

# IQR filtering -------------------------------------------------------------------
IQRs <- apply(xm, 1, IQR, na.rm = T)

# Optional -- not done for dataset used in app --------------------------------
# summary(IQRs)
# fIQR <- IQRs > 0.50 # using value for first quartile
# xm <- xm[fIQR, ]
# geneIDs <- geneIDs[fIQR]
# IQRs <- IQRs[fIQR]
#------------------------------------------------------------------------------

# If more than one probe for entrez gene, use highest variance:
table(duplicated(gene$ENTREZID))
whichMaxVar <- function(ids, IQRs) {
  dupx <- table(ids)
  dupx2 <- names(dupx)[dupx > 1] # ids with more than reporters
  dupx2 <- sapply(dupx2, 
                  function(x) { 
                    i <- which(ids %in% x) # indices of all reporters for id
                    i <- i[which.max(IQRs[i])] # index with max
                    return(i)
                  }
  )
  keep <- vector("integer", length(dupx))
  keep[dupx == 1] <- match(names(dupx)[dupx == 1], ids)
  keep[dupx > 1] <- dupx2
  names(keep) <- names(dupx)
  return(keep) 
}

keep <- whichMaxVar(gene$ENTREZID, IQRs)

xm <- xm[keep, ]
rownames(xm) <- names(keep) # new rownames will be Entrez ID
colnames(xm) <- as.character(ids)
xm.t <- t(xm)

# xm.t is exported
avucoh/nPOD documentation built on April 1, 2020, 5:24 p.m.