library(affyio)
library(affy)
if (Sys.info()["sysname"] == "Windows") {
suppressWarnings(setInternet2(use = FALSE))
}
# Will load from the local file system a list of esets matching the criteria specified
# in expt.annot.
# If from_raw is requested, but only from_processed exists for that GSE.ID, then
# the from_processed data will be returned (with appropriate warnings). However,
# if from_raw is requested and a from_raw output exists for a different expt.annot
# configuration, then no eset will be returned for that GSE.ID. This way we can
# be specific about the data that we process from raw (since there may be a few
# ways of processing it), but if raw data is not available then we can always get
# back the processed data. If allow.multiple.matches.per.file==F then in cases
# where multiple esets in the same file (ie, same GSE/GPL combination) match
# the given expt.annot then none will be returned and a warning will be generated.
# This will allow us to identify cases where we're essentially writing duplicate
# or ambiguously annotated esets.
LoadEset <- function(GSE.ID, eset.folder = "~/esets",
expt.annot=list(data.source = "from_raw",
brainarray=T),
verbose=T,
allow.multiple.matches.per.file=F) {
# if eset.folder is NA, then return an empty list
if (is.na(eset.folder)) {
return(invisible(list()))
}
RData.files <- list.files(path = eset.folder,
pattern = "\\.RData$")
# for each GSE.ID given, look for a corresponding .RData file. If it exists
# then open it. Any esets in the file that match the expt.annot are added to
# the list. No warning is given for cases where the eset is not available
esets <- list()
for (cur.GSE.ID in GSE.ID) {
matching.files <- grep(cur.GSE.ID, RData.files, value=T)
# loop through all matches, load the file
for (cur.file in matching.files) {
tmp.env <- new.env()
load(file.path(eset.folder, cur.file), envir = tmp.env)
# first see if we're asking for from_raw and only from_processed exists.
# If this is the case, then add the from_processed, throw a warning, and
# move to the next file. If it's not the case, then try to match on all
# relevant expt.annot fields
loaded.data.source.types <- sapply(tmp.env$esets,
function(cur.eset) {
notes(cur.eset)$data.source
})
loaded.only.from_processed <- (length(loaded.data.source.types)==1 &&
all(loaded.data.source.types=="from_processed"))
if (expt.annot$data.source=="from_raw" & loaded.only.from_processed) {
msg <- paste0("Eset processed from raw data is not available for ",
cur.GSE.ID, " in ", cur.file, ". Loading from_processed eset.")
cat(msg, "\n")
warning(msg)
cur.file.expt.annot <- expt.annot
cur.file.expt.annot$data.source <- "from_processed"
} else {
cur.file.expt.annot <- expt.annot
}
cur.file.esets <- list()
# see if any of the esets match the given criteria
for (cur.tmp.eset in tmp.env$esets) {
# for those expt.annot fields that are captured in the current eset (ie
# those that are relevant and were inserted by the data processing function)
# look for a match
tmp.expt.annot <- cur.file.expt.annot[names(cur.file.expt.annot) %in% names(notes(cur.tmp.eset))]
expt.annot.matches <-
identical(notes(cur.tmp.eset)[names(tmp.expt.annot)],
tmp.expt.annot)
if (all(expt.annot.matches)) {
cur.file.esets <- c(cur.file.esets, list(cur.tmp.eset))
}
}
# if allow.multiple.matches.per.file==F and we found more than one match
# throw a warning and don't load them into the eset that is returned.
if (!allow.multiple.matches.per.file & length(cur.file.esets)>1) {
warning(paste0("Multiple matching esets were found in '", cur.file,
"'. None were loaded because allow.multiple.matches.per.file=F"))
} else {
esets <- c(esets, cur.file.esets)
}
rm(tmp.env)
}
}
names(esets) <- lapply(esets, GetEsetName)
return(invisible(esets))
}
SaveEset <- function(cur.eset, eset.folder) {
cur.eset.name <- GetEsetName(cur.eset)
save.file <- file.path(eset.folder, paste0(cur.eset.name, ".RData"))
if (file.exists(save.file)) {
load(save.file)
} else {
esets <- list()
}
# see if any of the loaded esets have the same notes. Ignore original.pData
# since this _may_ be modified under some circumstances (like when samples
# are ommited)
loaded.eset.to.replace <-
sapply(esets,
function(cur.loaded.eset) {
cur.loaded.eset.notes <- notes(cur.loaded.eset)
cur.eset.notes <- notes(cur.eset)
# don't compare original pData or status fields...
cur.loaded.eset.notes$original.pData <-
cur.eset.notes$original.pData <-
NULL
cur.loaded.eset.notes$status <-
cur.eset.notes$status <-
NULL
identical(cur.loaded.eset.notes,
cur.eset.notes)
})
if (any(loaded.eset.to.replace)) {
stopifnot(sum(loaded.eset.to.replace)==1)
esets[loaded.eset.to.replace] <- cur.eset
} else {
esets <- c(esets, cur.eset)
}
save(esets, file=save.file)
return(invisible(NULL))
}
GetEsetName <- function(cur.eset) {
# get names out of the notes(cur.eset)@$name slot
return(notes(cur.eset)$name)
}
# This function will either load the esets corresponding to GSE.ID from local files
# (if they're there already and if overwrite.existing==F), or will go download
# the data from GEO and process it according to expt.annot if desired. The
# annotation files are automatically updated for any data downloaded from GEO,
# but not for any data already in local files.
##what other argument options are available for expt.annot?
GetEset <- function(GSE.ID, eset.folder = "S:/Groups/R and D Group Documents/GEO_data/esets/",
# download.missing=T,
overwrite.existing=F,
cache.folder = normalizePath("~/../Downloads"),
expt.annot=list(data.source = "from_processed"),
annot.csv.folder="S:/Groups/R and D Group Documents/GEO_data/sample_annotations/",
feature.annotation.path="S:/Groups/R and D Group Documents/GEO_data/feature_annotations/",
verbose=T) {
library("GEOquery")
if (!("data.source" %in% names(expt.annot))) {
stop("'expt.annot' must contain an element named 'data.source'.")
}
if (!(expt.annot$data.source %in% c("from_processed", "from_raw"))) {
stop("expt.annot$data.source must be 'from_processed' or 'from_raw'.")
}
# if we're loading processed data, strip away any other annotations because
# they won't apply
if (expt.annot$data.source=="from_processed") {
expt.annot <- expt.annot["data.source"]
}
esets <- list()
for (cur.GSE.ID in GSE.ID) {
if (!overwrite.existing) {
cur.esets <- LoadEset(cur.GSE.ID,
eset.folder = eset.folder,
expt.annot=expt.annot)
# if we managed to load from file we can skip to the next GSE.ID
if (length(cur.esets)>0) {
# we update annotations here so esets loaded from file will still have their
# annotations updated
cur.esets <- lapply(cur.esets,
UpdateAnnotations,
annot.csv.folder=annot.csv.folder)
cur.esets <- lapply(cur.esets,
map.features.to.EGID,
feature.annotation.path=feature.annotation.path,
cache.folder=cache.folder)
esets <- c(esets, cur.esets)
if (verbose) {cat("Loaded ", sub("from_", "", expt.annot$data.source),
" data for ", cur.GSE.ID, " from file.\n", sep="")}
next
}
}
if (expt.annot$data.source=="from_processed") {
if (verbose) {cat("Downloading processed data for ", cur.GSE.ID, ".\n", sep="")}
# call function from geoQuery package to get processed data from GEO
cur.esets <- getGEO_retry(cur.GSE.ID, destdir=cache.folder, GSEMatrix=T, getGPL=F)
# label each as coming from processed data
cur.esets <- lapply(cur.esets,
function(cur.eset) {
notes(cur.eset)$data.source <- "from_processed"
return(cur.eset)
})
cur.esets <-
lapply(cur.esets,
function(cur.eset) {
cur.exprs <- exprs(cur.eset)
if (max(cur.exprs, na.rm = T)>30) {
cat("Assuming expression values are not logged. Logging them now.\n")
if (any(cur.exprs[!is.na(cur.exprs)] <= 0)) {
cat("Intensities less than or equal to zero are replaced with half of the smalles positive intensity.\n")
cur.exprs[!is.na(cur.exprs) & cur.exprs <=0] <- min(cur.exprs[!is.na(cur.exprs) & cur.exprs>0])/2
}
cur.exprs <- log2(cur.exprs)
exprs(cur.eset) <- cur.exprs
}
return(cur.eset)
})
# cur.esets will be a list of eset objects, one for each platform present
# in this geo entry. Rename them based on the platform
platforms <- sapply(cur.esets,
function(x) as.character(annotation(x)))
names(cur.esets) <- paste0(cur.GSE.ID, "_", platforms)
# store the eset name, and other processing information, in the eset notes
for (cur.eset.name in names(cur.esets)) {
cur.eset <- cur.esets[[cur.eset.name]]
notes(cur.eset)$name <- cur.eset.name
notes(cur.eset)$GSE.ID <- cur.GSE.ID
notes(cur.eset)$platform <- as.character(annotation(cur.eset))
notes(cur.eset)$data.source <- "from_processed"
notes(cur.eset)$original.pData <- pData(cur.eset)
cur.esets[[cur.eset.name]] <- cur.eset
}
} else if (expt.annot$data.source=="from_raw") {
# if we want to process the raw data, first get the pre-processed data (this is
# where the annotations will come from)
cur.preprocessed.esets <- GetEset(cur.GSE.ID,
eset.folder = eset.folder,
overwrite.existing=overwrite.existing,
cache.folder = cache.folder,
expt.annot = list(data.source="from_processed"),
annot.csv.folder=annot.csv.folder,
feature.annotation.path=feature.annotation.path,
verbose=verbose)
if (verbose) {cat("Downloading and processing raw data for ", cur.GSE.ID, ".\n", sep="")}
# now get processed data (will use pData from cur.preprocessed.esets)
cur.esets <- lapply(cur.preprocessed.esets,
ProcessRawGEOData,
cache.folder,
expt.annot=expt.annot,
verbose=verbose)
# note that the relevant notes fields for each eset are added by
# ProcessRawGEOData() (for raw data processing).
}
# we update annotations here so annotations get updated before we save to file
cur.esets <- lapply(cur.esets,
UpdateAnnotations,
annot.csv.folder=annot.csv.folder)
# map features to EGID if possible
cur.esets <- lapply(cur.esets,
map.features.to.EGID,
feature.annotation.path=feature.annotation.path,
cache.folder=cache.folder)
# save to file here instead of outside the loop so that when an eset is loaded
# from an .RData file we aren't wasting time writing it back (because next()
# is used to go to the next iteration in the loop)
if (!is.na(eset.folder)) {
lapply(cur.esets,
SaveEset,
eset.folder)
}
esets <- c(esets, cur.esets)
}
return(invisible(esets))
}
getGEO_retry <- function(..., nretry=10, tdelay=3) {
for (cur.try in 1:nretry) {
cur.eset <-
tryCatch({
getGEO(...)
}, error=function(e) {
e
})
if (!("simpleError" %in% class(cur.eset))) {
break
}
Sys.sleep(tdelay)
print("RETRYING")
}
if (("simpleError" %in% class(cur.eset))) {
stop(cur.eset)
}
return(cur.eset)
}
map.features.to.EGID <- function(cur.eset, platform=NULL, feature.annotation.path="~/Datasets/Feature annotation files/", annotation.file=NULL, cache.folder="~/../Downloads") {
# browser()
# if we already have EGID in the feature annotations, then return immediately
if ("EGID" %in% colnames(fData(cur.eset))) {
return(cur.eset)
}
if (is.null(platform)) {
platform <- annotation(cur.eset)
}
# open annotation file from common location
if (is.null(annotation.file)) {
annotation.file <- file.path(feature.annotation.path, paste0(platform, ".csv"))
}
if (!file.exists(annotation.file)) {
annotation.file <- getGPLFile(cur.GPL=platform,
feature.annotation.path=feature.annotation.path,
cache.folder=cache.folder)
if (is.null(annotation.file)) {
warning("Features are not mapped to EGIDs for ",
GetEsetName(cur.eset))
return(cur.eset)
}
}
annot <- read.csv(annotation.file, stringsAsFactors=F)
if (!("EGID" %in% colnames(annot))) {
warning("EGID column does not exist in feature annotation file ",
annotation.file, ". Cannot map features to EGIDs.")
return(cur.eset)
}
if ("PROBE_NAME" %in% colnames(fData(cur.eset))) {
eset.features <- fData(cur.eset)$PROBE_NAME
} else {
eset.features <- rownames(fData(cur.eset))
}
fname.col <- which.max(sapply(annot,
function(cur.col.vals) {
sum(eset.features %in% cur.col.vals)
}))
if (!all(eset.features %in% annot[[fname.col]])) {
num.in.annot <- sum(eset.features %in% annot[[fname.col]])
warning("Only ", num.in.annot, " of ", nrow(cur.eset),
" features are found in the feature annotation file ",
annotation.file,
". ", nrow(cur.eset) - num.in.annot,
" features will be discarded.")
cur.eset <- cur.eset[eset.features %in% annot[[fname.col]], ]
eset.features <- eset.features[eset.features %in% annot[[fname.col]]]
}
fData(cur.eset)$EGID <- annot$EGID[match(eset.features, annot[[fname.col]])]
return(cur.eset)
}
getGPLFile <- function(cur.GPL, feature.annotation.path, cache.folder = normalizePath("~/../Downloads")) {
GPL.obj <- tryCatch({
getGEO_retry(cur.GPL, destdir=cache.folder)
},
error=function(err) {
print(paste("Error: ", err))
cat("A error occurred while downloading '", cur.GPL, "'.\n", sep="")
return(NULL)
})
if (is.null(GPL.obj)) return(NULL)
feat.annot.file <- file.path(feature.annotation.path,
paste0(cur.GPL, ".csv"))
updated.date <- Meta(GPL.obj)$last_update_date
feat.annot <- GEOquery::Table(GPL.obj)
if (nrow(feat.annot)==0) {return(NULL)} # this happens for GPL9052 - not sure why
feat.annot$last_updated_date <- updated.date
EGID.col <- grep("EGID|ENTREZ_GENE_ID|Entrez_Gene_ID", colnames(feat.annot))
if (length(EGID.col)!=1) {
cat("Unable to determine EGID column from ", cur.GPL, ". Please manually modify ",
feat.annot.file, " by labeling the EGID column.\n", sep="")
write.csv(feat.annot,
file=feat.annot.file,
row.names=F)
return(NULL)
}
feat.annot$EGID <- feat.annot[[EGID.col]]
write.csv(feat.annot,
file=feat.annot.file,
row.names=F)
return(feat.annot.file)
}
UpdateAnnotations <- function(cur.eset, annot.csv.folder) {
cur.eset.name <- notes(cur.eset)$name
annot.update.func.name <- paste0("UpdateAnnotations_", cur.eset.name)
annot.update.csv <- file.path(annot.csv.folder,
paste0(cur.eset.name, ".csv"))
# set the tmp.eset to NULL so we can easily tell if we call a function to update
# the annotations
tmp.eset <- NULL
if (exists(annot.update.func.name)) {
tmp.eset <- eval(parse(text=paste0(annot.update.func.name,
"(cur.eset)")))
if (file.exists(annot.update.csv)) {
warning(paste0("Using function ", annot.update.func.name, "() instead of",
" file ", annot.update.csv, " to update annotations."))
}
} else if (file.exists(annot.update.csv)) {
tmp.eset <- UpdateAnnotations_CSV(cur.eset, annot.update.csv)
}
if (!is.null(tmp.eset)) {
# make sure that the number of rows in pData(tmp.eset) and
# notes(tmp.eset)$original.pData are the same as the number of columns/samples
# in tmp.eset
if (nrow(pData(tmp.eset)) != nrow(notes(tmp.eset)$original.pData)) {
warning(paste0(annot.update.func.name, "() did not maintain the same number",
" of rows in pData and original.pData. Annotations were",
" not updated"))
} else if(nrow(pData(tmp.eset)) != ncol(tmp.eset)) {
warning(paste0(annot.update.func.name, "() did not maintain the same number",
" of samples in pData the eset. Annotations were",
" not updated"))
} else {
cur.eset <- tmp.eset
}
# if "supplementary_file" in original.pData but not in pData(), add to pData
# - this is important for processRawGeoData() because it will try to get
# sup files from pData, not original.pData, so that if one of the filenames
# is wrong in original.pData it can be corrected and the correct name will
# be used.
if (("supplementary_file" %in% names(notes(cur.eset)$original.pData)) &
!("supplementary_file" %in% names(pData(cur.eset)))) {
pData(cur.eset)$supplementary_file <-
notes(cur.eset)$original.pData$supplementary_file
}
# if "geo_accession" in original.pData but not in pData(), add to pData
# - this is important for processRawGeoData() because it will try to get
# geo_accession from pData, not original.pData
if (("geo_accession" %in% names(notes(cur.eset)$original.pData)) &
!("geo_accession" %in% names(pData(cur.eset)))) {
pData(cur.eset)$geo_accession <-
notes(cur.eset)$original.pData$geo_accession
}
} else {
cat("\nNo annotation processing file/function exists for ", cur.eset.name, ".",
" Saving phenoData to ", annot.update.csv, "\n",sep="")
cur.eset.name <- GetEsetName(cur.eset)
write.csv(pData(cur.eset),
file=annot.update.csv,
row.names=F)
}
return(cur.eset)
}
# # generic function for downloading data files from the internet
# downloadData <- function(path.to.data, name, folder=NULL, overwrite=FALSE, md5=NULL) {
#
# # TODO
# # put in update functionality - how to determine file size?
# # determine whether data is zipped or tar.gz and unpack it
# # how to associate annotation with this file?
#
# if(!is.null(md5) && file.exists(md5)){ # error without shortcircuit
# md5 <- read.delim2(md5)
# }
#
# # check if file exists - if it does then read it and download data
# if(file.exists(path.to.data)){
# print("Assuming input is a text file of URLs to download")
# # read text file
# urls <- read.delim2(path.to.data)
#
# for(i in 1:length(urls)){
# fname <- unlist(strsplit(url[i], "/", fixed=TRUE))
# fname <- fname[length(fname)]
#
# # if we should not overwrite file, then check to see if it exists in the current location
# if(!overwrite){
# logic <- file.exists(paste0(ifelse(is.null(folder), getwd(), folder), "/", fname))
# if(logic) stop("Stopping because overwrite=FALSE and the file already exists")
# }
# download.file(url[i], destfile=paste0(ifelse(is.null(folder), getwd(), folder), "/", fname))
#
# if(!is.null(md5)){
# stopifnot(md5sum(paste0(ifelse(is.null(folder), getwd(), folder), "/", fname, ))==md5[i])
# }
# }
# } else {
# print("Assuming input is URL to download")
# fname <- unlist(strsplit(path.to.data, "/", fixed=TRUE))
# fname <- fname[length(fname)]
#
# # if we should not overwrite file, then check to see if it exists in the current location
# if(!overwrite){
# logic <- file.exists(paste0(ifelse(is.null(folder), getwd(), folder), "/", fname))
# if(logic) stop("Stopping because overwrite=FALSE and the file already exists")
# }
#
# download.file(path.to.data, destfile=paste0(ifelse(is.null(folder), getwd(), folder), "/", fname))
#
# if(!is.null(md5)){
# stopifnot(md5sum(paste0(ifelse(is.null(folder), getwd(), folder), "/", fname, ))==md5)
# }
# }
# return(NULL)
# }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.