# Specify a directory where GSE downloaded files are stored
#GEOtemp <- "/home/axel/my_code/nursa/geometric2/required_files/GEOtemp"
# Input file
#GSELIST <- "test2.txt"
# Location/name for output file
#REVIEW1 <- "sample_review.txt"
#' Annotation Step 1
#' @param GEOtemp directory where downloaded GEO files are stored, e.g.: "/home/axel/my_code/nursa/geometric2/data/GEOtemp"
#' @param GSELIST a file with one GSE id per row
#' @param REVIEW1 filename for output
step_1 <- function(GEOtemp="/home/axel/my_code/nursa/geometric2/required_files/GEOtemp", GSELIST= "test2.txt", REVIEW1="sample_review.txt"){
## Download GSEs
#Feed in the ids of GSEs that passed Filter #1 (based on description). The following steps pull the metadata create a standardized annotation table for the samples.
gses <- readLines(GSELIST)
# Download the GSE
getSamples <- function(gse) {
eset <- getGEO(GEO = gse, destdir = GEOtemp)[[1]]
pData(eset)
}
# if processing many, limit messages to avoid cluttering display
results <- suppressMessages(lapply(gses, function(x) try(getSamples(x))))
names(results) <- gses
####
# the next session is temporarily commented out
####
#Sometimes downloading a GEO will fail (maybe 1-5% of GSEs?) because there was a parsing error or some other problem. GEOquery verbose output but not always helpful, so better to check with:
#ok <- allOK(results)
#table(ok)
#cat("GSEs that failed:", names(results)[!ok])
#results <- results[ok] # Go back to failures later, proceed with OK
## Create and populate annotation table
#Infer some annotation information for each GSE using some simple rules.
#This is only the starting point as data is imperfectly extracted; manual annotation follows for the exported table.
# Experiment Type (1-Channel or 2-Channel; more specific assignments, i.e. whether there was dye swap, happens in human review)
inferXPType <- function(DT) {
default <- rep("1C", nrow(DT))
ch2 <- grep("source_name_ch2", names(DT))
if(length(ch2)) return(rep("2C", nrow(DT)))
return(default)
}
ixpType <- sapply(results, inferXPType)
# BSM -- Molecule -- note: perhaps have a list of all known tlr ligands to match against
inferBSM <- function(DT) {
x1 <- rep("", nrow(DT))
bsmcol <- grep("agent|stimulation", names(DT))
if(length(bsmcol) == 1) {
info <- DT[bsmcol]
return(info)
}
return(x1)
}
iBSM <- sapply(results, inferBSM)
# BSMDCD -- Time
inferDCD <- function(DT) {
x1 <- rep("", nrow(DT))
# First find a column that specifically contains treatment info
tcol <- grep("^treatment:ch1|time", names(DT))
if(length(tcol)) {
info <- do.call(paste, DT[tcol])
} else { # Use information in sample title
info <- DT$title
}
m <- regexpr("[0-9]+ ?(d|h|D|H)|(D|d)(ay) ?[0-9]+", info)
x1[m != -1] <- regmatches(info, m)
# Sometimes time is stated as "overnight", which usually means "12 h"
x1[grepl("overnight", info)] <- "12h"
x1 <- tolower(gsub(" ", "", x1))
return(x1)
}
iDCD <- sapply(results, inferDCD)
# BSMDCD2 -- Dose/concentration
inferDCD2 <- function(DT) {
x1 <- rep("", nrow(DT))
dosecol <- grep("dose|concentration|amount|treatment", names(DT))
dosecol <- c(dosecol, which(sapply(DT, function(x) grepl("treatment_protocol", x[1])) == T))
if(length(dosecol)) {
info <- do.call(paste, DT[dosecol])
m <- gregexpr("[0-9.]+ ?(u|n|m|[?])(g|M)?(\\/[A-Za-z]{2})?", info)
dose <- sapply(regmatches(info, m), paste, collapse = ";")
dose <- gsub("?", "u", dose, fixed = T)
x1 <- dose
}
return(x1)
}
iDCD2 <- sapply(results, inferDCD2)
# BiosampName -- to do: a more sophisticated implementation could also match to biosample ID
#biosamps <- readLines("biosamps.txt") ## This is a list derived from "allbiosamples"
inferBiosamp <- function(DT) {
# First find the obvious columns containing cell line/type
cellcol <- grep("^cell.*ch1", colnames(DT))
if(length(cellcol)) {
cell <- do.call(paste, DT[cellcol])
return(cell)
} else {
tissuecol <- grep("^tissue.*ch1", colnames(DT))
if(length(tissuecol)) {
tissue <- do.call(paste, DT[tissuecol])
return(tissue)
} else { # Use the source_name_ch1,treatment_protocol_ch1 columns otherwise
bs <- lapply(biosamps, function(b) grepl(b, paste(DT$source_name_ch1, DT$treatment_protocol_ch1)))
names(bs) <- biosamps
bs <- bs[sapply(bs, sum) >= 1 ]
if(!length(bs)) return(rep("?", nrow(DT)))
bs <- bs[order(sapply(bs, sum), decreasing = T)]
nterm <- ifelse(length(bs) > 1, 2, 1)
x1 <- lapply(1:nterm, function(i) ifelse(bs[[i]], names(bs)[[i]], ""))
x1 <- do.call("paste", x1)
x1 <- trimws(x1)
return(x1)
}
}
}
iBiosamp <- sapply(results, inferBiosamp)
# Control?
inferCTRL <- function(DT) {
s <- DT$title
ctrl <- as.numeric(grepl("wt|wildtype|control|ctrl|healthy|media|medium|veh|unstim|untreated|mock", s, ignore.case = T))
}
iCTRL <- sapply(results, inferCTRL)
# Subset table for export with selected columns. Join data in one table and fill in additional columns.
export <- rbindlist(results, use.names = T, fill = T)
keepcols <- grep("title|geo_accession|source_name|description|platform|characteristics|treatment", names(export))
export <- export[, keepcols, with = F]
export[, GSE := unlist(mapply(rep, names(results), sapply(results, nrow)))]
newcols1 <- c("RefDye", "Comment", "Group", "Batch", "Node", "NodeFunction", "BSM")
export[, (newcols1) := ""]
export[, c("Ignore") := 0]
export[, c("XP") := 1]
# Inferred variables
export[, xpType := unlist(ixpType)]
export[, isCTRL := unlist(iCTRL)]
export[, BioSampName := unlist(iBiosamp)]
export[, BSMDCD := unlist(iDCD)]
export[, BSMDCD2 := unlist(iDCD2)]
myorder <- c("GSE", "xpType", "RefDye", "Comment", "Group", "Ignore", "XP", "isCTRL", "Batch", "Node", "NodeFunction", "BSM", "BSMDCD", "BSMDCD2", "BioSampName",
"title", "geo_accession")
neworder <- c(myorder, names(export)[!names(export) %in% myorder])
setcolorder(export, neworder)
head(export)
write.table(export, REVIEW1, sep = "\t", row.names = F, quote = F)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.