# Extract scan date from Affymetrix CEL file.
#
# Useful for defining sample batches for \code{ComBat}. No longer
# used by crossmeta, which discovers nuissance variables using \code{sva}.
#
# @param cel_paths Charactor vector specifying full paths to CEL files.
#
# @seealso \code{\link{ComBat}}
# @return Factor vector of CEL scan dates.
cel_dates <- function(cel_paths) {
scan_dates <- c()
for (i in seq_along(cel_paths)) {
datheader <- affxparser::readCelHeader(cel_paths[i])$datheader
scan_date <- gsub(".*([0-9]{2}/[0-9]{2}/[0-9]{2}).*", "\\1", datheader)
scan_dates[i] <- scan_date
}
return (as.factor(scan_dates))
}
# Affymetrix loader for load_plat.
#
# Used by load_plat to load an eset for each GPL platform in a GSE.
#
# @param eset Expression set obtained by load_plat call to getGEO.
# @param gse_dir String specifying path to GSE folder.
#
# @seealso \code{\link{load_plat}}.
# @return Annotated eset with scan_date in pData slot.
load_affy_plat <- function (eset, gse_name, gse_dir, ensql) {
try(Biobase::fData(eset)[Biobase::fData(eset) == ""] <- NA)
try(Biobase::fData(eset)[] <- lapply(Biobase::fData(eset), as.character))
sample_names <- Biobase::sampleNames(eset)
pattern <- paste(sample_names, ".*CEL$", collapse = "|", sep = "")
cel_paths <- tryCatch(
list.files(gse_dir, pattern, full.names = TRUE, ignore.case = TRUE),
# list.files fails if too many files
error = function(c) {
n <- length(sample_names)
p1 <- paste(sample_names[1:(n/2)], ".*CEL$", collapse = "|", sep = "")
p2 <- paste(sample_names[(n/2+1):n], ".*CEL$", collapse = "|", sep = "")
pth1 <- list.files(gse_dir, p1, full.names = TRUE, ignore.case = TRUE)
pth2 <- list.files(gse_dir, p2, full.names = TRUE, ignore.case = TRUE)
return(c(pth1, pth2))
}
)
# if multiple with same GSM, take first
gsm_names <- stringr::str_extract(cel_paths, "GSM[0-9]+")
cel_paths <- cel_paths[!duplicated(gsm_names)]
abatch <- tryCatch(
{
raw_abatch <- affy::ReadAffy(filenames = cel_paths)
return(affy::rma(raw_abatch))
},
warning = function(warn) {
if (grepl("oligo", warn$message)) {
# is the warning to use oligo/xps?
# throw error (handled below by trying oligo)
stop(warn)
} else {
# if not, use affy
raw_abatch <- affy::ReadAffy(filenames = cel_paths)
return(affy::rma(raw_abatch))
}
},
error = function(err) {
message(
'\naffy::ReadAffy failed:\n',
'----------------------'
)
message(err$message)
# is the error a corrupted CEL?
if (grepl('corrupted', err$message)) {
# exclude corrupted and try again
corrupted <- stringr::str_extract(err$message, 'GSM\\d+')
message('Excluding corrupted sample: ', corrupted)
cel_paths <- cel_paths[!grepl(corrupted, cel_paths)]
raw_abatch <- affy::ReadAffy(filenames = cel_paths)
return(affy::rma(raw_abatch))
} else {
res <- use_oligo(cel_paths)
return(res)
}
}
)
# rename samples in abatch
Biobase::sampleNames(abatch) <- stringr::str_extract(Biobase::sampleNames(abatch), "GSM[0-9]+")
# transfer exprs from abatch to eset (maintaining eset sample order)
sample_order <- Biobase::sampleNames(eset)[Biobase::sampleNames(eset) %in% Biobase::sampleNames(abatch)]
eset <- eset[, sample_order]
abatch <- abatch[, sample_order]
Biobase::assayData(eset) <- Biobase::assayData(abatch)
# transfer merged fdata
Biobase::fData(eset) <- merge_fdata(Biobase::fData(eset), Biobase::fData(abatch))
# add SYMBOL annotation
eset <- symbol_annot(eset, gse_name, ensql)
return(eset)
}
use_oligo <- function(cel_paths) {
message(
'\nTrying oligo::read.celfiles instead:\n',
'-----------------------------------'
)
res <- tryCatch(
oligo::read.celfiles(cel_paths),
# handle some incorrect annotation package names
error = function(err) {
if (grepl('pd.huex.1.0.st.v1', err$message))
return(oligo::read.celfiles(cel_paths, pkgname = 'pd.huex.1.0.st.v2'))
if (grepl('pd.hugene.2.0.st.v1', err$message))
return(oligo::read.celfiles(cel_paths, pkgname = 'pd.hugene.2.0.st'))
if (grepl('pd.mogene.2.0.st.v1', err$message))
return(oligo::read.celfiles(cel_paths, pkgname = 'pd.mogene.2.0.st'))
return(err)
})
if (methods::is(res, 'error')) {
message('oligo::read.celfiles failed.\n')
stop(res)
}
return (oligo::rma(res))
}
# Merge feature data from eset and raw data.
#
# Merges feature data from eset GSEMatrix and raw data.
#
# Data.frames are merged on feature names. Result has same row names as raw
# feature data. NAs are added where eset feature data is missing a feature
# in raw data.
#
# @param efdat data.frame with eset feature data (fData).
# @param dfdat data.frame with raw feature data (varies).
#
# @return Data.frame with all columns present in efdat and dfdat. Row names
# are same as dfdat.
merge_fdata <- function(eset_fdata, abatch_fdata) {
# merge feature data from raw data and eset
abatch_fdata$ID <- row.names(abatch_fdata)
abatch_fdata <- merge(abatch_fdata, eset_fdata, by = "ID", all.x = TRUE, sort = FALSE)
row.names(abatch_fdata) <- make.unique(abatch_fdata$ID)
abatch_fdata[] <- lapply(abatch_fdata, as.character)
return(abatch_fdata)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.