#' curateExpdata
#'
#' This function will take the name lists generated by nameChange and apply them
#' to an expression data set. The function will process locally downloaded GEO
#' expression files, or even just accept a GEO accession number and do the work
#' for you.
#'
#' @param expname This is the name of your data set; if local is T, it is a GEO
#' file (any type containing exp data). If local is F, this can be a GEO
#' accession number.
#' @param namelist This is a name list file as generated by nameChange.
#' @param local This is a logical variable; if T, the function looks for a local
#' file. If F, the function uses the expname as a GEO accession number and
#' attempts to download the data.
#' @param pct This is the cutoff for gene data; if a gene has expression values
#' for less than this percentage of samples, it is excluded.
#' @param pullmeta This allows you to get the metadata for the samples from the
#' GSE file; default is false.
#'
#' @return Returns an expression frame containing genes as rows and samples as
#' columns. If pullmeta = T, will be a list with the first entry being the
#' expression frame and the second being the metadata from a GSE file.
#' @export
#' @importFrom magrittr %>%
#' @examples
#'
curateExpdata <- function(expname, namelist, local = F, pct = 0.8, pullmeta = F){
#read and curate platform
if(local == T){alldata <- GEOquery::getGEO(file = expname)
}else{alldata <- GEOquery::getGEO(GEO = expname, destdir = getwd())}
#pull expression matrix
expdata <- Biobase::exprs(alldata[[1]])
#make name matrix
names <- as.data.frame(rownames(expdata), stringsAsFactors = F)
#make lookup table for namelist
namedict <- as.vector(namelist[,2])
names(namedict) <- namelist[,1]
#find new GeneIDs for probes
names[,2] <- sapply(names[,1], function(x) {unname(namedict[x])})
#remove non-mapping probes
names <- names[which(!is.na(names[,2])),]
rownames(names) <- names[,1]
names[,1] <- NULL
#merge names and data
expdata <- merge(names, expdata, by = "row.names")
#make everything numeric
expdata[,3:ncol(expdata)] <- sapply(expdata[,3:ncol(expdata)], function(x) as.numeric(x))
#average value for all probes per gene
suppressWarnings(output <- expdata %>% dplyr::group_by(expdata$V2) %>%
dplyr::summarize_all(mean, na.rm=TRUE))
output <- as.data.frame(output)
#set row names, remove blank rows, export
rownames(output) <- output[,1]
output <- output[,-(1:3)]
#remove rows with insufficient gene coverage
output$coverage <- apply(output, 1, function(x) {sum(!is.na(x))/ncol(output)})
output <- output[which(output$coverage >= pct),]
output$coverage <- NULL
#If metadata is wanted, pull metadata and make list
if(pullmeta == T){
metadata <- Biobase::pData(Biobase::phenoData(alldata[[1]]))
newout <- list()
newout[[1]] <- output
newout[[2]] <- metadata
output <- newout
}
output
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.