In montilab/BS831: Boston University - Genomics Data Mining

In this module, we illustrate how to download gene expression datasets from GEO.

To this end, we will download and pre-process the gene expression data from the LOAD (Late Onset Alzheimer Disease) study in [Zhang et al., Cell 2013] . The dataset consists of gene expression data of postmortem brain tissues from three brain regions (PC: prefrontal cortex; VC: visual cortex; and CB: cerebellum) from 129 LOAD patients and 101 healthy controls for a total of 690 profiles.

knitr::opts_chunk$set(message=FALSE, warning=FALSE, eval=TRUE)
devtools::load_all(".")
library(GEOquery)
library(Biobase)
library(dplyr)
overwrite <- FALSE

We start by loading some required packages and functions.

library(BS831)
library(GEOquery)
library(Biobase)
library(dplyr)
overwrite <- FALSE

Download data from GEO

We start by downloading the data from GEO. We will then show two approaches to adding gene annotation to the expression matrix, one based on the use of already available annotation available on GEO, and another one based on "de-novo" annotation based on the biomaRt package.

## Download expression data from GEO and store in temporary directory
tmp_dir <- tempdir()
LOAD <- GEOquery::getGEO(GEO="GSE44772", GSEMatrix=TRUE, destdir=tmp_dir)
LOAD <- LOAD[[1]] # getGEO returns a list, so we extract the first element
print(LOAD)       # display summary info for the object

Use pre-existing Gene Annotation

We first show how to use already available gene annotation information. To this end, we manually downloaded to the data/ subfolder the GPL4372.annot file from geo, then ran the following command.

## read.delim can read compressed files
fdata <- read.delim(
  file=file.path(system.file("extdata", package="BS831"), "GPL4372.annot"),
  skip=27,row.names=1
)
print(colnames(fdata))

There are many more annotation columns than needeed, thus we extract some relevant columns, and since we are at it, we also properly sort the rows, and rename the columns, for easier handling.

## simplifying fdata and matching to expression matrix
fdata <- dplyr::inner_join(
  data.frame(rowname=featureNames(LOAD)),      # matching expression and 
  tibble::rownames_to_column(fdata)) %>%       # ..fdata by rownames
  tibble::column_to_rownames() %>%
  dplyr::rename(gene_symbol="Gene.symbol",     # renaming columns
                gene_title="Gene.title",       # ..
                GeneID="Gene.ID") %>%          # ..
  dplyr::select(gene_symbol,gene_title,GeneID) # selecting columns

## always be super-cautious, double- and triple-check
if ( any(featureNames(LOAD)!=rownames(fdata)) ) stop( "row mismatch" )

## finally, update the gene annotation in the ExpressionSet
LOAD1 <- LOAD
fData(LOAD1) <- fdata

## save data
if (overwrite) {
  saveRDS(LOAD1, file=file.path(OMPATH,"data/LOAD1.RDS"))
}

Gene Annotation based on `biomaRt`

We will use the EntrezGeneID column in the fData to retrieve gene symbols and descriptions with biomaRt.

## check the annotation columns in fData
print(colnames(fData(LOAD)))

## we first restrict to rows w/ non-empty EntrezGeneID annotation
LOAD <- LOAD[!is.na(fData(LOAD)[,"EntrezGeneID"]),]; nrow(LOAD)

## notice that there are replicate entries, but we will deal with them later
print( length(unique(fData(LOAD)[,"EntrezGeneID"])) )

## use biomaRt databased to retrieve the relevant annotation (type
## `?useMart` and `?getBM` for details)
mart <- biomaRt::useMart(biomart="ensembl", dataset="hsapiens_gene_ensembl")
martMap <- biomaRt::getBM(attributes=c("entrezgene_id","hgnc_symbol","description"),
                          filters="entrezgene_id",
                          values=fData(LOAD)[,"EntrezGeneID"],
                          mart=mart)

## remove entries with empty gene symbol
nrow(martMap) # before
martMap <- dplyr::filter(martMap, hgnc_symbol!="")
nrow(martMap) # after

## notice that there are some duplicated entrez IDs and gene symbols
sum(base::duplicated(martMap[,"entrezgene_id"]))     
sum(base::duplicated(martMap[,"hgnc_symbol"]))

## we adopt the simple (perhaps too simple) approach of taking the
## first occurence of each EntrezID
nrow(martMap)
martMap <- martMap %>%
  dplyr::distinct(entrezgene_id,.keep_all=TRUE)
nrow(martMap)

## now there are no more replicated EntrezID (but there are still replicate gene symbols)
sum(base::duplicated(martMap[,"entrezgene_id"]))     
sum(base::duplicated(martMap[,"hgnc_symbol"]))

## let us now match gene annotation and expression data
matchIdx <- match.nona( martMap[,"entrezgene_id"], fData(LOAD)[,"EntrezGeneID"] )
LOAD2 <- LOAD[matchIdx,]
if ( any(martMap[,"entrezgene_id"]!=fData(LOAD2)[,"EntrezGeneID"]) ) stop( "row mismatch" )
fData(LOAD2) <- martMap

## save data
if (overwrite) {
  saveRDS( LOAD2, file=file.path(OMPATH,"data/LOAD2.RDS") )
}

Merge Replicated Entries

This next step is not necessary, as one might want to keep the multiple chip probes associated with the same gene symbol separate until the end of the analysis. However, if desired, one can merge the rows corresponding to multiple probes.

Here, we define a simple function to collapse multiple rows by either median or mean, using tidyverse functions.

collapseByFun <- function(eset,rowid,method=c("median","mean"))
{
  method <- match.arg(method)

  tbl <- data.frame(key=fData(eset)[,rowid],exprs(eset)) %>%
    dplyr::filter(key!="") %>%
    group_by(key) %>% 
    summarize_all({{method}}) %>%
    tibble::column_to_rownames("key")

  pdata <- pData(eset)[match.nona(colnames(tbl),sampleNames(eset)),]
  fdata <- fData(eset)[match.nona(rownames(tbl),fData(eset)[,rowid]),] %>%
    tibble::remove_rownames() %>%
    tibble::column_to_rownames({{rowid}})

  ExpressionSet(assayData=as.matrix(tbl),
                phenoData=AnnotatedDataFrame(pdata),
                featureData=AnnotatedDataFrame(fdata))

}

Previously, we had defined a similar function based on the R functions melt and dcast defined in the package reshape2. We will leave it to you to decide whether it is more or less intuitive. Efficiency-wise, they are comparable.

library(reshape2)
collapseByMedian <- function(eset, rowid)
{
    ## library(reshape2)

    ## remove unmapped probe sets
    genes <- fData(eset)[, rowid]
    rows.mapped <- !is.na(genes) & genes != ""
    eset <- eset[rows.mapped,]
    genes <- fData(eset)[, rowid]

    ## collapse by median value among duplicate probes
    df <- data.frame(exprs(eset), genes = genes)
    df.melt <- melt(df, id.vars = "genes")
    df.median.collapsed <- dcast(df.melt, genes ~ variable, median)

    ## reassemble collapsed eset
    fdat <- fData(eset)
    if ( any(is.na(roworder <- match(df.median.collapsed[,'genes'],fdat[,rowid]))) )
        stop( "something wrong" )
    fdat.collapsed <- fdat[roworder,]
    eset <- ExpressionSet(assayData=as.matrix(df.median.collapsed[, colnames(eset)]),
                          phenoData=AnnotatedDataFrame(pData(eset)),
                          featureData=AnnotatedDataFrame(fdat.collapsed))
    return(eset)
}

We now apply collapseByFun to both LOAD1 (n=r sum(base::duplicated(fData(LOAD1)[,"gene_symbol"])) duplicates), and LOAD2 (n=r sum(base::duplicated(fData(LOAD2)[,"hgnc_symbol"])) duplicates). Notice that running it on a 600+ sample size is somewhat slow.

nrow(LOAD1)
LOAD1.collapsed <- collapseByFun(LOAD1, rowid="gene_symbol",method="median")
nrow(LOAD1.collapsed)
if (overwrite) {
  saveRDS(LOAD1.collapsed,file=file.path(OMPATH,"data/LOAD1.collapsed.RDS"))
}

nrow(LOAD2)
LOAD2.collapsed <- collapseByFun(LOAD2, rowid="hgnc_symbol")
nrow(LOAD2.collapsed)
if (overwrite) {
  saveRDS(LOAD2.collapsed,file=file.path(OMPATH,"data/LOAD2.collapsed.RDS"))
}