knitr::opts_chunk$set(echo = TRUE)
options(width = 100)

Helper functions

Following the definitions provided here...

library(HDF5Array)
library(SingleCellExperiment)

getGenomes <- function(path) {
  genomes <- attributes(h5dump(path, load = FALSE))$names
  genomes
}

getRowData <- function(path, genome = NULL) {
  if (is.null(genome)) 
    genome <- getGenomes(path)[1]
  data.frame(
    ID = as.character(h5read(path, paste0(genome,"/genes"))),
    Symbol=as.character(h5read(path, paste0(genome,"/gene_names"))),
    stringsAsFactors=FALSE
  )
}

getColData <- function(path, genome = NULL) {
  if (is.null(genome)) 
    genome <- getGenomes(path)[1]

  barcode <- as.character(h5read(path, paste0(genome,"/barcodes")))

  data.frame(
    Barcode=barcode,
    stringsAsFactors=FALSE
  )
}

Downloading the h5 file

(accessed November 6, 2018)

url <- "https://s3.amazonaws.com/preview-ica-expression-data/ica_cord_blood_h5.h5"
path <- basename(url)
if(!file.exists(path))
  download.file(url, path)

Loading the data and preprocessing

Saving rowData and colData, used when constructing the SingleCellExperiment object

saveRDS(getRowData(path), "ica_cord_blood_rowData.rds")
saveRDS(getColData(path), "ica_cord_blood_colData.rds")

Converting into a HDF5Matrix object

mygenome <- getGenomes("ica_cord_blood_h5.h5")
tenxmat <- TENxMatrix(path, group = mygenome)
dim(tenxmat)
pryr::object_size(tenxmat)

Writing out the object as a rectangular h5 file

options(DelayedArray.block.size=1e9) # 1GB block size.
mat.out <- writeHDF5Array(
  tenxmat,
  file="ica_cord_blood_rectangular.h5",
  name="counts",
  chunkdim=beachmat::getBestChunkDims(dim(tenxmat))
)

Generating and saving the SingleCellExperiment object

sce_cordblood <- 
  SingleCellExperiment(
    list(counts = HDF5Array("ica_cord_blood_rectangular.h5", "counts")), 
    rowData = getRowData(path), 
    colData = getColData(path)
  )

rownames(sce_cordblood) <- rowData(sce_cordblood)$ID
colnames(sce_cordblood) <- colData(sce_cordblood)$Barcode
sce_cordblood

saveRDS(sce_cordblood, file = "sce_HCA_cordblood.rds")

Session info

sessionInfo()


federicomarini/HCAData documentation built on May 5, 2024, 1:24 p.m.