In BUStools/TENxBUSData: Single cell dataset from 10x in BUS format

Below is how the 100 cell dataset are prepared. First, we process the fastq files to generate the gz file you download. Then we load the data from the gz file. The other datasets were prepared the same way.

Download data

First download the fastq files

urls <- c("http://cf.10xgenomics.com/samples/cell-exp/2.1.0/hgmm_100/hgmm_100_fastqs.tar",
                                 "http://cf.10xgenomics.com/samples/cell-exp/3.0.0/hgmm_1k_v3/hgmm_1k_v3_fastqs.tar",
                                 "http://cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_1k_v3/pbmc_1k_v3_fastqs.tar",
                                 "http://s3-us-west-2.amazonaws.com/10x.files/samples/cell-exp/3.0.0/neuron_10k_v3/neuron_10k_v3_fastqs.tar")

file_names <- c("hgmm_100_fastqs.tar", "hgmm_1k_v3_fastqs.tar", "pbmc_1k_v3_fastqs.tar", "neuron_10k_v3_fastqs.tar")

for (i in seq_along(urls)) {
  if (!file.exists(file_names[i])) download.file(urls[i], destfile = file_names[i])
}

for(f in file_names) {
  untar(f)
}

Build the `kallisto` index

Here we use kallisto (see this link for install instructions) to pseudoalign the reads to the transcriptome and then to create the bus file to be converted to a sparse matrix. The first step is to build an index of the transcriptome. This data set has both human and mouse cells, so we need both human and mouse transcriptomes.

library(BUSpaRse)

dl_transcriptome(species = "Homo sapiens", gene_biotype_use = "cellranger", 
                 out_path = "./ref_hs")
dl_transcriptome(species = "Mus musculus", gene_biotype_use = "cellranger",
                 out_path = "./ref_mm")

Index for human and mouse mixed data ```{bash, cache=TRUE} kallisto index -i hs_mm_tr_index.idx ref_hs/tx_filtered.fa ref_mm/tx_filtered.fa

Index for human
```{bash, cache = TRUE}
kallisto index -i ref_hs/hs_tr_index.idx ref_hs/tx_filtered.fa

Index for mouse ```{bash, cache = TRUE} kallisto index -i ref_mm/mm_tr_index.idx ref_mm/tx_filtered.fa

## Run `kallisto bus`

For hgmm100 data
```{bash}
kallisto bus -i ./hs_mm_tr_index.idx -o ./out_hgmm100 -x 10xv2 -t8 \
./fastqs/hgmm_100_S1_L001_R1_001.fastq.gz ./fastqs/hgmm_100_S1_L001_R2_001.fastq.gz \
./fastqs/hgmm_100_S1_L002_R1_001.fastq.gz ./fastqs/hgmm_100_S1_L002_R2_001.fastq.gz \
./fastqs/hgmm_100_S1_L003_R1_001.fastq.gz ./fastqs/hgmm_100_S1_L003_R2_001.fastq.gz \
./fastqs/hgmm_100_S1_L004_R1_001.fastq.gz ./fastqs/hgmm_100_S1_L004_R2_001.fastq.gz \
./fastqs/hgmm_100_S1_L005_R1_001.fastq.gz ./fastqs/hgmm_100_S1_L005_R2_001.fastq.gz \
./fastqs/hgmm_100_S1_L006_R1_001.fastq.gz ./fastqs/hgmm_100_S1_L006_R2_001.fastq.gz \
./fastqs/hgmm_100_S1_L007_R1_001.fastq.gz ./fastqs/hgmm_100_S1_L007_R2_001.fastq.gz \
./fastqs/hgmm_100_S1_L008_R1_001.fastq.gz ./fastqs/hgmm_100_S1_L008_R2_001.fastq.gz

For hgmm1k data

fns <- list.files("hgmm_1k_v3_fastqs", pattern = "R", full.names = TRUE)
(command <- paste("kallisto bus -i ./hs_mm_tr_index.idx -o ./out_hgmm1k -x 10xv2 -t8",
                 paste(fns, collapse = " ")))

system(command)

For pbmc1k

fns <- list.files("pbmc_1k_v3_fastqs", pattern = "R", full.names = TRUE)
(command <- paste("kallisto bus -i ref_hs/hs_tr_index.idx -o ./out_pbmc1k -x 10xv3 -t8",
                 paste(fns, collapse = " ")))

system(command)

For neuron10k

fns <- list.files("neuron_10k_v3_fastqs", pattern = "R", full.names = TRUE)
(command <- paste("kallisto bus -i ref_mm/mm_tr_index.idx -o ./out_neuron10k -x 10xv3 -t8",
                 paste(fns, collapse = " ")))

system(command)

Run `BUStools`

The output.bus file is a binary. In order to make R parse it, we need to convert it into a sorted text file. There's a command line tool bustools for this.

For hgmm100

# Sort
bustools sort -o ./out_hgmm100/output.sorted -t8 ./out_hgmm100/output.bus
# Convert sorted file to text
bustools text -o ./out_hgmm100/output.sorted.txt ./out_hgmm100/output.sorted

# Sort
bustools sort -o ./out_hgmm1k/output.sorted -t8 ./out_hgmm1k/output.bus
# Convert sorted file to text
bustools text -o ./out_hgmm1k/output.sorted.txt ./out_hgmm1k/output.sorted

# Sort
bustools sort -o ./out_pbmc1k/output.sorted -t8 ./out_pbmc1k/output.bus
# Convert sorted file to text
bustools text -o ./out_pbmc1k/output.sorted.txt ./out_pbmc1k/output.sorted

# Sort
bustools sort -o ./out_neuron10k/output.sorted -t8 ./out_neuron10k/output.bus
# Convert sorted file to text
bustools text -o ./out_neuron10k/output.sorted.txt ./out_neuron10k/output.sorted

Compress data for S3 storage

All files necessary to construct a sparse matrix with BUSpaRse are stored in Amazon S3 as a gz file. Here is how that gz file is made:

(dirs <- list.files(pattern = "out"))

(commands <- sapply(dirs, function(d) {
  paste("GZIP=-9 tar -cvzf", paste0(stringr::str_remove(d, "^out_"), ".tar.gz"),
        paste0(d, "/output.sorted"), paste0(d, "/output.sorted.txt"),
        paste0(d, "/matrix.ec"), paste0(d, "/transcripts.txt"))
}))

for (i in commands) {
  system(i)
}

GZIP=-9 tar -cvzf hgmm100.tar.gz ./out_hgmm100/output.sorted.txt ./out_hgmm100/matrix.ec ./out_hgmm100/transcripts.txt ./out_hgmm100/output.sorted

The gz file is what you download. Then it can be loaded with the BUSpaRse package. See the vignette of BUSpaRse for how to load the data downloaded from this package.

BUStools/TENxBUSData documentation built on May 8, 2020, 4:20 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

Tweet to @rdrrHQ

GitHub issue tracker

ian@mutexlabs.com