knitr::opts_chunk$set(echo = TRUE)

DF_cell_types.txt file

DF_cell_types.txt contains a \code{data.frame} with the matching between cell id (1st column) and cell-types (2nd column) from the data stored in the alevin-fry folder. The cell-types were obtained from the original study of Velasco et al. (2019), and are publicly available via meta_combined.txt file.

Below is the script used to create the DF_cell_types.txt object.

# load internal data to the package:
data_dir = system.file("extdata", package = "DifferentialRegulation")
# specify samples ids:
sample_ids = paste0("organoid", c(1:3, 16:18))
# set directories of each sample input data (obtained via alevin-fry):
base_dir = file.path(data_dir, "alevin-fry", sample_ids)
file.exists(base_dir)
# load USA sce:
path_to_counts = file.path(base_dir,"/alevin/quants_mat.mtx")
path_to_cell_id = file.path(base_dir,"/alevin/quants_mat_rows.txt")
path_to_gene_id = file.path(base_dir,"/alevin/quants_mat_cols.txt")

file.exists(path_to_counts)
file.exists(path_to_cell_id)
file.exists(path_to_gene_id)

library(DifferentialRegulation)
sce = load_USA(path_to_counts,
               path_to_cell_id,
               path_to_gene_id,
               sample_ids)

# load pre-computed cell-types, from the original study
md <- read.csv("../REAL data/discovery - brain human organoids/data/meta_combined.txt", sep = "\t")
md <- md[grepl("PGP1", md$Batch), ]
md$SEQ <- sapply(md$NAME, FUN = function (x) strsplit(x, split = "_")[[1]][3])

# keep organoids 1:3 and 16:18 only
md = md[ md$Organoid %in% c("1", "2", "3", "16", "17", "18"), ]
table(md$Organoid)
md$Organoid = as.numeric(md$Organoid)

md$cell_id = paste0("organoid", md$Organoid, ".", md$SEQ)
matches = match(colnames(sce), md$cell_id)

DF_cell_types = data.frame(cell_id  = md$cell_id[matches],
                           cell_type = md$CellType[matches],
                           row.names = NULL)
table(DF_cell_types$cell_type)

write.csv(DF_cell_types, file = "DF_cell_types.csv")

write.table(DF_cell_types, file = "DF_cell_types.txt", sep = "\t",
            row.names = FALSE, col.names = TRUE)

# check loaded object is identical:
DF_cell_types_reload = read.csv("DF_cell_types.txt", sep = "\t", header = TRUE)

alevin-fry folder

Data

We use a real droplet scRNA-seq dataset from Velasco et al. (2019), Nature 570 (7762): 523–27. In particular, we compare two groups of three samples, consisting of human brain organoids, cultured for 3 and 6 months. For computational reasons, we stored a subset of this dataset, in our package, consisting of 100 genes and 3,493 cells, belonging to two cell-types. Cell-type assignment was done in the original styudy (Velasco et al., 2019), and is publicly available in the meta_combined.txt file here.

Raw data can also be downloaded here.

Download data

Download the raw data.

wget https://sra-pub-src-1.s3.amazonaws.com/SRR8869247/Org1_possorted_genome_bam.bam.1 -O SRR8869247.bam && \ 
bamtofastq SRR8869247.bam SRR8869247 --nthreads=32 && rm -rf SRR8869247.bam
wget https://sra-pub-src-1.s3.amazonaws.com/SRR8869248/Org2_possorted_genome_bam.bam.1 -O SRR8869248.bam && \
bamtofastq SRR8869248.bam SRR8869248 --nthreads=32 && rm -rf SRR8869248.bam
wget https://sra-pub-src-1.s3.amazonaws.com/SRR8869249/Org3_possorted_genome_bam.bam.1 -O SRR8869249.bam && \
bamtofastq SRR8869249.bam SRR8869249 --nthreads=32 && rm -rf SRR8869249.bam
wget https://sra-pub-src-1.s3.amazonaws.com/SRR8869262/Org16_possorted_genome_bam.bam.1 -O SRR8869262.bam && \
bamtofastq SRR8869262.bam SRR8869262 --nthreads=32 && rm -rf SRR8869262.bam
wget https://sra-pub-src-1.s3.amazonaws.com/SRR8869263/Org17_possorted_genome_bam.bam.1 -O SRR8869263.bam && \
bamtofastq SRR8869263.bam SRR8869263 --nthreads=32 && rm -rf SRR8869263.bam
wget https://sra-pub-src-1.s3.amazonaws.com/SRR8869264/Org18_possorted_genome_bam.bam.1 -O SRR8869264.bam && \
bamtofastq SRR8869264.bam SRR8869264 --nthreads=32 && rm -rf SRR8869264.bam

Genome

Download the reference genome.

path <- "brain_human/01_annotation/"
fa_file <- "GRCh38.primary_assembly.genome.fa"
gtf_file <- "gencode.v37.primary_assembly.annotation.gtf.gz"

# download data
if (!file.exists(paste0(path, fa_file))) {
download.file(url = "ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_37/GRCh38.primary_assembly.genome.fa.gz",
destfile = paste0(path, fa_file, ".gz"))
gunzip(paste0(path, fa_file, ".gz"))
}

if (!file.exists(paste0(path, gtf_file))) {
download.file(url = "ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_37/gencode.v37.primary_assembly.annotation.gtf.gz",
destfile = paste0(path, gtf_file))
}

Alignment via alevin-fry

Run make_splici_txome function, from roe R package.

# splici txome for human data
gtf_path <- file.path("brain_human/01_annotation/gencode.v37.annotation.gtf.gz")
genome_path = file.path("brain_human/01_annotation/GRCh38.primary_assembly.genome.fa")
read_length = 91
flank_trim_length = 5
output_dir = file.path("brain_human/01_annotation/splici/")

library(roe)

make_splici_txome(gtf_path = gtf_path, 
genome_path = genome_path, 
read_length = read_length, 
flank_trim_length = flank_trim_length, 
output_dir = output_dir)

Run alevin.

salmon index \
-t brain_human/01_annotation/splici/transcriptome_splici.fa \
-i brain_human/01_annotation/splici/index_folder \
--gencode -p 32

## ORGANOID 1
salmon alevin -l ISR -i brain_human/01_annotation/splici/index_folder \
-1 brain_human/03_data/fastq/SRR8869247/s1_MissingLibrary_1_H7WTJBGX5/bamtofastq_S1_L001_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869247/s1_MissingLibrary_1_H7WTJBGX5/bamtofastq_S1_L002_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869247/s1_MissingLibrary_1_H7WTJBGX5/bamtofastq_S1_L003_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869247/s1_MissingLibrary_1_H7WTJBGX5/bamtofastq_S1_L004_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869247/s1_MissingLibrary_1_HFM7TBGX5/bamtofastq_S1_L001_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869247/s1_MissingLibrary_1_HFM7TBGX5/bamtofastq_S1_L002_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869247/s1_MissingLibrary_1_HFM7TBGX5/bamtofastq_S1_L003_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869247/s1_MissingLibrary_1_HFM7TBGX5/bamtofastq_S1_L004_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869247/s1_MissingLibrary_1_HHT33BGX5/bamtofastq_S1_L001_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869247/s1_MissingLibrary_1_HHT33BGX5/bamtofastq_S1_L002_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869247/s1_MissingLibrary_1_HHT33BGX5/bamtofastq_S1_L003_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869247/s1_MissingLibrary_1_HHT33BGX5/bamtofastq_S1_L004_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869247/s1_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L001_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869247/s1_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L002_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869247/s1_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L003_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869247/s1_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L004_R1_001.fastq.gz \
-2 brain_human/03_data/fastq/SRR8869247/s1_MissingLibrary_1_H7WTJBGX5/bamtofastq_S1_L001_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869247/s1_MissingLibrary_1_H7WTJBGX5/bamtofastq_S1_L002_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869247/s1_MissingLibrary_1_H7WTJBGX5/bamtofastq_S1_L003_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869247/s1_MissingLibrary_1_H7WTJBGX5/bamtofastq_S1_L004_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869247/s1_MissingLibrary_1_HFM7TBGX5/bamtofastq_S1_L001_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869247/s1_MissingLibrary_1_HFM7TBGX5/bamtofastq_S1_L002_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869247/s1_MissingLibrary_1_HFM7TBGX5/bamtofastq_S1_L003_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869247/s1_MissingLibrary_1_HFM7TBGX5/bamtofastq_S1_L004_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869247/s1_MissingLibrary_1_HHT33BGX5/bamtofastq_S1_L001_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869247/s1_MissingLibrary_1_HHT33BGX5/bamtofastq_S1_L002_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869247/s1_MissingLibrary_1_HHT33BGX5/bamtofastq_S1_L003_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869247/s1_MissingLibrary_1_HHT33BGX5/bamtofastq_S1_L004_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869247/s1_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L001_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869247/s1_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L002_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869247/s1_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L003_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869247/s1_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L004_R2_001.fastq.gz \
-o brain_human/02_alevin_for_fry/organoid1/ -p 32 --chromium --sketch

## ORGANOID 2
salmon alevin -l ISR -i brain_human/01_annotation/splici/index_folder \
-1 brain_human/03_data/fastq/SRR8869248/s2_MissingLibrary_1_H7WTJBGX5/bamtofastq_S1_L001_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869248/s2_MissingLibrary_1_H7WTJBGX5/bamtofastq_S1_L002_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869248/s2_MissingLibrary_1_H7WTJBGX5/bamtofastq_S1_L003_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869248/s2_MissingLibrary_1_H7WTJBGX5/bamtofastq_S1_L004_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869248/s2_MissingLibrary_1_HFM7TBGX5/bamtofastq_S1_L001_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869248/s2_MissingLibrary_1_HFM7TBGX5/bamtofastq_S1_L002_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869248/s2_MissingLibrary_1_HFM7TBGX5/bamtofastq_S1_L003_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869248/s2_MissingLibrary_1_HFM7TBGX5/bamtofastq_S1_L004_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869248/s2_MissingLibrary_1_HHT33BGX5/bamtofastq_S1_L001_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869248/s2_MissingLibrary_1_HHT33BGX5/bamtofastq_S1_L002_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869248/s2_MissingLibrary_1_HHT33BGX5/bamtofastq_S1_L003_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869248/s2_MissingLibrary_1_HHT33BGX5/bamtofastq_S1_L004_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869248/s2_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L001_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869248/s2_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L002_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869248/s2_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L003_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869248/s2_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L004_R1_001.fastq.gz \
-2 brain_human/03_data/fastq/SRR8869248/s2_MissingLibrary_1_H7WTJBGX5/bamtofastq_S1_L001_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869248/s2_MissingLibrary_1_H7WTJBGX5/bamtofastq_S1_L002_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869248/s2_MissingLibrary_1_H7WTJBGX5/bamtofastq_S1_L003_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869248/s2_MissingLibrary_1_H7WTJBGX5/bamtofastq_S1_L004_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869248/s2_MissingLibrary_1_HFM7TBGX5/bamtofastq_S1_L001_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869248/s2_MissingLibrary_1_HFM7TBGX5/bamtofastq_S1_L002_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869248/s2_MissingLibrary_1_HFM7TBGX5/bamtofastq_S1_L003_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869248/s2_MissingLibrary_1_HFM7TBGX5/bamtofastq_S1_L004_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869248/s2_MissingLibrary_1_HHT33BGX5/bamtofastq_S1_L001_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869248/s2_MissingLibrary_1_HHT33BGX5/bamtofastq_S1_L002_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869248/s2_MissingLibrary_1_HHT33BGX5/bamtofastq_S1_L003_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869248/s2_MissingLibrary_1_HHT33BGX5/bamtofastq_S1_L004_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869248/s2_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L001_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869248/s2_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L002_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869248/s2_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L003_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869248/s2_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L004_R2_001.fastq.gz \
-o brain_human/02_alevin_for_fry/organoid2/ -p 32 --chromium --sketch

## ORGANOID 3
salmon alevin -l ISR -i brain_human/01_annotation/splici/index_folder \
-1 brain_human/03_data/fastq/SRR8869249/s3_MissingLibrary_1_H7WTJBGX5/bamtofastq_S1_L001_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869249/s3_MissingLibrary_1_H7WTJBGX5/bamtofastq_S1_L002_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869249/s3_MissingLibrary_1_H7WTJBGX5/bamtofastq_S1_L003_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869249/s3_MissingLibrary_1_H7WTJBGX5/bamtofastq_S1_L004_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869249/s3_MissingLibrary_1_HFM7TBGX5/bamtofastq_S1_L001_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869249/s3_MissingLibrary_1_HFM7TBGX5/bamtofastq_S1_L002_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869249/s3_MissingLibrary_1_HFM7TBGX5/bamtofastq_S1_L003_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869249/s3_MissingLibrary_1_HFM7TBGX5/bamtofastq_S1_L004_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869249/s3_MissingLibrary_1_HHT33BGX5/bamtofastq_S1_L001_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869249/s3_MissingLibrary_1_HHT33BGX5/bamtofastq_S1_L002_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869249/s3_MissingLibrary_1_HHT33BGX5/bamtofastq_S1_L003_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869249/s3_MissingLibrary_1_HHT33BGX5/bamtofastq_S1_L004_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869249/s3_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L001_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869249/s3_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L002_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869249/s3_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L003_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869249/s3_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L004_R1_001.fastq.gz \
-2 brain_human/03_data/fastq/SRR8869249/s3_MissingLibrary_1_H7WTJBGX5/bamtofastq_S1_L001_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869249/s3_MissingLibrary_1_H7WTJBGX5/bamtofastq_S1_L002_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869249/s3_MissingLibrary_1_H7WTJBGX5/bamtofastq_S1_L003_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869249/s3_MissingLibrary_1_H7WTJBGX5/bamtofastq_S1_L004_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869249/s3_MissingLibrary_1_HFM7TBGX5/bamtofastq_S1_L001_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869249/s3_MissingLibrary_1_HFM7TBGX5/bamtofastq_S1_L002_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869249/s3_MissingLibrary_1_HFM7TBGX5/bamtofastq_S1_L003_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869249/s3_MissingLibrary_1_HFM7TBGX5/bamtofastq_S1_L004_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869249/s3_MissingLibrary_1_HHT33BGX5/bamtofastq_S1_L001_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869249/s3_MissingLibrary_1_HHT33BGX5/bamtofastq_S1_L002_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869249/s3_MissingLibrary_1_HHT33BGX5/bamtofastq_S1_L003_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869249/s3_MissingLibrary_1_HHT33BGX5/bamtofastq_S1_L004_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869249/s3_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L001_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869249/s3_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L002_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869249/s3_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L003_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869249/s3_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L004_R2_001.fastq.gz \
-o brain_human/02_alevin_for_fry/organoid3/ -p 32 --chromium --sketch

## ORGANOID 16
salmon alevin -l ISR -i brain_human/01_annotation/splici/index_folder \
-1 brain_human/03_data/fastq/SRR8869262/s4_MissingLibrary_1_HMWNVBGX5/bamtofastq_S1_L001_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869262/s4_MissingLibrary_1_HMWNVBGX5/bamtofastq_S1_L002_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869262/s4_MissingLibrary_1_HMWNVBGX5/bamtofastq_S1_L002_R1_002.fastq.gz \
brain_human/03_data/fastq/SRR8869262/s4_MissingLibrary_1_HMWNVBGX5/bamtofastq_S1_L003_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869262/s4_MissingLibrary_1_HMWNVBGX5/bamtofastq_S1_L003_R1_002.fastq.gz \
brain_human/03_data/fastq/SRR8869262/s4_MissingLibrary_1_HMWNVBGX5/bamtofastq_S1_L004_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869262/s4_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L001_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869262/s4_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L001_R1_002.fastq.gz \
brain_human/03_data/fastq/SRR8869262/s4_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L002_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869262/s4_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L002_R1_002.fastq.gz \
brain_human/03_data/fastq/SRR8869262/s4_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L003_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869262/s4_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L003_R1_002.fastq.gz \
brain_human/03_data/fastq/SRR8869262/s4_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L004_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869262/s4_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L004_R1_002.fastq.gz \
-2 brain_human/03_data/fastq/SRR8869262/s4_MissingLibrary_1_HMWNVBGX5/bamtofastq_S1_L001_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869262/s4_MissingLibrary_1_HMWNVBGX5/bamtofastq_S1_L002_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869262/s4_MissingLibrary_1_HMWNVBGX5/bamtofastq_S1_L002_R2_002.fastq.gz \
brain_human/03_data/fastq/SRR8869262/s4_MissingLibrary_1_HMWNVBGX5/bamtofastq_S1_L003_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869262/s4_MissingLibrary_1_HMWNVBGX5/bamtofastq_S1_L003_R2_002.fastq.gz \
brain_human/03_data/fastq/SRR8869262/s4_MissingLibrary_1_HMWNVBGX5/bamtofastq_S1_L004_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869262/s4_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L001_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869262/s4_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L001_R2_002.fastq.gz \
brain_human/03_data/fastq/SRR8869262/s4_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L001_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869262/s4_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L001_R2_002.fastq.gz \
brain_human/03_data/fastq/SRR8869262/s4_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L003_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869262/s4_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L003_R2_002.fastq.gz \
brain_human/03_data/fastq/SRR8869262/s4_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L004_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869262/s4_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L004_R2_002.fastq.gz \
-o brain_human/02_alevin_for_fry/organoid16/ -p 32 --chromium --sketch

## ORGANOID 17
salmon alevin -l ISR -i brain_human/01_annotation/splici/index_folder \
-1 brain_human/03_data/fastq/SRR8869263/s5_MissingLibrary_1_HMWNVBGX5/bamtofastq_S1_L001_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869263/s5_MissingLibrary_1_HMWNVBGX5/bamtofastq_S1_L002_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869263/s5_MissingLibrary_1_HMWNVBGX5/bamtofastq_S1_L003_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869263/s5_MissingLibrary_1_HMWNVBGX5/bamtofastq_S1_L004_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869263/s5_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L001_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869263/s5_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L002_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869263/s5_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L003_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869263/s5_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L004_R1_001.fastq.gz \
-2 brain_human/03_data/fastq/SRR8869263/s5_MissingLibrary_1_HMWNVBGX5/bamtofastq_S1_L001_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869263/s5_MissingLibrary_1_HMWNVBGX5/bamtofastq_S1_L002_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869263/s5_MissingLibrary_1_HMWNVBGX5/bamtofastq_S1_L003_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869263/s5_MissingLibrary_1_HMWNVBGX5/bamtofastq_S1_L004_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869263/s5_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L001_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869263/s5_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L002_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869263/s5_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L003_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869263/s5_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L004_R2_001.fastq.gz \
-o brain_human/02_alevin_for_fry/organoid17/ -p 32 --chromium --sketch

## ORGANOID 18
salmon alevin -l ISR -i brain_human/01_annotation/splici/index_folder \
-1 brain_human/03_data/fastq/SRR8869264/s6_MissingLibrary_1_HMWNVBGX5/bamtofastq_S1_L001_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869264/s6_MissingLibrary_1_HMWNVBGX5/bamtofastq_S1_L002_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869264/s6_MissingLibrary_1_HMWNVBGX5/bamtofastq_S1_L003_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869264/s6_MissingLibrary_1_HMWNVBGX5/bamtofastq_S1_L004_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869264/s6_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L001_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869264/s6_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L002_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869264/s6_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L003_R1_001.fastq.gz \
brain_human/03_data/fastq/SRR8869264/s6_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L004_R1_001.fastq.gz \
-2 brain_human/03_data/fastq/SRR8869264/s6_MissingLibrary_1_HMWNVBGX5/bamtofastq_S1_L001_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869264/s6_MissingLibrary_1_HMWNVBGX5/bamtofastq_S1_L002_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869264/s6_MissingLibrary_1_HMWNVBGX5/bamtofastq_S1_L003_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869264/s6_MissingLibrary_1_HMWNVBGX5/bamtofastq_S1_L004_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869264/s6_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L001_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869264/s6_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L002_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869264/s6_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L003_R2_001.fastq.gz \
brain_human/03_data/fastq/SRR8869264/s6_MissingLibrary_1_HTW2HBGX5/bamtofastq_S1_L004_R2_001.fastq.gz \
-o brain_human/02_alevin_for_fry/organoid18/ -p 32 --chromium --sketch

Run alevin-fry.

samples=(1 2 3 16 17 18)

for sample in ${samples[@]}
do
alevin-fry generate-permit-list \
-i brain_human/02_alevin_for_fry/organoid${sample} \
-o brain_human/05_alevin_fry/organoid${sample} \
-k -d fw

alevin-fry collate \
-r brain_human/02_alevin_for_fry/organoid${sample} \
-i brain_human/05_alevin_fry/organoid${sample} \
-t 32

alevin-fry quant \
-i brain_human/05_alevin_fry/organoid${sample} \
-o brain_human/05_alevin_fry/organoid${sample} \
-m brain_human/01_annotation/splici/transcriptome_splici_t2g_3col.tsv \
-t 32 --use-mtx -d -r cr-like-em
done

Selection of 100 genes only

To make the example portable, we selected 100 genes and cells from 2 cell types only (RG and Cycling).

rm(list = ls())

# specify 4 samples ids:
sample_ids = paste0("organoid", c(1:3, 16:18))
# set directories of each sample input data (obtained via alevin-fry):
base_dir = file.path("human alevin-fry", sample_ids, "alevin")

path_to_counts = file.path(base_dir,"quants_mat.mtx")
path_to_cell_id = file.path(base_dir,"quants_mat_rows.txt")
path_to_gene_id = file.path(base_dir,"quants_mat_cols.txt")
path_to_EC_counts = file.path(base_dir,"geqc_counts.mtx")
path_to_EC = file.path(base_dir,"gene_eqclass.txt.gz")

n_genes_keep = 100
for(i in seq_along(sample_ids)){
############################################################################################################
# quants_mat_rows.txt: cell id
############################################################################################################
library(data.table)
cell_id = fread(path_to_cell_id[[i]],
sep = " ", quote = "", header = FALSE)

# assign cell types
md <- read.csv("meta_combined.txt", sep = "\t")
md <- md[grepl("PGP1", md$Batch), ]
md$SEQ <- sapply(md$NAME, FUN = function (x) strsplit(x, split = "_")[[1]][3])

# keep organoids 1:3 and 16:18 only
md = md[ md$Organoid %in% c("1", "2", "3", "16", "17", "18"), ]
table(md$Organoid)
# rename organoids 16:18 as 4:6
md$Organoid[md$Organoid == "16"]  = "4"
md$Organoid[md$Organoid == "17"]  = "5"
md$Organoid[md$Organoid == "18"]  = "6"

md$Organoid = as.numeric(md$Organoid)
table(md$Organoid)

md = md[md$Organoid == i, ]

matches = match(cell_id$V1, md$SEQ)
head(cell_id$V1); head(md$SEQ[matches])
tail(cell_id$V1); tail(md$SEQ[matches])

cell_type <- md$CellType[matches]
rm(md); rm(matches)

sel_cells = cell_type %in% c("RG", "Cycling")
sel_cells[is.na(sel_cells)] = FALSE
table(sel_cells)

cell_id = cell_id[sel_cells,]

colnames(cell_id) = NULL

file_name = file.path("alevin-fry", sample_ids[i], "alevin/quants_mat_rows.txt")
fwrite(cell_id, file = file_name)

cell_id_reload = fread(file_name, 
sep = " ", quote = "", header = FALSE)

# OK, same sizer and content:
dim(cell_id_reload); dim(cell_id)
print(all(cell_id_reload == cell_id))

############################################################################################################
# quants_mat.mtx: USA estimated counts
############################################################################################################
library(Matrix)
counts = readMM(path_to_counts[[i]])

# sel cells from above:
counts = counts[sel_cells,]

n_genes = ncol(counts)/3

# first n_genes refer to S
# second n_genes refer to U
# third n_genes refer to A
spliced  = counts[, seq.int(1, n_genes) ]
unspliced = counts[, seq.int(n_genes+1, 2*n_genes)]
ambiguous = counts[, seq.int(2*n_genes+1, 3*n_genes)]

counts_keep = cbind(spliced[, seq_len(n_genes_keep) ], 
unspliced[, seq_len(n_genes_keep)],
ambiguous[, seq_len(n_genes_keep)])

file_name = file.path("alevin-fry", sample_ids[i], "alevin/quants_mat.mtx")
writeMM(counts_keep, file = file_name)

counts_reload = readMM(file_name)
# OK, same sizer and content:
dim(counts_reload); dim(counts_keep)
print(all(counts_keep == counts_reload))

############################################################################################################
# quants_mat_cols.txt: gene id
############################################################################################################
gene_id = fread(path_to_gene_id[[i]],
sep = " ", quote = "", header = FALSE)
n_genes_original = nrow(gene_id)/3

sel = seq_len(n_genes_keep)
gene_id_keep = gene_id[c(sel,
n_genes_original + sel,
2 * n_genes_original+sel)]
colnames(gene_id_keep) = NULL

file_name = file.path("alevin-fry", sample_ids[i], "alevin/quants_mat_cols.txt")
fwrite(gene_id_keep, file = file_name)

gene_id_reload = fread(file_name, 
sep = " ", quote = "", header = FALSE)

# OK, same sizer and content:
dim(gene_id_reload); dim(gene_id_keep)
print(all(gene_id_reload == gene_id_keep))

############################################################################################################
# ECs ids:
############################################################################################################
# load ECs: 
EC_genes = fread(path_to_EC[[i]],
sep = " ", quote = "", header = FALSE)[[1]]

# remove first 2 numbers:
EC_genes = EC_genes[-c(1,2)]  
# 1st number: number of transcripts (n_genes * 3)
# 2nd number: number of equivalence classes

# load EC counts
EC_counts = readMM(path_to_EC_counts[[i]])

# select cells
EC_counts_keep = EC_counts[sel_cells, ]

# remove ECs with 0 counts:
sel_non_zero_ECs = colSums(EC_counts_keep) > 0
EC_genes = EC_genes[sel_non_zero_ECs]

n_EC  = length(EC_genes)
# load EC info:
X = EC_genes[seq_len(n_EC)] # vector with all transcript ids
# split info separated by "\t":
X = strsplit(X,"\t",fixed=TRUE)
# turn character ids into numeric:
X = lapply(X, as.integer)

# ECs id, corresponds to rows in EC 
EC_id = 1 + vapply(X, function(x){
x[length(x)]
}, FUN.VALUE = integer(1))

# ECs gene ids, as in "gene_id" file.
EC_gene_id = lapply(X, function(x){
x[-length(x)]
})
rm(X)

# keep initial n_genes_keep genes, then NA (genes to be removed):
seq_to_rep = c(seq_len(n_genes_keep)-1, rep(NA, n_genes - n_genes_keep))
transform_ids = c(seq_to_rep, 
seq_to_rep + n_genes_keep,
seq_to_rep + 2*n_genes_keep)

# transform gene ids -> keep initial n_genes_keep genes, then NA (genes to be removed):
EC_gene_id = lapply(EC_gene_id, function(x){
transform_ids[x + 1]
})

# remove NAs (genes removed)
EC_gene_id = lapply(EC_gene_id, function(x){
x[!is.na(x)]
})

# keep only ECs with at least 1 element:
keep_ECs = sapply(EC_gene_id, length) > 0.5
EC_gene_id = EC_gene_id[keep_ECs]
EC_id = EC_id[keep_ECs]

# store this ECs to be saved
keep_EC_matrix = EC_id

# set EC_id to 0, ..., n_ECs - 1
EC_id = seq_along(keep_EC_matrix)-1

n_EC_keep = length(EC_id)

# merge gene ids and EC ids:
XX = lapply(seq_len(n_EC_keep), function(i){
paste( c( unlist(EC_gene_id[[i]]), EC_id[[i]]), collapse =  "\t")
})

# create new EC object
XX = c(n_genes_keep * 3, # n_genes kept
n_EC_keep,
XX)    # n_ECs kept

XX = data.table(XX)
colnames(XX) = NULL

file_name = file.path("alevin-fry", sample_ids[i], "alevin/gene_eqclass.txt.gz")
fwrite(XX, file = file_name)

EC_genes_reload = fread(file_name,
sep = " ", quote = "", header = FALSE)

# OK, same sizer and content:
dim(EC_genes_reload); dim(XX)
print(all(EC_genes_reload == XX))

############################################################################################################
# ECs counts:
############################################################################################################
# transform EC_id to new size of the matrix
EC_counts_keep = EC_counts_keep[, keep_EC_matrix]

file_name = file.path("alevin-fry", sample_ids[i], "alevin/geqc_counts.mtx")
writeMM(EC_counts_keep, file = file_name)

EC_counts_reload = readMM(file_name)

# OK, same sizer and content:
dim(EC_counts_reload); dim(EC_counts_keep)
print(all(EC_counts_keep == EC_counts_reload))

print(i)
}

salmon folder

The scripts to generate this simulated data can be found here. The full simulation is used in the mauscript, while here (for space reasons) we used the smaller subset simulation, bases on chromosome 22 only.



SimoneTiberi/DifferentialRegulation documentation built on Feb. 5, 2024, 8:48 a.m.