data-raw/species_mixing_dge.R

# Download data from human and mouse species-mixing experiment from
# Macosko et al., 2015 and save as example dataset. Only a sample of the
# dataset with 100 STAMPS is used to save space

# download and extract dge data -----------------------------------------------

# download file
url <- "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE63nnn/GSE63269/suppl/GSE63269_RAW.tar"
download.file(url, destfile = "./data-raw/GSE63269_RAW.tar")

# extract dge files frome tarball
dge_files <- c("GSM1544799_SpeciesMix_HundredSTAMPs_HUMAN.digital_expression.txt.gz",
               "GSM1544799_SpeciesMix_HundredSTAMPs_MOUSE.digital_expression.txt.gz")

untar("./data-raw/GSE63269_RAW.tar", exdir = "./data-raw/", files = dge_files)

# read dge data
dge_raw <- lapply(paste0("./data-raw/", dge_files),
                  FUN = read.table, header = TRUE, stringsAsFactors = FALSE)

# make sure that cell barcodes are ordered the same in both data frames
dge_raw[[2]] <- dge_raw[[2]][, colnames(dge_raw[[1]])]

# create DigitalExpression summary file based on dge data ---------------------

# calculate number of detected genes and transcripts per species
n_genes <- sapply(dge_raw, FUN = function(x) colSums(x[, -1] > 1) )
n_txs  <- sapply(dge_raw, FUN = function(x) colSums(x[, -1]) )

# create summary data.frame
ms_dge_summary <- data.frame("CELL_BARCODE" = rownames(n_genes),
                          "NUM_GENES_HUMAN" = n_genes[, 1],
                          "NUM_TRANSCRIPTS_HUMAN" = n_txs[, 1],
                          "NUM_GENES_MOUSE" = n_genes[, 2],
                          "NUM_TRANSCRIPTS_MOUSE" = n_txs[, 2],
                          row.names = NULL, stringsAsFactors = FALSE)

# filter out human cells with more that 60,000 transcripts to make examples
# with mixed-species data easier to visualize
ms_dge_summary <- ms_dge_summary[ms_dge_summary[, 3] < 60000, ]

# create merged dge data ------------------------------------------------------

# add species column
dge_raw[[1]] <- cbind(SPECIES = "human", dge_raw[[1]],
                      stringsAsFactors = FALSE)
dge_raw[[2]] <- cbind(SPECIES = "mouse", dge_raw[[2]],
                      stringsAsFactors = FALSE)

# merge datasets
ms_dge_data <- rbind(dge_raw[[1]], dge_raw[[2]])

# also filter out cells with more than 60,000 human transcripts
filt_cells <- ms_dge_summary[, 1]
ms_dge_data <- ms_dge_data[, c("GENE", "SPECIES", filt_cells)]

# save data in RData object for use in package and clean up -------------------

# save data
devtools::use_data(ms_dge_summary)
devtools::use_data(ms_dge_data)

# delete downloaded files
unlink(paste0("./data-raw/", dge_files))
unlink("./data-raw/GSE63269_RAW.tar")
argschwind/dropseqr documentation built on May 23, 2019, 4:24 p.m.