library(HUG) library(DropletUtils)
This document describes how I import and preprocess scRNA-seq data and others for the paper.
Paper: Single-cell genomics identifies cell type–specific molecular changes in autism
Source: https://cells.ucsc.edu/autism/rawMatrix.zip
Raw data file: barcodes.tsv saves barcode information, genes.tsv saves gene information, and matrix.mtx saves the count data in MatrixMarket format
Technique: function read10xCounts from R package DropletUtils
data.dir = "Data/autism" sce = read10xCounts(file.path(data.dir, "rawMatrix"), col.names=TRUE) cell_info = fread(file.path(data.dir, "meta.tsv")) colData(sce) = cbind(colData(sce), cell_info) names(rowData(sce))[6] = "strand_n" sce = sce[which(rowData(sce)$n_cells_by_counts >= 0.01*ncol(sce)),] u.regions = unique(colData(sce)$region) u.clusters = unique(colData(sce)$cluster) data.dir = "Data/autism" for(r1 in u.regions){ cond1 = colData(sce)$region == r1 for(cls1 in u.clusters){ cond2 = colData(sce)$cluster == cls1 ct1 = counts(sce[,which(cond1 & cond2)]) cls2 = gsub("/", "_", cls1) fnm = sprintf("%s_%s.rds", r1, cls2) saveRDS(ct1, file=file.path(data.dir, "ct_mtx", fnm)) } } names = c("L2_3","IN-VIP","IN-SV2C","IN-SST","IN-PV") name.post = "PFC_" data.people = levels(factor(read.table("Data/autism/meta.tsv",sep = '\t',header = TRUE)$individual)) data.save = vector("list",length(data.people)+1) gene.num = nrow(as.matrix(readRDS(paste0("Data/autism/ct_mtx/",name.post,names[1],".rds")))) names(data.save) = c(data.people,"all") data.all = NULL for (i in 1:length(names)){ data.now = t(as.matrix(readRDS(paste0("Data/autism/ct_mtx/",name.post,names[i],".rds")))) data.all = rbind(data.all,data.now) } data.people = c("4341","5278", "5531","5958") for (j in 1:length(data.people)){ index = grep(data.people[j],row.names(data.all)) data.save = data.all[index,] save(data.save,file = paste0("Data/autism/genes_Velmeshev_",data.people[j],"_5cluster.rData")) }
Paper: Seq-Well: A portable, low-cost platform for single-cell RNA-seq of low-input samples Source: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE92495 Raw data file: GSE92495_RAW.tar/GSM2486331_HEK_SingleCells.txt Size: 1,453 cells * 24,187 genes Technique: Seq-Well based on
dat.gierahn = read.table('Data/GSM2486331_HEK_SingleCells.txt') dat.gierahn = Matrix(t(dat.gierahn)) ix.gierahn = as.factor(sapply(strsplit(rownames(dat.gierahn), '_'), `[`, 1)) mean(dat.gierahn == 0) save(dat.gierahn, ix.gierahn, file='Data/Gierahn2017_raw.rData')
conns = read.table('Data/PathwayCommons12.All.hgnc.sif', header=F, as.is=T) genes = unique(c(conns$V1, conns$V3)) rel = unique(conns$V2) p = length(genes) conns$V1 = as.integer(factor(conns$V1, levels=genes)) conns$V3 = as.integer(factor(conns$V3, levels=genes)) rels.gene = foreach(k=rel) %do% with(conns[conns$V2==k,], sparseMatrix(i=V1, j=V3, x=T, dims=c(p, p), dimnames=list(genes, genes))) conns.gene = lapply(rels.gene, function(x) x | t(x)) conn.gene = Reduce('|', conns.gene) save(conn.gene, file='Data/PathwayCommons12.rData')
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.