In hwang655/HUG: Dependent Graphical Model Estimation

library(HUG)
library(DropletUtils)

This document describes how I import and preprocess scRNA-seq data and others for the paper.

scRNA-seq Data

Velmeshev Data

Paper: Single-cell genomics identifies cell type–specific molecular changes in autism Source: https://cells.ucsc.edu/autism/rawMatrix.zip Raw data file: barcodes.tsv saves barcode information, genes.tsv saves gene information, and matrix.mtx saves the count data in MatrixMarket format Technique: function read10xCounts from R package DropletUtils

data.dir = "Data/autism"
sce      = read10xCounts(file.path(data.dir, "rawMatrix"), col.names=TRUE)
cell_info = fread(file.path(data.dir, "meta.tsv"))
colData(sce) = cbind(colData(sce), cell_info)
names(rowData(sce))[6] = "strand_n"
sce = sce[which(rowData(sce)$n_cells_by_counts >= 0.01*ncol(sce)),]
u.regions  = unique(colData(sce)$region)
u.clusters = unique(colData(sce)$cluster)

data.dir = "Data/autism"
for(r1 in u.regions){
  cond1 = colData(sce)$region == r1
  for(cls1 in u.clusters){
    cond2 = colData(sce)$cluster == cls1
    ct1   = counts(sce[,which(cond1 & cond2)])
    cls2  = gsub("/", "_", cls1)
    fnm   = sprintf("%s_%s.rds", r1, cls2)
    saveRDS(ct1, file=file.path(data.dir, "ct_mtx", fnm))
  }
}
names = c("L2_3","IN-VIP","IN-SV2C","IN-SST","IN-PV")
name.post = "PFC_"
data.people = levels(factor(read.table("Data/autism/meta.tsv",sep = '\t',header = TRUE)$individual))
data.save = vector("list",length(data.people)+1)
gene.num = nrow(as.matrix(readRDS(paste0("Data/autism/ct_mtx/",name.post,names[1],".rds"))))
names(data.save) = c(data.people,"all")
data.all = NULL
for (i in 1:length(names)){
  data.now = t(as.matrix(readRDS(paste0("Data/autism/ct_mtx/",name.post,names[i],".rds"))))
  data.all = rbind(data.all,data.now)
}
data.people = c("4341","5278", "5531","5958")
for (j in 1:length(data.people)){
  index = grep(data.people[j],row.names(data.all))
  data.save = data.all[index,]
  save(data.save,file = paste0("Data/autism/genes_Velmeshev_",data.people[j],"_5cluster.rData"))
}

Gierahn Data

Paper: Seq-Well: A portable, low-cost platform for single-cell RNA-seq of low-input samples Source: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE92495 Raw data file: GSE92495_RAW.tar/GSM2486331_HEK_SingleCells.txt Size: 1,453 cells * 24,187 genes Technique: Seq-Well based on

dat.gierahn = read.table('Data/GSM2486331_HEK_SingleCells.txt')
dat.gierahn = Matrix(t(dat.gierahn))
ix.gierahn = as.factor(sapply(strsplit(rownames(dat.gierahn), '_'), `[`, 1))
mean(dat.gierahn == 0)
save(dat.gierahn, ix.gierahn, file='Data/Gierahn2017_raw.rData')

Benchmark Graph

conns = read.table('Data/PathwayCommons12.All.hgnc.sif', header=F, as.is=T)
genes = unique(c(conns$V1, conns$V3))
rel = unique(conns$V2)
p = length(genes)

conns$V1 = as.integer(factor(conns$V1, levels=genes))
conns$V3 = as.integer(factor(conns$V3, levels=genes))

rels.gene = foreach(k=rel) %do% with(conns[conns$V2==k,], 
                                     sparseMatrix(i=V1, j=V3, x=T, dims=c(p, p), dimnames=list(genes, genes)))
conns.gene = lapply(rels.gene, function(x) x | t(x))
conn.gene = Reduce('|', conns.gene)
save(conn.gene, file='Data/PathwayCommons12.rData')