Load packages

rm(list = ls())
library(Seurat)
library(leedonghyung)
library(SummarizedExperiment)
library(irlba)
library(Rtsne)
library(RColorBrewer)
library(pheatmap)
library(corrplot)
library(tmod)
library(tagcloud)
library(limma)
library(VennDiagram)
library(ggrepel)
library(data.table)

# normalize function
normalize <- function (counts) 
{
  normfactor <- colSums(counts)
  return(t(t(counts)/normfactor)*median(normfactor))
}
color.vec <- c("#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "black", "#a65628", "#f781bf", "#999999")

Filter genes based on over-dispersion

path <- ("~/Dropbox/CZI_project/Public_Data/Kang_2017/")
name <- "Kang.2017.all.cells"

ctrl.data <- as.data.frame(fread(paste0("gzcat ", path,"immune_control_expression_matrix.txt.gz")))
dim(ctrl.data)
rownames(ctrl.data) <- ctrl.data$V1
ctrl.data$V1 <- NULL
dim(ctrl.data)
#head(ctrl.data[,1:5])

stim.data <- as.data.frame(fread(paste0("gzcat ", path,"immune_stimulated_expression_matrix.txt.gz")))
dim(stim.data)
rownames(stim.data) <- stim.data$V1
stim.data$V1 <- NULL
dim(stim.data)
#head(stim.data[,1:5])

# Set up control object
ctrl <- CreateSeuratObject(raw.data = ctrl.data, project = "IMMUNE_CTRL", min.cells = 5)
ctrl@meta.data$stim <- "CTRL"
ctrl <- FilterCells(ctrl, subset.names = "nGene", low.thresholds = 500, high.thresholds = Inf)
ctrl <- NormalizeData(ctrl)
ctrl <- ScaleData(ctrl, display.progress = F)
# Set up stimulated object
stim <- CreateSeuratObject(raw.data = stim.data, project = "IMMUNE_STIM", min.cells = 5)
stim@meta.data$stim <- "STIM"
stim <- FilterCells(stim, subset.names = "nGene", low.thresholds = 500, high.thresholds = Inf)
stim <- NormalizeData(stim)
stim <- ScaleData(stim, display.progress = F)

# Gene selection for input to CCA
ctrl <- FindVariableGenes(ctrl, do.plot = F)
stim <- FindVariableGenes(stim, do.plot = F)
g.1 <- head(rownames(ctrl@hvg.info), 2000)
g.2 <- head(rownames(stim@hvg.info), 2000)
genes.use <- union(g.1, g.2)

# combine data
ctrl.data.sel <- ctrl.data[genes.use, rownames(ctrl@meta.data)]
stim.data.sel <- stim.data[genes.use, rownames(stim@meta.data)]
all.counts <- cbind(ctrl.data.sel, stim.data.sel)
dim(all.counts)
# overdispersed genes
#saveRDS(all.counts, file = "immune_control_stimulated_filtered_counts.Rds")

Genes Detected

# load data
all.counts <- readRDS("~/Dropbox/Kang.2017/IASVA/By_Celltype/Kang.2017.by.cell.CD8 T.overdispersed_raw_counts.Rds")
dim(all.counts)

# load in seurat annotations
imm.comb <- readRDS("~/Dropbox/Kang.2017/Kang.2017.PBMC.stimulated.Seurat.immune.combined.Rds")
imm.meta <- imm.comb@meta.data
imm.meta <- cbind(imm.meta, imm.comb@ident)
colnames(imm.meta)[ncol(imm.meta)] <- "celltype"
dim(imm.meta)
all.meta <- imm.meta
imm.meta <- imm.meta[imm.meta$celltype == "CD8 T",]
table(rownames(imm.meta) == colnames(all.counts))

# calculate lib size
#pdf(file = paste(name, cell_interest, "IASVA.genes.detected.hist.pdf", sep = "."))
Num_Detected_Genes <- colSums(all.counts > 0)
hist(Num_Detected_Genes, col = "dodgerblue4", breaks = 20, ylab = "Number of Cells")
summ <- summary(Num_Detected_Genes)
legend("topright", legend = paste(names(summ), summ))
abline(v = 500, col = "red", lty = 2)
legend("right", legend = paste("< 500 genes = ", length(which(Num_Detected_Genes < 500)), " Cells\n",
                                  "> 500 genes = ", length(which(Num_Detected_Genes > 500)), " Cells", sep = ""))
#dev.off()

Known Factors

Here, the only known factors are stimulation, celltype, and library size

# make vector of condition
Condition <- factor(imm.meta$stim)
CellType <- factor(imm.meta$celltype)
# normalize counts
counts.filt <- normalize(all.counts)
Geo_Lib <- colSums(log(counts.filt+1))
Num_Detected_Genes <- colSums(all.counts > 0)
# categorize lib size
Geo_Lib_Cat <- cut(Num_Detected_Genes, breaks = 4)
summary(Geo_Lib)
barplot(Geo_Lib, xlab="Cell", las=2, ylab = "Geometric Library Size")               
lcounts <- log(counts.filt + 1)
lcounts <- as.matrix(lcounts)
# PC1 and Geometric library size correlation
pc1 = irlba(lcounts - rowMeans(lcounts), 1)$v[,1] ## partial SVD
cor(Geo_Lib, pc1)
cor(Num_Detected_Genes, pc1)
cor(as.numeric(Condition), pc1)
cor(Num_Detected_Genes, Geo_Lib)

T-SNE prior to IASVA analysis

#tsne.res <- Rtsne(unique(t(lcounts)), dims = 2)
#saveRDS(tsne.res, file = paste(name, "tsne.preIASVA.Rds", sep = "."))

# load tsne results
tsne.res <- readRDS("~/Dropbox/Kang.2017/IASVA/By_Celltype/Kang.2017.by.cell.CD8 T.tsne.preIASVA.Rds")

# by celltype
par(mfrow=c(2,2))
#par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=TRUE)
plot(tsne.res$Y[,1:2], main="Pre IA-SVA", xlab="tSNE Dim1",
     ylab="tSNE Dim2", pch=20, col=color.vec[CellType], 
     bg=color.vec[CellType], cex = 0.5)
#par(xpd=TRUE)
legend(50,20, levels(CellType), border="white", fill=color.vec, bty="n", title = "Cell Type")
# by condition
#par(mar=c(5, 4, 4, 2), xpd=FALSE)
plot(tsne.res$Y[,1:2], main="Pre IA-SVA", xlab="tSNE Dim1",
     ylab="tSNE Dim2", pch=20, col=color.vec[Condition], 
     bg=color.vec[Condition], oma=c(4,4,6,12), cex = 0.5)
legend("topleft", levels(Condition), border="white", fill=color.vec, bty="n", title = "Condition")
# by genes detect
plot(tsne.res$Y[,1:2], main="Pre IA-SVA", xlab="tSNE Dim1",
     ylab="tSNE Dim2", pch=20, col=color.vec[Geo_Lib_Cat], 
     bg=color.vec[Geo_Lib_Cat], oma=c(4,4,6,12), cex = 0.5)
legend("topleft", levels(Geo_Lib_Cat), border="white", fill=color.vec, bty="n", title = "Genes Detected")

Run IASVA using num.sv = 10

# run iasva
mod <- model.matrix(~Num_Detected_Genes)
summ_exp <- SummarizedExperiment(assays = as.matrix(counts.filt))
pct_cutt <- 1
iasva.res<- fast_iasva(summ_exp, mod[,-1, drop = F], verbose=FALSE, pct.cutoff = pct_cutt, num.sv = 10)
# load results
#iasva.res <- readRDS("/Users/lawlon/Documents/CZI/Seurat_Public_Data/immune_alignment_expression_matrices/IASVA/By_Celltype/Kang.2017.by.cell.CD8 T.IASVA.1.output.Rds")
iasva.sv <- iasva.res$sv
#saveRDS(iasva.res, file = paste(name, "IASVA", pct_cutt, "output.Rds", sep = "."))
#markers_pct1 <- find_markers(Y = summ_exp, iasva.sv = iasva.sv)
#markers_pct1 <- readRDS("/Users/lawlon/Documents/CZI/Seurat_Public_Data/immune_alignment_expression_matrices/IASVA/By_Celltype/Kang.2017.by.cell.CD8 T.IASVA.1.markers.Rds")

# revised find markers function
#source("/Users/lawlon/iasva_markers_update/R/find_markers.R")
#all_markers <- .GlobalEnv$find_markers(Y = summ_exp, iasva.sv = iasva.sv)
# all_markers <- readRDS("/Users/lawlon/Documents/CZI/Seurat_Public_Data/immune_alignment_expression_matrices/IASVA/All_Cells/Kang.2017.all.cells.IASVA.by.SV.markers.Rds")

#saveRDS(markers_pct1, file = paste(name, "IASVA", pct_cutt, "markers.Rds", sep = "."))
# tsne on markers
# set.seed(100)
# if (ncol(markers_pct1) > 100) {
#   perplex <- 30
# } else {
#   perplex <- round(ncol(markers_pct1)/4)
# }
# tsne.res.iasva <- Rtsne(unique(t(log(markers_pct1+1))), dims = 2, perplexity = perplex)
# saveRDS(tsne.res.iasva, file = paste(name, cell_interest, stim_cond, "IASVA", pct_cutt, "tsne.markers.Rds", sep = "."))

Visualize SVs

ncol(iasva.sv)
# if (ncol(iasva.sv) > 5) {
#   pairs(iasva.sv[,1:5], main="IA-SVA", pch=20, col=color.vec[CellType],
#       bg=color.vec[CellType], cex=0.5, oma=c(4,4,6,12))
#   legend("right", levels(CellType), border="white", fill=color.vec, bty="n")
#   pairs(iasva.sv[,1:5], main="IA-SVA", pch=20, col=color.vec[Condition],
#       bg=color.vec[Condition], cex=0.5, oma=c(4,4,6,12))
#   legend("right", levels(Condition), border="white", fill=color.vec, bty="n")
#   pairs(iasva.sv[,1:5], main="IA-SVA", pch=20, col=color.vec[Geo_Lib_Cat],
#       bg=color.vec[Geo_Lib_Cat], cex=0.5, oma=c(4,4,6,12))
#   legend("right", levels(Geo_Lib_Cat), border="white", fill=color.vec, bty="n")
# } else {
  par(mar=c(5.1, 4.1, 4.1, 8.2), xpd=TRUE)
  pairs(iasva.sv, main="IA-SVA", pch=20, col=color.vec[Condition],
      bg=color.vec[Condition], cex=0.5, oma=c(4,4,6,12))
  legend("right", levels(Condition), border="white", fill=color.vec, bty="n")


# }

Use SVs separate cells. For example, SV1, SV4, SV8 are good candidates for further analysis.

Correlate SVS

iasva_vars <- cbind(iasva.sv, Geo_Lib, Condition)
corrplot(cor(iasva_vars))

Marker genes for SV1

SV1.marker.counts <- leedonghyung::find_markers(summ_exp, as.matrix(iasva.sv[,1]), rsq.cutoff=0.2)
dim(SV1.marker.counts)

anno.col <- data.frame(Condition = Condition, Genes_Detect = Geo_Lib_Cat, #CellType = CellType, 
                       SV1 = iasva.sv[,1]) #SV4= iasva.sv[,4])
rownames(anno.col) <- colnames(SV1.marker.counts)

pheatmap(log(SV1.marker.counts+1), show_colnames =FALSE, 
         clustering_method = "ward.D2",cutree_cols = 3,annotation_col = anno.col)
         #annotation_colors = anno.colors)

Marker genes for SV4

SV4.marker.counts <- leedonghyung::find_markers(summ_exp, as.matrix(iasva.sv[,4]), rsq.cutoff=0.2)
dim(SV4.marker.counts)

anno.col <- data.frame(Condition = Condition, Genes_Detect = Geo_Lib_Cat, #CellType = CellType, 
                       SV4 = iasva.sv[,4]) 
rownames(anno.col) <- colnames(SV4.marker.counts)

pheatmap(log(SV4.marker.counts+1), show_colnames =FALSE, 
         clustering_method = "ward.D2",cutree_cols = 3,annotation_col = anno.col)
         #annotation_colors = anno.colors)

Marker genes for SV8

SV8.marker.counts <- leedonghyung::find_markers(summ_exp, as.matrix(iasva.sv[,8]), rsq.cutoff=0.1)
dim(SV8.marker.counts)

anno.col <- data.frame(Condition = Condition, Genes_Detect = Geo_Lib_Cat, #CellType = CellType, 
                       SV8 = iasva.sv[,8]) 
rownames(anno.col) <- colnames(SV8.marker.counts)

pheatmap(log(SV8.marker.counts+1), show_colnames =FALSE, 
         clustering_method = "ward.D2",cutree_cols = 3,annotation_col = anno.col)
         #annotation_colors = anno.colors)

Marker genes for SV1 and SV4

SV1.SV4.marker.counts <- leedonghyung::find_markers(summ_exp, iasva.sv[,c(1,4)], rsq.cutoff=0.2)
dim(SV1.SV4.marker.counts)

anno.col <- data.frame(Condition = Condition, Genes_Detect = Geo_Lib_Cat, #CellType = CellType, 
                       SV1 = iasva.sv[,1], SV4= iasva.sv[,4])
rownames(anno.col) <- colnames(SV1.SV4.marker.counts)

pheatmap(log(SV1.SV4.marker.counts+1), show_colnames =FALSE, 
         clustering_method = "ward.D2",cutree_cols = 3,annotation_col = anno.col)
         #annotation_colors = anno.colors)

Modules enriched in each SV

all_markers <- find_markers2(summ_exp, iasva.sv[,c(1,4,8)], rsq.cutoff=0.1)

design <- model.matrix(~Condition)
fit <- eBayes(lmFit(counts.filt, design))
tt <- topTable(fit, coef=2, number=Inf, genelist = rownames(counts.filt))
head(tt, 10)

# iterative analysis of SVs an modules
all_sv_res <- list()
par(mfrow = c(3,1), mar = c(4,2,2,2))
for (j in 1:3) {
    sv_genes <- all_markers[[j]]
    # module analysis
    fg <- tt[sv_genes,]
    fg.ord <- fg[order(fg$logFC, decreasing = T),]
    res <- tmodHGtest(fg = fg.ord$ID, bg = tt$ID)
    res.ord <- res[order(res$E, decreasing = T),]
    # append results
    all_sv_res[[j]] <- res.ord
    names(all_sv_res)[j] <- names(all_markers)[j]
    if (nrow(res.ord) > 1) {
      # make plots
    bp1 <- barplot(res.ord$E, main = paste(names(all_markers)[j], " Module Enrichment", sep = ""), xlab = "", horiz = T,
                  cex.main = 2, cex.lab = 2)
    #barplot(res.ord$E, xlab = "Module Enrichment", main = paste(names(all_markers)[j], " Genes", sep = ""), horiz = T)
    par(xpd=TRUE)
    text(x = max(res.ord$E)/2, y = bp1[,1], labels = paste(res.ord$Title, " (", res$b, "/", res$B, ")", sep = ""), cex = 1, font = 2)
    }
}

Differential Expression Analysis

# create seurat object
cd8_norm <- imm.meta[imm.meta$celltype == "CD8 T" & imm.meta$stim == "CTRL",]
cd8_stim <- imm.meta[imm.meta$celltype == "CD8 T" & imm.meta$stim == "STIM",]

cd8.norm.ids <- gsub("\\.","-",rownames(cd8_norm))
cd8.stim.ids <- gsub("\\.","-",rownames(cd8_stim))

all.counts.raw <- cbind(ctrl.data[, cd8.norm.ids], stim.data[, cd8.stim.ids])
all.anns <- rbind(cd8_norm, cd8_stim)
all_dat <- CreateSeuratObject(raw.data = all.counts.raw, min.cells = 5, project = "CD8_Cells")
all_dat@meta.data$Stim <- all.anns$stim
all_dat <- NormalizeData(all_dat)
all_dat <- ScaleData(all_dat, display.progress = F)
# find markers
all_dat <- SetAllIdent(object = all_dat, id = "Stim")
stim_dge <- FindMarkers(object = all_dat, ident.1 = "STIM", ident.2 = "CTRL")
stim_dge_sig_up <- stim_dge[stim_dge$p_val_adj < 0.05 & stim_dge$avg_logFC > 0,]
stim_dge_sig_dw <- stim_dge[stim_dge$p_val_adj < 0.05 & stim_dge$avg_logFC < 0,]

## find markers based on raw data.
summ_exp_raw <- SummarizedExperiment(assays = as.matrix(all.counts.raw))
all_markers_diff <- find_markers2(summ_exp_raw, as.matrix(iasva.sv[,c(1)]), rsq.cutoff=0.1) ## For fair comparison.
sv1_genes <- all_markers_diff$All_Unique_Markers
length(sv1_genes)
# venn diagram of iasva markers
library(gtools)
# calc fold change of iasva markers (SV1)
dat_norm <- as.matrix(all_dat@data)
dat_norm_ctrl <- dat_norm[, cd8.norm.ids]
dat_norm_stim <- dat_norm[, cd8.stim.ids]
sv1_fc <- NULL
for (k in 1:length(sv1_genes)) {
  fcs <- foldchange(num = mean(dat_norm_stim[sv1_genes[k],]), denom = mean(dat_norm_ctrl[sv1_genes[k],]))
  sv1_fc <- c(sv1_fc, fcs)
}
logfc <- log2(sv1_fc)
length(which(logfc > 0))
int_genes <- intersect(sv1_genes, rownames(stim_dge_sig_up))
grid.newpage()
draw.pairwise.venn(area1 = nrow(stim_dge_sig_up), area2 = length(sv1_genes),
                   cross.area = length(int_genes), category = c("Differential Expression (Wilcox)", "IA-SVA"),
                   cat.cex = 2, cex = 3, cat.pos = 0)
# gene list
int_genes

# make a scatterplot
avg.exp <- log1p(AverageExpression(all_dat, show.progress = FALSE))
avg.exp$gene <- rownames(avg.exp)
genes.to.label1 = sv1_genes
# label only a few items
avg.exp$labelint <- ""
avg.exp$up <- ""
ix_label <- which(avg.exp$gene %in% genes.to.label1)
avg.exp$labelint[ix_label] <- avg.exp$gene[ix_label]
ix_up <- which(avg.exp$gene %in% rownames(stim_dge_sig_up))
avg.exp$up[ix_up] <- avg.exp$gene[ix_up]
# character vector of colors
delta_colz <- character(length = nrow(avg.exp))
for (k in 1:nrow(avg.exp)) {
  if (avg.exp$labelint[k] != "") {
    delta_colz[k] <- "red"
  } else if (avg.exp$up[k] != "") {
    delta_colz[k] <- "blue"
  } else {
    delta_colz[k] <- "grey50"
  }
}

p3 <- ggplot(avg.exp, aes(CTRL, STIM)) + geom_point() + ggtitle("CD8 T Cells") +
  geom_point(color = delta_colz) +
  geom_text_repel(aes(label = labelint), size=2, show.legend = F, nudge_x = -0.5) + 
  annotate("text", x = 3, y = 6, label = paste("IA-SVA and DGE Genes (", length(int_genes), ")", sep = ""), color = "red") + 
  annotate("text", x = 3, y = 5.5, label = paste("DGE Genes Only (", nrow(stim_dge_sig_up)-length(int_genes), ")", sep = ""), color = "blue")
  #geom_text_repel(aes(label = up), size=2, show.legend = F, nudge_x = 0.5) 
plot(p3)

Session Information

sessionInfo()


dleelab/leedonghyung documentation built on May 7, 2019, 8:43 a.m.