In dleelab/leedonghyung: a set of useful R functions

Load packages

rm(list = ls())
library(Seurat)
library(leedonghyung)
library(SummarizedExperiment)
library(irlba)
library(Rtsne)
library(RColorBrewer)
library(pheatmap)
library(corrplot)
library(tmod)
library(tagcloud)
library(limma)
library(VennDiagram)
library(ggrepel)
library(data.table)

# normalize function
normalize <- function (counts) 
{
  normfactor <- colSums(counts)
  return(t(t(counts)/normfactor)*median(normfactor))
}
color.vec <- c("#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "black", "#a65628", "#f781bf", "#999999")

Filter genes based on over-dispersion

path <- ("~/Dropbox/CZI_project/Public_Data/Kang_2017/")
name <- "Kang.2017.all.cells"

ctrl.data <- as.data.frame(fread(paste0("gzcat ", path,"immune_control_expression_matrix.txt.gz")))
dim(ctrl.data)
rownames(ctrl.data) <- ctrl.data$V1
ctrl.data$V1 <- NULL
dim(ctrl.data)
#head(ctrl.data[,1:5])

stim.data <- as.data.frame(fread(paste0("gzcat ", path,"immune_stimulated_expression_matrix.txt.gz")))
dim(stim.data)
rownames(stim.data) <- stim.data$V1
stim.data$V1 <- NULL
dim(stim.data)
#head(stim.data[,1:5])

# Set up control object
ctrl <- CreateSeuratObject(raw.data = ctrl.data, project = "IMMUNE_CTRL", min.cells = 5)
ctrl@meta.data$stim <- "CTRL"
ctrl <- FilterCells(ctrl, subset.names = "nGene", low.thresholds = 500, high.thresholds = Inf)
ctrl <- NormalizeData(ctrl)
ctrl <- ScaleData(ctrl, display.progress = F)
# Set up stimulated object
stim <- CreateSeuratObject(raw.data = stim.data, project = "IMMUNE_STIM", min.cells = 5)
stim@meta.data$stim <- "STIM"
stim <- FilterCells(stim, subset.names = "nGene", low.thresholds = 500, high.thresholds = Inf)
stim <- NormalizeData(stim)
stim <- ScaleData(stim, display.progress = F)

# Gene selection for input to CCA
ctrl <- FindVariableGenes(ctrl, do.plot = F)
stim <- FindVariableGenes(stim, do.plot = F)
g.1 <- head(rownames(ctrl@hvg.info), 2000)
g.2 <- head(rownames(stim@hvg.info), 2000)
genes.use <- union(g.1, g.2)

# combine data
ctrl.data.sel <- ctrl.data[genes.use, rownames(ctrl@meta.data)]
stim.data.sel <- stim.data[genes.use, rownames(stim@meta.data)]
all.counts <- cbind(ctrl.data.sel, stim.data.sel)
dim(all.counts)
# overdispersed genes
#saveRDS(all.counts, file = "immune_control_stimulated_filtered_counts.Rds")

Genes Detected

# load data
all.counts <- readRDS("~/Dropbox/Kang.2017/IASVA/By_Celltype/Kang.2017.by.cell.CD8 T.overdispersed_raw_counts.Rds")
dim(all.counts)

# load in seurat annotations
imm.comb <- readRDS("~/Dropbox/Kang.2017/Kang.2017.PBMC.stimulated.Seurat.immune.combined.Rds")
imm.meta <- imm.comb@meta.data
imm.meta <- cbind(imm.meta, imm.comb@ident)
colnames(imm.meta)[ncol(imm.meta)] <- "celltype"
dim(imm.meta)
all.meta <- imm.meta
imm.meta <- imm.meta[imm.meta$celltype == "CD8 T",]
table(rownames(imm.meta) == colnames(all.counts))

# calculate lib size
#pdf(file = paste(name, cell_interest, "IASVA.genes.detected.hist.pdf", sep = "."))
Num_Detected_Genes <- colSums(all.counts > 0)
hist(Num_Detected_Genes, col = "dodgerblue4", breaks = 20, ylab = "Number of Cells")
summ <- summary(Num_Detected_Genes)
legend("topright", legend = paste(names(summ), summ))
abline(v = 500, col = "red", lty = 2)
legend("right", legend = paste("< 500 genes = ", length(which(Num_Detected_Genes < 500)), " Cells\n",
                                  "> 500 genes = ", length(which(Num_Detected_Genes > 500)), " Cells", sep = ""))
#dev.off()

Known Factors

Here, the only known factors are stimulation, celltype, and library size

# make vector of condition
Condition <- factor(imm.meta$stim)
CellType <- factor(imm.meta$celltype)
# normalize counts
counts.filt <- normalize(all.counts)
Geo_Lib <- colSums(log(counts.filt+1))
Num_Detected_Genes <- colSums(all.counts > 0)
# categorize lib size
Geo_Lib_Cat <- cut(Num_Detected_Genes, breaks = 4)
summary(Geo_Lib)
barplot(Geo_Lib, xlab="Cell", las=2, ylab = "Geometric Library Size")               
lcounts <- log(counts.filt + 1)
lcounts <- as.matrix(lcounts)
# PC1 and Geometric library size correlation
pc1 = irlba(lcounts - rowMeans(lcounts), 1)$v[,1] ## partial SVD
cor(Geo_Lib, pc1)
cor(Num_Detected_Genes, pc1)
cor(as.numeric(Condition), pc1)
cor(Num_Detected_Genes, Geo_Lib)

T-SNE prior to IASVA analysis

#tsne.res <- Rtsne(unique(t(lcounts)), dims = 2)
#saveRDS(tsne.res, file = paste(name, "tsne.preIASVA.Rds", sep = "."))

# load tsne results
tsne.res <- readRDS("~/Dropbox/Kang.2017/IASVA/By_Celltype/Kang.2017.by.cell.CD8 T.tsne.preIASVA.Rds")

# by celltype
par(mfrow=c(2,2))
#par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=TRUE)
plot(tsne.res$Y[,1:2], main="Pre IA-SVA", xlab="tSNE Dim1",
     ylab="tSNE Dim2", pch=20, col=color.vec[CellType], 
     bg=color.vec[CellType], cex = 0.5)
#par(xpd=TRUE)
legend(50,20, levels(CellType), border="white", fill=color.vec, bty="n", title = "Cell Type")
# by condition
#par(mar=c(5, 4, 4, 2), xpd=FALSE)
plot(tsne.res$Y[,1:2], main="Pre IA-SVA", xlab="tSNE Dim1",
     ylab="tSNE Dim2", pch=20, col=color.vec[Condition], 
     bg=color.vec[Condition], oma=c(4,4,6,12), cex = 0.5)
legend("topleft", levels(Condition), border="white", fill=color.vec, bty="n", title = "Condition")
# by genes detect
plot(tsne.res$Y[,1:2], main="Pre IA-SVA", xlab="tSNE Dim1",
     ylab="tSNE Dim2", pch=20, col=color.vec[Geo_Lib_Cat], 
     bg=color.vec[Geo_Lib_Cat], oma=c(4,4,6,12), cex = 0.5)
legend("topleft", levels(Geo_Lib_Cat), border="white", fill=color.vec, bty="n", title = "Genes Detected")

Run IASVA using num.sv = 10

# run iasva
mod <- model.matrix(~Num_Detected_Genes)
summ_exp <- SummarizedExperiment(assays = as.matrix(counts.filt))
pct_cutt <- 1
iasva.res<- fast_iasva(summ_exp, mod[,-1, drop = F], verbose=FALSE, pct.cutoff = pct_cutt, num.sv = 10)
# load results
#iasva.res <- readRDS("/Users/lawlon/Documents/CZI/Seurat_Public_Data/immune_alignment_expression_matrices/IASVA/By_Celltype/Kang.2017.by.cell.CD8 T.IASVA.1.output.Rds")
iasva.sv <- iasva.res$sv
#saveRDS(iasva.res, file = paste(name, "IASVA", pct_cutt, "output.Rds", sep = "."))
#markers_pct1 <- find_markers(Y = summ_exp, iasva.sv = iasva.sv)
#markers_pct1 <- readRDS("/Users/lawlon/Documents/CZI/Seurat_Public_Data/immune_alignment_expression_matrices/IASVA/By_Celltype/Kang.2017.by.cell.CD8 T.IASVA.1.markers.Rds")

# revised find markers function
#source("/Users/lawlon/iasva_markers_update/R/find_markers.R")
#all_markers <- .GlobalEnv$find_markers(Y = summ_exp, iasva.sv = iasva.sv)
# all_markers <- readRDS("/Users/lawlon/Documents/CZI/Seurat_Public_Data/immune_alignment_expression_matrices/IASVA/All_Cells/Kang.2017.all.cells.IASVA.by.SV.markers.Rds")

#saveRDS(markers_pct1, file = paste(name, "IASVA", pct_cutt, "markers.Rds", sep = "."))
# tsne on markers
# set.seed(100)
# if (ncol(markers_pct1) > 100) {
#   perplex <- 30
# } else {
#   perplex <- round(ncol(markers_pct1)/4)
# }
# tsne.res.iasva <- Rtsne(unique(t(log(markers_pct1+1))), dims = 2, perplexity = perplex)
# saveRDS(tsne.res.iasva, file = paste(name, cell_interest, stim_cond, "IASVA", pct_cutt, "tsne.markers.Rds", sep = "."))

Visualize SVs

ncol(iasva.sv)
# if (ncol(iasva.sv) > 5) {
#   pairs(iasva.sv[,1:5], main="IA-SVA", pch=20, col=color.vec[CellType],
#       bg=color.vec[CellType], cex=0.5, oma=c(4,4,6,12))
#   legend("right", levels(CellType), border="white", fill=color.vec, bty="n")
#   pairs(iasva.sv[,1:5], main="IA-SVA", pch=20, col=color.vec[Condition],
#       bg=color.vec[Condition], cex=0.5, oma=c(4,4,6,12))
#   legend("right", levels(Condition), border="white", fill=color.vec, bty="n")
#   pairs(iasva.sv[,1:5], main="IA-SVA", pch=20, col=color.vec[Geo_Lib_Cat],
#       bg=color.vec[Geo_Lib_Cat], cex=0.5, oma=c(4,4,6,12))
#   legend("right", levels(Geo_Lib_Cat), border="white", fill=color.vec, bty="n")
# } else {
  par(mar=c(5.1, 4.1, 4.1, 8.2), xpd=TRUE)
  pairs(iasva.sv, main="IA-SVA", pch=20, col=color.vec[Condition],
      bg=color.vec[Condition], cex=0.5, oma=c(4,4,6,12))
  legend("right", levels(Condition), border="white", fill=color.vec, bty="n")


# }

Use SVs separate cells. For example, SV1, SV4, SV8 are good candidates for further analysis.

Correlate SVS

iasva_vars <- cbind(iasva.sv, Geo_Lib, Condition)
corrplot(cor(iasva_vars))

Marker genes for SV1

SV1.marker.counts <- leedonghyung::find_markers(summ_exp, as.matrix(iasva.sv[,1]), rsq.cutoff=0.2)
dim(SV1.marker.counts)

anno.col <- data.frame(Condition = Condition, Genes_Detect = Geo_Lib_Cat, #CellType = CellType, 
                       SV1 = iasva.sv[,1]) #SV4= iasva.sv[,4])
rownames(anno.col) <- colnames(SV1.marker.counts)

pheatmap(log(SV1.marker.counts+1), show_colnames =FALSE, 
         clustering_method = "ward.D2",cutree_cols = 3,annotation_col = anno.col)
         #annotation_colors = anno.colors)

Marker genes for SV4

SV4.marker.counts <- leedonghyung::find_markers(summ_exp, as.matrix(iasva.sv[,4]), rsq.cutoff=0.2)
dim(SV4.marker.counts)

anno.col <- data.frame(Condition = Condition, Genes_Detect = Geo_Lib_Cat, #CellType = CellType, 
                       SV4 = iasva.sv[,4]) 
rownames(anno.col) <- colnames(SV4.marker.counts)

pheatmap(log(SV4.marker.counts+1), show_colnames =FALSE, 
         clustering_method = "ward.D2",cutree_cols = 3,annotation_col = anno.col)
         #annotation_colors = anno.colors)

Marker genes for SV8

SV8.marker.counts <- leedonghyung::find_markers(summ_exp, as.matrix(iasva.sv[,8]), rsq.cutoff=0.1)
dim(SV8.marker.counts)

anno.col <- data.frame(Condition = Condition, Genes_Detect = Geo_Lib_Cat, #CellType = CellType, 
                       SV8 = iasva.sv[,8]) 
rownames(anno.col) <- colnames(SV8.marker.counts)

pheatmap(log(SV8.marker.counts+1), show_colnames =FALSE, 
         clustering_method = "ward.D2",cutree_cols = 3,annotation_col = anno.col)
         #annotation_colors = anno.colors)

Marker genes for SV1 and SV4

SV1.SV4.marker.counts <- leedonghyung::find_markers(summ_exp, iasva.sv[,c(1,4)], rsq.cutoff=0.2)
dim(SV1.SV4.marker.counts)

anno.col <- data.frame(Condition = Condition, Genes_Detect = Geo_Lib_Cat, #CellType = CellType, 
                       SV1 = iasva.sv[,1], SV4= iasva.sv[,4])
rownames(anno.col) <- colnames(SV1.SV4.marker.counts)

pheatmap(log(SV1.SV4.marker.counts+1), show_colnames =FALSE, 
         clustering_method = "ward.D2",cutree_cols = 3,annotation_col = anno.col)
         #annotation_colors = anno.colors)

Modules enriched in each SV

all_markers <- find_markers2(summ_exp, iasva.sv[,c(1,4,8)], rsq.cutoff=0.1)

design <- model.matrix(~Condition)
fit <- eBayes(lmFit(counts.filt, design))
tt <- topTable(fit, coef=2, number=Inf, genelist = rownames(counts.filt))
head(tt, 10)

# iterative analysis of SVs an modules
all_sv_res <- list()
par(mfrow = c(3,1), mar = c(4,2,2,2))
for (j in 1:3) {
    sv_genes <- all_markers[[j]]
    # module analysis
    fg <- tt[sv_genes,]
    fg.ord <- fg[order(fg$logFC, decreasing = T),]
    res <- tmodHGtest(fg = fg.ord$ID, bg = tt$ID)
    res.ord <- res[order(res$E, decreasing = T),]
    # append results
    all_sv_res[[j]] <- res.ord
    names(all_sv_res)[j] <- names(all_markers)[j]
    if (nrow(res.ord) > 1) {
      # make plots
    bp1 <- barplot(res.ord$E, main = paste(names(all_markers)[j], " Module Enrichment", sep = ""), xlab = "", horiz = T,
                  cex.main = 2, cex.lab = 2)
    #barplot(res.ord$E, xlab = "Module Enrichment", main = paste(names(all_markers)[j], " Genes", sep = ""), horiz = T)
    par(xpd=TRUE)
    text(x = max(res.ord$E)/2, y = bp1[,1], labels = paste(res.ord$Title, " (", res$b, "/", res$B, ")", sep = ""), cex = 1, font = 2)
    }
}

Differential Expression Analysis

# create seurat object
cd8_norm <- imm.meta[imm.meta$celltype == "CD8 T" & imm.meta$stim == "CTRL",]
cd8_stim <- imm.meta[imm.meta$celltype == "CD8 T" & imm.meta$stim == "STIM",]

cd8.norm.ids <- gsub("\\.","-",rownames(cd8_norm))
cd8.stim.ids <- gsub("\\.","-",rownames(cd8_stim))

all.counts.raw <- cbind(ctrl.data[, cd8.norm.ids], stim.data[, cd8.stim.ids])
all.anns <- rbind(cd8_norm, cd8_stim)
all_dat <- CreateSeuratObject(raw.data = all.counts.raw, min.cells = 5, project = "CD8_Cells")
all_dat@meta.data$Stim <- all.anns$stim
all_dat <- NormalizeData(all_dat)
all_dat <- ScaleData(all_dat, display.progress = F)
# find markers
all_dat <- SetAllIdent(object = all_dat, id = "Stim")
stim_dge <- FindMarkers(object = all_dat, ident.1 = "STIM", ident.2 = "CTRL")
stim_dge_sig_up <- stim_dge[stim_dge$p_val_adj < 0.05 & stim_dge$avg_logFC > 0,]
stim_dge_sig_dw <- stim_dge[stim_dge$p_val_adj < 0.05 & stim_dge$avg_logFC < 0,]

## find markers based on raw data.
summ_exp_raw <- SummarizedExperiment(assays = as.matrix(all.counts.raw))
all_markers_diff <- find_markers2(summ_exp_raw, as.matrix(iasva.sv[,c(1)]), rsq.cutoff=0.1) ## For fair comparison.
sv1_genes <- all_markers_diff$All_Unique_Markers
length(sv1_genes)
# venn diagram of iasva markers
library(gtools)
# calc fold change of iasva markers (SV1)
dat_norm <- as.matrix(all_dat@data)
dat_norm_ctrl <- dat_norm[, cd8.norm.ids]
dat_norm_stim <- dat_norm[, cd8.stim.ids]
sv1_fc <- NULL
for (k in 1:length(sv1_genes)) {
  fcs <- foldchange(num = mean(dat_norm_stim[sv1_genes[k],]), denom = mean(dat_norm_ctrl[sv1_genes[k],]))
  sv1_fc <- c(sv1_fc, fcs)
}
logfc <- log2(sv1_fc)
length(which(logfc > 0))
int_genes <- intersect(sv1_genes, rownames(stim_dge_sig_up))
grid.newpage()
draw.pairwise.venn(area1 = nrow(stim_dge_sig_up), area2 = length(sv1_genes),
                   cross.area = length(int_genes), category = c("Differential Expression (Wilcox)", "IA-SVA"),
                   cat.cex = 2, cex = 3, cat.pos = 0)
# gene list
int_genes

# make a scatterplot
avg.exp <- log1p(AverageExpression(all_dat, show.progress = FALSE))
avg.exp$gene <- rownames(avg.exp)
genes.to.label1 = sv1_genes
# label only a few items
avg.exp$labelint <- ""
avg.exp$up <- ""
ix_label <- which(avg.exp$gene %in% genes.to.label1)
avg.exp$labelint[ix_label] <- avg.exp$gene[ix_label]
ix_up <- which(avg.exp$gene %in% rownames(stim_dge_sig_up))
avg.exp$up[ix_up] <- avg.exp$gene[ix_up]
# character vector of colors
delta_colz <- character(length = nrow(avg.exp))
for (k in 1:nrow(avg.exp)) {
  if (avg.exp$labelint[k] != "") {
    delta_colz[k] <- "red"
  } else if (avg.exp$up[k] != "") {
    delta_colz[k] <- "blue"
  } else {
    delta_colz[k] <- "grey50"
  }
}

p3 <- ggplot(avg.exp, aes(CTRL, STIM)) + geom_point() + ggtitle("CD8 T Cells") +
  geom_point(color = delta_colz) +
  geom_text_repel(aes(label = labelint), size=2, show.legend = F, nudge_x = -0.5) + 
  annotate("text", x = 3, y = 6, label = paste("IA-SVA and DGE Genes (", length(int_genes), ")", sep = ""), color = "red") + 
  annotate("text", x = 3, y = 5.5, label = paste("DGE Genes Only (", nrow(stim_dge_sig_up)-length(int_genes), ")", sep = ""), color = "blue")
  #geom_text_repel(aes(label = up), size=2, show.legend = F, nudge_x = 0.5) 
plot(p3)

Session Information

sessionInfo()

dleelab/leedonghyung documentation built on May 7, 2019, 8:43 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

dleelab/leedonghyung
a set of useful R functions

In dleelab/leedonghyung: a set of useful R functions

Load packages

Filter genes based on over-dispersion

Genes Detected

Known Factors

T-SNE prior to IASVA analysis

Run IASVA using num.sv = 10

Visualize SVs

Correlate SVS

Marker genes for SV1

Marker genes for SV4

Marker genes for SV8

Marker genes for SV1 and SV4

Modules enriched in each SV

Differential Expression Analysis

Session Information

R Package Documentation

Browse R Packages

We want your feedback!

dleelab/leedonghyung a set of useful R functions

In dleelab/leedonghyung: a set of useful R functions

Load packages

Filter genes based on over-dispersion

Genes Detected

Known Factors

T-SNE prior to IASVA analysis

Run IASVA using num.sv = 10

Visualize SVs

Correlate SVS

Marker genes for SV1

Marker genes for SV4

Marker genes for SV8

Marker genes for SV1 and SV4

Modules enriched in each SV

Differential Expression Analysis

Session Information

R Package Documentation

Browse R Packages

We want your feedback!

dleelab/leedonghyung
a set of useful R functions