scCD4:

# Will Connell
# 2018.09.20

# Monocle Analysis - REPREX


# Depends -----------------------------------------------------------------
# source("http://bioconductor.org/biocLite.R")
# biocLite()
# biocLite("monocle")

library(rhdf5)
library(monocle)
library(dplyr)
library(tictoc)
library(readr)
library(stringr)
devtools::load_all()

# Prepare output paths ----------------------------------------------------
start.date <- Sys.Date()
out.dir <- sprintf("../out/%s_monocle-mat2-clust0", start.date)
dir.create(out.dir)

print('Number of cores:')
print(detectCores())

# Import ------------------------------------------------------------------
tic('Importing & Preparing Data.....')

# view available strucutre
# file_loc <- "/ye/yelabstore2/dosageTF/tfko_140/combined/nsnp20.raw.sng.km_vb1_default.norm.h5ad"
#  ----- 2018.09.28: UPDATE NEW MATRIX -----------
file_loc <- "/ye/yelabstore2/dosageTF/tfko_140/combined/nsnp20.raw.sng.guide_sng.norm.h5ad"
h5ls(file_loc)

# import meta data
path_cell_meta <- "~/ye/projects/scanpy/KO_cells.csv"
obs <- read_csv(path_cell_meta) # observations, or rows, including metadata
vars <- h5read(file_loc, "/var") # variables, or columns

# subset a sample to effectively
#clust_indices <- sample(1:nrow(obs), 20000)
clust_indices <- which(obs$louvain == 0)
#clust_indices <- sample(clust_indices, 25000)
h5closeAll()
data <- h5read(file_loc, "/X", index = list(NULL, clust_indices)) #normalized matrix
obs_indices <- obs[clust_indices,]

data <- data %>%
  t() %>%
  as.data.frame()
colnames(data) <- c(vars$index)
data <- cbind(obs_indices, data) %>%
  mutate_if(is.array, as.vector) %>%
  mutate(guide_cov = str_extract(guide_cov, "^[:alnum:]+\\.[:digit:]+")) %>%
  mutate(guide_cov = if_else(is.na(guide_cov), '0', guide_cov))

print("Dimensions of data:")
print(dim(data))

# import EN/DE guides
guides <- read_tsv("/netapp/home/rgate/tfko_140/nsnp20.raw.sng.guide_sng.meta.guidecluster.enrichment.zsig.txt")

tidy_guides <- apply(guides, 1, function(x) strsplit(x, split = ","))
enriched_guides <- tidy_feats(tidy_guides, "enriched")
# enriched guides for cluster X
clust1_guide_data <- enriched_guides %>%
  filter(louvain == 0)

# Prepare for CellDataSet object ---------------------------------------------
expr_matrix <- t(as.matrix(data[11:ncol(data)]))
colnames(expr_matrix) <- data$index
sample_sheet <- data[2:10]
rownames(sample_sheet) <- data$index
gene_annotation <- data.frame(gene_short_name = colnames(data[11:ncol(data)]))
rownames(gene_annotation) <- colnames(data[11:ncol(data)])

# create CellDataSet object
pd <- new("AnnotatedDataFrame", data = sample_sheet)
fd <- new("AnnotatedDataFrame", data = gene_annotation)
# choose a data distribution
# using guassianff() here since data is already normally dist,
# on raw want to use negbinomial.size()
HSMM <- newCellDataSet(as(expr_matrix, 'sparseMatrix'),
                       phenoData = pd,
                       featureData = fd,
                       lowerDetectionLimit = 0.5,
                       expressionFamily = inv.gaussianff()) # must check this inv.gaussian()

toc()


# METHOD: Construct single cell trajectories ------------------------------
tic('Gene abnd cell filtering.....')

# use low level of expresison detection - these genes
# account for DE genes already so want to keep them all
# skip cell filtering in general...already preprocessed
HSMM <- detectGenes(HSMM, min_expr = 0.0001)
print('Number cells expressing retained genes summary:')
print(summary(fData(HSMM)$num_cells_expressed))

# superset of genes expressed in at least 5% of all cells
fData(HSMM)$use_for_ordering <-
  fData(HSMM)$num_cells_expressed > 0.05 * ncol(HSMM)
# percentage of retained genes
print('Percentage of retained genes:')
print(sum(fData(HSMM)$use_for_ordering) / length(fData(HSMM)$use_for_ordering))
toc()

# visualize importance of PCs by how much
# variation they explain
png(sprintf('%s/pc_variance.png', out.dir), width = 6, height = 6, units = 'in', res = 200)
plot_pc_variance_explained(HSMM, return_all = F)
dev.off()

tic('Calc PCs and reduce via tSNE...')
# reduce the top PCs further using tSNE
HSMM <- reduceDimension(HSMM,
                        max_components = 6,
                        norm_method = 'none',
                        pseudo_expr = 0,
                        #num_dim = 4,
                        reduction_method = 'tSNE',
                        cores = round(detectCores()*0.9),
                        verbose = T)
toc()
# tmp save reduced model
saveRDS(HSMM, sprintf("%s/analyzed_obj.RDS", out.dir))

tic('Louvain sub-clustering.....')
# run density peak clustering to identify the
# clusters on the 2-D t-SNE space
HSMM <- clusterCells(HSMM,
                     verbose = F,
                     cores = round(detectCores()*0.9),
                     #delta_threshold = 5,
                     #rho_threshold = 400,
                     method = "louvain",
                     k = round(sqrt(dim(HSMM)[[2]])) ,
                     louvain_iter = 3)
toc()

# visualize
pData(HSMM)$louvain <- as.factor(pData(HSMM)$louvain)
pData(HSMM)$guide_cov <- as.factor(pData(HSMM)$guide_cov)
pData(HSMM)$wt <- as.factor(pData(HSMM)$guide_cov == "0")
# vectorize enriched guides in cluster 1 vs non
pData(HSMM)$en.gu.clust0 <- replace(as.vector(pData(HSMM)$guide_cov),
                                    !(as.vector(pData(HSMM)$guide_cov)%in% clust1_guide_data$guide),
                                    "non-enriched_guides")
pData(HSMM)$en.gu.clust0 <- as.factor(pData(HSMM)$en.gu.clust0)
# facet plot all phenotypically encoded data types
louv_plot <- plot_cell_clusters(HSMM, color_by = 'louvain')
clust_plot <- plot_cell_clusters(HSMM, color_by = 'Cluster')
guide_plot <- plot_cell_clusters(HSMM, color_by = 'en.gu.clust0')
wt_plot <-  plot_cell_clusters(HSMM, color_by = 'wt')
png(sprintf('%s/subclusters.png', out.dir), width = 30, height = 10, units = 'in', res = 200)
gridExtra::grid.arrange(clust_plot, louv_plot, wt_plot, guide_plot, ncol = 4)
dev.off()

# visualize cell local density (P) vs.
# nearest comparative cell distance (delta)
# for cluster understanding
# Can only employ this plot when using
# Density Peak clustering
# png(sprintf('%s/rho_delta_thresholds.png', out.dir), width = 6, height = 6, units = 'in', res = 200)
# plot_rho_delta(HSMM)
# dev.off()

# HSMM <- clusterCells(HSMM,
#                      rho_threshold = 23,
#                      delta_threshold = 23,
#                      skip_rho_sigma = T,
#                      verbose = F)
# louv_plot <- plot_cell_clusters(HSMM, color_by = 'louvain')
# clust_plot <- plot_cell_clusters(HSMM, color_by = 'Cluster')
# gridExtra::grid.arrange(louv_plot, clust_plot, ncol = 2)

# add DE guides to visualize and cal DE genes by....
pData(HSMM)$louvain <- as.factor(pData(HSMM)$louvain)
pData(HSMM)$guide_cov <- as.factor(pData(HSMM)$guide_cov)
pData(HSMM)$cluster_guide <- as.factor(paste0(pData(HSMM)$louvain, "-", pData(HSMM)$guide_cov))

tic('DE gene test....')
# perform DE gene test to extract distinguishing genes
clustering_DEG_genes <- differentialGeneTest(HSMM,
                                             fullModelFormulaStr = "~en.gu.clust0 + louvain + guide_cov",
                                             reducedModelFormulaStr = "~louvain + guide_cov",
                                             cores = round(detectCores()*0.9))
toc()

# select top significant genes
HSMM_ordering_genes <-
  row.names(clustering_DEG_genes)[order(clustering_DEG_genes$qval)][1:200]

HSMM <- setOrderingFilter(HSMM,
                          ordering_genes = HSMM_ordering_genes)

tic('Reduce dimensions via DDRTree.....')
HSMM <- reduceDimension(HSMM, method = 'DDRTree', cores = round(detectCores()*0.9))
toc()

tic('Order cells according to learned trajectory of pseudotime.....')
HSMM <- orderCells(HSMM)
toc()

# visualize
louv_plot <- plot_cell_trajectory(HSMM, color_by = "louvain")
clust_plot <- plot_cell_trajectory(HSMM, color_by = "Cluster")
state_plot <- plot_cell_trajectory(HSMM, color_by = "State")
pseudo_plot <- plot_cell_trajectory(HSMM, color_by = "Pseudotime")
guide_plot <- plot_cell_trajectory(HSMM, color_by = "en.gu.clust0")
wt_plot <- plot_cell_trajectory(HSMM, color_by = "wt")
png(sprintf('%s/trajectory.png', out.dir), width = 30, height = 20, units = 'in', res = 200)
gridExtra::grid.arrange(louv_plot, clust_plot, state_plot,
                        pseudo_plot, wt_plot, guide_plot,
                        ncol = 3, nrow = 3)
dev.off()

# get DE genes associated with each EN guide
# sigpos guide-gene combos
path_guide_data <- "/ye/yelabstore2/dosageTF/tfko_140/combined/nsnp20.raw.sng.guide_sng.norm.vs0.igtb.guide.de.seurate.txt.meta.guide.meta.sigpos.txt"
guide_data <- read_tsv(path_guide_data) %>%
  rename(guide = cluster) %>%
  select(guide, gene, fdr) %>%
  mutate(guide = str_extract(guide, "^[:alnum:]+\\.[:digit:]+")) %>%
  filter(guide %in% clust1_guide_data$guide)

png(sprintf("%s/EN-gene-traj.png", out.dir), width = 15, height = 15, units = 'in', res = 200)
plot_cell_trajectory(HSMM, color_by = "en.gu.clust0", markers = guide_data$gene, use_color_gradient = FALSE,
                     markers_linear = TRUE)
dev.off()

# save object for futher use
saveRDS(HSMM, sprintf("%s/analyzed_obj.RDS", out.dir))