# library(flexdashboard) # library(data.table::data.table) # library(magrittr) # library(kableExtra) # library(RColorBrewer) # library(ggplot2) # library(cicero) # library(viridis)
read_conf <- function(){ system(paste0('grep = ', params$configure_user, " | grep -v ^# | awk -F= '{print $1}' | awk '{$1=$1;print}' > ", params$output_dir, "/vrs.txt")) system(paste0('grep = ', params$configure_user, " | grep -v ^# | awk -F= '{print $2}' | awk -F# '{print $1}' | awk '{$1=$1;print}' > ", params$output_dir, "/vls.txt")) vrs = readLines(paste0(params$output_dir, '/vrs.txt')) vls = readLines(paste0(params$output_dir, '/vls.txt')) for(i in 1:length(vrs)){ assign(vrs[i], vls[i], envir = .GlobalEnv) } # system(paste0('rm', params$output_dir, '/vrs.txt')) # system(paste0('rm', params$output_dir, '/vls.txt')) } read_conf() plotEPS = as.logical(plotEPS) mapping_qc_file = paste0(params$output_dir, '/summary/', OUTPUT_PREFIX, '.MappingStats') fragments_file = paste0(params$output_dir, '/summary/', OUTPUT_PREFIX, '.fragments.txt') frags = data.table::fread(fragments_file) names(frags) = c('chr', 'start', 'end', 'bc', 'ndup') mapping_qc = data.table::fread(mapping_qc_file, header = F) lib_complx = nrow(frags)/sum(frags$ndup) lib_complx = paste0(100 * round(lib_complx, 3), '%') mapping_qc$frac = round(mapping_qc$V2/mapping_qc$V2[1], 3) mapping_qc$frac = paste0(100*mapping_qc$frac, '%') mapping_qc = rbind(mapping_qc, data.frame(V1 ='Library Complexity (nonredudant fraction)', V2 = '', frac = lib_complx)) kable(mapping_qc, col.names = NULL, format = 'html', caption = paste('Sample:', OUTPUT_PREFIX)) %>% kable_styling("striped", full_width = F, position = 'left', font_size = 15)
cell_mapping_qc_file = paste0(params$output_dir, '/summary/cell_barcodes.MappingStats') cell_mapping_qc = data.table::fread(cell_mapping_qc_file, header = F) bc_stat_file = paste0(params$output_dir, '/summary/', OUTPUT_PREFIX, '.qc_per_barcode.txt') selected_bcs = paste0(params$output_dir, '/filtered_matrix/', PEAK_CALLER, '/', CELL_CALLER, '/barcodes.txt') bc_stat = data.table::fread(bc_stat_file) barcodes = data.table::fread(selected_bcs, header = F)$V1 qc_sele = bc_stat[bc %in% barcodes, ] qc_nonsele = bc_stat[!bc %in% barcodes, ] #frags = frags[bc %in% barcodes] ncells = length(barcodes) mapq30.frac.in.cell = paste0(round(sum(frags[bc %in% barcodes]$ndup)/sum(frags$ndup), 3) * 100, '%') frac.in.cell = paste0(round(cell_mapping_qc$V2[2]/as.integer(as.character(mapping_qc$V2[2])), 3) * 100, '%') med.frag.per.cell = round(median(qc_sele$total_frags)) frac.uniq = paste0(round(nrow(frags[bc %in% barcodes])/sum(frags[bc %in% barcodes]$ndup), 3) * 100, '%') cell.table = data.frame(c(CELL_CALLER, paste0(ncells), paste0(med.frag.per.cell), frac.in.cell, mapq30.frac.in.cell)) rownames(cell.table) = c('Cell called by', 'Estimated # of cells', 'Median fragments per cell', 'Fraction of Mapped reads in cells', 'Fraction of MAPQ30 in cells') kable(cell.table, row.names = T, col.names = NULL, format = 'html') %>% kable_styling("striped", full_width = F, position = 'left', font_size = 15)
if(CELL_MAP_QC){ lib_complx = frac.uniq cell_mapping_qc$frac = round(cell_mapping_qc$V2/cell_mapping_qc$V2[1], 3) cell_mapping_qc$frac = paste0(100*cell_mapping_qc$frac, '%') cell_mapping_qc = rbind(cell_mapping_qc, data.frame(V1 ='Library Complexity (nonredudant fraction)', V2 = '', frac = lib_complx)) kable(cell_mapping_qc, col.names = NULL, format = 'html') %>% kable_styling("striped", full_width = F, position = 'left', font_size = 15) }
bc_stat[, 'group' := ifelse(bc %in% barcodes, 'cell', 'non-cell')] # library(ggplot2) # library(grid) nsub_frags = min(15000, nrow(bc_stat)) g <- ggplot(data = bc_stat[sample(1:nrow(bc_stat), nsub_frags), ], aes(x = total_frags, y = frac_peak, col = group)) + geom_point(size = 0.5) + scale_x_continuous(trans='log10') + theme_bw() + theme(legend.position = 'none', legend.title=element_blank(), axis.text = element_text(size = 15, family = "Helvetica"), axis.title = element_text(size = 18, family = "Helvetica")) + xlab('Total #Unique Fragments') + ylab('Fraction in Peak') #bb = ggplot_build(g) #cols = unique(bb[[1]][[1]]$colour) text1 <- grobTree(textGrob("Cell", x=0.8, y=0.93, hjust=0, gp=gpar(col='#F8766D', fontsize=15, fontface = 'bold', fontfamily = "Helvetica"))) text2 <- grobTree(textGrob("Non-cell", x=0.8, y=0.83, hjust=0, gp=gpar(col='#00BFC4', fontsize=15, fontface = 'bold', fontfamily = "Helvetica"))) g <- g + annotation_custom(text1) + annotation_custom(text2) g if(plotEPS){ system(paste0('mkdir -p ', params$output_dir, '/summary/Figures')) pfname = paste0(params$output_dir, '/summary/Figures/fracInPeak_vs_totalFragments.eps') ggsave(g, file = pfname, device = 'eps', width = 6, height = 6) }
frags[, 'isize' := end - start] frags = frags[sample(1:nrow(frags), 1000000), ] p1 <- ggplot(data = frags[isize < 800], aes(x = isize)) + geom_density(fill = 'lightblue') + xlab('Insert Size') + ylab('Density') + theme_bw() + theme(legend.title=element_blank(), legend.background = NULL, axis.text = element_text(size = 15, family = "Helvetica"), axis.title = element_text(size = 18, family = "Helvetica")) p1 if(plotEPS){ pfname = paste0(params$output_dir, '/summary/Figures/dist_insert_size.eps') ggsave(p1, file = pfname, device = 'eps', width = 6, height = 6) }
#tss_escore_file = paste0(params$output_dir, '/signal/', OUTPUT_PREFIX, '.aggregated.mtx.gz') tss_escore_file = paste0(params$output_dir, '/signal/cell_barcodes.aggregated.mtx.gz') set.cols = brewer.pal(n=5, name = 'Dark2') tss.mat = data.table::fread(tss_escore_file) tss.mat = tss.mat[, -c(1:6)] tss.mat[is.na(tss.mat)] = 0 tss.escore = colSums(tss.mat) ma <- function(x, n = 10){stats::filter(x, rep(1 / n, n), sides = 2)} tss.escore = ma(tss.escore) tss.escore = tss.escore[14:213] df = data.table::data.table(index = 10*(-100:99), escore = tss.escore/tss.escore[1]) p0 <- ggplot(data = df, aes(x = index, y = escore)) + geom_line(size = 1, col = set.cols[1]) + theme_bw() + xlab('Distance to TSS (bp)') + ylab('TSS enrichment score') + theme(legend.title=element_blank(), axis.text = element_text(size = 15, family = "Helvetica"), axis.title = element_text(size = 18, family = "Helvetica")) p0 if(plotEPS){ pfname = paste0(params$output_dir, '/summary/Figures/tss_enrich.eps') ggsave(p0, file = pfname, device = 'eps', width = 6, height = 6) }
bc_stat[, 'group' := ifelse(bc %in% barcodes, 'cell', 'non-cell')] p <- ggplot(data = bc_stat, aes(x = total_frags, fill = group)) + geom_density() + scale_x_continuous(trans = 'log10') + theme_bw() + theme(legend.position='none', legend.title=element_blank(), axis.title = element_text(size = 18, family = "Helvetica"), axis.text = element_text(size = 15, family = "Helvetica")) + xlab('Total #Unique Fragments') + ylab('Density') #bb = ggplot_build(p) #cols = unique(bb$data[[1]][['fill']]) text1 <- grobTree(textGrob("Cell", x=0.8, y=0.93, hjust=0, gp=gpar(col='#F8766D', fontsize=15, fontface = 'bold', fontfamily = "Helvetica"))) text2 <- grobTree(textGrob("Non-cell", x=0.8, y=0.83, hjust=0, gp=gpar(col='#00BFC4', fontsize=15, fontface = 'bold', fontfamily = "Helvetica"))) p <- p + annotation_custom(text1) + annotation_custom(text2) p #ggplotly(p) if(plotEPS){ system(paste0('mkdir -p ', params$output_dir, '/summary/Figures')) pfname = paste0(params$output_dir, '/summary/Figures/dist_frags.eps') ggsave(p, file = pfname, device = 'eps', width = 6, height = 6) }
qc_sele_df = data.table::data.table(frac = c(qc_sele$frac_peak, qc_sele$frac_tss, qc_sele$frac_promoter, qc_sele$frac_enh, qc_sele$frac_mito), 'type' = rep(c('Peaks', 'Tss', 'Promoter', 'Enhancer', 'Mito'), each = nrow(qc_sele))) qc_sele_df$type = factor(qc_sele_df$type, levels = c('Peaks', 'Tss', 'Promoter', 'Enhancer', 'Mito')) p0 <- ggplot(data = qc_sele_df, aes(y = frac, x = type, fill = type)) + ylab('Fraction') + theme_bw() + geom_boxplot(outlier.size = 0.01, show.legend = FALSE) + theme(legend.position = 'none', axis.text = element_text(size = 18, family = "Helvetica"), axis.title.x = element_blank(), axis.title.y = element_text(size = 18, family = "Helvetica")) + xlab('') p0 if(plotEPS){ pfname = paste0(params$output_dir, '/summary/Figures/overlap_annotation.eps') ggsave(p0, file = pfname, device = 'eps', width = 6, height = 6) }
frac_peak = sum(qc_sele$total_frags * qc_sele$frac_peak)/sum(qc_sele$total_frags) frac_mito = sum(qc_sele$total_frags * qc_sele$frac_mito)/sum(qc_sele$total_frags) frac_promoter = sum(qc_sele$total_frags * qc_sele$frac_promoter)/sum(qc_sele$total_frags) frac_enh = sum(qc_sele$total_frags * qc_sele$frac_enhancer)/sum(qc_sele$total_frags) frac_tss = sum(qc_sele$total_frags * qc_sele$frac_tss)/sum(qc_sele$total_frags) fracs = data.frame(c(frac_peak, frac_promoter, frac_enh, frac_tss)) row.names(fracs) = c('Fraction in peaks', 'Fraction in promoters', 'Fraction in Enhancers(ENCODE)', 'Fraction in TSS') colnames(fracs) = 'pr' fracs$pr = round(fracs$pr, 3) fracs$pr = paste0(100*fracs$pr, '%') kable(fracs, row.names = T, col.names = NULL) %>% kable_styling(full_width = F, position = 'left', font_size = 15)
rm(frags) # library(Seurat) down.dir = paste0(params$output_dir, '/downstream_analysis/', PEAK_CALLER, '/', CELL_CALLER) seurat_file = paste0(down.dir, '/seurat_obj.rds') if(file.exists(seurat_file)){ ss = readRDS(seurat_file) cg <- Seurat::DimPlot(ss, reduction = 'umap', group.by = 'active_clusters', label = T) + scale_color_brewer(palette = "Paired") + theme(legend.text = element_text(size = 18, family = "Helvetica")) if(plotEPS){ pfname = paste0(params$output_dir, '/summary/Figures/umap_clusters.eps') ggsave(cg, file = pfname, device = 'eps', width = 6, height = 6) } cg }
go_file = paste0(down.dir, '/enrichedGO_differential_peak_cluster_', group1, '_VS_cluster_', group2, '.txt.xlsx') #go_file = paste0(down.dir, '/enrichedGO_differential_peak_cluster_', group1, # '_VS_cluster_', group2, '.txt.csv') if(file.exists(go_file)){ # library(xlsx) go_res1 = xlsx::read.xlsx(go_file, sheetName = paste0('cluster', group1)) go_res2 = xlsx::read.xlsx(go_file, sheetName = paste0('cluster', group2)) go_res = go_res1 group.sele = group1 if(nrow(go_res) == 0) { go_res = go_res2 group.sele = group2 } if(nrow(go_res) == 0) { print('No enriched terms in any of the two groups') }else{ #go_res = data.table::fread(go_file) go_res = data.table::data.table(go_res) go_res[, 'score' := -log10(p.adjust)] go_res = go_res[order(-score), ] go_res = go_res[1:20, ] go_res = go_res[order(score), ] go_res$Description = factor(go_res$Description, levels = go_res$Description) p_go <- ggplot(go_res, aes(y = score, x = Description, fill = Count)) + geom_bar(width = 0.7, stat = 'identity') + ggtitle(paste0("Enriched terms: cluster_", group2)) + theme_classic() + theme(legend.position = 'bottom', legend.direction = "horizontal") + coord_flip() + scale_fill_continuous(name = "#genes", type = "viridis") + xlab('') + ylab('-log10(p.adjust)') if(plotEPS){ pfname = paste0(params$output_dir, '/summary/Figures/enriched_GO.eps') ggsave(p_go, file = pfname, device = 'eps', width = 10, height = 6) } p_go } }
## check enriched TFs for each cluster # library(chromVAR) # library(BiocParallel) register(SerialParam()) # Do DA/DE with one cluster vs the rest clusters # clusters are the data frame with <barcode> <cluster> do_DA <- function(mtx_score, clusters, test = 'wilcox', only.pos = T, fdr = 0.05, topn = 10){ clusters$cluster = as.character(clusters$cluster) cls = unique(clusters$cluster) res = NULL features = rownames(mtx_score) for(cluster0 in cls){ bc0 = clusters[cluster == cluster0]$barcode mtx1 = mtx_score[, colnames(mtx_score) %in% bc0] mtx2 = mtx_score[, !colnames(mtx_score) %in% bc0] mu1 = sapply(1:length(features), function(x) mean(mtx1[x, ])) mu2 = sapply(1:length(features), function(x) mean(mtx2[x, ])) pvs = rep(0.5, length(features)) for(x in 1:length(features)){ a1 = mtx1[x, ] a2 = mtx2[x, ] if(length(which(!is.na(a1))) < 2 || length(which(!is.na(a2))) < 2) next pvs[x] = wilcox.test(a1, a2, alternative = 'greater')$p.value } pvs.adj = p.adjust(pvs, method = 'fdr') res0 = data.table::data.table('feature' = features, 'cluster' = cluster0, 'mean1' = mu1, 'mean2' = mu2, 'pv' = pvs, 'pv_adjust' = pvs.adj) res0 = res0[order(pv_adjust), ] res0 = res0[pv_adjust <= fdr] if(nrow(res0) > topn) res0 = res0[1:topn, ] res = rbind(res, res0) } return(res) } if(file.exists(seurat_file)){ metaData = ss@meta.data rm(ss) } if(file.exists(paste0(down.dir, '/chromVar_obj.rds'))){ chromVar.obj = readRDS(paste0(down.dir, '/chromVar_obj.rds')) #variability <- computeVariability(chromVar.obj) #variability = data.table::data.table(variability, stringsAsFactors = F) #variability = variability[order(-variability), ] ## calculate DA #dev = chromVar.obj@assays$data$deviations dev = deviations(chromVar.obj) da.res = do_DA(dev, clusters = data.table::data.table('barcode' = rownames(metaData), 'cluster' = metaData$active_clusters), topn = 10) rm(dev) write.csv(da.res, file = paste0(down.dir, '/differential_TF_cluster_enrich.txt'), quote = F, row.names = F ) ## plot enriched TFs in heatmap sele.tfs = da.res$feature #zscores = chromVar.obj@assays$data$z zscores = deviationScores(chromVar.obj) sele.zscores = zscores[sele.tfs, ] # change tf name to be more readable rnames = rownames(sele.zscores) nnames = sapply(rnames, function(x) unlist(strsplit(x, '_'))[3]) nnames1 = sapply(rnames, function(x) unlist(strsplit(x, '_'))[1]) rownames(sele.zscores) = ifelse(grepl(nnames, pattern = 'LINE'), nnames1, nnames) metaData$active_clusters = as.character(metaData$active_clusters) metaData = data.table::data.table(metaData, keep.rownames = T) setkey(metaData, active_clusters) rr = metaData$rn[metaData$rn %in% colnames(sele.zscores)] sele.zscores = sele.zscores[, rr] sele.zscores = sele.zscores[!duplicated(sele.zscores), ] ann_col = data.frame('cluster' = metaData$active_clusters) rownames(ann_col) = metaData$rn up_cut = quantile(sele.zscores, 0.95, na.rm = T) low_cut = quantile(sele.zscores, 0.05, na.rm = T) sele.zscores[is.na(sele.zscores)] = 0 low_cut = min(0, low_cut) sele.zscores[sele.zscores > up_cut] = up_cut sele.zscores[sele.zscores < low_cut] = low_cut cluster = brewer.pal(n=length(unique(metaData$active_clusters)), name = 'Paired') names(cluster) = sort(unique(metaData$active_clusters)) ann_colors = list('cluster' = cluster) # resample to reduce memory used set.seed(2019) rids = sort(sample(1:ncol(sele.zscores), floor(ncol(sele.zscores)/6))) ann_col0 = data.frame(ann_col[rids, ]) rownames(ann_col0) = colnames(sele.zscores)[rids] mtx0 = sele.zscores[, rids] names(ann_col0) = 'cluster' ph <- pheatmap::pheatmap(mtx0, cluster_cols = F, cluster_rows = T, show_colnames = F, fontsize = 13, annotation_col = ann_col0, color = viridis(100), annotation_colors = ann_colors, fontsize_row = 9) if(plotEPS){ pfname = paste0(params$output_dir, '/summary/Figures/heatmap_motif_enrich.eps') #postscript(file = pfname, width = 9, height = 12) ggsave(ph, filename = pfname, device = 'eps', height = 12, width = 9) #dev.off() } }
comp_cls = paste0('cluster_', group1_fp, '_cluster_', group2_fp) footprint_stats.file = paste0(down.dir, '/data_by_cluster/footprint/', comp_cls, '/differential_statistics.txt') if(file.exists(footprint_stats.file)){ # library(ggrepel) # library(gridExtra) # # library(grid) footprint_stats = data.table::fread(footprint_stats.file) footprint_stats[, 'motif1' := unlist(strsplit(Motif, '.', fixed = T))[3], by = Motif] footprint_stats[, 'motif2' := unlist(strsplit(Motif, '.', fixed = T))[4], by = Motif] footprint_stats[, 'motif2' := ifelse(is.na(motif2), "", motif2)] footprint_stats[, 'motif' := paste0(motif1, motif2)] footprint_stats[, c('motif1', 'motif2') := NULL] footprint_stats[, 'isSig' := ifelse(P_values <= 0.05, 'differentiated', 'no difference')] footprint_stats[, 'isSig' := ifelse(P_values <= 0.05 & TF_Activity > 0, paste0('cluster', group2_fp, '_high'), isSig)] footprint_stats[, 'isSig' := ifelse(P_values <= 0.05 & TF_Activity < 0, paste0('cluster', group1_fp, '_high'), isSig)] footprint_stats$motif_show = "" footprint_stats$motif = toupper(footprint_stats$motif) footprint_stats[P_values <= 0.05]$motif_show = footprint_stats[P_values <= 0.05]$motif p <- ggplot(data = footprint_stats, aes(x = motif, y = TF_Activity, colour = factor(isSig), label = motif_show)) + geom_point() + xlab("") + ylab('TF Activity Difference') + theme(legend.text = element_text(size=18, family = "Helvetica"), axis.title = element_text(size = 18, family = "Helvetica"), axis.text.x = element_blank(), legend.title = element_blank(), legend.direction = 'horizontal', plot.title = element_text(size = 15, family = "Helvetica", face = 'bold'), panel.background = element_rect(fill = "white")) + geom_text_repel(force = 10) + theme(plot.margin=unit(c(0.2, 0, 0.2, 0), "cm"), legend.position = 'bottom') + scale_color_manual(values = c('#F8766D', '#619CFF', 'gray40')) if(plotEPS){ pfname = paste0(params$output_dir, '/summary/Figures/footprint_tf_activity.eps') ggsave(p, filename = pfname, device = 'eps', height = 6, width = 6) } p }
cicero_conn.file = paste0(down.dir, '/cicero_interactions.txt') if(file.exists(cicero_conn.file)){ conns = data.table::fread(cicero_conn.file) conns = data.frame(conns) temp <- tempfile() if(grepl(GENOME_NAME, pattern = 'mm10', ignore.case = T)) { download.file('ftp://ftp.ensembl.org/pub/release-95/gtf/mus_musculus/Mus_musculus.GRCm38.95.gtf.gz', temp) } if(grepl(GENOME_NAME, pattern = 'mm9', ignore.case = T)) { download.file('ftp://ftp.ensembl.org/pub/release-67/gtf/mus_musculus/Mus_musculus.NCBIM37.67.gtf.gz', temp) } if(grepl(GENOME_NAME, pattern = 'hg38', ignore.case = T)) { download.file('ftp://ftp.ensembl.org/pub/release-95/gtf/homo_sapiens/Homo_sapiens.GRCh38.95.gtf.gz', temp) } if(grepl(GENOME_NAME, pattern = 'hg19', ignore.case = T)) { download.file('ftp://ftp.ensembl.org/pub/release-67/gtf/homo_sapiens/Homo_sapiens.GRCh37.67.gtf.gz', temp) } gene_anno <- rtracklayer::readGFF(temp) unlink(temp) # rename some columns to match requirements gene_anno$chromosome <- paste0("chr", gene_anno$seqid) gene_anno$gene <- gene_anno$gene_id gene_anno$transcript <- gene_anno$transcript_id gene_anno$symbol <- gene_anno$gene_name gene_anno = subset(gene_anno, select = c(chromosome, start, end, strand, transcript, gene, symbol)) gene_anno = gene_anno[complete.cases(gene_anno), ] chr0 = unlist(strsplit(Cicero_Plot_Region, ':'))[1] ## chr5:140610000-140640000 region0 = unlist(strsplit(Cicero_Display_Region, ':'))[2] start0 = as.integer(unlist(strsplit(region0, '-'))[1]) end0 = as.integer(unlist(strsplit(region0, '-'))[2]) if(plotEPS){ pdf(paste0(params$output_dir,'/summary/Figures/interactions_example_region.pdf'), width = 8, height = 5, fonts = 'sans') plot_connections(conns, chr0, start0, end0, gene_model = gene_anno, coaccess_cutoff = .4, connection_width = 1, collapseTranscripts = "longest", viewpoint_alpha = 0) dev.off() } plot_connections(conns, chr0, start0, end0, gene_model = gene_anno, coaccess_cutoff = .4, connection_width = 1, collapseTranscripts = "longest", viewpoint_alpha = 0) }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.