title: "Cleaning the DGE Data" author: "Eric Kernfeld" date: "September 7, 2016" output: html_document
#' Characterize genes by behavior over pseudotime, returning cluster assignments and p values. #' #' @param dge should be a seurat object with a field "pseudotime". #' The field `dge@data` is accessed for expression levels -- for Eric's objects, the units will be log2(1+CP10K). #' @param results_path is a character vector showing where to dump the output. #' @param num_periods_initial_screen Cells are partitioned into this many pseudotime periods (equal number of cells in each). Initial screening is based on a piecewise linear model where expression is constant within these bins. #' @param pval_cutoff Genes are screened by p-value to avoid too much computationally expensive smoothing. #' @param do_adjust Logical -- if TRUE, then apply BH correction. #' @param abcissae_kmeans Gene expression is fed into k-means as a series of predictions at successive time points. #' The arguments says how many time points to predict and feed in (if length one) or what time points (if longer). #' @param num_clusters Genes are partitioned into this many modules. If NULL (default) the value is selected via the gap statistics and their SEs using the method in the original gap statistic paper: #' #' Tibshirani, R., Walther, G. and Hastie, T. (2001). Estimating the number #' of data clusters via the Gap statistic. Journal of the Royal Statistical Society B, 63, 411–423. #' #' There's one adjustment: this function will never use just one cluster. It will issue a warning and use 2. #' #' @return A list with elements: #' \itemize{ #' \item dge: the Seurat object #' \item gene_stats: genes with effect sizes and cluster labels. #' \item smoothers: fitted regression models, one for each gene. #' \item cluster_mod: output from stats::kmeans #' \item gap_stats: output from cluster::clusGap #' } #' #' #' This function helps explore gene dynamics over pseudotime. It goes through three main steps: #' \itemize{ #' \item find genes that respond strongly to pseudotime. #' \item smooth those genes' expression to form an overall pseudotime trend. #' \item cluster genes based on smoothed expression patterns that have been shifted/scaled to the unit interval. #' } #' #' @export #' smooth_and_cluster_genes = function( dge, results_path, genes.use = get_mouse_tfs(), num_periods_initial_screen = 20, pval_cutoff = 0.05, do_adjust = T, abcissae_kmeans = 20, num_clusters = NULL ){ dir.create.nice( results_path ) # # Filter genes based on qvals from piecewise linear model cell_data = Seurat::FetchData( dge , "pseudotime" ) nbreaks = num_periods_initial_screen+1 breaks = quantile(cell_data$pseudotime, probs = (1:nbreaks - 0.5)/nbreaks) cell_data$period = cut( x = cell_data$pseudotime, breaks = breaks, labels = as.character( 1:num_periods_initial_screen ), include.lowest = T, ordered_result = T) dge = Seurat::AddMetaData( dge, cell_data[, "period"] %>% matrixify_preserving_rownames, "period" ) ssq = function(x) sum((x - mean(x))^2) genes.use = FetchDataZeroPad( dge, genes.use ) %>% apply(2, prop_nz) %>% is_greater_than(0.05) %>% which %>% names expr_mat = FetchDataZeroPad( dge, genes.use ) %>% as.matrix n = length(dge@cell.names) df1 = n - 1 df2 = n - num_periods_initial_screen mss_constant = apply( expr_mat, 2, ssq ) %>% divide_by(df1) mss_piecewise = aggregate_nice( expr_mat, by = cell_data$period, FUN = ssq ) %>% t %>% rowSums %>% divide_by(df2) get_p = function(x) pf( x, df1 = df1, df2 = df2, lower.tail = F ) pvals = (mss_constant / mss_piecewise) %>% sapply( get_p ) names(pvals) = genes.use if( do_adjust ){ pvals %<>% p.adjust() } genes_included = names(which(pvals < pval_cutoff )) # Smooth gene expression over pseudotime pt_data = FetchData( dge, c( "pseudotime", genes_included ) ) s = mgcv:::s fast_smooth = function( i ) { gene = genes_included[i] expr = pt_data[[gene]] pseudotime = pt_data[["pseudotime"]] if( i %% 50 == 0 ){ cat("Smoothing gene ", i, " of ", length( genes_included ), "\n" ) } return( mgcv::gam( expr ~ s( pseudotime ), family = mgcv::nb() ) ) } smoothers = lapply( seq_along( genes_included ), fast_smooth ) names( smoothers ) = genes_included # # Generate kmeans features if( length( abcissae_kmeans ) == 0 ) { abcissae_kmeans = 20 } if( length( abcissae_kmeans ) == 1 ) { abcissae_kmeans = quantile(cell_data$pseudotime, (1:abcissae_kmeans - 0.5)/abcissae_kmeans ) } kmeans_features = sapply( smoothers, predict, type = "response", newdata = data.frame( pseudotime = abcissae_kmeans ) ) kmeans_features = t(kmeans_features) # # Rescale each gene to the unit interval kmeans_features = apply( kmeans_features, MARGIN = 1, FUN = function( x ) (x - min(x)) ) %>% t kmeans_features = apply( kmeans_features, MARGIN = 1, FUN = div_by_max ) %>% t atae(ncol(kmeans_features), length(abcissae_kmeans)) atae(nrow(kmeans_features), length(genes_included)) rownames(kmeans_features) = genes_included cat(" Calculating gap statistics... \n") gap_stats = cluster::clusGap( x = kmeans_features, FUNcluster = kmeans, K.max = 25, spaceH0 = "scaledPCA", iter.max = length(abcissae_kmeans) * 2 ) { pdf( file.path( results_path, "gap_stats.pdf" ) ) plot( gap_stats ) dev.off() } if( is.null( num_clusters ) ) { num_clusters = cluster::maxSE(gap_stats$Tab[, 3], gap_stats$Tab[, 4], "Tibs2001SEmax") if( num_clusters == 1 ){ warning("Based on gap statistics, Tibshirani et alii suggest one cluster, but that's boring so I'll use two.") num_clusters = 2 } } cat(" Continuing with ", num_clusters, " clusters.\n") cluster_mod = kmeans( kmeans_features, centers = num_clusters, iter.max = 500 ) # Reorder clusters by hierarchical clustering peak_position = apply( cluster_mod$centers, 1, function( x ) mean( which( x > 0.75 ) ) ) preferred_ordering = order( peak_position ) old_given_new = preferred_ordering new_given_old = function( k ){ which(k==old_given_new)} cluster_mod$cluster = sapply( cluster_mod$cluster, new_given_old ) cluster_mod$centers = cluster_mod$centers [old_given_new, ] cluster_mod$withinss = cluster_mod$withinss[old_given_new] cluster_mod$size = cluster_mod$size [old_given_new] rownames(cluster_mod$centers) = NULL # # Assemble data for export cat("Exporting cluster assignments and pvals for genes under study... \n") to_return = data.frame( gene = genes_included, pval = pvals[genes_included], cluster = cluster_mod$cluster[genes_included] ) to_return = to_return[order( to_return$cluster, to_return$pval ), ] write.table( to_return, file.path(results_path, "gene_pt_dependence_stats.txt"), sep = "\t", quote = F, row.names = F, col.names = T) cat("Done.\n") return( list( dge = dge, # smoothing/filtering gene_stats = to_return, smoothers = smoothers, # clustering abcissae_kmeans = abcissae_kmeans, kmeans_features = kmeans_features, cluster_mod = cluster_mod, gap_stats = gap_stats ) ) } #' Draw overlaid line plots of gene clusters from output of `smooth_and_cluster_genes`. #' #' @param facet_ncol For resulting facet plots, number of columns. #' @export facet_plot_gene_clusters = function( dge, results_path, cluster_mod, kmeans_features, abcissae_kmeans, facet_ncol = NULL ){ cluster_mod$cluster = factor( cluster_mod$cluster ) num_clusters = length(levels(cluster_mod$cluster)) centers = aggregate_nice( kmeans_features, by = cluster_mod$cluster, FUN = mean ) data_wide = data.frame( rbind( centers, kmeans_features ) ) data_wide$is_center = F; data_wide$is_center[1:num_clusters] = T data_wide$cluster = c( levels(cluster_mod$cluster), as.character( cluster_mod$cluster ) ) data_wide$gene = rownames( data_wide ) data_long = reshape2::melt( data_wide, id.vars = c( "is_center", "cluster", "gene" )) data_long$unit_scaled_expression = data_long$value data_long$pseudotime = abcissae_kmeans[ data_long$variable ] data_long$cluster %<>% factor(levels = rev(sort(unique(data_long$cluster))), ordered = T) p_faceted_clusters = ggplot() + ggtitle( "Major gene clusters" ) + geom_line( data = subset( data_long, !is_center), alpha = 0.6, aes( x = pseudotime, y = unit_scaled_expression, group = gene, colour = cluster ) ) + geom_line( data = subset( data_long, is_center), alpha = 1, colour = "black", aes( x = pseudotime, y = unit_scaled_expression, group = gene ) ) + facet_wrap( ~cluster, ncol = facet_ncol ) + theme(strip.background = element_blank(), legend.position = "none") + ylab("Unit-scaled expression") + scale_colour_grey() ggsave( filename = file.path(results_path, "faceted_gene_clusters.pdf"), p_faceted_clusters, width = 5, height = 6 ) return( p_faceted_clusters ) } #' Draw heatmaps of gene clusters from output of `smooth_and_cluster_genes`. #' #' @param dge Seurat object with raw data and pseudotime metadata used to train smoothers. #' If metadata `simple_branch` is present, function is hardwired to look for "mTEC", "cTEC", "branchpoint", and "progenitor" #' and make a heatmap similar to figure 2 in http://dx.doi.org/10.1101/122531. #' @param results_path Where to save plots and files. #' @param cluster_mod K-means output with cluster labels for the genes in `smoothers` and also with cluster centers. #' @param smoothers List of regression models for the genes in `cluster_mod` and `gene_stats`. #' @param gap_size White bars separating clusters are formed by adding fake genes. gap_size is how many fake genes per bar. #' @param genes_use Gene names or anything in `AvailableData(dge)`. #' @param genes_to_label Subset of `genes_use` to write tick labels for. #' @export heatmap_gene_clusters = function( dge, results_path, cluster_mod, smoothers, gap_size = NULL, genes_use = NULL, genes_to_label = NULL ){ cell_data = dge %>% FetchData("pseudotime") num_clusters = length( unique( cluster_mod$cluster ) ) atae( names( smoothers ), names(cluster_mod$cluster) ) if( is.null( genes_use ) ){ genes_use = names(cluster_mod$cluster) } else { cluster_mod$cluster = cluster_mod$cluster[genes_use] smoothers = smoothers[genes_use] } # # set up wide-format data if( "simple_branch" %in% AvailableData( dge ) ){ abcissae_heatmap = FetchData(dge, "pseudotime")[[1]] #avoids exact duplicates; need to use these as dimnames abcissae_heatmap = abcissae_heatmap + rnorm( n = length(abcissae_heatmap), mean = 0, sd = 1e-8 ) abcissae_heatmap %<>% sort } else { abcissae_heatmap = seq( min(cell_data$pseudotime), max(cell_data$pseudotime), length.out = 100 ) } data_wide_heat = sapply( smoothers, predict, type = "response", newdata = data.frame( pseudotime = abcissae_heatmap ) ) rownames(data_wide_heat) = abcissae_heatmap get_expression_peak = function(x){ weighted.mean( x = seq_along( x ), w = x * ( standardize(x)>0 ) ) } peak_expression = apply( data_wide_heat, 2, get_expression_peak ) names(peak_expression) = names(smoothers) data_wide_heat = apply(data_wide_heat, 2, standardize) %>% t %>% as.data.frame data_wide_heat$gene = names(smoothers) data_wide_heat$cluster = cluster_mod$cluster # # Add bars between clusters by adding dummy genes ranked last in each cluster if( is.null(gap_size) ){ gap_size = ifelse( length(genes_use) < 50, 0, ceiling( length(smoothers) / 100 ) ) } scaffold = rep( 1:nrow(cluster_mod$centers), each = gap_size ) if( gap_size > 0 ){ dummy_genes = matrix( NA, ncol = length( abcissae_heatmap ), nrow = length( scaffold ) ) %>% as.data.frame dummy_genes$gene = paste0("DUMMY_GENE_", scaffold ) %>% make.unique dummy_genes$cluster = scaffold colnames(dummy_genes) = colnames(data_wide_heat) data_wide_heat = rbind( data_wide_heat, dummy_genes ) peak_expression = c( peak_expression, rep( Inf, length(scaffold) ) ) #used for ranking } # # Add vertical bar for branching heatmap if( "simple_branch" %in% AvailableData( dge ) ){ pt_by_branch = aggregate_nice( x = FetchData(dge, "pseudotime"), by = FetchData(dge, "simple_branch"), FUN = mean ) atae( pt_by_branch["progenitor", ], 0, tol = 1e-8 ) atae( pt_by_branch["branchpoint", ], 0, tol = 1e-8 ) atat( pt_by_branch["mTEC", ] < 0 ) atat( pt_by_branch["cTEC", ] > 0 ) branch_counts = FetchData(dge, "simple_branch") %>% table boundary = branch_counts["mTEC"] + branch_counts["progenitor"] / 2 + branch_counts["branchpoint"] / 2 dummy_cells = matrix(NA, nrow = nrow( data_wide_heat ), ncol = ncol( data_wide_heat )/40 ) middle_pt = abcissae_heatmap[boundary + 0:1 ] colnames(dummy_cells) = seq( max(middle_pt) + 1e-8, min(middle_pt) - 1e-8, length.out = ncol( dummy_cells ) ) is_left = 1:ncol(data_wide_heat) < boundary data_wide_heat = cbind( data_wide_heat[, is_left], dummy_cells, data_wide_heat[, !is_left] ) } # # Melt data and order rows data_wide_heat$gene = factor( data_wide_heat$gene, levels = data_wide_heat$gene[order(data_wide_heat$cluster, peak_expression)] , ordered = T) data_wide_heat = data_wide_heat[order(data_wide_heat$gene), ] data_long_heat = reshape2::melt(data_wide_heat, id.vars = c("gene", "cluster")) data_long_heat %<>% plyr::rename( replace = c("variable" = "pseudotime", "value" = "rel_log_expr") ) # # Smoothed heatmap p_heat_clusters = ggplot(data = data_long_heat ) + ggtitle( "Genes clustered by temporal expression pattern" ) + geom_raster( aes( x = pseudotime, y = gene, fill = rel_log_expr ), interpolate = T ) + scale_fill_gradientn( colors = blue_gray_red, na.value="white" ) + theme(axis.line = element_blank(), axis.ticks = element_blank(), axis.text.x=element_blank()) # # Label selected genes rest = setdiff( data_wide_heat$gene, genes_to_label ) ylabels = c( genes_to_label, rep("", length(rest)) ) names(ylabels) = c( genes_to_label, rest ) p_heat_clusters = p_heat_clusters + scale_y_discrete( labels = ylabels ) + labs( y = "Genes" ) # # X axis is either by cells (branching) or by pseudotime (not branching). if( "simple_branch" %in% AvailableData( dge ) ){ p_heat_clusters = p_heat_clusters + xlab("Cells") } else { p_heat_clusters = p_heat_clusters + xlab("Pseudotime") } # # Add cluster colorbar boundary_genes = which(1==diff(sort(data_wide_heat$cluster))) boundary_genes = c(0, boundary_genes, length( data_wide_heat$gene ) ) right_edge = max(as.numeric(data_long_heat$pseudotime)) left_edge = min(as.numeric(data_long_heat$pseudotime)) for( i in 1:num_clusters ){ p_heat_clusters = p_heat_clusters + annotate( "rect", xmin = left_edge - 4 * (right_edge - left_edge) / 100, xmax = left_edge - (right_edge - left_edge) / 100, ymin = boundary_genes[i ] + gap_size, ymax = boundary_genes[i+1], fill = scales::hue_pal()(num_clusters)[i]) + annotate( "text", x = left_edge - 8 * (right_edge - left_edge) / 100, y = (boundary_genes[i ] + gap_size + boundary_genes[i+1]) / 2, label = i) } # # Add cell types branch_colors = c(scales::hue_pal()(2), "purple", "gray" ) names(branch_colors) = c("mTEC", "cTEC", "progenitor", "branchpoint" ) if( "simple_branch" %in% AvailableData( dge ) ){ X = FetchData(dge, c("simple_branch", "pseudotime")) X = X[order(X$pseudotime), ] X = X[["simple_branch"]] %>% as.character %>% rle #run-length encoding keeps from overwhelming ggplot X$positions = c(0, cumsum(X$lengths) ) X$positions %<>% div_by_max X$positions = X$positions*right_edge for( i in 1:length(X$values) ){ cell_label_data = data.frame( xmin = X$positions[i], xmax = X$positions[i+1], ymin = 2*ceiling( length(smoothers) / 100 ), ymax = 0 ) p_heat_clusters = p_heat_clusters + geom_rect( data = cell_label_data, aes( xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax ), fill = branch_colors[X$values[i]] ) dummy_cells_pseudotime = as.numeric(colnames(dummy_cells)) * right_edge / max(X$positions) p_heat_clusters = p_heat_clusters + annotate( geom = "rect", fill = "white", xmin = min( dummy_cells_pseudotime ), xmax = max( dummy_cells_pseudotime ), ymin = 2*ceiling( length(smoothers) / 100 ), ymax = 0 ) } } ggsave( filename = file.path(results_path, "heatmapped_gene_clusters.png"), p_heat_clusters, width = 6, height = 7 ) ggsave( filename = file.path(results_path, "heatmapped_gene_clusters.pdf"), p_heat_clusters, width = 6, height = 7 ) return( p_heat_clusters ) } #' Plot a dendrogram, labeling merges given by `converter` with colored rectangles. #' #' @details If converter is c("a", "a", "b", "c"), you get edges labeled as a #' four-colour muted rainbow next to a grayscale with light, light, medium, and dark. #' This is to convey that the first two tips have been somehow grouped. #' @export plot_dendro_with_rect = function( hc, converter, main = "Dendrogram" ){ p = ggdendro::ggdendrogram(hc) p = p + annotate( geom = "tile", x = 1:length(hc$order), y = -2, fill = scales::hue_pal()(length(hc$order)) ) p = p + annotate( geom = "tile", x = 1:length(hc$order), y = -1, fill = scales::grey_pal()(3)[factor(converter[hc$order])] ) p = p + ggtitle(main) print(p) return(p) } #' Plot a dendrogram and also cut it to merge input into `num_desired` groups. #' #' @export dendrogram_merge_points = function( X, num_desired, results_path, FUN = function(x) hclust(dist(x), method = "ward.D2"), REORDER_FUN = function(hc) as.hclust(stats::reorder(as.dendrogram(hc), 1:nrow(X))), PLOT_FUN = plot_dendro_with_rect, CUT_FUN = stats::cutree, main = "Dendrogram", return_hc = F, ... ){ hc = FUN( X, ... ) hc = REORDER_FUN( hc ) converter = CUT_FUN(hc, num_desired) converter = setNames( letters[converter], names(converter) ) atae(as.character(hc$labels), names(converter)) hc$labels = paste0( hc$labels, " (", converter[hc$labels], ")" ) { pdf( file.path( results_path, paste0( main, ".pdf" ) ) ) PLOT_FUN( hc, converter, main = main ) dev.off() } PLOT_FUN( hc, converter, main = main ) if(return_hc){ return( hc ) } return( converter ) }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.