title: "Cleaning the DGE Data" author: "Eric Kernfeld" date: "September 7, 2016" output: html_document
#' Remove missing values from the metadata, issuing a warning if changes are made. #' #' @param object Seurat object #' #' @export #' FillNA = function( object, filler = "NA" ){ varnames = object@data.info %>% names missing_list = c() na2filler = function(x){ x[is.na(x)] = filler return(x) } for( var in varnames ){ if( any( is.na( object@data.info[[var]] ) ) ){ missing_list %<>% c(var) object@data.info[[var]] %<>% na2filler } } if( length( missing_list ) > 0 ){ warning( paste0( "Missing values found in these variables: \n", paste0( missing_list, collapse = "\n" ), "\nReplacing with ", filler, " .\n\n" ) ) } return( object ) } #' Get available variable names (genes, identity classes, PCA embeddings, etc) #' #' @param object Seurat object #' @return Returns a character vector of all eligible inputs to the `vars.all` argument of `FetchData`. #' @export AvailableData = function( object ){ available_categorized = list( metadata = names( object@data.info ), PCs = names(object@pca.x), tsne = names(object@tsne.rot), ICs = names(object@ica.rot), genes = rownames( object@data ), ident = "ident" ) return( Reduce( f = union, x = available_categorized ) ) } #' FetchData but with zeroes for unavailable genes #' #' @export #' @param dge Seurat object #' @param vars.all List of all variables to fetch. Missing entries are ignored. #' @param ... Other arguments to pass to FetchData #' #' @details This function is stupid: if you ask for "PC1" and it's not available, #' it will think you're asking for a non-expressed gene, so it will return zeroes. FetchDataZeroPad = function( dge, vars.all, warn = T, ... ){ vars.all = vars.all[complete.cases(vars.all)] avail = intersect( vars.all, AvailableData( dge ) ) unavail = setdiff( vars.all, AvailableData( dge ) ) if(warn && length(unavail) > 0){ warning("Some variables are not available. Returning zeroes.\n") } to_return = FetchData( dge, avail, ... ) pad = as.data.frame( matrix(0, nrow = nrow( to_return ), ncol = length( unavail ), dimnames = list( rownames( to_return ), unavail) ) ) to_return = cbind( to_return, pad ) assertthat::are_equal( sort( vars.all ), sort( colnames( to_return ) ) ) to_return = to_return[, vars.all, drop = F] assertthat::assert_that( is.data.frame( to_return ) ) return( to_return ) } #' Subset data flexibly from a Seurat object. #' #' @param dge Seurat object #' @param vars.use Variables to fetch for use in `predicate`. #' @param predicate String to be parsed into an R expression and evaluated as an arg to base::subset. #' @param preserve_raw If TRUE, then it will not exclude any cells from the @raw.data slot. #' By default, it leaves cells in @raw.data only if they satisfy the given predicate. #' @param show_on_tsne If true, save a tSNE showing which cells are kept. #' @param results_path Where to save the tSNE. #' @param ... Extra params passed to tsne_colored. #' @details Calls FetchData, subsets it as a dataframe using base::subset, and #' subsets the Seurat object correspondingly (using the df rownames.) #' #' @export #' SubsetDataFlex = function( dge, vars.use, predicate, preserve_raw = FALSE, results_path = NULL, show_on_tsne = !is.null(results_path), ... ){ if( typeof(predicate) != "character"){ print("predicate should be a character vector. It will be parsed into `subset` as an R expression.") } df = FetchData(dge, vars.use) %>% as.data.frame cu = df %>% subset(eval(parse(text=predicate))) %>% rownames if( show_on_tsne ){ dge@data.info$included = (rownames(dge@data.info) %in% cu) %>% as.character tsne_colored( dge, results_path, colour = "included", ... ) } dge = SubsetData(dge, cells.use = cu) if( !preserve_raw ){ dge@raw.data = deseuratify_raw_data(dge) } return( dge ) } #' Merge two Seurat objects. #' #' By default, preserves any column of @data.info shared between both objects. #' You can also specify what variables to keep. They will be added to data.info in #' the output, warning and padding with zeroes if either object lacks any var in vars.keep. #' #' @export #' SeuratMerge = function( dge1, dge2, preserve_ident = F, vars.keep = intersect(names(dge1@data.info), names(dge2@data.info) ) ){ # Save on memory dge1@scale.data = matrix() dge2@scale.data = matrix() gc() # Do merging dge_all = list( dge1 = deseuratify_raw_data( dge1 ), dge2 = deseuratify_raw_data( dge2 ) ) %>% dge_merge_list %>% seuratify_thy_data characterize_factor = function(x){ if(is.factor(x)) as.character(x) else x } all_factors_to_char = function(X) data.frame(lapply(X, characterize_factor), stringsAsFactors=FALSE) if( length(vars.keep) > 0 ){ preserved_metadata = rbind( FetchDataZeroPad( dge1, vars.keep ) %>% all_factors_to_char, FetchDataZeroPad( dge2, vars.keep ) %>% all_factors_to_char ) preserved_metadata %<>% as.data.frame rownames(preserved_metadata) = c( rownames(dge1@data.info),rownames(dge2@data.info)) } if(preserve_ident){ new_ident = c( characterize_factor(dge1@ident), characterize_factor(dge2@ident) ) names(new_ident) = c(names(dge1@ident), names(dge2@ident)) dge_all %<>% SetIdent(new_ident, cells.use = names(new_ident) ) } return(dge_all) } #' Make small-multiple pie charts. #' #' @param dge Seurat object #' @param ident.use Becomes the categories in the pie chart #' @param facet_by Each small multiple contains cases at one level of this variable. #' @param col Optional colorscale. #' @param label Logical. If TRUE, percentages are added. #' @param main Plot title. #' @param drop_levels If TRUE, omit facets that would be empty. #' #' @export #' SeuratPie = function( dge, ident.use = "cell_type", facet_by = "eday", do.test = FetchDataZeroPad(dge, "eday")[[1]] %>% unique %>% length %>% is_greater_than(1), col = NULL, label = F, main = "Sample makeup by day", drop_levels = F ){ cell_types = FetchData(dge, ident.use)[[1]] %>% unique if( length(cell_types) < 2 ){ stop("\n SeuratPie cannot handle cases with only one level in ident.use.\n") } #### Testing # Test each cluster for a quadratic trend in pct by eday, weighted by the number of cells at each eday. if(do.test){ # Assemble percentages for testing pct_for_testing = FetchData( dge, c( ident.use, "orig.ident" )) %>% table %>% apply(2, percentify) %>% (reshape2::melt) %>% plyr::rename(c("value" = "percent")) ncells = FetchData( dge, "orig.ident" ) %>% table # Fill in eday based on sample id map_to_eday = setNames(get_metadata()$eday, get_metadata()$Sample_ID %>% as.character ) map_to_eday %<>% na.omit nm = names( map_to_eday ) map_to_eday %<>% as.numeric names(map_to_eday) = nm pct_for_testing$eday = map_to_eday[pct_for_testing$orig.ident %>% as.character] pct_for_testing$eday_sq = pct_for_testing$eday ^ 2 # Quadratic fit, weighted by day, tested against null model pvals = rep(1, length(cell_types)) names(pvals) = cell_types for( cl in cell_types ){ this_cluster_data = subset( pct_for_testing, eval(parse(text = ident.use)) == cl ) mod = lm( data = this_cluster_data, formula = percent~eday + eday_sq, weights = ncells ) test_result = car::linearHypothesis(mod, c("eday = 0", "eday_sq = 0" )) pvals[cl] = test_result$`Pr(>F)`[[2]] } } #### Plotting # Get percentages by facet X = FetchData( dge, c( ident.use, facet_by )) %>% table %>% apply(2, percentify) %>% (reshape2::melt) %>% plyr::rename(c("value" = "percent")) if( drop_levels ){ X %<>% subset( percent != 0 ) } facet_values = FetchData( dge, facet_by )[[1]] if(is.factor(facet_values) & !drop_levels){ X[[facet_by]] %<>% as.character %>% factor(levels = levels(facet_values), ordered = T) } # Position percentages decently X$at = 0 X = X[order(X[[ident.use]]), ] for(facet_level in unique(X[[facet_by]])){ idx = (X[[facet_by]] == facet_level) X[idx, "at"] = 100 - ( cumsum(X[idx, "percent"]) - X[idx, "percent"]/2 ) } # Pie charts require stat=identity and x=constant p = ggplot(X) + ggtitle( main) + geom_bar( aes_string( y = "percent", x = "factor('')", fill = ident.use ), position = "stack", stat='identity' ) + coord_polar(theta = "y") + xlab("") + ylab("") + facet_wrap(facet_by, nrow = 1) + theme(axis.ticks = element_blank(), axis.text.y = element_blank(), axis.text.x = element_blank()) if(!is.null(col)){p = p + scale_fill_manual( values = col ) } if( label ) { p = p + geom_text( aes( y = at, x = 1.5, label = percent ) ) } if(do.test){ pval_text = paste0(" (", round(log10(pvals), 1), ")") } else { pval_text = "" } p = p + scale_fill_manual( name="Cell type (Log10 p)", values = col, breaks=cell_types, labels=paste0(cell_types, pval_text)) return(p) } #' Test for markers flexibly from a Seurat object. #' #' Calls FindMarkers with extra features. #' #' @param ident.use Fetched via FetchData to define the groups being tested. Should obey #' @param test.use Passed into FindMarkers unless it is "binomial_batch", in which case #' it uses approximate p-values based on a binomial glmm with a random effect for batch (1|orig.ident). #' #' All other parameters are passed into FindMarkers unless test.use=="binomial_batch", in #' which case I attempt to match the behavior of FindMarkers. #' #' Output contains an extra column for q-values from p.adjust(..., method="fdr"). #' #' @export #' FindMarkersFlex = function( object, ident.use, ident.1, ident.2 = object %>% FetchData(ident.use) %>% extract2(1) %>% unique %>% setdiff(ident.1), order_by_var = "avg_diff", thresh.use = 0.25, test.use = "binomial_batch", genes.use = object@data %>% rownames, min.pct = 0.1, ... ){ # This chunk handles ident.1 or .2 of length greater than 1 by collapsing them both with underscores. new_ident = FetchData( object, ident.use )[[1]] %>% as.character names(new_ident) = names(object@ident) new_ident[new_ident %in% ident.1] = paste0(ident.1, collapse = "_") new_ident[new_ident %in% ident.2] = paste0(ident.2, collapse = "_") ident.1 = paste0(ident.1, collapse = "_") ident.2 = paste0(ident.2, collapse = "_") object %<>% AddMetaData(metadata = new_ident, col.name = ident.use) # To interface with Seurat, the @ident slot gets overwritten with the groups for the expression test. object %<>% Seurat::SetIdent(ident.use = new_ident) # Slice the object down to just the relevant cells, to save time and reduce code complexity downstream. predicate = paste0(ident.use, " %in% c( '", ident.1, "', '", paste0(ident.2, collapse = "', '"), "' )") object %<>% SubsetDataFlex(vars.use = ident.use, predicate) genes.use %<>% intersect(AvailableData(object)) if (test.use == "binomial_batch") { cat(" \n Computing summaries... \n") x = data.frame( gene = genes.use, stringsAsFactors = F ) rownames( x ) = x$gene group_means = aggregate_nice( x = FetchData(object, genes.use), by = FetchData(object, ident.use), FUN = mean ) %>% t group_pcts = aggregate_nice( x = FetchData(object, genes.use), by = FetchData(object, ident.use), FUN = prop_nz ) %>% t x$avg_diff = group_means[, ident.1] - group_means[, ident.2] x$pct.1 = group_pcts[, ident.1] x$pct.2 = group_pcts[, ident.2] x = subset(x, abs(avg_diff) > thresh.use & ( pct.1 > min.pct | pct.2 > min.pct ) ) cat(" Computing p-values... \n") get_p = function( gene ) { data = FetchData(object, c(gene, ident.use, "orig.ident")) data[[gene]] %<>% is_greater_than(0) data[[ident.1]] = data[[ident.use]] %in% ident.1 colnames(data) = make.names(colnames(data)) mod = lme4::glmer(formula = paste0( colnames(data)[1], " ~ (1|orig.ident) + ", colnames(data)[4] ) , family = "binomial", data = data ) mod_p = car::linearHypothesis( mod, hypothesis.matrix = paste0( make.names(ident.1), "TRUE = 0" ) ) cat(".") return( mod_p$`Pr(>Chisq)`[[2]] ) } x$p.value = parallel::mclapply( x$gene, function(s) { tryCatch(get_p(s), error = function(e) NA) }) %>% simplify2array() failures = is.na(x$p.value) cat(" ", sum( failures ), " failed tests out of ", nrow(x), ". Setting failures to 1 for conservative FDR control. \n" ) x$p.value[failures] = 1 } else { x = Seurat::FindMarkers( object, ident.1 = ident.1, ident.2 = ident.2, test.use = test.use, genes.use = genes.use, thresh.use = thresh.use, min.pct = min.pct, ... ) } if(is.null(x$p.value)){x$p.value = x$p_val} if( !is.null( x$p.value ) ){ x$q.value = p.adjust( x$p.value, method = "fdr" ) } x = x[order(x[[order_by_var]], decreasing = T), ] x$gene = rownames(x) return( x ) } #' Sanitize gene names via `make.names` #' #' @export #' SanitizeGenes = function( dge ){ rownames( dge@raw.data ) %<>% make.names rownames( dge@data ) %<>% make.names rownames( dge@scale.data ) %<>% make.names names( dge@var.genes ) %<>% make.names return( dge ) }
#' Make a FACS-like plot from a single-cell rna-seq dataset. #' #' @param dge Seurat object #' @param gene1 Horizontal axis on plot mimics this gene. Character, usually length 1 but possibly longer. #' @param gene2 Vertical axis on plot mimics this gene. Character, usually length 1 but possibly longer. #' @param genesets_predetermined If FALSE, plot the sum of many genes similar to gene1 instead of gene1 alone (same #' for gene2). See ?get_similar_genes. If TRUE, plot the sum of only the genes given. #' @param num_genes_add Each axis shows a simple sum of similar genes. This is how many (before removing overlap). Integer. #' @param return_val If "all", returns a list with several internal calculations revealed. #' If "plot", returns just a ggplot object. If "seurat", returns a Seurat object with gene scores added. #' @param cutoffs If given, divide plot into four quadrants and annotate with percentages. Numeric vector of length 2. #' @param dge_reference Seurat object. This function relies on gene-gene correlation. If your dataset is perturbed in a way #' that would substantially alter gene-gene correlations, for example if different time points are present or certain #' cell types are mostly depleted, you can feed in a reference dge, and TACS will choose axes based on the reference data. #' @param density If TRUE, plot contours instead of points. #' @param ... Extra params for stat_density2d. #' #' This function is based on a simple scheme: choose genes similar to the ones specified #' and average them to reduce the noise. #' #' @export #' TACS = function( dge, gene1, gene2, cutoffs = NULL, return_val = "plot", density = F, facet_by = NULL, include_panel_with_all = FALSE, facet_levels = FetchData(dge, facet_by)[[1]] %>% factor %>% levels %>% c(rep("all", include_panel_with_all), .), col = stats::setNames( scales::hue_pal()( length( facet_levels ) - include_panel_with_all ), facet_levels[ ( include_panel_with_all + 1 ) : length( facet_levels )] ), num_genes_add = 100, genesets_predetermined = F, dge_reference = dge, ... ){ # Get gene sets to average if(genesets_predetermined){ g1_similar = gene1 g2_similar = gene2 } else { g1_similar = get_similar_genes(dge_reference, gene1, num_genes_add) %>% c( gene1, . ) g2_similar = get_similar_genes(dge_reference, gene2, num_genes_add) %>% c( gene2, . ) shared = intersect(g1_similar, g2_similar) g1_similar %<>% setdiff(shared) g2_similar %<>% setdiff(shared) } # Average gene sets to get scores g1_score = rowMeans(FetchDataZeroPad(dge, g1_similar)) g2_score = rowMeans(FetchDataZeroPad(dge, g2_similar)) g1_score_name = paste0(gene1[1], "_score") g2_score_name = paste0(gene2[1], "_score") #Add scores as metadata. Extract with faceting var into plotting data. dge %<>% AddMetaData(g1_score, col.name = g1_score_name) dge %<>% AddMetaData(g2_score, col.name = g2_score_name) plot_df = FetchData(dge, c(g1_score_name, g2_score_name, facet_by)) # Augment data to form extra panel with everything if( include_panel_with_all ){ plot_df_all = plot_df plot_df_all[[facet_by]] = "all" plot_df = rbind(plot_df, plot_df_all) col = c(col, "all"="black") } # Prepare to facet if(!is.null(facet_by)) { plot_df[[facet_by]] %<>% factor(levels = facet_levels, ordered = T) %>% droplevels } # Form plot p = ggplot(plot_df) if(density){ p = p + stat_density2d( aes_string( x = g1_score_name, y = g2_score_name, colour = facet_by, alpha = "..level.." ), bins = 50 ) + scale_alpha_continuous( range = c(0.4, 1) ) + scale_color_manual(values = col) } else { p = p + geom_point( aes_string( x=g1_score_name, y=g2_score_name ) ) } p = p + expand_limits(y=0, x=0) # Facet if desired if(!is.null(facet_by)) { p = p + facet_wrap(as.formula(paste0("~", facet_by))) } # Add quadrants and percentages if( !is.null(cutoffs)){ p %<>% add_quadrants(g1_score_name = g1_score_name, g2_score_name = g2_score_name, cutoffs = cutoffs, facet_by = facet_by) } # Return everything or just a plot or just a seurat object if( return_val == "all" ){ return( list( plot = p, dge = dge, genes = list( gene1 = gene1, gene2 = gene2 ), score_names = c( g1_score_name, g2_score_name ), genesets = list( g1_similar, g2_similar ), cutoffs = cutoffs, plot_df = plot_df ) ) } else if( return_val == "seurat" ){ return(dge) } else if( return_val == "plot" ){ return( p ) } else { stop(" return_val should be 'all', 'seurat', or 'plot'. ") } } #' Split a scatterplot (or similar) into quadrants and label percentages in each quadrant. #' #' @param dge Seurat object #' @param cutoffs numeric vector of length 2. Where to delineate the quadrants. #' @param facet_by optional facetting variable. Percents are calculated separately for each facet. #' #' This is a helper for TACS, but it's exported in case it becomes useful. #' #' @export #' add_quadrants = function(p, g1_score_name, g2_score_name, cutoffs, facet_by = NULL){ # Calculate percentages p = p + geom_vline(data = data.frame(xint=cutoffs[1]), aes(xintercept=xint)) p = p + geom_hline(data = data.frame(yint=cutoffs[2]), aes(yintercept=yint)) percentages = p$data[c(g1_score_name, g2_score_name, facet_by)] percentages[[g1_score_name]] %<>% is_greater_than(cutoffs[1]) percentages[[g2_score_name]] %<>% is_greater_than(cutoffs[2]) percentages %<>% table %>% (reshape2::melt) if(!is.null(facet_by)) { percentages = percentages[order(percentages[[facet_by]]), ] for( facet_level in unique(p$data[[facet_by]])){ percentages[percentages[[facet_by]] == facet_level, "value"] %<>% percentify() } } else { percentages$value %<>% percentify() } # Form annotation DF with correct facet and attempt sensible placement of percentages for( i in seq_along(percentages$value)){ if(percentages$value[i]==0){next} annot_df = data.frame( x = ifelse( percentages[i, g1_score_name], cutoffs[1]*2, cutoffs[1]*0.35) , y = ifelse( percentages[i, g2_score_name], cutoffs[2]*2, cutoffs[2]*0.25) , label = paste0( round(percentages$value[i], 1), "%") ) if(!is.null(facet_by)) { annot_df[[facet_by]] = percentages[i, facet_by] } p = p + geom_text( data = annot_df, aes(x=x,y=y,label=label) ) } return(p) }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.