In maehrlab/thymusatlastools: Tools for analysis of single-cell transcriptomic data

title: "Cleaning the DGE Data" author: "Eric Kernfeld" date: "September 7, 2016" output: html_document

Data handling

#' Remove missing values from the metadata, issuing a warning if changes are made. 
#'
#' @param object Seurat object
#'
#' @export
#'
FillNA = function( object, filler = "NA" ){
  varnames = object@data.info %>% names
  missing_list = c()
  na2filler = function(x){
    x[is.na(x)] = filler
    return(x)
  }
  for( var in varnames ){
    if( any( is.na( object@data.info[[var]] ) ) ){
      missing_list %<>% c(var)
      object@data.info[[var]] %<>% na2filler
    }
  }
  if( length( missing_list ) > 0 ){
    warning( paste0( "Missing values found in these variables: \n", 
                     paste0( missing_list, collapse = "\n" ),
                     "\nReplacing with ", filler, " .\n\n" ) )
  }
  return( object )
}


#' Get available variable names (genes, identity classes, PCA embeddings, etc)
#'
#' @param object Seurat object
#' @return Returns a character vector of all eligible inputs to the `vars.all` argument of `FetchData`.
#' @export
AvailableData = function( object ){
  available_categorized = list( metadata = names( object@data.info ),
                                PCs = names(object@pca.x),
                                tsne = names(object@tsne.rot),
                                ICs = names(object@ica.rot),
                                genes = rownames( object@data ),
                                ident = "ident" )
  return( Reduce( f = union, x = available_categorized ) )
}


#' FetchData but with zeroes for unavailable genes
#'
#' @export
#' @param dge Seurat object
#' @param vars.all List of all variables to fetch. Missing entries are ignored.
#' @param ... Other arguments to pass to FetchData
#'
#' @details This function is stupid: if you ask for "PC1" and it's not available,
#' it will think you're asking for a non-expressed gene, so it will return zeroes.
FetchDataZeroPad = function( dge, vars.all, warn = T, ... ){
  vars.all = vars.all[complete.cases(vars.all)]
  avail = intersect( vars.all, AvailableData( dge ) )
  unavail = setdiff( vars.all, AvailableData( dge ) )
  if(warn && length(unavail) > 0){
    warning("Some variables are not available. Returning zeroes.\n")
  }
  to_return  = FetchData( dge,  avail, ... ) 
  pad = as.data.frame( matrix(0,           
                              nrow = nrow( to_return ), 
                              ncol = length( unavail ),
                              dimnames = list( rownames( to_return ),               
                                               unavail) ) )
  to_return = cbind( to_return, pad )
  assertthat::are_equal( sort( vars.all ),   sort( colnames( to_return ) ) )   
  to_return = to_return[, vars.all, drop = F]
  assertthat::assert_that( is.data.frame( to_return ) )
  return( to_return )
}


#' Subset data flexibly from a Seurat object.
#'
#' @param dge Seurat object
#' @param vars.use Variables to fetch for use in `predicate`.
#' @param predicate String to be parsed into an R expression and evaluated as an arg to base::subset.
#' @param preserve_raw If TRUE, then it will not exclude any cells from the @raw.data slot.
#' By default, it leaves cells in @raw.data only if they satisfy the given predicate.
#' @param show_on_tsne If true, save a tSNE showing which cells are kept.
#' @param results_path Where to save the tSNE.
#' @param ... Extra params passed to tsne_colored.
#' @details Calls FetchData, subsets it as a dataframe using base::subset, and 
#' subsets the Seurat object correspondingly (using the df rownames.)
#'
#' @export
#'
SubsetDataFlex = function( dge, vars.use, predicate, preserve_raw = FALSE, 
                           results_path = NULL, show_on_tsne = !is.null(results_path), ... ){
  if( typeof(predicate) != "character"){
    print("predicate should be a character vector. It will be parsed into `subset` as an R expression.")
  }

  df = FetchData(dge, vars.use) %>% as.data.frame
  cu = df %>% subset(eval(parse(text=predicate))) %>% rownames
  if( show_on_tsne ){
    dge@data.info$included = (rownames(dge@data.info) %in% cu) %>% as.character
    tsne_colored( dge, results_path, colour = "included", ... )
  }
  dge = SubsetData(dge, cells.use = cu)
  if( !preserve_raw ){
    dge@raw.data = deseuratify_raw_data(dge) 
  }
  return( dge )
}

#' Merge two Seurat objects.
#'
#' By default, preserves any column of @data.info shared between both objects.  
#' You can also specify what variables to keep. They will be added to data.info in 
#' the output, warning and padding with zeroes if either object lacks any var in vars.keep.
#'  
#' @export
#' 
SeuratMerge = function( dge1, dge2, preserve_ident = F, 
                        vars.keep = intersect(names(dge1@data.info), 
                                              names(dge2@data.info) ) ){
  # Save on memory
  dge1@scale.data = matrix()
  dge2@scale.data = matrix()
  gc()

  # Do merging
  dge_all = list( dge1 = deseuratify_raw_data( dge1 ), 
                  dge2 = deseuratify_raw_data( dge2 ) ) %>%
    dge_merge_list %>% seuratify_thy_data 
  characterize_factor = function(x){ if(is.factor(x)) as.character(x) else x }
  all_factors_to_char = function(X) data.frame(lapply(X, characterize_factor), stringsAsFactors=FALSE)

  if( length(vars.keep) > 0 ){
    preserved_metadata = rbind( FetchDataZeroPad( dge1, vars.keep ) %>% all_factors_to_char, 
                                FetchDataZeroPad( dge2, vars.keep ) %>% all_factors_to_char )
    preserved_metadata %<>% as.data.frame
    rownames(preserved_metadata) = c( rownames(dge1@data.info),rownames(dge2@data.info)) 
  }

  if(preserve_ident){
    new_ident = c( characterize_factor(dge1@ident), 
                   characterize_factor(dge2@ident) )
    names(new_ident) = c(names(dge1@ident), names(dge2@ident))
    dge_all %<>% SetIdent(new_ident, cells.use = names(new_ident) )
  }

  return(dge_all)
}

#' Make small-multiple pie charts.
#'
#' @param dge Seurat object
#' @param ident.use Becomes the categories in the pie chart
#' @param facet_by Each small multiple contains cases at one level of this variable.
#' @param col Optional colorscale.
#' @param label Logical. If TRUE, percentages are added.
#' @param main Plot title.
#' @param drop_levels If TRUE, omit facets that would be empty. 
#'
#' @export
#'
SeuratPie = function( dge, ident.use = "cell_type", facet_by = "eday",
                      do.test = FetchDataZeroPad(dge, "eday")[[1]] %>% unique %>% length %>% is_greater_than(1),
                      col = NULL, label = F,
                      main = "Sample makeup by day", drop_levels = F ){
  cell_types = FetchData(dge, ident.use)[[1]] %>% unique 
  if( length(cell_types) < 2 ){
    stop("\n SeuratPie cannot handle cases with only one level in ident.use.\n")
  }

  #### Testing
  # Test each cluster for a quadratic trend in pct by eday, weighted by the number of cells at each eday.
  if(do.test){

    # Assemble percentages for testing
    pct_for_testing = FetchData( dge, c( ident.use, "orig.ident" )) %>%
      table %>%
      apply(2, percentify) %>%
      (reshape2::melt) %>%
      plyr::rename(c("value" = "percent"))
    ncells =  FetchData( dge, "orig.ident" ) %>% table

    # Fill in eday based on sample id
    map_to_eday = setNames(get_metadata()$eday,
                           get_metadata()$Sample_ID %>% as.character )

    map_to_eday %<>% na.omit
    nm = names( map_to_eday )
    map_to_eday %<>% as.numeric
    names(map_to_eday) = nm
    pct_for_testing$eday = map_to_eday[pct_for_testing$orig.ident %>% as.character]
    pct_for_testing$eday_sq = pct_for_testing$eday ^ 2

    # Quadratic fit, weighted by day, tested against null model
    pvals = rep(1, length(cell_types))
    names(pvals) = cell_types
    for( cl in cell_types ){
      this_cluster_data = subset( pct_for_testing, eval(parse(text = ident.use)) == cl )
      mod = lm( data = this_cluster_data, formula = percent~eday + eday_sq, weights = ncells )
      test_result = car::linearHypothesis(mod, c("eday = 0", "eday_sq = 0" ))
      pvals[cl] = test_result$`Pr(>F)`[[2]]
    }
  }

  #### Plotting
  # Get percentages by facet
  X = FetchData( dge, c( ident.use, facet_by )) %>%
    table %>%
    apply(2, percentify) %>%
    (reshape2::melt) %>%
    plyr::rename(c("value" = "percent"))
  if( drop_levels ){
    X %<>% subset( percent != 0 )
  }
  facet_values = FetchData( dge, facet_by )[[1]]
  if(is.factor(facet_values) & !drop_levels){
    X[[facet_by]] %<>% as.character %>% factor(levels = levels(facet_values), ordered = T)
  }

  # Position percentages decently
  X$at = 0
  X = X[order(X[[ident.use]]), ]
  for(facet_level in unique(X[[facet_by]])){
    idx = (X[[facet_by]] == facet_level)
    X[idx, "at"] = 100 - ( cumsum(X[idx, "percent"]) - X[idx, "percent"]/2 )
  }

  # Pie charts require stat=identity and x=constant
  p = ggplot(X) + ggtitle( main) +
    geom_bar( aes_string( y = "percent", x = "factor('')", fill = ident.use ),
              position = "stack", stat='identity' ) +
    coord_polar(theta = "y") + xlab("") + ylab("") +
    facet_wrap(facet_by, nrow = 1) + theme(axis.ticks = element_blank(),
                                           axis.text.y = element_blank(),
                                           axis.text.x = element_blank())
  if(!is.null(col)){p = p + scale_fill_manual( values = col ) }
  if( label ) { p = p + geom_text( aes( y = at, x = 1.5, label = percent ) ) }
  if(do.test){
    pval_text = paste0(" (", round(log10(pvals), 1), ")")
  } else {
    pval_text = ""
  }
  p = p +
    scale_fill_manual( name="Cell type (Log10 p)",
                       values = col,
                       breaks=cell_types,
                       labels=paste0(cell_types, pval_text))

  return(p)
}


#' Test for markers flexibly from a Seurat object.
#'
#' Calls FindMarkers with extra features.
#'
#' @param ident.use Fetched via FetchData to define the groups being tested. Should obey 
#' @param test.use Passed into FindMarkers unless it is "binomial_batch", in which case 
#'   it uses approximate p-values based on a binomial glmm with a random effect for batch (1|orig.ident). 
#'
#' All other parameters are passed into FindMarkers unless test.use=="binomial_batch", in 
#' which case I attempt to match the behavior of FindMarkers.
#' 
#' Output contains an extra column for q-values from p.adjust(..., method="fdr").
#'
#' @export
#'
FindMarkersFlex = function( object,
                            ident.use, ident.1, 
                            ident.2 = object %>% FetchData(ident.use) %>% extract2(1) %>% unique %>% setdiff(ident.1),
                            order_by_var = "avg_diff",
                            thresh.use = 0.25, 
                            test.use = "binomial_batch",
                            genes.use = object@data %>% rownames,
                            min.pct = 0.1, ... ){
  # This chunk handles ident.1 or .2 of length greater than 1 by collapsing them both with underscores.
  new_ident = FetchData( object, ident.use )[[1]] %>% as.character
  names(new_ident) = names(object@ident)
  new_ident[new_ident %in% ident.1] = paste0(ident.1, collapse = "_")
  new_ident[new_ident %in% ident.2] = paste0(ident.2, collapse = "_")
  ident.1 = paste0(ident.1, collapse = "_")
  ident.2 = paste0(ident.2, collapse = "_")
  object %<>% AddMetaData(metadata = new_ident, col.name = ident.use)
  # To interface with Seurat, the @ident slot gets overwritten with the groups for the expression test.
  object %<>% Seurat::SetIdent(ident.use = new_ident)
  # Slice the object down to just the relevant cells, to save time and reduce code complexity downstream.
  predicate = paste0(ident.use, " %in% c( '", ident.1, "', '", paste0(ident.2, collapse = "', '"), "' )")
  object %<>% SubsetDataFlex(vars.use = ident.use, predicate)
  genes.use %<>% intersect(AvailableData(object))
  if (test.use == "binomial_batch") {
    cat(" \n Computing summaries... \n")
    x = data.frame( gene = genes.use, stringsAsFactors = F )
    rownames( x ) = x$gene
    group_means = aggregate_nice( x  = FetchData(object, genes.use), 
                                  by = FetchData(object, ident.use), 
                                  FUN = mean ) %>% t
    group_pcts  = aggregate_nice( x  = FetchData(object, genes.use), 
                                  by = FetchData(object, ident.use), 
                                  FUN = prop_nz ) %>% t
    x$avg_diff  = group_means[, ident.1] - group_means[, ident.2]
    x$pct.1 = group_pcts[, ident.1]
    x$pct.2 = group_pcts[, ident.2]
    x = subset(x, abs(avg_diff) > thresh.use & ( pct.1 > min.pct | pct.2 > min.pct ) )
    cat(" Computing p-values... \n")
    get_p = function( gene ) {
      data = FetchData(object, c(gene, ident.use, "orig.ident"))
      data[[gene]]  %<>% is_greater_than(0)
      data[[ident.1]] = data[[ident.use]] %in% ident.1
      colnames(data) = make.names(colnames(data))
      mod = lme4::glmer(formula = paste0( colnames(data)[1], " ~ (1|orig.ident) + ", colnames(data)[4] ) , 
                        family = "binomial", data = data )
      mod_p = car::linearHypothesis( mod, hypothesis.matrix = paste0( make.names(ident.1), "TRUE = 0" ) )
      cat(".")
      return( mod_p$`Pr(>Chisq)`[[2]] )
    }
    x$p.value = parallel::mclapply( x$gene, 
                                    function(s) {
                                      tryCatch(get_p(s), error = function(e) NA) 
                                    }) %>% simplify2array()
    failures = is.na(x$p.value)
    cat("    ", sum( failures ), " failed tests out of ", nrow(x), 
        ". Setting failures to 1 for conservative FDR control. \n" )
    x$p.value[failures] = 1
  } else {
    x = Seurat::FindMarkers( object, ident.1 = ident.1, ident.2 = ident.2,                              
                             test.use = test.use,
                             genes.use = genes.use,   
                             thresh.use = thresh.use, 
                             min.pct = min.pct, ... )
  }
  if(is.null(x$p.value)){x$p.value = x$p_val}
  if( !is.null( x$p.value ) ){
    x$q.value = p.adjust( x$p.value, method = "fdr" )
  }
  x = x[order(x[[order_by_var]], decreasing = T), ]
  x$gene = rownames(x)
  return( x )
}


#' Sanitize gene names via `make.names`
#'
#' @export 
#'
SanitizeGenes = function( dge ){
  rownames( dge@raw.data )   %<>% make.names
  rownames( dge@data )       %<>% make.names
  rownames( dge@scale.data ) %<>% make.names
  names(    dge@var.genes )  %<>% make.names
  return( dge )
}

Transcript averaged cell scoring (TACS) plot

#' Make a FACS-like plot from a single-cell rna-seq dataset.
#'
#' @param dge Seurat object
#' @param gene1 Horizontal axis on plot mimics this gene. Character, usually length 1 but possibly longer.
#' @param gene2 Vertical axis on plot mimics this gene. Character, usually length 1 but possibly longer. 
#' @param genesets_predetermined If FALSE, plot the sum of many genes similar to gene1 instead of gene1 alone (same 
#' for gene2). See ?get_similar_genes. If TRUE, plot the sum of only the genes given.
#' @param num_genes_add Each axis shows a simple sum of similar genes. This is how many (before removing overlap). Integer.
#' @param return_val If "all", returns a list with several internal calculations revealed.
#' If "plot", returns just a ggplot object. If "seurat", returns a Seurat object with gene scores added. 
#' @param cutoffs If given, divide plot into four quadrants and annotate with percentages. Numeric vector of length 2.
#' @param dge_reference Seurat object. This function relies on gene-gene correlation. If your dataset is perturbed in a way 
#' that would substantially alter gene-gene correlations, for example if different time points are present or certain 
#' cell types are mostly depleted, you can feed in a reference dge, and TACS will choose axes based on the reference data.

#' @param density If TRUE, plot contours instead of points.
#' @param ... Extra params for stat_density2d.
#'
#' This function is based on a simple scheme: choose genes similar to the ones specified 
#' and average them to reduce the noise. 
#'
#' @export 
#'
TACS = function( dge, gene1, gene2, cutoffs = NULL, 
                 return_val = "plot", density = F, 
                 facet_by = NULL, 
                 include_panel_with_all = FALSE, 
                 facet_levels = 
                   FetchData(dge, facet_by)[[1]] %>% factor %>% levels %>% 
                   c(rep("all", include_panel_with_all), .),
                 col = stats::setNames( scales::hue_pal()( length( facet_levels ) - include_panel_with_all ), 
                                        facet_levels[ ( include_panel_with_all + 1 ) : length( facet_levels )] ),
                 num_genes_add = 100, genesets_predetermined = F, dge_reference = dge, ... ){
  # Get gene sets to average
  if(genesets_predetermined){
    g1_similar = gene1
    g2_similar = gene2
  } else {
    g1_similar = get_similar_genes(dge_reference, gene1, num_genes_add) %>% c( gene1, . )
    g2_similar = get_similar_genes(dge_reference, gene2, num_genes_add) %>% c( gene2, . ) 
    shared = intersect(g1_similar, g2_similar)
    g1_similar %<>% setdiff(shared)
    g2_similar %<>% setdiff(shared)
  }

  # Average gene sets to get scores
  g1_score = rowMeans(FetchDataZeroPad(dge, g1_similar))
  g2_score = rowMeans(FetchDataZeroPad(dge, g2_similar))
  g1_score_name = paste0(gene1[1], "_score")
  g2_score_name = paste0(gene2[1], "_score")

  #Add scores as metadata. Extract with faceting var into plotting data.
  dge %<>% AddMetaData(g1_score, col.name = g1_score_name)
  dge %<>% AddMetaData(g2_score, col.name = g2_score_name)
  plot_df = FetchData(dge, c(g1_score_name, g2_score_name, facet_by))

  # Augment data to form extra panel with everything
  if( include_panel_with_all ){
    plot_df_all = plot_df
    plot_df_all[[facet_by]] = "all"
    plot_df = rbind(plot_df, plot_df_all)
    col = c(col, "all"="black")
  } 
  # Prepare to facet
  if(!is.null(facet_by)) {
    plot_df[[facet_by]] %<>% factor(levels = facet_levels, ordered = T) %>% droplevels
  }

  # Form plot
  p = ggplot(plot_df) 
  if(density){ 
    p = p + stat_density2d( aes_string( x = g1_score_name, y = g2_score_name, 
                                        colour = facet_by, alpha = "..level.." ), bins = 50 ) +
      scale_alpha_continuous( range = c(0.4, 1) ) + 
      scale_color_manual(values = col)
  } else {
    p = p + geom_point( aes_string( x=g1_score_name, y=g2_score_name ) ) 
  }
  p = p + expand_limits(y=0, x=0)
  # Facet if desired
  if(!is.null(facet_by)) {
    p = p + facet_wrap(as.formula(paste0("~", facet_by)))
  }

  # Add quadrants and percentages
  if( !is.null(cutoffs)){
    p %<>% add_quadrants(g1_score_name = g1_score_name,
                         g2_score_name = g2_score_name, 
                         cutoffs = cutoffs,
                         facet_by = facet_by)
  } 

  # Return everything or just a plot or just a seurat object
  if( return_val == "all" ){
    return( list( plot = p, 
                  dge = dge, 
                  genes = list( gene1 = gene1, gene2 = gene2 ),
                  score_names = c( g1_score_name, g2_score_name ), 
                  genesets = list( g1_similar, g2_similar ),
                  cutoffs = cutoffs,
                  plot_df = plot_df ) )
  } else if( return_val == "seurat" ){
    return(dge)
  } else if( return_val == "plot" ){
    return( p )
  } else {
    stop(" return_val should be 'all', 'seurat', or 'plot'. ")
  }
}

#' Split a scatterplot (or similar) into quadrants and label percentages in each quadrant. 
#'
#' @param dge Seurat object
#' @param cutoffs numeric vector of length 2. Where to delineate the quadrants.
#' @param facet_by optional facetting variable. Percents are calculated separately for each facet.
#'
#' This is a helper for TACS, but it's exported in case it becomes useful.
#'
#' @export 
#'
add_quadrants = function(p, g1_score_name, g2_score_name, cutoffs, facet_by = NULL){

  # Calculate percentages
  p = p + geom_vline(data = data.frame(xint=cutoffs[1]), aes(xintercept=xint))
  p = p + geom_hline(data = data.frame(yint=cutoffs[2]), aes(yintercept=yint))
  percentages = p$data[c(g1_score_name, g2_score_name, facet_by)]
  percentages[[g1_score_name]] %<>% is_greater_than(cutoffs[1])
  percentages[[g2_score_name]] %<>% is_greater_than(cutoffs[2])
  percentages %<>% table %>% (reshape2::melt)
  if(!is.null(facet_by)) {
    percentages = percentages[order(percentages[[facet_by]]), ]
    for( facet_level in unique(p$data[[facet_by]])){
      percentages[percentages[[facet_by]] == facet_level, "value"] %<>% percentify()
    } 
  } else {
    percentages$value %<>% percentify()
  }

  # Form annotation DF with correct facet and attempt sensible placement of percentages
  for( i in seq_along(percentages$value)){
    if(percentages$value[i]==0){next}
    annot_df = data.frame(
      x = ifelse( percentages[i, g1_score_name], cutoffs[1]*2, cutoffs[1]*0.35) ,
      y = ifelse( percentages[i, g2_score_name], cutoffs[2]*2, cutoffs[2]*0.25) ,
      label = paste0( round(percentages$value[i], 1), "%") )
    if(!is.null(facet_by)) {
      annot_df[[facet_by]] = percentages[i, facet_by]
    }
    p = p + geom_text( data = annot_df, aes(x=x,y=y,label=label) )                
  }

  return(p)
}