title: "Cleaning the DGE Data" author: "Eric Kernfeld" date: "September 7, 2016" output: html_document
#' Run Monocle, DPT, or PCA-based pseudotime analysis on a Seurat object. #' #' @param dge Seurat object #' @param results_path Where to save tables and plots #' @param earliest_day Numeric. Used to root the Monocle trajectory; gets compared to `FetchData(dge, "eday")[[1]]` via `==`. #' @param mp Passed to call_monocle_on_seurat as monocle_params arg. #' @param ... Additional parameters passed to call_monocle_on_seurat. #' #' The statistical inference is outsourced to Monocle via call_monocle_on_seurat (or Seurat via pc_as_pt). #' @export master_pt = function( dge, results_path, method = "monocle", earliest_day = NULL, reset_var_genes = T, mp = list( reset_var_genes = reset_var_genes, log_scale_expr_thresh = 0.1, excess_disp = 1, num_mature_types = NULL, reduction_method = "DDRTree" ), ... ){ if( method=="PCA" ){ dge = pc_as_pt( dge )$dge } else if (method=="DPT") { if( !requireNamespace("destiny", quietly = TRUE) | packageVersion("destiny") < "2" ){ stop("Option DPT requires the package 'destiny' version >=2.5.0.") } if( reset_var_genes ){ dge %<>% MeanVarPlot } dm = destiny::DiffusionMap( data = t( as.matrix( dge@scale.data[dge@var.genes, ] ) ) ) dpt_out = destiny::DPT( dm, ... ) dge = add_pseudotime_to_seurat( dge, pt_obj = dpt_out, pt_method = "dpt" ) } else { mobj = call_monocle_on_seurat( dge, results_path, monocle_params = mp, earliest_day = earliest_day, ... ) dge = add_pseudotime_to_seurat( dge, pt_obj = mobj ) } return( dge ) } #' Take a principal component (first by default) and return a list of results gained by interpreting that PC as a pseudotime axis. #' #' @param dge: Seurat object to be used. Should have variable genes and PCA fields already filled in. #' Jackstraw and t-SNE are optional but preferred. #' @param orient_var: pseudotime is flipped to correlate positively with this. Anything available #' from `Seurat::FetchData(dge, orient_var)` is fair game... as long as it's numeric. #' @param pc.use: which principal component to take #' #' @return List with named elements: #' `$dge`: Seurat object with a `pseudotime` metadatum filled in #' `$gene_corrs`: dataframe with genes ordered by correlation with pseudotime. #' Columns are `gene`, `corr`, and (if available from Jackstraw) `p_value`. pc_as_pt = function( dge, pc.use = 1, orient_var = "eday" ){ # # Construct pseudotime pt = Seurat::FetchData(dge, paste0("PC", pc.use)) %>% vectorize_preserving_rownames # # Get correlations with genes and sort through them dge = ProjectPCA( dge, do.print = F ) gene_corrs = dge@pca.x.full[, "PC1", drop = F] %>% vectorize_preserving_rownames %>% sort gene_corrs = data.frame( gene = names(gene_corrs), corr = gene_corrs ) if( prod( dim( dge@jackStraw.empP.full ) ) > 0 ){ # Need to index by gene because of earlier sorting gene_corrs$p_value = dge@jackStraw.empP.full[gene_corrs$gene, "PC1"] } rownames(gene_corrs) = gene_corrs$gene # # Flip everything if necessary ov = Seurat::FetchData( dge, orient_var )[[1]] atat(is.numeric(ov)) wrong_way = cor( pt, ov ) < 0 if( wrong_way ){ orient = function(x) -x } else { orient = function(x) x } pt %<>% orient dge@pca.rot [, 1] %<>% orient dge@pca.x [, 1] %<>% orient dge@pca.x.full[, 1] %<>% orient gene_corrs$corr %<>% orient # # Add pt as metadata and return dge = Seurat::AddMetaData( dge, pt, "pseudotime" ) return( list( dge = dge, gene_corrs = gene_corrs ) ) }
#' Extract data from a Seurat object and run Monocle, returning a Monocle object. #' #' #' @export call_monocle_on_seurat = function( dge, results_path, monocle_params, earliest_day = NULL ){ attach( monocle_params ) # # Make (m)onocle (obj)ect raw_dge = deseuratify_raw_data( dge ) rownames(dge@mean.var) %<>% make.names geneInfo = data.frame( dge@mean.var[rownames(raw_dge), ] ) geneInfo$gene = rownames(geneInfo) adf_class = getClassDef("AnnotatedDataFrame", where, package = "Biobase") pd = new(adf_class, data = dge@data.info[colnames(raw_dge), ]) fd = new(adf_class, data = geneInfo) mobj = monocle::newCellDataSet( as.matrix( raw_dge ), phenoData = pd, featureData=fd, lowerDetectionLimit=1, expressionFamily=VGAM::negbinomial.size( ) ) # # Select variable genes or get from Seurat object mobj = monocle::detectGenes( mobj, min_expr = 0 ) mobj = BiocGenerics::estimateSizeFactors( mobj ) mobj = BiocGenerics::estimateDispersions( mobj ) if(reset_var_genes){ disp_table = monocle::dispersionTable( mobj ) ordering_genes = subset(disp_table, ( mean_expression >= log_scale_expr_thresh ) & ( dispersion_empirical >= excess_disp * dispersion_fit ) )[["gene_id"]] cc_genes_macosko = get_macosko_cc_genes() cc_genes_all = Reduce(union, cc_genes_macosko) %>% sapply( get_ortholog ) %>% unique %>% na.omit ordering_genes %<>% as.character %>% setdiff( cc_genes_all ) expressed_genes = row.names(subset(Biobase::fData(mobj), num_cells_expressed >= 10)) ordering_genes = intersect(expressed_genes,ordering_genes) mobj = monocle::setOrderingFilter(mobj,ordering_genes) } else { mobj = monocle::setOrderingFilter(mobj, dge@var.genes ) } # # Save variable genes and parameters vgsrp = file.path( results_path, "var_gene_select" ) dir.create.nice( vgsrp ) gd = mobj@featureData@data var_genes = gd$gene[gd$use_for_ordering] cell_markers = get_rene_markers() variable_cell_markers = intersect( Capitalize(cell_markers$marker), Capitalize(as.character(var_genes)) ) variable_cell_markers = c(paste0(length(variable_cell_markers), "total"), variable_cell_markers) text2file( file.path( vgsrp, "markers_among_variable_genes_monocle.txt" ), variable_cell_markers ) totalstring = paste(length(as.character(var_genes)), "total") var_genes = c(totalstring, as.character(var_genes)) text2file(file.path(vgsrp, "variable_genes_monocle.txt"), var_genes) vsp = c( excess_disp, log_scale_expr_thresh) names(vsp) = c("excess_disp", "log_scale_expr_thresh") text2file(file.path(vgsrp, "var_gene_selection_params_monocle.txt"), collapse_by_name(vsp)) # # Do Monocle dimension reduction, correcting for cell cycle but not batch effects # # (For us, batch effects are often completely nested inside of embryonic day, # # so we can't adjust for them without removing the temporal signal.) form_str = paste0( "~", paste0( CC_PHASES, collapse = " + " ) ) library(DDRTree) # required to prevent "error in DDRTree_reduce_dim: cannot convert to environment" mobj = monocle::reduceDimension(mobj,residualModelFormulaStr=form_str) mobj = monocle::orderCells(mobj, num_paths = num_mature_types) if ( is.null( earliest_day ) ) { earliest_day = Biobase::phenoData(mobj)[["eday"]] %>% as.character %>% as.numeric %>% min } # # This code roots the lineage tree using a horrible horrible hack! # # Monocle crashes when you give orderCells a root_state that is an internal node of the ddrtree graph. # # I can't tell which f***ing nodes are internal. So I pick the one I want based on embryonic day, # # then rank the states by # # their proximity to it in one of the 2d embeddings. I try rooting the tree at the best # # ones first, stopping when something works. pct_eq_10_5 = function(x) mean( x == earliest_day ) eday = Biobase::phenoData(mobj)[["eday"]] %>% as.character %>% as.numeric state = Biobase::phenoData(mobj)[["State"]] %>% as.character %>% as.numeric coords = t( mobj@reducedDimS ) eday_by_state = aggregate_nice( x = eday, by = state, FUN = pct_eq_10_5 ) centroid_by_state = aggregate_nice( x = coords, by = state, FUN = mean ) root_state = which.max( eday_by_state ) distance_sq = function(x, y) sum((x - y)*(x - y)) dist_sq_to_root = apply( centroid_by_state, 1, FUN = function(x) distance_sq(x, centroid_by_state[root_state,]) ) names( dist_sq_to_root ) = rownames( centroid_by_state ) dist_sq_to_root = sort( dist_sq_to_root, decreasing = F ) for( i in seq_along( dist_sq_to_root ) ){ candidate_state = names( dist_sq_to_root )[[i]] e = tryCatch( expr = { mobj = orderCells( mobj, root_state = candidate_state ); e = "success" }, error=function( err ){ "failure" } ) if( e != "failure" ){ break } } detach( monocle_params ) return(mobj) } #' Transfer data from a pseudotime modeling object to a Seurat object. #' #' @details This function takes Seurat object and a finished object from a pseudotime analysis package #' It transfers info from the latter to the former, guaranteeing that the Seurat object #' will have complete metadata fields of the following names: #' `pseudotime`, `branch`, `branch_viz_1`, `branch_viz_2` #' @export add_pseudotime_to_seurat = function(dge, pt_obj, pt_method = "monocle" ){ if( pt_method == "monocle" ){ to_add = data.frame( branch_viz_1 = pt_obj@reducedDimS[1, dge@cell.names], branch_viz_2 = pt_obj@reducedDimS[2, dge@cell.names], branch = as.character( pt_obj@phenoData[["State"]] ), pseudotime = pt_obj@phenoData[["Pseudotime"]] ) atae( dge@cell.names, rownames( pt_obj@phenoData@data ) ) rownames( to_add ) = dge@cell.names } else if (pt_method == "dpt") { to_add = data.frame( branch_viz_1 = pt_obj@dm@eigenvectors[, 1], branch_viz_2 = pt_obj@dm@eigenvectors[, 2], branch_viz_3 = pt_obj@dm@eigenvectors[, 3], branch = as.character( pt_obj@branch [, 1] ), pseudotime = pt_obj[, 1] ) to_add$pseudotime = to_add$pseudotime*sign(cor(to_add$pseudotime, FetchData(dge, "eday"))) rownames( to_add ) = dge@cell.names warning("Removing root cell from DPT output due to weird gap from rest of dataset.") bad_idx = which( to_add$pseudotime == 0 ) to_add = to_add[-bad_idx, ] dge = Seurat::SubsetData( dge, cells.use = rownames( to_add ) ) } else { warning("Only transfer from monocle and dpt has been implemented. Returning seurat object untouched.") } dge = Seurat::AddMetaData(dge, to_add) return( dge ) } #' Project cells from one Seurat object onto the pseudotime axis (or any other continuous variable) from another Seurat object. #' #' @param dge_train Seurat object with 'pseudotime' field included. #' @param dge_test Seurat object. Cells will be assigned a pseudotime value based on `dge_train`. #' @param genes.use Defaults to `dge_train@var.genes`. If you use method="PCA", use the same genes here. #' If you aren't sure, use method="KNN". #' @param regressout Please put whatever you used for 'latent.vars' in RegressOut when you processed 'dge_train'. #' @param to_project Character vector such that FetchData( dge_train, to_project) contains numeric columns. #' Try "pseudotime" or c("tSNE_1", "tSNE_2"). #' @param k Number of nearest neighbors to use. #' @param pc.use which principal components to use for nearest neighbor search. #' #' @export #' ProjectCells = function( dge_train, dge_test, genes.use = dge_train@var.genes, regressout = c( "IG1.S", "S", "G2.M", "M", "M.G1" ), to_project = "pseudotime", k = 25, pc.use = 1:25, test_mode = F ){ if(any(pc.use > ncol(dge_train@pca.x))){ stop("Not enough principal components available in trainset. Compute moar. \n") } # Assemble data train_genes = FetchDataZeroPad( dge_train, genes.use ) %>% as.matrix test_genes = FetchDataZeroPad( dge_test, genes.use ) %>% as.matrix train_cc = FetchDataZeroPad( dge_train, regressout ) %>% as.matrix %>% cbind(1) test_cc = FetchDataZeroPad( dge_test, regressout ) %>% as.matrix %>% cbind(1) # # Regress out cc from testset using trainset params cc_coeffs = solve( t(train_cc) %*% train_cc, ( t(train_cc) %*% train_genes ) ) train_genes = train_genes - train_cc %*% cc_coeffs test_genes = test_genes - test_cc %*% cc_coeffs # # Standardize all data based on trainset statistics gene_means = apply( train_genes, 2, mean ) gene_sds = apply( train_genes, 2, sd ) atae( length(gene_means), length(genes.use)) train_genes = train_genes %>% as.matrix %>% sweep( 2, STATS = gene_means, "-") %>% sweep( 2, STATS = gene_sds, "/") test_genes = test_genes %>% as.matrix %>% sweep( 2, STATS = gene_means, "-") %>% sweep( 2, STATS = gene_sds, "/") # # Project into principal subspace of training data projection_mat = as.matrix(dge_train@pca.x[paste0("PC", pc.use)]) train_pca_embeddings = train_genes %*% projection_mat test_pca_embeddings = test_genes %*% projection_mat # # Check if( test_mode ){ plot( FetchData(dge_train, "PC1")[[1]], train_pca_embeddings[, 1], main = "Existing versus recomputed PC1 projections of cells", xlab = "existing", ylab = "recomputed") } # # Interpolate via average over nearest neighbors train_response = FetchDataZeroPad( dge_train, to_project ) test_response = data.frame(matrix(NA, nrow = nrow(test_genes), ncol = length(to_project))) colnames(test_response) = to_project rownames(test_response) = rownames(test_genes) neighbors = FNN::get.knnx( data = train_pca_embeddings, query = test_pca_embeddings, k = k ) for(i in 1:nrow(test_response)){ test_response[i, ] = colMeans(train_response[neighbors$nn.index[i, ], ]) } dge_test %<>% AddMetaData( metadata = test_response ) return( dge_test ) }
#' Align simple (y-shaped) lineage trees represented as points in 2D space. #' #' Output from Monocle is arbitrarily rotated and reflected. #' For bootstrap analysis where I call Monocle on subsets of the data, it's useful to #' align all of the results so that cells are in roughly the same positions across bootstrap samples. #' This program does that for any two matrices mat_ref and mat_samp. #' Rows are cells and columns are dimensions of some embedding. #' #' The reference and the query can have different numbers of columns. #' They should have the same number of rows, or else #' the property all( rownames( mat_samp ) %in% rownames( mat_ref ) ). #' The solution is based on linear regression. #' The return value is a projection of each column of mat_ref onto the column space of the #' augmented matrix [mat_samp | 1]. In other terms, mat_samp is translated and linearly combined to #' minimize the l2 distance to each column of mat_ref, then returned. #' This function always returns a numeric matrix with ncol = ncol(mat_ref) and nrow = nrow(mat_samp). #' @export align_embedding_to_reference = function(mat_samp, mat_ref, do.plot = F){ # # Handle vectors if( is.null( dim ( mat_samp ) ) ) { mat_samp = matrixify_preserving_rownames( mat_samp ) } if( is.null( dim ( mat_ref ) ) ) { mat_ref = matrixify_preserving_rownames( mat_ref ) } # # Handle dataframes mat_samp = as.matrix( mat_samp ) mat_ref = as.matrix( mat_ref ) # # Check dimensions if( ncol( mat_samp ) > nrow( mat_samp ) || ncol( mat_ref ) > nrow( mat_ref ) ){ warning("Did you forget to transpose?") } # # Ensure rownames present and compatible between sample and reference if( is.null( rownames( mat_samp ) ) | is.null( rownames( mat_ref ) ) ){ if( nrow( mat_samp ) != nrow( mat_ref ) ){ stop( "Either number of rows should be equal or rows should be named with `all( rownames( mat_samp ) %in% rownames( mat_ref ) )`." ) } else { rownames( mat_samp ) = as.character( 1:nrow( mat_samp ) ) rownames( mat_ref ) = as.character( 1:nrow( mat_samp ) ) } } atat( !is.null( rownames( mat_samp ) ) ) atat( !is.null( rownames( mat_ref ) ) ) atat( all( rownames( mat_samp ) %in% rownames( mat_ref ) ) ) mat_ref = mat_ref[ rownames( mat_samp ) , , drop = F] # drop = F GODDAMMIT # # Do math mat_return = matrix( lm( mat_ref ~ mat_samp )$fitted.values, ncol = ncol( mat_ref ) ) # # guarantee nice named-matrix output rownames( mat_return ) = rownames( mat_samp ) atat( nrow( mat_return ) == nrow( mat_samp ) ) atat( ncol( mat_return ) == ncol( mat_ref ) ) atat( is.numeric( mat_return ) ) # # For testing if(do.plot){ mat_ref = data.frame( mat_ref ); mat_ref$type = "input_ref" mat_samp = data.frame( mat_samp ); mat_samp$type = "input_sample" mat_return = data.frame( mat_return ); mat_return$type = "return_val" df_all = rbind( mat_ref, mat_samp, mat_return ) p = ggplot(df_all) + ggtitle("return_val and input_ref should be aligned") + geom_point(aes(X1, X2, colour = type), alpha = 0.2) print(p) } return( mat_return ) } # # Brief test script # # One dimensional test # refscale = 1:5; names(refscale) = letters[1:5] # atae( matrixify_preserving_rownames( refscale ), # align_embedding_to_reference(mat_ref = refscale, mat_samp = -10*refscale + 4 ), # tolerance = 0.000001, # check.attributes = F) # # # Two dimensional test # noisy_l = cbind(-99:100, c( rep(1, 100), 1:100 ) - 50 ) + matrix( 2*rnorm(400), ncol = 2 ) # rownames( noisy_l ) = make.unique( letters[(1:200 %% 25) + 1] ) # noisy_l_rot = noisy_l %*% matrix( c(1, 1, -1, 1), nrow = 2 ) + matrix( 2*rnorm(400), ncol = 2 ) # tmp = align_embedding_to_reference(mat_ref = noisy_l, # mat_samp = noisy_l, # do.plot = T) # tmp = align_embedding_to_reference(mat_ref = noisy_l, # mat_samp = noisy_l_rot, # do.plot = T)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.