## ------------------------------------------------------------------------
#' Return df with handpicked genes.
#'
#'@export
get_rene_markers = function(){
data( handpicked_markers, envir = environment() )
return( handpicked_markers )
}
#' Return a table of genes generated by head-to-head comparison of cTECs and mTECs on the Atlas data.
#'
#'@export
get_cTEC_mTEC_genes = function(){
data( cTEC_mTEC_data_driven, envir = environment() )
return( cTEC_mTEC_data_driven )
}
#' Return human receptor-ligand pairs from Ramilowski et al 2015.
#'
#'@export
get_ramilowski = function(){
data( ramilowski, envir = environment() )
return( ramilowski )
}
#' Return cell-cycle genes from original Drop-seq paper by Macosko et al. (2015)
#'
#'@export
get_macosko_cc_genes = function( case = "Capital" ){
data( cc_genes, envir = environment() )
return(cc_genes)
}
#' Maehr lab list of mouse transcription factors.
#'
#'@export
get_mouse_tfs = function(){
data( mouse_tfs, envir = environment() )
return( mouse_tfs )
}
#' Poised genes from Lesch 2016 Nature Genetics
#'
#'@export
get_poised_genes = function(){
data( poised_genes, envir = environment() )
return( poised_genes )
}
#' Human-mouse ortholog pairs from Ensembl 87.
#'
#'@export
get_ortholog_table = function(){
data( orthologs_ens87 )
return( orthologs_ens87 )
}
CC_PHASES = c("IG1.S", "S", "G2.M", "M", "M.G1")
black_white = c( colorspace::sequential_hcl( 30, h = 0, c. = c(0, 0), l = c( 0, 100 ) ) )
blue_gray_red = colorspace::diverge_hcl ( 30, c = 180, l = c( 40, 80 ) )
blue_purple_red = colorRampPalette(c("blue", "red"))(30)
yellow_red = c( colorspace::sequential_hcl( 18, h = 60, c. = c(100, 100), l = c( 100, 80 ) ),
colorspace::sequential_hcl( 12, h = 0, c. = c(100, 160), l = c( 80, 40 ) ) )
blue_yellow = c( colorspace::sequential_hcl( 15, h = 260, c. = c(50, 0), l = c( 13, 65 ) ),
colorspace::sequential_hcl( 15, h = 40, c. = c(0, 80), l = c( 65, 95 ) ) )
Thanksgiving_colors = c("yellow", "orange", "red", "brown")
## ------------------------------------------------------------------------
atat = function(my_ass) (assertthat::assert_that(my_ass))
atae = function(x, y, ...) (assertthat::are_equal(x, y, ...))
#' Return a scatterplot, labeling outliers.
#'
#'@details This function plots two variables, labeling the ones that are far from their neighbors.
#' `data` should be a dataframe with first column to go on the horizontal axis,
#' second on the vertical. The third column is treated as the labels and the
#' fourth (optional) is treated as the color.
#' `main` shows up as the figure title.
#' The proportion of points labeled is `prop_label`.
#' @return Prints and returns a ggplot object.
#' @export
outlier_labeled_scatterplot = function( data, main = "", prop_label = 0.02 ){
# sanitize input
if(any(is.na(data))){
cc = complete.cases( data )
warning(c("Removing ", sum(1-cc), " rows with missing data."))
data = data[cc, ]
}
cc = is.finite(data[, 1]) & is.finite(data[, 1])
if(any(!cc)){
warning(c("Removing ", sum(1-cc), " rows with NaN, Inf, or -Inf."))
data = data[cc, ]
}
x = names(data)[[1]]
y = names(data)[[2]]
label = names(data)[[3]]
colour = NULL
if( length( data ) > 3 ){ colour = names(data)[[4]] }
data$dist_to_nn = rowSums( FNN::get.knn( data = apply(X = data[ , 1:2 ], MARGIN = 2, FUN = div_by_max ),
k=3, algorithm=c( "cover_tree" ) )$nn.dist )
data$should_label = data$dist_to_nn > quantile( data$dist_to_nn, probs = (1 - prop_label))
p = ggplot() + ggtitle( main ) +
geom_text( data = subset( data, should_label ),
aes_string( x = x, y = y, colour = colour, label = label ) ) +
geom_point( data = subset( data, !should_label ),
aes_string( x = x, y = y, colour = colour ) )
print( p )
return( p )
}
flip_table = function(X) X[nrow(X):1, ]
#' Clean and save a df containing columns "gene", "avg_diff", "pct.1", "pct.2", "p.value" and possibly "q.value".
#'
#' @export
#'
save_marker_table = function(X, results_path, testname, remove_rp = FALSE, add_to_inventory = FALSE ){
atae(names(X), c("gene", "avg_diff", "pct.1", "pct.2", "p.value", "q.value" ) )
if(remove_rp) X %<>% subset( !is_rp(gene) )
# Only Phred-scale qvals
X$`-log10q` = X$q.value %>% log10 %>% multiply_by(-1)
X$p.value = NULL
X$q.value = NULL
# Round stuff
X$avg_diff %<>% round(2)
X$pct.1 %<>% round(2)
X$pct.2 %<>% round(2)
X$`-log10q` %<>% round(2)
# Print to files
f_up = file.path(results_path, paste0( testname, "_up.txt" ) )
f_dn = file.path(results_path, paste0( testname, "_dn.txt" ) )
X %>% subset( avg_diff > 0 ) %>%
write.table( f_up, quote = F, col.names = T, row.names = F)
X %>% subset(avg_diff < 0 ) %>% flip_table %>%
write.table( f_dn, quote = F, col.names = T, row.names = F)
# Add to freezr inventory
if( add_to_inventory ){
freezr::inventory_add( tag = paste0( "de_genes_", testname, "_up" ),
filename = f_up,
force = T )
freezr::inventory_add( tag = paste0( "de_genes_", testname, "_dn" ),
filename = f_dn,
force = T )
}
}
distance_sq = function( x, y ) { sum( ( x - y )^2 ) }
nnz = function(x)(sum(x>0))
#' @export
prop_nz = function(x)( nnz(x) / length(x))
#' @export
div_by_max = function( x ){ if( max(x) == 0) 0*x else x / max( x ) }
#' @export
div_by_sum = function( x ){ if( sum(x) == 0) 0*x else x / sum( x ) }
#' @export
percentify = function( x ){ return( 100*round( div_by_sum( x ), 3 ) ) }
#' @export
standardize = function( x, nonpar = F ){
if(nonpar){
y = x - median(x)
fake_sd = IQR( y ) / 1.37
z = y / fake_sd
} else {
y = x - mean(x)
z = y / sd( y )
}
return( z )
}
#' @export
matrixify_preserving_rownames = function(x) matrix( x, ncol = 1, dimnames = list( names( x ), "") )
#' @export
vectorize_preserving_rownames = function(x, i = 1) {
v = x[, i]
names(v) = rownames(x)
atat(is.vector(v) | is.factor(v));
return(v)
}
#' @export
down_idx = function(x){ x[[1]] }
#' @export
factor_numeric = function(x) {
atat(is.numeric(x))
x = factor(x, levels = sort(unique(x)), ordered = T)
}
# Like dplyr::top_n but it preserves the rownames.
#' @export
top_n_preserve_rownames = function( x, ...){
if(is.null(rownames(x))){return(top_n(x, ...))}
rownames_tempcol = make.unique( c( colnames( x ), "rownames_tempcol" ) ) %>% rev %>% down_idx
x[[rownames_tempcol]] = attr(x, "row.names")
y = dplyr::top_n(x, ...)
attr(y, "row.names") = y[[rownames_tempcol]]
y[[rownames_tempcol]] = NULL
return(y)
}
#' Aggregate data.frame by a categorical variable, permissively.
#'
#' @details A wrapper for aggregate(). Accepts atomic "by" argument.
#' Guaranteed to return a matrix. Also returns the aggregation levels in the rownames
#' instead of adding a column for them.
#' @export
aggregate_nice = function(x, by, FUN, ... ) {
if( typeof( by ) != "list" ){
by = list ( by )
}
right_type = ( is.atomic(x) | typeof( x ) %in% c( "matrix", "dataframe" ) )
if( !right_type ){
x = as.matrix( x )
}
result = aggregate( x, by, FUN = function(x) FUN(x, ...) )
rownames( result ) = as.character( result[, 1] )
result = result[ , -1, drop = F]
return( as.matrix( result ) )
}
#' Convert a, b, b, a, c, a to 1, 2, 2, 1, 3, 1. Works with any strings alphabetically.
#'
#' @export
replace_with_int_rank = function(x) as.numeric( as.factor( x ) )
#' @export
na2zero = function(df){
df[is.na(df)] <- 0
return(df)
}
#' @export
Capitalize = function(s) {paste0(toupper( substring( s, 1, 1 ) ), tolower( substring( s, 2 ) ) )}
#' Turn a named list or vector of strings into a
#' pipe-separated key-value format: "<name1>=<value1>|<name2>=<value2>|..."
#'
#' @export
collapse_by_name = function(named_list){
name_eq_element = named_list # preallocate
for(field in names( named_list ) ){
name_eq_element[[field]] = paste0(field, "=", named_list[[field]])
}
return(paste(name_eq_element, collapse = "|"))
}
#' Remove a suffix if it is present, but if it's absent, avoid damaging the input.
#'
#' @details Not vectorized! Length-1 character vectors please.
#' @export
strip_suffix = function(s, suffix){
if( !is.atomic( s ) ) { stop( "This function isn't vectorized. Sorry!" ) }
if( length( s ) != 1 ){ stop( "This function isn't vectorized. Sorry!" ) }
nc = nchar(s)
ncs = nchar(suffix)
if( substring(s, nc + 1 - ncs, nc ) == suffix ){ s = substring(s, 1, nc - ncs ) }
return(s)
}
#' Write text to a file
#'
#' @export
text2file = function(filename, vector_of_lines){
vector_of_lines = as.character(vector_of_lines)
fileConn<-file(filename)
writeLines(vector_of_lines, fileConn)
close(fileConn)
}
#' Create new directories recursively and without a warning if they already exist.
#'
#' @export
dir.create.nice = function(my_dir){
if( !dir.exists( my_dir ) ){ dir.create( my_dir, recursive = T) }
}
#' Wrap base::strsplit with a 'drop' option
#'
#' @export
strsplit_drop = function(..., drop = TRUE){
x = strsplit(...)
if(drop){
x = x[[1]]
}
return( x )
}
#' Take in a function from one finite set of strings to another
#' in the form of a named vector `map`. (Inputs are names, outputs are values.)
#' Return its preimage as a named list of vectors.
#'
#' @details If `detailed_output`, returns a list with named elements:
#' - `preimage`: a list where each name is an output (a value of `map`)
#' and each element is a vector of all inputs leading to that output.
#' - `output_occurs_multiple`: a named list or vector subsetted from `map`
#' where each element occurs more than once.
#' - `output_occurs_once`: a named list or vector subsetted from `map` where
#' each element occurs once.
#' Otherwise, it returns just the preimage.
#' @export
get_preimage = function( map, detailed_output = F ){
assertthat::assert_that(is.character(map))
assertthat::assert_that(is.vector(map))
assertthat::assert_that(is.atomic(map))
if( any( is.na( map ) ) ) { warning( "I haven't tested this on missing values." ) }
# # Preallocate. Empty lists are treacherous bastards, so fill in NA's at first.
range_of_map = unique( map )
preimage = setNames( as.list( rep(NA, length( range_of_map ) ) ), nm = range_of_map )
# # Save CPU time by setting the easy ones wholesale
output_occurs_multiple = map[ map %in% map[ duplicated( map ) ] ] # mappity map map map
output_occurs_once = map[ !( map %in% output_occurs_multiple ) ] # map map
preimage[ output_occurs_once ] = names( output_occurs_once )
# # Fill in fibers
for( input in names( output_occurs_multiple ) ){
preimage[[ map[[ input ]] ]] = c( preimage[[ map[[ input ]] ]], input )
}
# # Clean up NA's one by one
remove_NA = function( x ){ x[!is.na(x)] }
preimage = lapply( preimage, FUN = remove_NA )
atae( length( output_occurs_once ),
length( unique( output_occurs_once ) ) )
if(!detailed_output){ return( preimage ) }
return( list( preimage = preimage,
output_occurs_multiple = output_occurs_multiple,
output_occurs_once = output_occurs_once ) )
}
## ------------------------------------------------------------------------
# # Set up data on human-mouse orthologs
ortholog_table = get_ortholog_table()
# # Hash tables for fast access
human_dupes = duplicated( ortholog_table$humansym )
mouse_dupes = duplicated( ortholog_table$mousesym )
h2m = setNames( ortholog_table$mousesym, nm = ortholog_table$humansym )[!human_dupes]
m2h = setNames( ortholog_table$humansym, nm = ortholog_table$mousesym )[!mouse_dupes]
#' Return the ortholog of a given gene or NA (if no match). Not vectorized. Use get_ortholog instead.
#'
get_ortholog_nonvec = function( gene, from = "human", to = "mouse" ){
if ( from == "human" && to == "mouse"){
return( h2m[ gene ] )
} else if( from == "mouse" && to == "human"){
return( m2h[ gene ] )
} else {
warning(' The only working options are from = "human", to = "mouse" and from = "mouse", to = "human". Returning your gene unaltered. ')
return( gene )
}
}
#' Return the ortholog of a given gene or NA (if no match).
#' Human and mouse only.
#'
#' @export
get_ortholog = function(x, ...) {
x %<>% as.character
sapply(x, get_ortholog_nonvec, ...)
}
#' Same as get_ortholog but returns just T or F.
#'
#' @export
has_ortholog = function( ... ){ !is.na( get_ortholog( ..., try_caps = FALSE ) ) }
#' Convert a raw digital gene expression matrix from one species to another.
#'
#'@details If two genes have the same ortholog, the molecule counts get added.
#' If a gene has no ortholog, it is omitted.
#' The input must be a matrix with genes stored in rownames( raw_dge ).
#' @export
convert_species_dge = function( raw_dge, from = "human", to = "mouse"){
cat( paste( "Converting to", to, "...\n" ) )
genes = rownames(raw_dge)
eligible_genes = genes[ has_ortholog( genes, from, to ) ]
genes_by_ortholog = eligible_genes %>% get_ortholog( ., from, to ) %>% get_preimage
raw_dge_converted = matrix( 0, nrow = length( genes_by_ortholog ), ncol = ncol( raw_dge ) )
rownames( raw_dge_converted ) = names( genes_by_ortholog )
colnames( raw_dge_converted ) = colnames( raw_dge )
for( ortholog in names( genes_by_ortholog ) ){
raw_dge_converted[ ortholog, ] = raw_dge[ genes_by_ortholog[[ortholog]], , drop = F ] %>% colSums
}
return( raw_dge_converted )
}
#' Given a gene list and a seurat object, return a gene list compatible with 'species' in the seurat object.
#'
#' @export
harmonize_species = function( gene_list, dge ){
gene_list %<>% as.character
if( !"species" %in% AvailableData( dge ) ){
warning( paste( "Please add a `species` field to the metadata containing 'human', 'mouse', or some of each.\n",
"Attempting to add species for you via `add_maehrlab_metadata( dge, 'species' )`.") )
dge = add_maehrlab_metadata( dge, "species" )
}
has_human = "human" %in% FetchData(dge, "species")[[ "species" ]]
has_mouse = "mouse" %in% FetchData(dge, "species")[[ "species" ]]
if( has_human ){
gene_list %<>% union( get_ortholog( gene_list, from = "mouse", to = "human" ) )
}
if( has_mouse ){
gene_list %<>% union( get_ortholog( gene_list, from = "human", to = "mouse" ) )
}
gene_list %<>% intersect( AvailableData( dge ) )
return( gene_list )
}
#' Remove species-specific genes from a Seurat object.
#'
#'@details The input is a Seurat object with a metadata field "species".
#' Removes genes that appear in less than 3 out of 1000 cells of any species
#' Also removes genes where the proportion in one species is higher by 0.5 than the other.
#' @export
remove_species_specific_genes = function( dge, results_path, threshold = 0.003, diff_thresh = 0.5 ){
proportions_by_species = aggregate_nice( x = as.matrix( t( dge@data > 0 ) ),
by = dge@data.info$species ,
FUN = mean )
min_proportions_by_species = sapply( X = proportions_by_species, FUN = min)
max_proportions_by_species = sapply( X = proportions_by_species, FUN = max)
plot_df = as.data.frame( t( proportions_by_species ) ); names ( plot_df ) = rownames( proportions_by_species )
plot_df$gene = rownames( plot_df )
plot_df$diff_big = ( max_proportions_by_species - min_proportions_by_species > diff_thresh )
plot_df$absent_in_one = ( min_proportions_by_species < threshold )
plot_df$excluded = plot_df$absent_in_one | plot_df$diff_big
pdf( file.path( results_path, "min_proportions_by_species.pdf" ) )
{
hist( unlist( min_proportions_by_species[min_proportions_by_species < 10*threshold ] ),
main = "Min proportions truncated at 10*threshold",
xlab = "Proportion in mouse or human (whichever is lower)")
p = ggplot() + ggtitle("Proportion of cells expressing each gene") +
geom_point( data = subset( plot_df, !diff_big | absent_in_one ),
aes_string( x = names ( plot_df )[1],
y = names ( plot_df )[2],
colour = "excluded" ) ) +
geom_text( data = subset( plot_df, diff_big & !absent_in_one ),
aes_string( x = names ( plot_df )[1],
y = names ( plot_df )[2],
label = "gene",
colour = "excluded" ) )
print( p )
}
dev.off()
passing_genes = plot_df$gene[ !plot_df$excluded ]
print( paste0( "Removing ", 100*round( mean( plot_df$excluded ), 2), "% of the genes." ) )
if( !is.null( dge@data ) ) { dge@data = dge@data [passing_genes, ]}
if( !is.null( dge@raw.data ) ) { dge@raw.data = dge@raw.data [passing_genes, ]}
if( !is.null( dge@scale.data ) ) { dge@scale.data = dge@scale.data[passing_genes, ]}
if( !is.null( dge@mean.var ) ) { dge@mean.var = dge@mean.var [passing_genes, ]}
if( !is.null( dge@var.genes ) ) { dge@var.genes %<>% intersect( passing_genes )}
return( dge )
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.