'
Cleaning functions
Authors: Valeria Bonapersona & Heike Schuler
'
# Specify damaged areas ---------------------------------------------------
#' @title Specify damaged areas per sample
#' @description specify_damage() uses tidyr functions to filter
#' a dataframe containing a summary of damaged areas for all samples
#' and adapt it for later use in cleaning functions.
#'
#'
#' @param sample_id String used to select relevant sample.
#' @param data Dataframe with summary of damaged brain areas for all samples.
#' Long format in which each row is a damaged area. Requires the following
#' variables: "sample" with the name of the sample;
#' "area" with the acronym of damaged brain area according to Allen Brain Atlas
#' nomenclature; "hemisphere" to specify which hemisphere.
#'
#' @export
#' @return
#'
#' @examples
#' x <- data.frame(
#' sample_id = c(1,1,2,3),
#' area = c("BLA", "VTA", "CA1", "DG"),
#' hemisphere = c("left", "left", "right", "right")
#' )
#'
#' my_sample <- 1
#'
#' specify_damage(my_sample, x)
## add checks for data to be a dataframe
## add warning is sample_id not in data$sample.
specify_damage <- function(sample, data) {
# check atlas is a dataframe
assertthat::assert_that(is.data.frame(data))
# check parent_acronym and acronym are the var names of the df
assertthat::has_name(data, "sample_id")
assertthat::has_name(data, "area")
assertthat::has_name(data, "hemisphere")
# check only one sample provided has length 1
assertthat::are_equal(length(sample), 1)
data %>%
dplyr::filter(sample_id == sample) %>%
dplyr::mutate(area_hemisphere = paste(area, hemisphere, sep = "_")) %>%
droplevels()
}
# Clean_counts ------------------------------------------------------------
## ADD MASKS AS DATA OF PACKAGE
#' @title Clean xyz coordinates of identified cells
#' @description This function cleans the annotated cells by: 1) removing the
#' halo around the brain and ventricles, 2) filtering brain areas of interest,
#' 3) removing damaged areas, and 4) re-imputing the damaged areas by mirroring the
#' other hemisphere. clean_counts() saves two .RDS files in the path specified
#' for each sample processed. The first file (*clean_counts.RDS) contains the xyz
#' coordinates of cells that met the cleaning criteria, as well as their categorization
#' to brain areas of interest by the user. The second file
#' (*_removed_counts_summary.RDS) contains information about the removed counts during the procedure.
#'
#'
#' @param sample_id String used to save output. Please do not use spaces.
#' @param data Dataframe with cell coordinates for one sample.
#' Requires variables "xPos", "yPos" and "zPos" for x, y, z coordinates respectively; "id"
#' for code of the brain areas according to the Allen Brain Atlas.
#' @param atlas Dataframe with meta-data of Allen Brain Atlas areas. It can be generated by running the
#' preparation script "atlas_tree.R". Requires variables: "id" for numerical value of ABA areas;
#' "name" for character value; "acronym" for nomenclature; "parent_acronym" for the parent ABA it belongs to;
#' "category" for categorization of brain areas difficult to interpret, see XX for details; "my_grouping"
#' for your categorization of brain areas.
#' @param damaged_areas Dataframe with list of damaged brain areas of all samples. By using
#' the function specify_damage(), relevant damaged areas will be automatically selected.
#' Requires the following variables "area" with the acronym of the brain area that matches
#' the "grouping" variable of the areas dataframe (see above); "hemisphere" which specify
#' the hemisphere where the damage occurs ("right", "left").
#' @param dodgy_cells Dataframe with information about cells with abnormally high intensity (likely to be
#' unspecific binding, for example after checking scans). Three columns named "sample_id" (id of the sample),
#' "my_grouping" (brain area), and "threshold" (threshold above which cells are considered "spots". Filters based
#' on maximum intensity). This dataframe can be null.
#' @param out_mask Mask to identify outer part of brain to be removed due to halo.
#' Ask Heike more info: Created in python by finding all 0 that border the brain (next to non-zero values)
#' from there move 3 voxels in all directions (k). Create new matrix with non-zero values for
#' 3 voxels around k.
#' @param vent_mask Mask to identify outer part of the ventricles to be removed due
#' to halo and unspecific binding of the antibody. Ask Heike more info
#' @param warning_percentage Number between 0 and 1. It defines a threshold after which a warning message is
#' delivered. The warning message specifies if brain areas removed during the cleaning procedure had
#' (abnormally) high cells. 0 will never return a warning, 1 will always.
#' @param path_cleaned String to specify path where cleaned annotated cells will be saved
#' @param path_removed String to specify path where summary of removed cells will be saved
#'
#' @return
#' @export
#'
#' @examples For a thorough example, please see XX.
clean_counts <- function(sample_id, data, atlas, damaged_areas, dodgy_cells = NULL,
out_mask = out_mask, vent_mask = vent_mask,
warning_percentage = 0.2, path_cleaned, path_removed) {
## add checks for my_path and sample_id (must be strings)
## add checks for each dataframe added to check whether they are in the write format
## and their variables are in the write format
## specify somewhere the reason for which we split as we split it
## checks
# atlas must have xyz coordinates and id
# check atlas and data are a dataframe
assertthat::assert_that(is.data.frame(atlas))
assertthat::assert_that(is.data.frame(data))
# check all required variables are present
## data
assertthat::has_name(data, "xPos")
assertthat::has_name(data, "yPos")
assertthat::has_name(data, "zPos")
assertthat::has_name(data, "id")
## atlas
assertthat::has_name(atlas, "id")
assertthat::has_name(atlas, "name")
assertthat::has_name(atlas, "acronym")
assertthat::has_name(atlas, "parent_acronym")
assertthat::has_name(atlas, "category")
assertthat::has_name(atlas, "my_grouping")
# if dodgy_cells is not null, do all checks
if(!is.null(dodgy_cells)) {
assertthat::assert_that(is.data.frame(dodgy_cells))
assertthat::has_name(dodgy_cells, "sample_id")
assertthat::has_name(dodgy_cells, "my_grouping")
assertthat::has_name(dodgy_cells, "threshold")
assertthat::assert_that(nrow(dodgy_cells) > 1)
}
# warning_percentage must be between 0 and 1
assertthat::is.number(warning_percentage)
if(!warning_percentage >= 0 & ! warning_percentage <= 1) stop ("warning_percentage provided is not a number between 0 and 1")
# Match info categorization brain area ----------------------------------
data <- data.frame(data,
atlas[match(data$id, atlas$id),
c("category", "my_grouping")])
# recode warning_percentage
warn <- 1 - warning_percentage
# Specify damaged areas ---------------------------------------------------
damaged_areas_sample <- specify_damage(sample_id, damaged_areas)
# Areas checks ------------------------------------------------------------
## correct if you change category variable levels
if ( any(data$category %in% c(1,3)) ) warning("Subcategorized areas return non-zero counts, in disagreement with Allen Bran Atlas.
Removed, but categorization must be checked.")
if (data %>% filter(category == 4) %>% count() > nrow(data)*warn) warning("Trimmed areas have disproportionately high counts.")
if (data %>% filter(category == 5) %>% count() > nrow(data)*warn) warning("Sub-categorized, not interpretable areas have disproportionately high counts.")
if (data %>% filter(category == 7) %>% count() > nrow(data)*warn) warning("Not reliable areas have disproportionately high counts.")
# Cleaning -------------------------------------------------------------
clean_data <-
data %>%
# Remove halo
dplyr::mutate_at(vars(xPos, yPos, zPos), floor) %>%
dplyr::anti_join(rbind(out_mask, vent_mask)) %>%
{ . ->> no_halo} %>% # saves intermediate output
# Remove background and brain areas not interpretables
dplyr::filter(id != 0,
category %in% c(0,2)) %>%
{ . ->> no_background} %>% # saves intermediate output ##CHECK ALL AREAS HAVE CATEGORIZATION
# Split zPos (hemispheres) into half. Left areas will be < 0, right areas will be > 0.
dplyr::mutate(zPos = zPos - 228, #dimensions of template: x*y*456
hemisphere = case_when(zPos < 0 ~ "left",
zPos > 0 ~ "right",
TRUE ~ "midline")) %>%
{ . ->> hemisphere_sep} %>% # saves intermediate output
# Clean up factors
droplevels() %>% # necessary?
# Remove damaged areas
dplyr::mutate(grouping_hemisphere = paste(.$my_grouping, hemisphere, sep="_"),
potentially_damaged = case_when(grouping_hemisphere %in%
damaged_areas_sample$area_hemisphere ~ "yes",
TRUE ~ "no")) %>%
{ . ->> pot_damaged} %>% # saves intermediate output
dplyr::filter(potentially_damaged != "yes") %>%
# Impute mirror of other hemisphere for damaged brain areas removed
dplyr::full_join(
pot_damaged %>%
dplyr::filter(my_grouping %in% damaged_areas_sample$area,
!hemisphere %in% c(levels(damaged_areas_sample$hemisphere), "midline")) %>%
dplyr::mutate(zPos = .$zPos * -1,
hemisphere = dplyr::case_when(.$hemisphere == "right" ~ "left",
.$hemisphere == "left" ~ "right"),
grouping_hemisphere = paste(.$my_grouping, hemisphere, sep = "_"))
)
# remove dodgy cells if necessary
if (!is.null(dodgy_cells)) {
# select what to filter
threshold <- dodgy_cells %>%
dplyr::filter(sample_id == sample) %>%
dplyr::select(my_grouping, threshold)
# filter out dodgy cells
clean_data <- clean_data %>%
# merge with df
dplyr::left_join(threshold, by = "my_grouping") %>%
# remove cells on upper boundary
dplyr::filter(is.na(threshold) | threshold>maxInt) %>%
dplyr::select(-threshold)
}
# select relevant variables
clean_data <- clean_data %>%
# select only vars of interest
dplyr::select(dplyr::ends_with("Pos"), maxInt, my_grouping, hemisphere)
# Save output cleancounts in temp file
saveRDS(clean_data, file = paste0(path_cleaned, sample_id, "_clean_cells.RDS"))
# Removed counts -------------------------------------------------------------
removed_counts <- data.frame(
"sample_id" = sample_id,
"initial_counts" = nrow(data),
"halo_removal" = (nrow(data) - nrow(no_halo)),
"background_removal" = (nrow(no_halo) - nrow(no_background)),
"damaged_counts_removed" = nrow(pot_damaged[pot_damaged$potentially_damaged == "yes",]),
"replacement_counts" = nrow(pot_damaged[pot_damaged$my_grouping %in% damaged_areas_sample$area &
!pot_damaged$hemisphere %in% c(levels(damaged_areas_sample$hemisphere), "midline"),]),
"final_counts" = nrow(clean_data))
saveRDS(removed_counts, file = paste0(path_removed, sample_id, "_removed_counts_summary.RDS"))
# Return ------------------------------------------------------------------
print(paste0("Cleaned data of sample '", sample_id,
"' has been saved in the specified folder."))
}
####### this part can be later deleted
## check no weird areas >> categories 1 and 3 should not be present
# categories:
# 0 = included
# 1 = subcategorized (returns null counts)
# 2 = subcategorized, non no-zero, interpretable
# 3 = subcategorization does not exist in ABA, returns null
# 4 = trimmed areas (trimmed by who??)
# 5 = subcategorized, non no-zero, not interpretable
# 6 = fiber tract
# 7 = not reliable, around ventricles and small
# samples_files_to_df ------------------------------------------------------
#' @title Merges in one data frame all the separate samples' files
#' @description Wrapper to dplyr functions to merge together multiple files output
#' from cleaned_counts(). It can merge both the cleaned cell files (extension = "_clean_cells.RDS"),
#' as well as the summaries of the removed cells (extension = "_removed_counts_summary.RDS").
#'
#' @param path String for path of location files. It does not matter if other files are present
#' in this folder. Only files with the specified extension will be added.
#' @param extension Final part of the string of the files. Same as output of cleaned_counts().
#'
#' @return
#' @export
#'
#' @examples For a thorough example, please see XX.
samples_files_to_df <- function(path, extension = c("_clean_cells.RDS", "_removed_counts_summary.RDS")) {
if (length(extension) > 1) {extension <- extension[[1]]}
file_names <- list.files(path = path)
file_list <- file_names[stringr::str_detect(file_names, extension)]
data_ls <- lapply(paste0(path, file_list), readRDS)
# get sample names
samples <- stringr::str_remove_all(file_list, extension)
names(data_ls) <- samples
# from list to data frame
dplyr::bind_rows(data_ls, .id = "sample_id")
}
# remove_blinding --------------------------------------------------------------
#' @title Remove (or scramble) blinding of your groups
#' @description Convert random group names (e.g. "a", "b") into meaningful
#' groups (e.g. "control", "experimental"). It can be used to unblind at the
#' *end* of the analysis, or it can be used as "scrambled" at the beginning of
#' the analysis. Returns a data frame with all (unused) columns of data and key.
#'
#'
#' @param data Dataframe with (at least) two columns: "sample_id" and "blinded_group"
#' @param key Dataframe with (at least) two columns: "blinded_group" and "key"
#' @param scramble Specifies whether to scramble the groups or not. Can be
#' either TRUE or FALSE
#'
#' @return
#' @export
#'
#' @examples
#' x <- data.frame(sample_id = c(1:20), blinded_group = c(rep(c(1:4), each = 5)))
#' y <- data.frame(blinded_group = 1:4, key = c("control", "drug_a", "drug_b", "drug_c"))
#' z <- unblind(x,y)
remove_blinding <- function(data, key, scramble = FALSE) {
# check data and key are dataframes
assertthat::assert_that(is.data.frame(data))
assertthat::assert_that(is.data.frame(key))
# check data has correct vars
assertthat::has_name(data, "sample_id")
assertthat::has_name(data, "blinded_group")
# check that key has correct vars
assertthat::has_name(key, "blinded_group")
assertthat::has_name(key, "key")
if (! scramble %in% c(TRUE, FALSE)) stop("scramble can only be TRUE or FALSE")
# create new key with scrambling
if (scramble == TRUE) {
scramble_order <- sample(nrow(key), replace = FALSE)
key$key <- key$key[scramble_order]
}
# unblind
data %>%
# merge with unblinding
dplyr::left_join(key, by = "blinded_group") %>%
# remove old and rename new
dplyr::select(-blinded_group) %>%
dplyr::rename(group = key)
}
# Normalization -----------------------------------------------
#' @title Normalize cell counts
#' @description Wrapper functon to YeoJohnson transformation from recipes package. Similar transformation to boxcox,
#' but it accepts also 0 (and negative) values.
#'
#'
#' @param x Numeric vector with values to normalize
#'
#' @return
#' @export
#'
#' @examples
#' x <- rpois(100, 3)
#' y <- normalize(x)
#'
#' par(mfrow=c(1,2))
#' hist(x)
#' hist(y)
#' par(mfrow=c(1,1))
normalize <- function(x) {
# check that x is a string
assertthat::assert_that(is.numeric(x))
# wrapper around recipe package
y <- data.frame(x)
rec <- recipes::recipe(x~1, data = y)
# get transformation values
yj_transform <- recipes::step_YeoJohnson(rec, recipes::all_numeric())
# get estimates
yj_estimates <- recipes::prep(yj_transform, training = y)
# get transformed values
yj_te <- recipes::bake(yj_estimates, y)
# return
return(yj_te$x)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.