R/cleaning_functions.R

Defines functions normalize remove_blinding samples_files_to_df clean_counts specify_damage

Documented in clean_counts normalize remove_blinding samples_files_to_df specify_damage

'
  Cleaning functions

  Authors: Valeria Bonapersona & Heike Schuler
'


# Specify damaged areas ---------------------------------------------------
#' @title Specify damaged areas per sample
#' @description specify_damage() uses tidyr functions to filter
#' a dataframe containing a summary of damaged areas for all samples
#' and adapt it for later use in cleaning functions.
#'
#'
#' @param sample_id String used to select relevant sample.
#' @param data Dataframe with summary of damaged brain areas for all samples.
#' Long format in which each row is a damaged area. Requires the following
#' variables: "sample" with the name of the sample;
#' "area" with the acronym of damaged brain area according to Allen Brain Atlas
#' nomenclature; "hemisphere" to specify which hemisphere.
#'
#' @export
#' @return
#'
#' @examples
#' x <- data.frame(
#' sample_id = c(1,1,2,3),
#' area = c("BLA", "VTA", "CA1", "DG"),
#' hemisphere = c("left", "left", "right", "right")
#' )
#'
#' my_sample <- 1
#'
#' specify_damage(my_sample, x)

## add checks for data to be a dataframe
## add warning is sample_id not in data$sample.
specify_damage <- function(sample, data) {

  # check atlas is a dataframe
  assertthat::assert_that(is.data.frame(data))

  # check parent_acronym and acronym are the var names of the df
  assertthat::has_name(data, "sample_id")
  assertthat::has_name(data, "area")
  assertthat::has_name(data, "hemisphere")

  # check only one sample provided has length 1
  assertthat::are_equal(length(sample), 1)

  data %>%
    dplyr::filter(sample_id == sample) %>%
    dplyr::mutate(area_hemisphere = paste(area, hemisphere, sep = "_")) %>%
    droplevels()

}


# Clean_counts ------------------------------------------------------------
## ADD MASKS AS DATA OF PACKAGE
#' @title Clean xyz coordinates of identified cells
#' @description This function cleans the annotated cells by: 1) removing the
#' halo around the brain and ventricles, 2) filtering brain areas of interest,
#' 3) removing damaged areas, and 4) re-imputing the damaged areas by mirroring the
#' other hemisphere. clean_counts() saves two .RDS files in the path specified
#' for each sample processed. The first file (*clean_counts.RDS) contains the xyz
#' coordinates of cells that met the cleaning criteria, as well as their categorization
#' to brain areas of interest by the user. The second file
#' (*_removed_counts_summary.RDS) contains information about the removed counts during the procedure.
#'
#'
#' @param sample_id String used to save output. Please do not use spaces.
#' @param data Dataframe with cell coordinates for one sample.
#' Requires variables "xPos", "yPos" and "zPos" for x, y, z coordinates respectively; "id"
#' for code of the brain areas according to the Allen Brain Atlas.
#' @param atlas Dataframe with meta-data of Allen Brain Atlas areas. It can be generated by running the
#' preparation script "atlas_tree.R". Requires variables: "id" for numerical value of ABA areas;
#' "name" for character value; "acronym" for nomenclature; "parent_acronym" for the parent ABA it belongs to;
#' "category" for categorization of brain areas difficult to interpret, see XX for details; "my_grouping"
#' for your categorization of brain areas.
#' @param damaged_areas Dataframe with list of damaged brain areas of all samples. By using
#' the function specify_damage(), relevant damaged areas will be automatically selected.
#' Requires the following variables "area" with the acronym of the brain area that matches
#' the "grouping" variable of the areas dataframe (see above); "hemisphere" which specify
#' the hemisphere where the damage occurs ("right", "left").
#' @param dodgy_cells Dataframe with information about cells with abnormally high intensity (likely to be
#' unspecific binding, for example after checking scans). Three columns named "sample_id" (id of the sample),
#' "my_grouping" (brain area), and "threshold" (threshold above which cells are considered "spots". Filters based
#' on maximum intensity). This dataframe can be null.
#' @param out_mask Mask to identify outer part of brain to be removed due to halo.
#' Ask Heike more info: Created in python by finding all 0 that border the brain (next to non-zero values)
#' from there move 3 voxels in all directions (k). Create new matrix with non-zero values for
#' 3 voxels around k.
#' @param vent_mask Mask to identify outer part of the ventricles to be removed due
#' to halo and unspecific binding of the antibody. Ask Heike more info
#' @param warning_percentage Number between 0 and 1. It defines a threshold after which a warning message is
#' delivered. The warning message specifies if brain areas removed during the cleaning procedure had
#' (abnormally) high cells. 0 will never return a warning, 1 will always.
#' @param path_cleaned String to specify path where cleaned annotated cells will be saved
#' @param path_removed String to specify path where summary of removed cells will be saved
#'
#' @return
#' @export
#'
#' @examples For a thorough example, please see XX.

clean_counts <- function(sample_id, data, atlas, damaged_areas, dodgy_cells = NULL,
                         out_mask = out_mask, vent_mask = vent_mask,
                         warning_percentage = 0.2, path_cleaned, path_removed) {


  ## add checks for my_path and sample_id (must be strings)
  ## add checks for each dataframe added to check whether they are in the write format
  ## and their variables are in the write format
  ## specify somewhere the reason for which we split as we split it


  ## checks
  # atlas must have xyz coordinates and id
  # check atlas and data are a dataframe
  assertthat::assert_that(is.data.frame(atlas))
  assertthat::assert_that(is.data.frame(data))

  # check all required variables are present
  ## data
  assertthat::has_name(data, "xPos")
  assertthat::has_name(data, "yPos")
  assertthat::has_name(data, "zPos")
  assertthat::has_name(data, "id")

  ## atlas
  assertthat::has_name(atlas, "id")
  assertthat::has_name(atlas, "name")
  assertthat::has_name(atlas, "acronym")
  assertthat::has_name(atlas, "parent_acronym")
  assertthat::has_name(atlas, "category")
  assertthat::has_name(atlas, "my_grouping")

  # if dodgy_cells is not null, do all checks
  if(!is.null(dodgy_cells)) {
    assertthat::assert_that(is.data.frame(dodgy_cells))
    assertthat::has_name(dodgy_cells, "sample_id")
    assertthat::has_name(dodgy_cells, "my_grouping")
    assertthat::has_name(dodgy_cells, "threshold")
    assertthat::assert_that(nrow(dodgy_cells) > 1)
  }

  # warning_percentage must be between 0 and 1
  assertthat::is.number(warning_percentage)
  if(!warning_percentage >= 0 & ! warning_percentage <= 1) stop ("warning_percentage provided is not a number between 0 and 1")



  # Match  info categorization brain area  ----------------------------------
  data <- data.frame(data,
                     atlas[match(data$id, atlas$id),
                           c("category", "my_grouping")])
  # recode warning_percentage
  warn <- 1 - warning_percentage


  # Specify damaged areas ---------------------------------------------------
  damaged_areas_sample <- specify_damage(sample_id, damaged_areas)

  # Areas checks ------------------------------------------------------------
  ## correct if you change category variable levels
  if ( any(data$category %in% c(1,3)) ) warning("Subcategorized areas return non-zero counts, in disagreement with Allen Bran Atlas.
    Removed, but categorization must be checked.")

  if (data %>% filter(category == 4) %>% count() > nrow(data)*warn) warning("Trimmed areas have disproportionately high counts.")
  if (data %>% filter(category == 5) %>% count() > nrow(data)*warn) warning("Sub-categorized, not interpretable areas have disproportionately high counts.")
  if (data %>% filter(category == 7) %>% count() > nrow(data)*warn) warning("Not reliable areas have disproportionately high counts.")

  # Cleaning -------------------------------------------------------------

  clean_data <-
    data %>%

    # Remove halo
    dplyr::mutate_at(vars(xPos, yPos, zPos), floor) %>%
    dplyr::anti_join(rbind(out_mask, vent_mask)) %>%
    { . ->> no_halo} %>% # saves intermediate output

    # Remove background and brain areas not interpretables
    dplyr::filter(id != 0,
           category %in% c(0,2))  %>%
    { . ->> no_background} %>% # saves intermediate output ##CHECK ALL AREAS HAVE CATEGORIZATION

    # Split zPos (hemispheres) into half. Left areas will be < 0, right areas will be > 0.
    dplyr::mutate(zPos = zPos - 228, #dimensions of template: x*y*456
           hemisphere = case_when(zPos < 0 ~ "left",
                                  zPos > 0 ~ "right",
                                  TRUE ~ "midline")) %>%

    { . ->> hemisphere_sep} %>% # saves intermediate output

    # Clean up factors
    droplevels() %>% # necessary?

    # Remove damaged areas
    dplyr::mutate(grouping_hemisphere = paste(.$my_grouping, hemisphere, sep="_"),
           potentially_damaged = case_when(grouping_hemisphere %in%
                                             damaged_areas_sample$area_hemisphere ~ "yes",
                                           TRUE ~ "no")) %>%

    { . ->> pot_damaged}  %>% # saves intermediate output

    dplyr::filter(potentially_damaged != "yes") %>%

    # Impute mirror of other hemisphere for damaged brain areas removed
    dplyr::full_join(
      pot_damaged %>%
        dplyr::filter(my_grouping %in% damaged_areas_sample$area,
               !hemisphere %in% c(levels(damaged_areas_sample$hemisphere), "midline")) %>%
        dplyr::mutate(zPos = .$zPos * -1,
               hemisphere = dplyr::case_when(.$hemisphere == "right" ~ "left",
                                      .$hemisphere == "left" ~ "right"),
               grouping_hemisphere = paste(.$my_grouping, hemisphere, sep = "_"))
    )


  # remove dodgy cells if necessary
  if (!is.null(dodgy_cells)) {

    # select what to filter
    threshold <- dodgy_cells %>%
      dplyr::filter(sample_id == sample) %>%
      dplyr::select(my_grouping, threshold)

    # filter out dodgy cells
    clean_data <- clean_data %>%

      # merge with df
      dplyr::left_join(threshold, by = "my_grouping") %>%

      # remove cells on upper boundary
      dplyr::filter(is.na(threshold) | threshold>maxInt) %>%
      dplyr::select(-threshold)
  }

  # select relevant variables
  clean_data <- clean_data %>%

    # select only vars of interest
    dplyr::select(dplyr::ends_with("Pos"), maxInt, my_grouping, hemisphere)




  # Save output cleancounts in temp file
  saveRDS(clean_data, file = paste0(path_cleaned, sample_id, "_clean_cells.RDS"))

  # Removed counts -------------------------------------------------------------

  removed_counts <- data.frame(
    "sample_id" = sample_id,
    "initial_counts" = nrow(data),
    "halo_removal" = (nrow(data) - nrow(no_halo)),
    "background_removal" = (nrow(no_halo) - nrow(no_background)),
    "damaged_counts_removed" = nrow(pot_damaged[pot_damaged$potentially_damaged == "yes",]),
    "replacement_counts" = nrow(pot_damaged[pot_damaged$my_grouping %in% damaged_areas_sample$area &
                                              !pot_damaged$hemisphere %in% c(levels(damaged_areas_sample$hemisphere), "midline"),]),
    "final_counts" = nrow(clean_data))

  saveRDS(removed_counts, file = paste0(path_removed, sample_id, "_removed_counts_summary.RDS"))


  # Return ------------------------------------------------------------------
  print(paste0("Cleaned data of sample '", sample_id,
               "' has been saved in the specified folder."))
}


####### this part can be later deleted
## check no weird areas >> categories 1 and 3 should not be present
# categories:
#   0 = included
#   1 = subcategorized (returns null counts)
#   2 = subcategorized, non no-zero, interpretable
#   3 = subcategorization does not exist in ABA, returns null
#   4 = trimmed areas (trimmed by who??)
#   5 = subcategorized, non no-zero, not interpretable
#   6 = fiber tract
#   7 = not reliable, around ventricles and small





# samples_files_to_df ------------------------------------------------------
#' @title Merges in one data frame all the separate samples' files
#' @description Wrapper to dplyr functions to merge together multiple files output
#' from cleaned_counts(). It can merge both the cleaned cell files (extension = "_clean_cells.RDS"),
#' as well as the summaries of the removed cells (extension = "_removed_counts_summary.RDS").
#'
#' @param path String for path of location files. It does not matter if other files are present
#' in this folder. Only files with the specified extension will be added.
#' @param extension Final part of the string of the files. Same as output of cleaned_counts().
#'
#' @return
#' @export
#'
#' @examples For a thorough example, please see XX.

samples_files_to_df <- function(path, extension = c("_clean_cells.RDS", "_removed_counts_summary.RDS")) {

  if (length(extension) > 1) {extension <- extension[[1]]}
  file_names <- list.files(path = path)
  file_list <- file_names[stringr::str_detect(file_names, extension)]
  data_ls <- lapply(paste0(path, file_list), readRDS)

  # get sample names
  samples <-  stringr::str_remove_all(file_list, extension)
  names(data_ls) <- samples

  # from list to data frame
  dplyr::bind_rows(data_ls, .id = "sample_id")
}

# remove_blinding --------------------------------------------------------------
#' @title Remove (or scramble) blinding of your groups
#' @description Convert random group names (e.g. "a", "b") into meaningful
#' groups (e.g. "control", "experimental"). It can be used to unblind at the
#' *end* of the analysis, or it can be used as "scrambled" at the beginning of
#' the analysis. Returns a data frame with all (unused) columns of data and key.
#'
#'
#' @param data Dataframe with (at least) two columns: "sample_id" and "blinded_group"
#' @param key Dataframe with (at least) two columns: "blinded_group" and "key"
#' @param scramble Specifies whether to scramble the groups or not. Can be
#' either TRUE or FALSE
#'
#' @return
#' @export
#'
#' @examples
#' x <- data.frame(sample_id = c(1:20), blinded_group = c(rep(c(1:4), each = 5)))
#' y <- data.frame(blinded_group = 1:4, key = c("control", "drug_a", "drug_b", "drug_c"))
#' z <- unblind(x,y)

remove_blinding <- function(data, key, scramble = FALSE) {

  # check data and key are dataframes
  assertthat::assert_that(is.data.frame(data))
  assertthat::assert_that(is.data.frame(key))

  # check data has correct vars
  assertthat::has_name(data, "sample_id")
  assertthat::has_name(data, "blinded_group")

  # check that key has correct vars
  assertthat::has_name(key, "blinded_group")
  assertthat::has_name(key, "key")

  if (! scramble %in% c(TRUE, FALSE)) stop("scramble can only be TRUE or FALSE")


  # create new key with scrambling
  if (scramble == TRUE) {
    scramble_order <- sample(nrow(key), replace = FALSE)
    key$key <- key$key[scramble_order]
  }

  # unblind
  data %>%

    # merge with unblinding
    dplyr::left_join(key, by = "blinded_group") %>%

    # remove old and rename new
    dplyr::select(-blinded_group) %>%
    dplyr::rename(group = key)

}



# Normalization  -----------------------------------------------
#' @title Normalize cell counts
#' @description Wrapper functon to YeoJohnson transformation from recipes package. Similar transformation to boxcox,
#' but it accepts also 0 (and negative) values.
#'
#'
#' @param x Numeric vector with values to normalize
#'
#' @return
#' @export
#'
#' @examples
#' x <- rpois(100, 3)
#' y <- normalize(x)
#'
#' par(mfrow=c(1,2))
#' hist(x)
#' hist(y)
#' par(mfrow=c(1,1))

normalize <- function(x) {

  # check that x is a string
  assertthat::assert_that(is.numeric(x))

  # wrapper around recipe package
  y <- data.frame(x)
  rec <- recipes::recipe(x~1, data = y)

  # get transformation values
  yj_transform <- recipes::step_YeoJohnson(rec,  recipes::all_numeric())

  # get estimates
  yj_estimates <- recipes::prep(yj_transform, training = y)

  # get transformed values
  yj_te <- recipes::bake(yj_estimates, y)

  # return
  return(yj_te$x)

}
valeriabonapersona/abc4d documentation built on Dec. 23, 2021, 2:09 p.m.