#' Find inForm data files.
#'
#' `list_cell_seg_files` finds inForm cell seg data files in a single directory
#' or a directory hierarchy.
#'
#' @param path Path to the base directory to search.
#' @param ... Additional arguments passed to [list.files][base::list.files()].
#' Pass `recursive=TRUE` to search a directory hierarchy.
#' @return A list of file paths.
#' @export
#' @family file readers
#' @md
list_cell_seg_files <- function(path, ...) {
list.files(path, pattern='cell_seg_data.txt', full.names=TRUE, ...)
}
#' Read and clean an inForm data file.
#'
#' \code{read_cell_seg_data} makes it easier to use data from PerkinElmer's
#' inForm program. It reads data files written by inForm 2.0 and later and does
#' useful cleanup on the result. Data files written by inForm 2.0 can be read
#' easily using \code{\link[utils]{read.delim}} or
#' \code{\link[readr]{read_tsv}}. However there is still some useful cleanup to
#' be done.
#'
#' \code{read_cell_seg_data} reads both single-image tables and merged tables
#' and does useful cleanup on the data:
#' \itemize{
#' \item Removes columns that are all NA.
#' These are typically unused summary columns.
#' \item Converts percent columns to numeric fractions.
#' \item Converts pixel distances to microns. The conversion factor may be
#' specified as a parameter, by setting
#' \code{options(phenoptr.pixels.per.micron)}, or by reading an associated
#' \code{component_data.tif} file.
#' \item Optionally removes units from expression names
#' \item If the file contains multiple sample names,
#' a \code{tag} column is created
#' containing a minimal, unique tag for each sample.
#' This is useful when a
#' short name is needed, for example in chart legends.
#' }
#'
#' If \code{pixels_per_micron='auto'}, \code{read_cell_seg_data} looks for
#' a \code{component_data.tif} file in the same directory as \code{path}.
#' If found, \code{pixels_per_micron} is read from the file \strong{and}
#' the cell coordinates are offset to the correct spatial location.
#'
#' @param path Path to the file to read, or NA to use a file chooser.
#' @param pixels_per_micron Conversion factor to microns
#' (default 2 pixels/micron, the resolution of 20x MSI fields
#' taken on Vectra Polaris and Vectra 3.).
#' Set to NA to skip conversion. Set to \code{'auto'} to read from
#' an associated \code{component_data.tif} file.
#' @param remove_units If TRUE (default),
#' remove the unit name from expression columns.
#' @return A \code{\link[tibble]{data_frame}}
#' containing the cleaned-up data set.
#' @export
#' @family file readers
#' @examples
#' path <- sample_cell_seg_path()
#' csd <- read_cell_seg_data(path)
#'
#' # count all the phenotypes in the data
#' table(csd$Phenotype)
#'
#' \dontrun{
#' # Use purrr::map_df to read all cell seg files in a directory
#' # and return a single data_frame.
#' paths <- list_cell_seg_files(path)
#' csd <- purrr::map_df(paths, read_cell_seg_data)
#' }
read_cell_seg_data <- function(
path=NA,
pixels_per_micron=getOption('phenoptr.pixels.per.micron'),
remove_units=TRUE) {
if (is.na(path)) {
path <- file.choose() # nocov not going to happen...
cat('Loading', path) # nocov
}
if (path=='')
stop("File name is missing.")
# Read the data. Supplying col_types prevents output of the imputed types
df <- readr::read_tsv(path, na=c('NA', '#N/A'), col_types=readr::cols())
sample_name = 'Sample Name'
# If there are multiple sample names, make 'tag' be an abbreviated
# Sample.Name column and insert it as the first column
# Use the 'tag' column when you need a short name for the sample,
# e.g. in chart legends
if (length(unique(df[[sample_name]])) > 1 && !('tag' %in% names(df))) {
tag <- as.factor(remove_extensions(remove_common_prefix(df[[sample_name]])))
df <- cbind(tag, df)
}
# Convert percents if it is not done already
pcts <- grep('percent|confidence', names(df), ignore.case=TRUE)
for (i in pcts)
if (is.character(df[[i]]))
df[[i]] <- as.numeric(sub('\\s*%$', '', df[[i]]))/100
# Remove columns that are all NA or all blank
# The first condition is a preflight that speeds this up a lot
na_columns <- purrr::map_lgl(df, function(col)
(is.na(col[1]) | col[1]=='') && all(is.na(col) | col==''))
df <- df[!na_columns]
# Convert distance to microns if requested.
# No way to tell in general if this was done already...
# Try looking for 'micron' in column names
if (any(stringr::str_detect(names(df), 'micron')) && !is.na(pixels_per_micron)) {
message('Data is already in microns, no conversion performed')
} else if (!is.na(pixels_per_micron)) {
if (pixels_per_micron=='auto') {
# Get pixels_per_micron and field location from component_data.tif
component_path = sub('_cell_seg_data.txt', '_component_data.tif', path)
stopifnot(file.exists(component_path))
info = get_field_info(component_path)
pixels_per_micron = 1/info$microns_per_pixel
location = info$location
} else {
location = NA
}
cols = get_pixel_columns(df)
for (col in cols) {
# If col has a lot of NAs it may have been read as chr
if (!is.numeric(df[[col]]))
df[[col]] = as.numeric(df[[col]])
df[[col]] = df[[col]] / pixels_per_micron
names(df)[col] = sub('pixels', 'microns', names(df)[col])
}
# Position columns only get an offset, if available
if (!anyNA(location)) {
df$`Cell X Position` = df$`Cell X Position` + location[1]
df$`Cell Y Position` = df$`Cell Y Position` + location[2]
}
cols = get_area_columns(df)
for (col in cols) {
if (!is.numeric(df[[col]]))
df[[col]] = as.numeric(df[[col]])
df[[col]] = df[[col]] / (pixels_per_micron^2)
names(df)[col] = sub('pixels', 'sq microns', names(df)[col])
}
cols = get_density_columns(df)
for (col in cols) {
if (!is.numeric(df[[col]]))
df[[col]] = as.numeric(df[[col]])
df[[col]] = df[[col]] * (pixels_per_micron^2)
names(df)[col] = sub('megapixel', 'sq mm', names(df)[col])
}
}
if (remove_units) {
unit_name = stringr::str_match(names(df), 'Mean( \\(.*\\))$')[, 2]
unit_name = unit_name[!is.na(unit_name)][1]
if (!is.na(unit_name))
names(df) = sub(unit_name, '', names(df), fixed=TRUE)
}
dplyr::as_data_frame(df)
}
get_pixel_columns = function(df) {
rx = 'position|Distance from Process Region Edge|Distance from Tissue Category Edge|axis' # nolint
grep(rx, names(df), ignore.case=TRUE)
}
get_area_columns = function(df) {
rx = 'area \\(pixels\\)'
grep(rx, names(df), ignore.case=TRUE)
}
get_density_columns = function(df) {
rx = 'megapixel'
grep(rx, names(df), ignore.case=TRUE)
}
# Remove the common prefix from a vector of strings
remove_common_prefix <- function(x) {
# Lexicographic min and max
.min <- min(x)
.max <- max(x)
if (.min == .max) return (x) # All strings are the same
# Find the first difference by comparing characters
.split <- strsplit(c(.min, .max), split='')
suppressWarnings(.match <- .split[[1]] == .split[[2]])
first_diff <- match(FALSE, .match)
substring(x, first_diff)
}
# Remove the extensions from a vector of strings
remove_extensions <- function(x) {
sub('\\.[^.]+$', '', x)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.