# ** Spatial Method-Specific Convenience Functions for Giotto Object Creation ** #
# Common Utility Functions ####
#' @title Read a structured folder of exported data
#' @name read_data_folder
#' @description Read the exported folder of a spatial method and
#' detect the presence of needed files. NULL values denote missing items.
#' @param spat_method spatial method for which the data is being read
#' @param data_dir exported data directory to read from
#' @param dir_items named list of directory items to expect and keywords to match
#' @param data_to_use which type(s) of expression data to build the gobject with
#' @param require_data_DT data.table detailing if expected data items are required
#' or optional for each \code{data_to_use} workflow
#' @param cores cores to use
#' @param verbose be verbose
#' @details Steps performed:
#' \itemize{
#' \item{1. detection of items within \code{data_dir} by looking for keywords
#' assigned through \code{dir_items}}
#' \item{2. check of detected items to see if everything needed has been found.
#' Dictionary of necessary vs optional items for each \code{data_to_use} workflow
#' is provided through \code{require_data_DT}}
#' \item{3. if multiple filepaths are found to be matching then select the first
#' one. This function is only intended to find the first level subdirectories
#' and files.}
#' }
#' @keywords internal
read_data_folder = function(spat_method = NULL,
data_dir = NULL,
dir_items,
data_to_use,
load_format = NULL,
require_data_DT,
cores = NA,
verbose = TRUE) {
ch = box_chars()
# 0. check params
if(is.null(data_dir) | !dir.exists(data_dir)) stop(wrap_txt('The full path to a', spat_method, 'directory must be given.\n'))
if(isTRUE(verbose)) wrap_msg('A structured', spat_method, 'directory will be used')
if(!data_to_use %in% require_data_DT$workflow) stop(wrap_txt('Data requirements for data_to_use not found in require_data_DT'))
# 1. detect items
dir_items = lapply_flex(dir_items, function(x) {
Sys.glob(paths = file.path(data_dir, x))
}, cores = cores)
# (length = 1 if present, length = 0 if missing)
dir_items_lengths = lengths(dir_items)
# 2. check directory contents
if(isTRUE(verbose)) wrap_msg('Checking directory contents...')
for(item in names(dir_items)) {
# IF ITEM FOUND
if(dir_items_lengths[[item]] > 0) {
if(isTRUE(verbose)) {
message(ch$s, '> ' ,item, ' found')
for(item_i in seq_along(dir_items[[item]])) { # print found item names
subItem = gsub(pattern = '.*/', replacement = '', x = dir_items[[item]][[item_i]])
message(ch$s, ch$s, ch$l, ch$h, ch$h, subItem)
}
}
} else {
# IF ITEM MISSING
# necessary (error)
# optional (warning)
# data.table variables
workflow = needed = filetype = NULL
require_data_DT = require_data_DT[workflow == data_to_use,]
if(!is.null(load_format)) require_data_DT = require_data_DT[filetype == load_format,]
if(item %in% require_data_DT[needed == TRUE, item]) stop(item, ' is missing\n')
if(item %in% require_data_DT[needed == FALSE, item]) warning(item, 'is missing (optional)\n')
}
}
# 3. select first path in list if multiple are detected
if(any(dir_items_lengths > 1)) {
warning(wrap_txt('Multiple matches for expected directory item(s).
First matching item selected'))
multiples = which(dir_items_lengths > 1)
for(mult_i in multiples) {
message(names(dir_items)[[mult_i]], 'multiple matches found:')
print(dir_items[[mult_i]])
dir_items[[mult_i]] = dir_items[[mult_i]][[1]]
}
}
if(isTRUE(verbose)) message('Directory check done')
return(dir_items)
}
# object creation ####
## MERSCOPE ####
#' @title Create Vizgen MERSCOPE Giotto Object
#' @name createGiottoMerscopeObject
#' @description Given the path to a MERSCOPE experiment directory, creates a Giotto
#' object.
#' @param merscope_dir full path to the exported merscope directory
#' @param data_to_use which of either the 'subcellular' or 'aggregate' information
#' to use for object creation
#' @param FOVs which FOVs to use when building the subcellular object. (default is NULL)
#' NULL loads all FOVs (very slow)
#' @param calculate_overlap whether to run \code{\link{calculateOverlapRaster}}
#' @param overlap_to_matrix whether to run \code{\link{overlapToMatrix}}
#' @param aggregate_stack whether to run \code{\link{aggregateStacks}}
#' @param aggregate_stack_param params to pass to \code{\link{aggregateStacks}}
#' @inheritParams createGiottoObjectSubcellular
#' @return a giotto object
#' @export
#' @details
#' [\strong{Expected Directory}] This function generates a giotto object when given a
#' link to a MERSCOPE output directory. It expects the following items within the directory
#' where the \strong{bolded} portions are what this function matches against:
#' \itemize{
#' \item{\strong{cell_boundaries} (folder .hdf5 files)}
#' \item{\strong{images} (folder of .tif images and a scalefactor/transfrom table)}
#' \item{\strong{cell_by_gene}.csv (file)}
#' \item{cell_metadata\strong{fov_positions_file}.csv (file)}
#' \item{detected_transcripts\strong{metadata_file}.csv (file)}
#' }
createGiottoMerscopeObject = function(merscope_dir,
data_to_use = c('subcellular', 'aggregate'),
FOVs = NULL,
poly_z_indices = 1:7,
calculate_overlap = TRUE,
overlap_to_matrix = TRUE,
aggregate_stack = TRUE,
aggregate_stack_param = list(summarize_expression = 'sum',
summarize_locations = 'mean',
new_spat_unit = 'cell'),
instructions = NULL,
cores = NA,
verbose = TRUE) {
fovs = NULL
# 0. setup
merscope_dir = path.expand(merscope_dir)
poly_z_indices = as.integer(poly_z_indices)
if(any(poly_z_indices < 1)) stop(wrap_txt(
'poly_z_indices is a vector of one or more integers starting from 1.',
errWidth = TRUE
))
# determine data to use
data_to_use = match.arg(arg = data_to_use, choices = c('subcellular','aggregate'))
# 1. test if folder structure exists and is as expected
dir_items = read_merscope_folder(merscope_dir = merscope_dir,
data_to_use = data_to_use,
cores = cores,
verbose = verbose)
# 2. load in directory items
data_list = load_merscope_folder(dir_items = dir_items,
data_to_use = data_to_use,
poly_z_indices = poly_z_indices,
fovs = fovs,
cores = cores,
verbose = verbose)
# 3. Create giotto object
if(data_to_use == 'subcellular') {
merscope_gobject = createGiottoMerscopeObject_subcellular(data_list = data_list,
calculate_overlap = calculate_overlap,
overlap_to_matrix = overlap_to_matrix,
aggregate_stack = aggregate_stack,
aggregate_stack_param = aggregate_stack_param,
cores = cores,
verbose = verbose)
} else if(data_to_use == 'aggregate') {
merscope_gobject = createGiottoMerscopeObject_aggregate(data_list = data_list,
cores = cores,
verbose = verbose)
} else {
stop(wrap_txt('data_to_use "', data_to_use, '" not implemented', sep = ''))
}
return(merscope_gobject)
}
#' @describeIn createGiottoMerscopeObject Create giotto object with 'subcellular' workflow
#' @param data_list list of loaded data from \code{\link{load_merscope_folder}}
#' @import data.table
#' @keywords internal
createGiottoMerscopeObject_subcellular = function(data_list,
calculate_overlap = TRUE,
overlap_to_matrix = TRUE,
aggregate_stack = TRUE,
aggregate_stack_param = list(summarize_expression = 'sum',
summarize_locations = 'mean',
new_spat_unit = 'cell'),
cores = NA,
verbose = TRUE) {
feat_coord = neg_coord = cellLabel_dir = instructions = NULL
# unpack data_list
poly_info = data_list$poly_info
tx_dt = data_list$tx_dt
micronToPixelScale = data_list$micronToPixelScale
image_list = data_list$images
# data.table vars
gene = NULL
# split tx_dt by expression and blank
if(isTRUE(verbose)) wrap_msg('Splitting detections by feature vs blank')
feat_id_all = tx_dt[, unique(gene)]
blank_id = feat_id_all[grepl(pattern = 'Blank', feat_id_all)]
feat_id = feat_id_all[!feat_id_all %in% blank_id]
feat_dt = tx_dt[gene %in% feat_id,]
blank_dt = tx_dt[gene %in% blank_id,]
# extract transcript_id col and store as feature meta
feat_meta = unique(feat_dt[, c('gene', 'transcript_id', 'barcode_id'), with = FALSE])
blank_meta = unique(blank_dt[, c('gene', 'transcript_id', 'barcode_id'), with = FALSE])
feat_dt[, c('transcript_id', 'barcode_id') := NULL]
blank_dt[, c('transcript_id', 'barcode_id') := NULL]
if(isTRUE(verbose)) {
message(' > Features: ', feat_dt[, .N])
message(' > Blanks: ', blank_dt[, .N])
}
# build giotto object
if(isTRUE(verbose)) wrap_msg('Building subcellular giotto object...')
z_sub = createGiottoObjectSubcellular(
gpoints = list('rna' = feat_coord,
'neg_probe' = neg_coord),
gpolygons = list('cell' = cellLabel_dir),
polygon_mask_list_params = list(
mask_method = 'guess',
flip_vertical = TRUE,
flip_horizontal = FALSE,
shift_horizontal_step = FALSE
),
instructions = instructions,
cores = cores
)
}
#' @describeIn createGiottoMerscopeObject Create giotto object with 'aggregate' workflow
#' @param data_list list of loaded data from \code{\link{load_merscope_folder}}
#' @keywords internal
createGiottoMerscopeObject_aggregate = function(data_list,
cores = NA,
verbose = TRUE) {
# unpack data_list
micronToPixelScale = data_list$micronToPixelScale
expr_dt = data_list$expr_dt
cell_meta = data_list$expr_mat
image_list = data_list$images
# split expr_dt by expression and blank
# feat_id_all =
}
## CosMx ####
#' @title Create Nanostring CosMx Giotto Object
#' @name createGiottoCosMxObject
#' @description Given the path to a CosMx experiment directory, creates a Giotto
#' object.
#' @param cosmx_dir full path to the exported cosmx directory
#' @param data_to_use which type(s) of expression data to build the gobject with
#' Default is \code{'all'} information available. \code{'subcellular'} loads the transcript
#' coordinates only. \code{'aggregate'} loads the provided aggregated expression matrix.
#' @param FOVs field of views to load (only affects subcellular data and images)
#' @param remove_background_polygon try to remove background polygon (default: FALSE)
#' @param background_algo algorithm to remove background polygon
#' @param remove_unvalid_polygons remove unvalid polygons (default: TRUE)
#' @inheritParams createGiottoObjectSubcellular
#' @import data.table
#' @return a giotto object
#' @export
#' @details
#' [\strong{Expected Directory}] This function generates a giotto object when given a
#' link to a cosmx output directory. It expects the following items within the directory
#' where the \strong{bolded} portions are what this function matches against:
#' \itemize{
#' \item{\strong{CellComposite} (folder of images)}
#' \item{\strong{CellLabels} (folder of images)}
#' \item{\strong{CellOverlay} (folder of images)}
#' \item{\strong{CompartmentLabels} (folder of images)}
#' \item{experimentname_\strong{exprMat_file}.csv (file)}
#' \item{experimentname_\strong{fov_positions_file}.csv (file)}
#' \item{experimentname_\strong{metadata_file}.csv (file)}
#' \item{experimentname_\strong{tx_file}.csv (file)}
#' }
#'
#' [\strong{Workflows}] Workflow to use is accessed through the data_to_use param
#' \itemize{
#' \item{'all' - loads and requires subcellular information from tx_file and fov_positions_file
#' and also the existing aggregated information (expression, spatial locations, and metadata)
#' from exprMat_file and metadata_file.}
#' \item{'subcellular' - loads and requires subcellular information from tx_file and
#' fov_positions_file only.}
#' \item{'aggregate' - loads and requires the existing aggregate information (expression,
#' spatial locations, and metadata) from exprMat_file and metadata_file.}
#' }
#'
#' [\strong{Images}] Images in the default CellComposite, CellLabels, CompartmentLabels, and CellOverlay
#' folders will be loaded as giotto largeImage objects in all workflows as long as they are available.
#' Additionally, CellComposite images will be converted to giotto image objects, making plotting with
#' these image objects more responsive when accessing them from a server.
#' \code{\link{showGiottoImageNames}} can be used to see the available images.
#'
#'
createGiottoCosMxObject = function(cosmx_dir = NULL,
data_to_use = c('all','subcellular','aggregate'),
remove_background_polygon = TRUE,
background_algo = c('range'),
remove_unvalid_polygons = TRUE,
FOVs = NULL,
instructions = NULL,
cores = determine_cores(),
verbose = TRUE) {
# 0. setup
cosmx_dir = path.expand(cosmx_dir)
# determine data to use
data_to_use = match.arg(arg = data_to_use, choices = c('all','subcellular','aggregate'))
if(data_to_use %in% c('all', 'aggregate')) {
stop(wrap_txt('Convenience workflows "all" and "aggregate" are not available yet'))
}
# Define for data.table
fov = target = x_local_px = y_local_px = z = cell_ID = CenterX_global_px = CenterY_global_px =
CenterX_local_px = CenterY_local_px = NULL
# 1. test if folder structure exists and is as expected
dir_items = read_cosmx_folder(cosmx_dir = cosmx_dir,
verbose = verbose)
# 2. load and create giotto object
if(data_to_use == 'subcellular') {
cosmx_gobject = createGiottoCosMxObject_subcellular(dir_items,
FOVs = FOVs,
remove_background_polygon = remove_background_polygon,
background_algo = background_algo,
remove_unvalid_polygons = remove_unvalid_polygons,
cores = cores,
verbose = verbose,
instructions = instructions)
}
if(data_to_use == 'aggregate') {
cosmx_gobject = createGiottoCosMxObject_aggregate(dir_items,
cores = cores,
verbose = verbose,
instructions = instructions)
}
if(data_to_use == 'all') {
cosmx_gobject = createGiottoCosMxObject_all(dir_items,
FOVs = FOVs,
remove_background_polygon = remove_background_polygon,
background_algo = background_algo,
remove_unvalid_polygons = remove_unvalid_polygons,
cores = cores,
verbose = verbose,
instructions = instructions)
}
# load in subcellular information, subcellular FOV objects, then join
# load in pre-generated aggregated expression matrix
if(data_to_use == 'aggregate' | data_to_use == 'all') {
}
message('done')
return(cosmx_gobject)
}
#' @title Load and create a CosMx Giotto object from subcellular info
#' @name createGiottoCosMxObject_subcellular
#' @inheritParams createGiottoCosMxObject
#' @keywords internal
createGiottoCosMxObject_subcellular = function(dir_items,
FOVs = NULL,
remove_background_polygon = TRUE,
background_algo = c('range'),
remove_unvalid_polygons = TRUE,
cores,
verbose = TRUE,
instructions = NULL) {
target = fov = NULL
# load tx detections and FOV offsets
data_list = load_cosmx_folder_subcellular(dir_items = dir_items,
FOVs = FOVs,
cores = cores,
verbose = verbose)
# unpack data_list
FOV_ID = data_list$FOV_ID
fov_offset_file = data_list$fov_offset_file
tx_coord_all = data_list$tx_coord_all
# remove global xy values and cell_ID
tx_coord_all[, c('x_global_px', 'y_global_px', 'cell_ID') := NULL]
data.table::setcolorder(tx_coord_all, c('target', 'x_local_px', 'y_local_px', 'z', 'fov'))
if(isTRUE(verbose)) wrap_msg('Splitting detections by feature vs neg probe')
all_IDs = tx_coord_all[, unique(target)]
neg_IDs = all_IDs[grepl(pattern = 'NegPrb', all_IDs)]
feat_IDs = all_IDs[!all_IDs %in% neg_IDs]
# split detections DT
feat_coords_all = tx_coord_all[target %in% feat_IDs]
neg_coords_all = tx_coord_all[target %in% neg_IDs]
if(isTRUE(verbose)) {
message(' > Features: ', feat_coords_all[, .N])
message(' > NegProbes: ', neg_coords_all[, .N])
}
# Start FOV lapply
fov_gobjects_list = lapply(FOV_ID, function(x) {
# Build image paths
if(isTRUE(verbose)) message('Loading image information...')
composite_dir = Sys.glob(paths = file.path(dir_items$`CellComposite folder`, paste0('*',x, '*')))
cellLabel_dir = Sys.glob(paths = file.path(dir_items$`CellLabels folder`, paste0('*',x, '*')))
compartmentLabel_dir = Sys.glob(paths = file.path(dir_items$`CompartmentLabels folder`, paste0('*',x, '*')))
cellOverlay_dir = Sys.glob(paths = file.path(dir_items$`CellOverlay folder`, paste0('*',x, '*')))
# Missing warnings
if(length(composite_dir) == 0) {warning('[ FOV ', x, ' ] No composite images found') ; composite_dir = NULL}
if(length(cellLabel_dir) == 0) {stop('[ FOV ', x, ' ] No cell mask images found')} # cell masks are necessary
if(length(compartmentLabel_dir) == 0) {warning('[ FOV ', x, ' ] No compartment label images found') ; compartmentLabel_dir = NULL}
if(length(cellOverlay_dir) == 0) {warning('[ FOV ', x, ' ] No cell polygon overlay images found') ; cellOverlay_dir = NULL}
if(isTRUE(verbose)) message('Image load done')
if(isTRUE(verbose)) wrap_msg('[ FOV ', x, ']')
# get FOV specific tx locations
if(isTRUE(verbose)) wrap_msg('Assigning FOV feature detections...')
# feature info
coord_oldnames = c('target', 'x_local_px', 'y_local_px')
coord_newnames = c('feat_ID', 'x', 'y')
feat_coord = feat_coords_all[fov == as.numeric(x)]
data.table::setnames(feat_coord, old = coord_oldnames, new = coord_newnames)
# neg probe info
neg_coord = neg_coords_all[fov == as.numeric(x)]
data.table::setnames(neg_coord, old = coord_oldnames, new = coord_newnames)
# build giotto object
if(isTRUE(verbose)) wrap_msg('Building subcellular giotto object...')
fov_subset = createGiottoObjectSubcellular(
gpoints = list('rna' = feat_coord,
'neg_probe' = neg_coord),
gpolygons = list('cell' = cellLabel_dir),
polygon_mask_list_params = list(
mask_method = 'guess',
flip_vertical = TRUE,
flip_horizontal = FALSE,
shift_horizontal_step = FALSE,
remove_background_polygon = remove_background_polygon,
background_algo = background_algo,
remove_unvalid_polygons = remove_unvalid_polygons
),
instructions = instructions,
cores = cores
)
# find centroids as spatial locations
if(isTRUE(verbose)) wrap_msg('Finding polygon centroids as cell spatial locations...')
fov_subset = addSpatialCentroidLocations(fov_subset,
poly_info = 'cell',
spat_loc_name = 'raw')
# create and add giotto image objects
if(isTRUE(verbose)) message('Attaching image files...')
print(composite_dir)
print(cellOverlay_dir)
print(compartmentLabel_dir)
gImage_list = list()
# load image if files are found
if(!is.null(composite_dir))
gImage_list$composite = createGiottoLargeImage(raster_object = composite_dir,
negative_y = F,
name = 'composite')
if(!is.null(cellOverlay_dir))
gImage_list$overlay = createGiottoLargeImage(raster_object = cellOverlay_dir,
negative_y = F,
name = 'overlay')
if(!is.null(compartmentLabel_dir))
gImage_list$compartment = createGiottoLargeImage(raster_object = compartmentLabel_dir,
negative_y = F,
name = 'compartment') #TODO
if(length(gImage_list) > 0) {
fov_subset = addGiottoImage(gobject = fov_subset,
largeImages = gImage_list)
# convert to MG for faster loading (particularly relevant for pulling from server)
fov_subset = convertGiottoLargeImageToMG(giottoLargeImage = gImage_list$composite,
gobject = fov_subset,
return_gobject = TRUE,
verbose = FALSE)
# fov_subset = convertGiottoLargeImageToMG(giottoLargeImage = gImage_list$overlay, gobject = fov_subset, return_gobject = TRUE)
# fov_subset = convertGiottoLargeImageToMG(giottoLargeImage = gImage_list$compartment, gobject = fov_subset, return_gobject = TRUE)
} else {
message('No images found for fov')
}
}) #lapply end
if(length(FOVs) == 1) {
return(fov_gobjects_list[[1]])
} else {
# join giotto objects according to FOV positions file
if(isTRUE(verbose)) message('Joining FOV gobjects...')
new_gobj_names = paste0('fov', FOV_ID)
id_match = match(as.numeric(FOV_ID), fov_offset_file$fov)
x_shifts = fov_offset_file[id_match]$x_global_px
y_shifts = fov_offset_file[id_match]$y_global_px
# Join giotto objects
cosmx_gobject = joinGiottoObjects(gobject_list = fov_gobjects_list,
gobject_names = new_gobj_names,
join_method = 'shift',
x_shift = x_shifts,
y_shift = y_shifts)
return(cosmx_gobject)
}
}
#' @title Load and create a CosMx Giotto object from aggregate info
#' @name createGiottoCosMxObject_aggregate
#' @inheritParams createGiottoCosMxObject
#' @keywords internal
createGiottoCosMxObject_aggregate = function(dir_items,
cores,
verbose = TRUE,
instructions = NULL) {
data_to_use = fov = NULL
data_list = load_cosmx_folder_aggregate(dir_items = dir_items,
cores = cores,
verbose = verbose)
# unpack data_list
spatlocs = data_list$spatlocs
spatlocs_fov = data_list$spatlocs_fov
metadata = data_list$metadata
protM = data_list$protM
spM = data_list$spM
fov_shifts = data_list$fov_shifts
# create standard gobject from aggregate matrix
if(data_to_use == 'aggregate') {
# Create aggregate gobject
if(isTRUE(verbose)) message('Building giotto object...')
cosmx_gobject = createGiottoObject(expression = list('raw' = spM, 'protein' = protM),
cell_metadata = list('cell' = list('rna' = metadata,
'protein' = metadata)),
spatial_locs = spatlocs,
instructions = instructions,
cores = cores)
# load in images
img_ID = data.table::data.table(fov = fov_shifts[, fov],
img_name = paste0('fov', sprintf('%03d', fov_shifts[, fov]), '-image'))
if(isTRUE(verbose)) message('Attaching image files...')
composite_dir = Sys.glob(paths = file.path(dir_items$`CellComposite folder`, paste0('/*')))
cellLabel_dir = Sys.glob(paths = file.path(dir_items$`CellLabels folder`, paste0('/*')))
compartmentLabel_dir = Sys.glob(paths = file.path(dir_items$`CompartmentLabels folder`, paste0('/*')))
overlay_dir = Sys.glob(paths = file.path(dir_items$`CellOverlay folder`, paste0('/*')))
if(length(cellLabel_imgList) > 0) cellLabel_imgList = lapply(cellLabel_dir, function(x) {createGiottoLargeImage(x,name = 'cellLabel',negative_y = TRUE)})
if(length(composite_imgList) > 0) composite_imgList = lapply(composite_dir, function(x) {createGiottoLargeImage(x,name = 'composite',negative_y = TRUE)})
if(length(compartmentLabel_dir) > 0) compartmentLabel_imgList = lapply(compartmentLabel_dir, function(x) {createGiottoLargeImage(x,name = 'composite',negative_y = TRUE)})
if(length(overlay_dir) > 0) overlay_imgList = lapply(overlay_dir, function(x) {createGiottoLargeImage(x,name = 'composite',negative_y = TRUE)})
}
}
#' @title Load and create a CosMx Giotto object from subcellular and aggregate info
#' @name createGiottoCosMxObject_all
#' @param dir_items list of full directory paths from \code{read_cosmx_folder}
#' @inheritParams createGiottoCosMxObject
#' @details Both \emph{subcellular} (subellular transcript detection information) and
#' \emph{aggregate} (aggregated detection count matrices by cell polygon from NanoString)
#' data will be loaded in. The two will be separated into 'cell' and 'cell_agg'
#' spatial units in order to denote the difference in origin of the two.
#' @seealso createGiottoCosMxObject createGiottoCosMxObject_aggregate
#' createGiottoCosMxObject_subcellular
#' @keywords internal
createGiottoCosMxObject_all = function(dir_items,
FOVs,
remove_background_polygon = TRUE,
background_algo = c('range'),
remove_unvalid_polygons = TRUE,
cores,
verbose = TRUE,
instructions = NULL) {
# 1. create subcellular giotto as spat_unit 'cell'
cosmx_gobject = createGiottoCosMxObject_subcellular(dir_items = dir_items,
FOVs = FOVs,
remove_background_polygon = remove_background_polygon,
background_algo = background_algo,
remove_unvalid_polygons = remove_unvalid_polygons,
cores = cores,
verbose = verbose,
instructions = instructions)
# 2. load and append aggregated information in spat_unit 'cell_agg'
agg_data = load_cosmx_folder_aggregate(dir_items = dir_items,
cores = cores,
verbose = verbose)
# unpack data_list
spatlocs = agg_data$spatlocs
spatlocs_fov = agg_data$spatlocs_fov
metadata = agg_data$metadata
protM = agg_data$protM
spM = agg_data$spM
# add in pre-generated aggregated expression matrix information for 'all' workflow
# Add aggregate expression information
if(isTRUE(verbose)) wrap_msg('Appending provided aggregate expression data as...
spat_unit: "cell_agg"
feat_type: "rna"
name: "raw"')
# add expression data to expression slot
s4_expr = create_expr_obj(name = 'raw',
exprMat = spM,
spat_unit = 'cell_agg',
feat_type = 'rna',
provenance = 'cell_agg')
cosmx_gobject = set_expression_values(cosmx_gobject, values = s4_expr)
# Add spatial locations
if(isTRUE(verbose)) wrap_msg('Appending metadata provided spatial locations data as...
--> spat_unit: "cell_agg" name: "raw"
--> spat_unit: "cell" name: "raw_fov"')
if(isTRUE(verbose)) wrap_msg('Polygon centroid derived spatial locations assigned as...
--> spat_unit: "cell" name: "raw" (default)')
locsObj = create_spat_locs_obj(name = 'raw',
coordinates = spatlocs,
spat_unit = 'cell_agg',
provenance = 'cell_agg')
locsObj_fov = create_spat_locs_obj(name = 'raw_fov',
coordinates = spatlocs_fov,
spat_unit = 'cell_agg',
provenance = 'cell_agg')
cosmx_gobject = set_spatial_locations(cosmx_gobject, spatlocs = locsObj)
cosmx_gobject = set_spatial_locations(cosmx_gobject, spatlocs = locsObj_fov)
# cosmx_gobject = set_spatial_locations(cosmx_gobject,
# spat_unit = 'cell_agg',
# spat_loc_name = 'raw',
# spatlocs = spatlocs)
# cosmx_gobject = set_spatial_locations(cosmx_gobject,
# spat_unit = 'cell_agg',
# spat_loc_name = 'raw_fov',
# spatlocs = spatlocs_fov)
# initialize cell and feat IDs and metadata slots for 'cell_agg' spat_unit
agg_cell_ID = colnames(s4_expr[])
agg_feat_ID = rownames(s4_expr[])
sub_feat_ID = get_feat_id(cosmx_gobject, feat_type = 'rna')
feat_ID_new = unique(c(agg_feat_ID, sub_feat_ID))
cosmx_gobject = set_cell_id(gobject = cosmx_gobject,
spat_unit = 'cell_agg',
cell_IDs = agg_cell_ID)
cosmx_gobject = set_feat_id(gobject = cosmx_gobject,
feat_type = 'rna',
feat_IDs = feat_ID_new)
# cell metadata
# cosmx_gobject@cell_ID[['cell_agg']] = colnames(cosmx_gobject@expression[['cell_agg']][[1]][[1]])
# cosmx_gobject@feat_ID[['rna']] = unique(c(cosmx_gobject@feat_ID, rownames(cosmx_gobject@expression[['cell_agg']][['rna']][[1]])))
# cosmx_gobject@cell_metadata[['cell_agg']][['rna']] = data.table::data.table(cell_ID = cosmx_gobject@cell_ID[['cell_agg']])
# cosmx_gobject@feat_metadata[['cell_agg']][['rna']] = data.table::data.table(feat_ID = cosmx_gobject@feat_ID[['rna']])
# Add metadata to both the given and the poly spat_units
if(isTRUE(verbose)) message('Appending provided cell metadata...')
cosmx_gobject = addCellMetadata(cosmx_gobject,
spat_unit = 'cell',
feat_type = 'rna',
new_metadata = metadata,
by_column = TRUE,
column_cell_ID = 'cell_ID')
cosmx_gobject = addCellMetadata(cosmx_gobject,
spat_unit = 'cell_agg',
feat_type = 'rna',
new_metadata = metadata,
by_column = TRUE,
column_cell_ID = 'cell_ID')
}
## Xenium ####
#' @title Create 10x Xenium Giotto Object
#' @name createGiottoXeniumObject
#' @description Given the path to a Xenium experiment output folder, creates a Giotto
#' object
#' @param xenium_dir full path to the exported xenium directory
#' @param data_to_use which type(s) of expression data to build the gobject with
#' (e.g. default: \strong{'subcellular'}, 'aggregate', or 'all')
#' @param load_format files formats from which to load the data. Either `csv` or
#' `parquet` currently supported.
#' @param h5_expression (boolean) whether to load cell_feature_matrix from .h5 file.
#' Default is \code{TRUE}
#' @param h5_gene_ids use gene symbols (default) or ensembl ids for the .h5 gene
#' expression matrix
#' @param bounds_to_load vector of boundary information to load (e.g. \code{'cell'}
#' or \code{'nucleus'} by themselves or \code{c('cell', 'nucleus')} to load both
#' at the same time.)
#' @param qv_threshold Minimum Phred-scaled quality score cutoff to be included as
#' a subcellular transcript detection (default = 20)
#' @param key_list (advanced) list of grep-based keywords to split the subcellular
#' feature detections by feature type. See details
#' @inheritParams get10Xmatrix
#' @inheritParams createGiottoObjectSubcellular
#' @details
#'
#' [\strong{QC feature types}]
#' Xenium provides info on feature detections that include more than only the
#' Gene Expression specific probes. Additional probes for QC are included:
#' \emph{blank codeword}, \emph{negative control codeword}, and
#' \emph{negative control probe}. These additional QC probes each occupy and are treated
#' as their own feature types so that they can largely remain independent of the
#' gene expression information.
#'
#' [\strong{key_list}]
#' Related to \code{data_to_use = 'subcellular'} workflow only:
#' Additional QC probe information is in the subcellular feature detections information
#' and must be separated from the gene expression information during processing.
#' The QC probes have prefixes that allow them to be selected from the rest of the
#' feature IDs.
#' Giotto uses a named list of keywords (\code{key_list}) to select these QC probes,
#' with the list names being the names that will be assigned as the feature type
#' of these feature detections. The default list is used when \code{key_list} = NULL.
#'
#' Default list:
#' \preformatted{
#' list(blank_code = 'BLANK_',
#' neg_code = 'NegControlCodeword_',
#' neg_probe = c('NegControlProbe_|antisense_'))
#' }
#'
#' The Gene expression subset is accepted as the subset of feat_IDs that do not
#' map to any of the keys.
#'
#' @export
createGiottoXeniumObject = function(xenium_dir,
data_to_use = c('subcellular','aggregate'),
load_format = 'csv',
h5_expression = TRUE,
h5_gene_ids = c('symbols', 'ensembl'),
gene_column_index = 1,
bounds_to_load = c('cell'),
qv_threshold = 20,
key_list = NULL,
# include_analysis = FALSE,
instructions = NULL,
cores = NA,
verbose = TRUE) {
# 0. setup
xenium_dir = path.expand(xenium_dir)
# Determine data to load
data_to_use = match.arg(arg = data_to_use, choices = c('subcellular','aggregate'))
# Determine load formats
load_format = 'csv' # TODO Remove this and add as param once other options are available
load_format = match.arg(arg = load_format, choices = c('csv', 'parquet', 'zarr'))
# set number of cores automatically, but with limit of 10
cores = determine_cores(cores)
data.table::setDTthreads(threads = cores)
# 1. detect xenium folder and find filepaths to load
# path_list contents:
# tx_path
# bound_paths
# cell_meta_path
# agg_expr_path
# panel_meta_path
path_list = read_xenium_folder(xenium_dir = xenium_dir,
data_to_use = data_to_use,
bounds_to_load = bounds_to_load,
load_format = load_format,
h5_expression = h5_expression,
verbose = verbose)
# 2. load in data
# data_list contents:
# feat_meta
# tx_dt
# bound_dt_list
# cell_meta
# agg_expr
data_list = load_xenium_folder(path_list = path_list,
load_format = load_format,
data_to_use = data_to_use,
h5_expression = h5_expression,
h5_gene_ids = h5_gene_ids,
gene_column_index = gene_column_index,
cores = cores,
verbose = verbose)
# TODO load images
# 3. Create giotto objects
if(data_to_use == 'subcellular') {
# ** feat type search keys **
if(is.null(key_list)) {
key_list = list(blank_code = 'BLANK_',
neg_code = 'NegControlCodeword_',
neg_probe = c('NegControlProbe_|antisense_'))
}
# needed:
# feat_meta
# tx_dt
# bound_dt_list
xenium_gobject = createGiottoXeniumObject_subcellular(data_list = data_list,
qv_threshold = qv_threshold,
key_list = key_list,
instructions = instructions,
cores = cores,
verbose = verbose)
}
if(data_to_use == 'aggregate') {
# needed:
# feat_meta
# cell_meta
# agg_expr
# optional?
# tx_dt
# bound_dt_list
xenium_gobject = createGiottoXeniumObject_aggregate(data_list = data_list,
instructions = instructions,
cores = cores,
verbose = verbose)
}
return(xenium_gobject)
}
#' @title Create a Xenium Giotto object from subcellular info
#' @name createGiottoXeniumObject_subcellular
#' @description Subcellular workflow for createGiottoXeniumObject
#' @param data_list list of data loaded by \code{\link{load_xenium_folder}}
#' @param key_list regex-based search keys for feature IDs to allow separation
#' into separate giottoPoints objects by feat_type
#' @param qv_threshold Minimum Phred-scaled quality score cutoff to be included as
#' a subcellular transcript detection (default = 20)
#' @inheritParams get10Xmatrix
#' @inheritParams createGiottoObjectSubcellular
#' @seealso createGiottoXeniumObject createGiottoXeniumObject_aggregate
#' @keywords internal
createGiottoXeniumObject_subcellular = function(data_list,
key_list = NULL,
qv_threshold = 20,
instructions = NULL,
cores = NA,
verbose = TRUE) {
# data.table vars
qv = NULL
# Unpack data_list info
feat_meta = data_list$feat_meta
tx_dt = data_list$tx_dt
bound_dt_list = data_list$bound_dt_list
# cell_meta = data_list$cell_meta
# agg_expr = data_list$agg_expr
# define for data.table
cell_id = feat_ID = feature_name = NULL
if(isTRUE(verbose)) message('Building subcellular giotto object...')
# Giotto points object
if(isTRUE(verbose)) message('> points data prep...')
# filter by qv_threshold
if(isTRUE(verbose)) wrap_msg('> filtering feature detections for Phred score >= ', qv_threshold)
n_before = tx_dt[,.N]
tx_dt_filtered = tx_dt[qv >= qv_threshold]
n_after = tx_dt_filtered[,.N]
cat('Number of feature points removed: ',
n_before - n_after,
' out of ', n_before, '\n')
if(isTRUE(verbose)) message('> splitting detections by feat_type')
# discover feat_IDs for each feat_type
all_IDs = tx_dt_filtered[, unique(feat_ID)]
feat_types_IDs = lapply(key_list, function(x) all_IDs[grepl(pattern = x, all_IDs)])
rna = list('rna' = all_IDs[!all_IDs %in% unlist(feat_types_IDs)])
feat_types_IDs = append(rna, feat_types_IDs)
# separate detections by feature type
points_list = lapply(
feat_types_IDs,
function(types) {
tx_dt_filtered[feat_ID %in% types]
}
)
# Giotto polygons object
if(isTRUE(verbose)) message('> polygons data prep...')
polys_list = lapply(
bound_dt_list,
function(bound_type) {
bound_type[, cell_id := as.character(cell_id)]
}
)
xenium_gobject = createGiottoObjectSubcellular(gpoints = points_list,
gpolygons = polys_list,
instructions = instructions,
cores = cores,
verbose = verbose)
# generate centroids
if(isTRUE(verbose)) message('Calculating polygon centroids...')
xenium_gobject = addSpatialCentroidLocations(xenium_gobject,
poly_info = c(names(bound_dt_list)))
# add in feature metadata
# xenium_gobject = addFeatMetadata(gobject = xenium_gobject,
# new_metadata = feat_meta,
# by_column = TRUE,
# column_feat_ID = 'feat_ID')
return(xenium_gobject)
}
#' @title Create a Xenium Giotto object from aggregate info
#' @name createGiottoXeniumObject_aggregate
#' @description Aggregate workflow for createGiottoXeniumObject
#' @param data_list list of data loaded by \code{load_xenium_folder}
#' @inheritParams get10Xmatrix
#' @inheritParams createGiottoObjectSubcellular
#' @seealso createGiottoXeniumObject createGiottoXeniumObject_subcellular
#' @keywords internal
createGiottoXeniumObject_aggregate = function(data_list,
# include_analysis = FALSE,
instructions = NULL,
cores = NA,
verbose = TRUE) {
# Unpack data_list info
feat_meta = data_list$feat_meta
# tx_dt = data_list$tx_dt
# bound_dt_list = data_list$bound_dt_list
cell_meta = data_list$cell_meta
agg_expr = data_list$agg_expr
# define for data.table
cell_ID = x_centroid = y_centroid = NULL
# clean up names for aggregate matrices
names(agg_expr) = gsub(pattern = ' ', replacement = '_' ,names(agg_expr))
geneExpMat = which(names(agg_expr) == 'Gene_Expression')
names(agg_expr)[[geneExpMat]] = 'raw'
# set cell_id as character
cell_meta = cell_meta[, data.table::setnames(.SD, 'cell_id', 'cell_ID')]
cell_meta = cell_meta[, cell_ID := as.character(cell_ID)]
# set up spatial locations
agg_spatlocs = cell_meta[, .(x_centroid, y_centroid, cell_ID)]
# set up metadata
agg_meta = cell_meta[, !c('x_centroid','y_centroid')]
if(isTRUE(verbose)) message('Building aggregate giotto object...')
xenium_gobject = createGiottoObject(expression = agg_expr,
spatial_locs = agg_spatlocs,
instructions = instructions,
cores = cores,
verbose = verbose)
# append aggregate metadata
xenium_gobject = addCellMetadata(gobject = xenium_gobject,
new_metadata = agg_meta,
by_column = TRUE,
column_cell_ID = 'cell_ID')
xenium_gobject = addFeatMetadata(gobject = xenium_gobject,
new_metadata = feat_meta,
by_column = TRUE,
column_feat_ID = 'feat_ID')
return(xenium_gobject)
}
# folder reading and detection ####
#' @describeIn read_data_folder Read a structured MERSCOPE folder
#' @keywords internal
read_merscope_folder = function(merscope_dir,
data_to_use,
cores = NA,
verbose = TRUE) {
# prepare dir_items list
dir_items = list(`boundary info` = '*cell_boundaries*',
`image info` = '*images*',
`cell feature matrix` = '*cell_by_gene*',
`cell metadata` = '*cell_metadata*',
`raw transcript info` = '*transcripts*')
# prepare require_data_DT
sub_reqs = data.table::data.table(workflow = c('subcellular'),
item = c('boundary info',
'raw transcript info',
'image info',
'cell by gene matrix',
'cell metadata'),
needed = c(TRUE, TRUE, FALSE, FALSE, FALSE))
agg_reqs = data.table::data.table(workflow = c('aggregate'),
item = c('boundary info',
'raw transcript info',
'image info',
'cell by gene matrix',
'cell metadata'),
needed = c(FALSE, FALSE, FALSE, TRUE, TRUE))
require_data_DT = rbind(sub_reqs, agg_reqs)
dir_items = read_data_folder(spat_method = 'MERSCOPE',
data_dir = merscope_dir,
dir_items = dir_items,
data_to_use = data_to_use,
require_data_DT = require_data_DT,
cores = cores,
verbose = verbose)
return(dir_items)
}
#' @title Read a structured CosMx folder
#' @name read_cosmx_folder
#' @inheritParams createGiottoCosMxObject
#' @seealso createGiottoCosMxObject load_cosmx_folder
#' @return path_list a list of cosmx files discovered and their filepaths. NULL
#' values denote missing items
#' @keywords internal
read_cosmx_folder = function(cosmx_dir,
verbose = TRUE) {
ch = box_chars()
if(is.null(cosmx_dir) | !dir.exists(cosmx_dir)) stop('The full path to a cosmx directory must be given.\n')
if(isTRUE(verbose)) wrap_msg('A structured CosMx directory will be used\n')
# find directories (length = 1 if present, length = 0 if missing)
dir_items = list(`CellLabels folder` = '*CellLabels',
`CompartmentLabels folder` = '*CompartmentLabels',
`CellComposite folder` = '*CellComposite',
`CellOverlay folder` = '*CellOverlay',
`transcript locations file` = '*tx_file*',
`fov positions file` = '*fov_positions_file*',
`expression matrix file` = '*exprMat_file*',
`metadata file` = '*metadata_file*')
dir_items = lapply(dir_items, function(x) Sys.glob(paths = file.path(cosmx_dir, x)))
dir_items_lengths = lengths(dir_items)
if(isTRUE(verbose)) {
message('Checking directory contents...')
for(item in names(dir_items)) {
if(dir_items_lengths[[item]] > 0) {
message(ch$s, '> ' ,item, ' found')
} else {
warning(item, ' is missing\n')
}
}
}
# select first directory in list if multiple are detected
if(any(dir_items_lengths > 1)) {
warning('Multiple matches for expected subdirectory item(s).\n First matching item selected')
multiples = which(dir_items_lengths > 1)
for(mult_i in multiples) {
message(names(dir_items)[[mult_i]], 'multiple matches found:')
print(dir_items[[mult_i]])
dir_items[[mult_i]] = dir_items[[mult_i]][[1]]
}
}
if(isTRUE(verbose)) message('Directory check done')
return(dir_items)
}
#' @title Read a structured xenium folder
#' @name read_xenium_folder
#' @inheritParams createGiottoXeniumObject
#' @keywords internal
#' @return path_list a list of xenium files discovered and their filepaths. NULL
#' values denote missing items
read_xenium_folder = function(xenium_dir,
data_to_use = 'subcellular',
bounds_to_load = c('cell'),
load_format = 'csv',
h5_expression = FALSE,
verbose = TRUE) {
# Check needed packages
if(load_format == 'parquet') {
package_check(pkg_name = 'arrow', repository = 'CRAN')
package_check(pkg_name = 'dplyr', repository = 'CRAN')
}
if(isTRUE(h5_expression)) {
package_check(pkg_name = 'hdf5r', repository = 'CRAN')
}
ch = box_chars()
# 0. test if folder structure exists and is as expected
if(is.null(xenium_dir) | !dir.exists(xenium_dir)) stop('The full path to a xenium directory must be given.\n')
if(isTRUE(verbose)) message('A structured Xenium directory will be used\n')
# find items (length = 1 if present, length = 0 if missing)
dir_items = list(`analysis info` = '*analysis*',
`boundary info` = '*bound*',
`cell feature matrix` = '*cell_feature_matrix*',
`cell metadata` = '*cells*',
`image info` = '*tif',
`panel metadata` = '*panel*',
`raw transcript info` = '*transcripts*',
`experiment info (.xenium)` = '*.xenium')
dir_items = lapply(dir_items, function(x) Sys.glob(paths = file.path(xenium_dir, x)))
dir_items_lengths = lengths(dir_items)
if(isTRUE(verbose)) {
message('Checking directory contents...')
for(item in names(dir_items)) {
# IF ITEM FOUND
if(dir_items_lengths[[item]] > 0) {
message(ch$s, '> ' ,item, ' found')
for(item_i in seq_along(dir_items[[item]])) { # print found item names
subItem = gsub(pattern = '.*/', replacement = '', x = dir_items[[item]][[item_i]])
message(ch$s, ch$s, ch$l,ch$h,ch$h, subItem)
}
} else {
# IF ITEM MISSING
# Based on workflow, determine if:
# necessary (error)
# optional (warning)
if(data_to_use == 'subcellular') {
# necessary items
if(item %in% c('boundary info', 'raw transcript info')) stop(item, ' is missing\n')
# optional items
if(item %in% c('image info', 'experiment info (.xenium)', 'panel metadata')) warning(item, ' is missing (optional)\n')
# items to ignore: analysis info, cell feature matrix, cell metadata
} else if(data_to_use == 'aggregate') {
# necessary items
if(item %in% c('cell feature matrix', 'cell metadata')) stop(item, ' is missing\n')
# optional items
if(item %in% c('image info', 'experiment info (.xenium)', 'panel metadata', 'analysis info')) warning(item, ' is missing (optional)\n')
# items to ignore: boundary info, raw transcript info
}
}
}
}
# 1. Select data to load
# **** transcript info ****
tx_path = NULL
tx_path = dir_items$`raw transcript info`[grepl(pattern = load_format, dir_items$`raw transcript info`)]
# **** cell metadata ****
cell_meta_path = NULL
cell_meta_path = dir_items$`cell metadata`[grepl(pattern = load_format, dir_items$`cell metadata`)]
# **** boundary info ****
# Select bound load format
if(load_format != 'zarr') { # No zarr available for boundary info
dir_items$`boundary info` = dir_items$`boundary info`[grepl(pattern = load_format, dir_items$`boundary info`)]
} else dir_items$`boundary info` = dir_items$`boundary info`[grepl(pattern = 'csv', dir_items$`boundary info`)]
# Organize bound paths by type of bound (bounds_to_load param)
bound_paths = NULL
bound_names = bounds_to_load
bounds_to_load = as.list(bounds_to_load)
bound_paths = lapply(bounds_to_load, function(x) dir_items$`boundary info`[grepl(pattern = x, dir_items$`boundary info`)])
names(bound_paths) = bound_names
# **** aggregated expression info ****
agg_expr_path = NULL
if(isTRUE(h5_expression)) { # h5 expression matrix loading is default
agg_expr_path = dir_items$`cell feature matrix`[grepl(pattern = 'h5', dir_items$`cell feature matrix`)]
h5_gene_ids = match.arg(arg = h5_gene_ids, choices = c('symbols', 'ensembl'))
} else if(load_format == 'zarr') {
agg_expr_path = dir_items$`cell feature matrix`[grepl(pattern = 'zarr', dir_items$`cell feature matrix`)]
} else { # No parquet for aggregated expression - default to normal 10x loading
agg_expr_path = dir_items$`cell feature matrix`[sapply(dir_items$`cell feature matrix`, function(x) file_test(op = '-d', x))]
if(length(agg_expr_path) == 0) stop(wrap_txt(
'Expression matrix cannot be loaded.\nHas cell_feature_matrix(.tar.gz) been unpacked into a directory?'
))
}
if(data_to_use == 'aggregate') {
if(length(path_list$agg_expr_path) == 0) stop(wrap_txt(
'Aggregated expression not found.\nPlease confirm h5_expression and load_format params are correct\n'
))
}
# **** panel info ****
panel_meta_path = NULL
panel_meta_path = dir_items$`panel metadata`
if(isTRUE(verbose)) message('Directory check done')
path_list = list('tx_path' = tx_path,
'bound_paths' = bound_paths,
'cell_meta_path' = cell_meta_path,
'agg_expr_path' = agg_expr_path,
'panel_meta_path' = panel_meta_path)
return(path_list)
}
# folder loading ####
## MERSCOPE ####
#' @title Load MERSCOPE data from folder
#' @name load_merscope_folder
#' @param dir_items list of full filepaths from \code{\link{read_merscope_folder}}
#' @inheritParams createGiottoMerscopeObject
#' @keywords internal
#' @return list of loaded-in MERSCOPE data
load_merscope_folder = function(dir_items,
data_to_use,
fovs = NULL,
poly_z_indices = 1L:7L,
cores = NA,
verbose = TRUE) {
# 1. load data_to_use-specific
if(data_to_use == 'subcellular') {
data_list = load_merscope_folder_subcellular(dir_items = dir_items,
data_to_use = data_to_use,
fovs = fovs,
poly_z_indices = poly_z_indices,
cores = cores,
verbose = verbose)
} else if(data_to_use == 'aggregate') {
data_list = load_merscope_folder_aggregate(dir_items = dir_items,
data_to_use = data_to_use,
cores = cores,
verbose = verbose)
} else {
stop(wrap_txt('data_to_use "', data_to_use, '" not implemented', sep = ''))
}
# 2. Load images if available
if(!is.null(dir_items$`image info`)) {
## micron to px scaling factor
micronToPixelScale = Sys.glob(paths = file.path(dir_items$`image info`, '*micron_to_mosaic_pixel_transform*'))[[1]]
micronToPixelScale = data.table::fread(micronToPixelScale, nThread = cores)
# add to data_list
data_list$micronToPixelScale = micronToPixelScale
## staining images
## determine types of stains
images_filenames = list.files(dir_items$`image info`)
bound_stains_filenames = images_filenames[grep(pattern = '.tif', images_filenames)]
bound_stains_types = sapply(strsplit(bound_stains_filenames, '_'), `[`, 2)
bound_stains_types = unique(bound_stains_types)
img_list = lapply_flex(bound_stains_types, function(stype) {
img_paths = Sys.glob(paths = file.path(dir_items$`image info`, paste0('*',stype,'*')))
# print(img_paths)
lapply_flex(img_paths, function(img) {
createGiottoLargeImage(raster_object = img)
}, cores = cores)
}, cores = cores)
# add to data_list
data_list$images = img_list
}
return(data_list)
}
#' @describeIn load_merscope_folder Load items for 'subcellular' workflow
#' @keywords internal
load_merscope_folder_subcellular = function(dir_items,
data_to_use,
cores = NA,
poly_z_indices = 1L:7L,
verbose = TRUE,
fovs = NULL) {
if(isTRUE(verbose)) wrap_msg('Loading transcript level info...')
if(is.null(fovs)) {
tx_dt = data.table::fread(dir_items$`raw transcript info`, nThread = cores)
} else {
if(isTRUE(verbose)) wrap_msg('Selecting FOV subset transcripts')
tx_dt = fread_colmatch(file = dir_items$`raw transcript info`,
col = 'fov',
values_to_match = fovs,
verbose = FALSE,
nThread = cores)
}
tx_dt[, c('x','y') := NULL] # remove unneeded cols
data.table::setcolorder(tx_dt, c('gene', 'global_x', 'global_y', 'global_z'))
if(isTRUE(verbose)) wrap_msg('Loading polygon info...')
poly_info = readPolygonFilesVizgenHDF5(boundaries_path = dir_items$`boundary info`,
z_indices = poly_z_indices,
flip_y_axis = TRUE,
fovs = fovs)
data_list = list(
'poly_info' = poly_info,
'tx_dt' = tx_dt,
'micronToPixelScale' = NULL,
'expr_dt' = NULL,
'cell_meta' = NULL,
'images' = NULL
)
}
#' @describeIn load_merscope_folder Load items for 'aggregate' workflow
#' @keywords internal
load_merscope_folder_aggregate = function(dir_items,
data_to_use,
cores = NA,
verbose = TRUE) {
# metadata is polygon-related measurements
if(isTRUE(verbose)) wrap_msg('Loading cell metadata...')
cell_metadata_file = data.table::fread(dir_items$`cell metadata`, nThread = cores)
if(isTRUE(verbose)) wrap_msg('Loading expression matrix')
expr_dt = data.table::fread(dir_items$`cell feature matrix`, nThread = cores)
data_list = list(
'poly_info' = NULL,
'tx_dt' = NULL,
'micronToPixelScale' = NULL,
'expr_dt' = expr_dt,
'cell_meta' = cell_metadata_file,
'images' = NULL
)
}
## CosMx ####
#' @title Load CosMx folder subcellular info
#' @name load_cosmx_folder_subcellular
#' @description loads in the feature detections information. Note that the mask
#' images are still required for a working subcellular object, and those are loaded
#' in \code{\link{createGiottoCosMxObject_subcellular}}
#' @inheritParams createGiottoCosMxObject
#' @keywords internal
load_cosmx_folder_subcellular = function(dir_items,
FOVs = NULL,
cores,
verbose = TRUE) {
if(isTRUE(verbose)) wrap_msg('Loading subcellular information...')
# subcellular checks
if(!file.exists(dir_items$`transcript locations file`))
stop(wrap_txt('No transcript locations file (.csv) detected'))
if(!file.exists(dir_items$`fov positions file`))
stop(wrap_txt('No fov positions file (.csv) detected'))
# FOVs to load
if(isTRUE(verbose)) wrap_msg('Loading FOV offsets...')
fov_offset_file = fread(input = dir_items$`fov positions file`, nThread = cores)
if(is.null(FOVs)) FOVs = fov_offset_file$fov # default to ALL FOVs
FOV_ID = as.list(sprintf('%03d', FOVs))
#TODO Load only relevant portions of file?
if(isTRUE(verbose)) wrap_msg('Loading transcript level info...')
tx_coord_all = fread(input = dir_items$`transcript locations file`, nThread = cores)
if(isTRUE(verbose)) wrap_msg('Subcellular load done')
data_list = list(
'FOV_ID' = FOV_ID,
'fov_offset_file' = fov_offset_file,
'tx_coord_all' = tx_coord_all
)
return(data_list)
}
#' @title Load CosMx folder aggregate info
#' @name load_cosmx_folder_aggregate
#' @inheritParams createGiottoCosMxObject
#' @keywords internal
load_cosmx_folder_aggregate = function(dir_items,
cores,
verbose = TRUE) {
# data.table vars
fov = cell_ID = fov_cell_ID = CenterX_global_px = CenterY_global_px = CenterX_local_px =
CenterY_local_px = x_shift = y_shift = NULL
# load aggregate information
wrap_msg('Loading provided aggregated information...')
# aggregate checks
if(!file.exists(dir_items$`expression matrix file`)) stop(wrap_txt('No expression matrix file (.csv) detected'))
if(!file.exists(dir_items$`metadata file`)) stop(wrap_txt('No metadata file (.csv) detected. Needed for cell spatial locations.'))
# read in aggregate data
expr_mat = fread(input = dir_items$`expression matrix file`, nThread = cores)
metadata = fread(input = dir_items$`metadata file`, nThread = cores)
# setorder expression and spatlocs
data.table::setorder(metadata, fov, cell_ID)
data.table::setorder(expr_mat, fov, cell_ID)
# generate unique cell IDs
expr_mat[, cell_ID := paste0('fov', sprintf('%03d', fov), '-', 'cell_', cell_ID)]
# expr_mat$cell_ID = paste0('fov', sprintf('%03d', expr_mat$fov), '-', 'cell_', expr_mat$cell_ID)
expr_mat = expr_mat[, fov := NULL]
metadata[, fov_cell_ID := cell_ID]
metadata[, cell_ID := paste0('fov', sprintf('%03d', fov), '-', 'cell_', cell_ID)]
# metadata$cell_ID = paste0('fov', sprintf('%03d', metadata$fov), '-', 'cell_', metadata$cell_ID)
# reorder
data.table::setcolorder(x = metadata, c('cell_ID','fov','fov_cell_ID'))
# extract spatial locations
spatlocs = metadata[,.(CenterX_global_px, CenterY_global_px, cell_ID)]
spatlocs_fov = metadata[,.(CenterX_local_px, CenterY_local_px, cell_ID)]
# regenerate FOV shifts
metadata[, x_shift := CenterX_global_px - CenterX_local_px]
metadata[, y_shift := CenterY_global_px - CenterY_local_px]
fov_shifts = metadata[, .(mean(x_shift), mean(y_shift)), fov]
colnames(fov_shifts) = c('fov', 'x_shift', 'y_shift')
# rename spatloc column names
spatloc_oldnames = c('CenterX_global_px', 'CenterY_global_px', 'cell_ID')
spatloc_oldnames_fov = c('CenterX_local_px', 'CenterY_local_px', 'cell_ID')
spatloc_newnames = c('sdimx', 'sdimy', 'cell_ID')
data.table::setnames(spatlocs, old = spatloc_oldnames, new = spatloc_newnames)
data.table::setnames(spatlocs_fov, old = spatloc_oldnames_fov, new = spatloc_newnames)
# cleanup metadata and spatlocs
metadata = metadata[,c('CenterX_global_px', 'CenterY_global_px', 'CenterX_local_px', 'CenterY_local_px') := NULL]
# find unique cell_IDs present in both expression and metadata
giotto_cell_ID = unique(intersect(expr_mat$cell_ID, metadata$cell_ID))
# subset to only unique cell_IDs
expr_mat = expr_mat[cell_ID %in% giotto_cell_ID,]
metadata = metadata[cell_ID %in% giotto_cell_ID,]
# convert protein metadata to expr mat
# take all mean intensity protein information except for MembraneStain and DAPI
protein_meta_cols = colnames(metadata)
protein_meta_cols = protein_meta_cols[grepl(pattern = 'Mean.*', x = protein_meta_cols)]
protein_meta_cols = protein_meta_cols[!protein_meta_cols %in% c('Mean.MembraneStain', 'Mean.DAPI')]
protein_meta_cols = c('cell_ID', protein_meta_cols)
prot_expr = metadata[, protein_meta_cols, with = FALSE]
prot_cell_ID = metadata[, cell_ID]
protM = Matrix::Matrix(as.matrix(prot_expr[,-1]), dimnames = list(prot_expr[[1]], colnames(prot_expr[,-1])), sparse = FALSE)
protM = t_flex(protM)
# convert expression to sparse matrix
spM = Matrix::Matrix(as.matrix(expr_mat[,-1]), dimnames = list(expr_mat[[1]], colnames(expr_mat[,-1])), sparse = TRUE)
spM = t_flex(spM)
## Ready for downstream aggregate gobject creation or appending into existing subcellular Giotto object ##
data_list = list(
'spatlocs' = spatlocs,
'spatlocs_fov' = spatlocs_fov,
'metadata' = metadata,
'protM' = protM,
'spM' = spM,
'fov_shifts' = fov_shifts
)
return(data_list)
}
## Xenium ####
#' @title Load xenium data from folder
#' @name load_xenium_folder
#' @param path_list list of full filepaths from read_xenium_folder
#' @inheritParams createGiottoXeniumObject
#' @keywords internal
#' @return list of loaded in xenium data
load_xenium_folder = function(path_list,
load_format = 'csv',
data_to_use = 'subcellular',
h5_expression = 'FALSE',
h5_gene_ids = 'symbols',
gene_column_index = 1,
cores,
verbose = TRUE) {
if(load_format == 'csv') {
data_list = load_xenium_folder_csv(path_list = path_list,
data_to_use = data_to_use,
h5_expression = h5_expression,
h5_gene_ids = h5_gene_ids,
gene_column_index = gene_column_index,
cores = cores,
verbose = verbose)
}
if(load_format == 'parquet') {
data_list = load_xenium_folder_parquet(path_list = path_list,
data_to_use = data_to_use,
h5_expression = h5_expression,
h5_gene_ids = h5_gene_ids,
gene_column_index = gene_column_index,
cores = cores,
verbose = verbose)
}
if(load_format == 'zarr') {
# TODO
}
return(data_list)
}
#' @describeIn load_xenium_folder Load from csv files
#' @keywords internal
load_xenium_folder_csv = function(path_list,
cores,
data_to_use = 'subcellular',
h5_expression = FALSE,
h5_gene_ids = 'symbols',
gene_column_index = 1,
verbose = TRUE) {
# initialize return vars
feat_meta = tx_dt = bound_dt_list = cell_meta = agg_expr = NULL
if(isTRUE(verbose)) message('Loading feature metadata...')
feat_meta = data.table::fread(path_list$panel_meta_path[[1]], nThread = cores)
colnames(feat_meta)[[1]] = 'feat_ID'
# **** subcellular info ****
if(data_to_use == 'subcellular') {
# append missing QC probe info to feat_meta
if(isTRUE(h5_expression)) {
h5 = hdf5r::H5File$new(path_list$agg_expr_path)
tryCatch({
root = names(h5)
feature_id = h5[[paste0(root, "/features/id")]][]
feature_info = h5[[paste0(root,"/features/feature_type")]][]
feature_names = h5[[paste0(root, "/features/name")]][]
features_dt = data.table::data.table(
'id' = feature_id,
'name' = feature_names,
'feature_type' = feature_info
)
}, finally = {
h5$close_all()
})
} else {
features_dt = data.table::fread(paste0(path_list$agg_expr_path, '/features.tsv.gz'), header = F)
}
colnames(features_dt) = c('id', 'feat_ID', 'feat_class')
feat_meta = merge(features_dt[,c(2,3)], feat_meta, all.x = TRUE, by = 'feat_ID')
if(isTRUE(verbose)) message('Loading transcript level info...')
tx_dt = data.table::fread(path_list$tx_path[[1]], nThread = cores)
data.table::setnames(x = tx_dt,
old = c('feature_name', 'x_location', 'y_location'),
new = c('feat_ID', 'x', 'y'))
if(isTRUE(verbose)) message('Loading boundary info...')
bound_dt_list = lapply(path_list$bound_paths, function(x) data.table::fread(x[[1]], nThread = cores))
}
# **** aggregate info ****
if(isTRUE(verbose)) message('Loading cell metadata...')
cell_meta = data.table::fread(path_list$cell_meta_path[[1]], nThread = cores)
if(data_to_use == 'aggregate') {
if(isTRUE(verbose)) message('Loading aggregated expression...')
if(isTRUE(h5_expression)) agg_expr = get10Xmatrix_h5(path_to_data = path_list$agg_expr_path,
gene_ids = h5_gene_ids,
remove_zero_rows = TRUE,
split_by_type = TRUE)
else agg_expr = get10Xmatrix(path_to_data = path_list$agg_expr_path,
gene_column_index = gene_column_index,
remove_zero_rows = TRUE,
split_by_type = TRUE)
}
data_list = list(
'feat_meta' = feat_meta,
'tx_dt' = tx_dt,
'bound_dt_list' = bound_dt_list,
'cell_meta' = cell_meta,
'agg_expr' = agg_expr
)
return(data_list)
}
#' @describeIn load_xenium_folder Load from parquet files
#' @keywords internal
load_xenium_folder_parquet = function(path_list,
cores,
data_to_use = 'subcellular',
h5_expression = FALSE,
h5_gene_ids = 'symbols',
gene_column_index = 1,
verbose = TRUE) {
# initialize return vars
feat_meta = tx_dt = bound_dt_list = cell_meta = agg_expr = NULL
# dplyr variable
cell_id = NULL
if(isTRUE(verbose)) message('Loading feature metadata...')
feat_meta = data.table::fread(path_list$panel_meta_path[[1]], nThread = cores)
colnames(feat_meta)[[1]] = 'feat_ID'
# **** subcellular info ****
if(data_to_use == 'subcellular') {
# define for data.table
transcript_id = feature_name = NULL
# append missing QC probe info to feat_meta
if(isTRUE(h5_expression)) {
h5 = hdf5r::H5File$new(path_list$agg_expr_path)
tryCatch({
root = names(h5)
feature_id = h5[[paste0(root, "/features/id")]][]
feature_info = h5[[paste0(root,"/features/feature_type")]][]
feature_names = h5[[paste0(root, "/features/name")]][]
features_dt = data.table::data.table(
'id' = feature_id,
'name' = feature_names,
'feature_type' = feature_info
)
}, finally = {
h5$close_all()
})
} else {
features_dt = arrow::read_tsv_arrow(paste0(path_list$agg_expr_path, '/features.tsv.gz'),
col_names = FALSE) %>%
data.table::setDT()
}
colnames(features_dt) = c('id', 'feat_ID', 'feat_class')
feat_meta = merge(features_dt[,c(2,3)], feat_meta, all.x = TRUE, by = 'feat_ID')
if(isTRUE(verbose)) message('Loading transcript level info...')
tx_dt = arrow::read_parquet(file = path_list$tx_path[[1]], as_data_frame = FALSE) %>%
dplyr::mutate(transcript_id = cast(transcript_id, arrow::string())) %>%
dplyr::mutate(cell_id = cast(cell_id, arrow::string())) %>%
dplyr::mutate(feature_name = cast(feature_name, arrow::string())) %>%
as.data.frame() %>%
data.table::setDT()
data.table::setnames(x = tx_dt,
old = c('feature_name', 'x_location', 'y_location'),
new = c('feat_ID', 'x', 'y'))
if(isTRUE(verbose)) message('Loading boundary info...')
bound_dt_list = lapply(path_list$bound_paths, function(x) {
arrow::read_parquet(file = x[[1]], as_data_frame = FALSE) %>%
dplyr::mutate(cell_id = cast(cell_id, arrow::string())) %>%
as.data.frame() %>%
data.table::setDT()})
}
# **** aggregate info ****
if(data_to_use == 'aggregate') {
if(isTRUE(verbose)) message('Loading cell metadata...')
cell_meta = arrow::read_parquet(file = path_list$cell_meta_path[[1]], as_data_frame = FALSE) %>%
dplyr::mutate(cell_id = cast(cell_id, arrow::string())) %>%
as.data.frame() %>%
data.table::setDT()
# NOTE: no parquet for agg_expr.
if(isTRUE(verbose)) message('Loading aggregated expression...')
if(isTRUE(h5_expression)) agg_expr = get10Xmatrix_h5(path_to_data = path_list$agg_expr_path,
gene_ids = h5_gene_ids,
remove_zero_rows = TRUE,
split_by_type = TRUE)
else agg_expr = get10Xmatrix(path_to_data = path_list$agg_expr_path,
gene_column_index = gene_column_index,
remove_zero_rows = TRUE,
split_by_type = TRUE)
}
data_list = list(
'feat_meta' = feat_meta,
'tx_dt' = tx_dt,
'bound_dt_list' = bound_dt_list,
'cell_meta' = cell_meta,
'agg_expr' = agg_expr
)
return(data_list)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.