##
# @title detect_input_rnacounts
# @description Detects the type of gene expression data being provided to the function STList.
# @details
# This function detects the input provided to the `rnacounts` argument of the STList()
# function. It also detects the delimiter of the file when relevant. NOTE that the
# function does minimum checking on the contents of files, limited mostly to detect
# csv/tsv, data frames, Visium/Xeniumss files, or Seurat objects.
#
# @param rnacounts the object, file, or directory with counts provided to STList.
# @return inputtype a string indicated the input type.
#
#
detect_input_rnacounts = function(rnacounts=NULL){
# Define output/return variable.
# Return NULL if no input was provided to rnacounts
inputtype_rnacounts = c()
if(is.null(rnacounts) || length(rnacounts) == 0){
inputtype_rnacounts = NULL
return(inputtype_rnacounts)
}
# CASE: SEURAT OBJECT
# FUTURE DEV: May be good to be able to input a list of Seurat objects, instead of a single object at a time
# ALSO CONSIDER: Currently multiple samples from Seurat are allowed as long as they are stored as "slices" (names(seurat_oj@images))
# FUTURE DEV: Seurat V5 -_-
if(is(rnacounts, 'Seurat')){
inputtype_rnacounts = 'seurat'
return(inputtype_rnacounts)
}
# CASE: NAMED LIST OF DATAFRAMES
if(inherits(rnacounts, 'list')){
# Test that lists are named.
if(!is.null(names(rnacounts))){
# Test that all elements in the list are data frames
tests_df = all(sapply(rnacounts, is.data.frame))
if(tests_df){
inputtype_rnacounts = 'list_dfs'
} else{
raise_err(err_code='error0022')
}
rm(tests_df) # Clean env
} else {
raise_err(err_code='error0023')
}
return(inputtype_rnacounts)
}
# CASE: FILE PATH(S) TO COUNT MATRICES/TABLES (INCLUDES COSMX-SMI)
# Test that all elements of the input vector exist as files and not as directories.
test_files = all(sapply(rnacounts, file.exists))
test_dirs = all(!sapply(rnacounts, dir.exists))
if(test_files && test_dirs){
# Test that files are CSV or TSV
tests_text = lapply(rnacounts, readLines, n=2)
tests_tab = unlist(lapply(tests_text, function(i){grepl("\t", i[2])}))
tests_comma = unlist(lapply(tests_text, function(i){grepl(",", i[2])}))
if(all(tests_tab)){
inputtype_rnacounts = 'tab_delim'
} else if(all(tests_comma)){
inputtype_rnacounts = 'comma_delim'
} else{
raise_err(err_code='error0024')
}
rm(tests_tab, tests_comma) # Clean env
# Check if inputs were COSMX-SMI
tests_cosmx = unlist(lapply(tests_text, function(i){ ( grepl('fov', i[1]) && grepl('cell_ID|cell_id', i[1]) )}))
if(all(tests_cosmx)){
inputtype_rnacounts = paste0(inputtype_rnacounts, '_cosmx')
}
rm(tests_text, tests_cosmx) # Clean env
return(inputtype_rnacounts)
}
rm(test_files, test_dirs) # Clean env
# CASE: FILE PATHS TO VISIUM DIRECTORIES
# Test that elements in `rnacounts` are directories.
test_dirs = all(sapply(rnacounts, dir.exists))
if(test_dirs){
# Check that directories contains an element with name matching:
# 'filtered_feature_bc' (visium), 'raw_feature_bc' (visium), or 'cell_feature_matrix' (xenium)
visium_h5_filt = lapply(rnacounts, function(i){list.files(i, pattern='filtered_feature_bc[_a-zA-Z0-9]*\\.h5$', include.dirs=T, full.names=T)})
visium_h5_raw = lapply(rnacounts, function(i){list.files(i, pattern='raw_feature_bc[_a-zA-Z0-9]*\\.h5$', include.dirs=T, full.names=T)})
xenium_h5 = lapply(rnacounts, function(i){list.files(i, pattern='cell_feature_matrix[_a-zA-Z0-9]*\\.h5$', include.dirs=T, full.names=T)})
visium_mex_filt = lapply(rnacounts, function(i){list.files(i, pattern='filtered_feature_bc[_a-zA-Z0-9]*\\.tar\\.gz$', include.dirs=T, full.names=T)})
visium_mex_raw = lapply(rnacounts, function(i){list.files(i, pattern='raw_feature_bc[_a-zA-Z0-9]*\\.tar\\.gz$', include.dirs=T, full.names=T)})
xenium_mex = lapply(rnacounts, function(i){list.files(i, pattern='cell_feature_matrix[_a-zA-Z0-9]*\\.tar\\.gz$', include.dirs=T, full.names=T)})
# Are h5 or MEX files present?
if(!(rlang::is_empty(unlist(visium_h5_filt)))){
tests_files = lapply(visium_h5_filt, function(i){ hdf5r::is_hdf5(i[1]) })
if(all(unlist(tests_files))){ inputtype_rnacounts = 'visium_filtered_h5' }
} else if(!(rlang::is_empty(unlist(visium_h5_raw)))){
tests_files = lapply(visium_h5_raw, function(i){ hdf5r::is_hdf5(i[1]) })
if(all(unlist(tests_files))){ inputtype_rnacounts = 'visium_raw_h5' }
} else if(!(rlang::is_empty(unlist(xenium_h5)))){
tests_files = lapply(xenium_h5, function(i){ hdf5r::is_hdf5(i[1]) })
if(all(unlist(tests_files))){ inputtype_rnacounts = 'xenium_h5' }
} else if(!(rlang::is_empty(unlist(visium_mex_filt)))){
tests_files = lapply(visium_mex_filt, function(i){ grepl('filtered_feature_bc[_a-zA-Z0-9]*\\.tar\\.gz$', i[1]) })
if(all(unlist(tests_files))){ inputtype_rnacounts = 'visium_filtered_mex' }
} else if(!(rlang::is_empty(unlist(visium_mex_raw)))){
tests_files = lapply(visium_mex_raw, function(i){ grepl('raw_feature_bc[_a-zA-Z0-9]*\\.tar\\.gz$', i[1]) })
if(all(unlist(tests_files))){ inputtype_rnacounts = 'visium_raw_mex' }
} else if(!(rlang::is_empty(unlist(xenium_mex)))){
tests_files = lapply(xenium_mex, function(i){ grepl('cell_feature_matrix[_a-zA-Z0-9]*\\.tar\\.gz$', i[1]) })
if(all(unlist(tests_files))){ inputtype_rnacounts = 'xenium_mex' }
} else{
raise_err(err_code='error0026')
}
return(inputtype_rnacounts)
} else{
raise_err(err_code='error0025')
}
rm(test_dirs) # Clean env
} # CLOSE detect_input_rnacounts
##
# @title detect_input_spotcoords
# @description Detects the type of coordinates file being provided to the function STList.
# @details
# This function detects the input provided to the `spotcoords` argument of the STList()
# function. It also detects the delimiter of the file when relevant. NOTE that the
# function does minimum checking on the contents of files, limited mostly to detect
# csv/tsv, data frames, or parquet files.
#
# @param spotcoords the object, file, or directory with counts provided to STList.
# @return inputtype a string indicated the input type.
#
#
detect_input_spotcoords = function(spotcoords=NULL){
# Define output/return variable.
# Return NULL if no input was provided to rnacounts
inputtype_spotcoords = c()
if(is.null(spotcoords) || length(spotcoords) == 0){
inputtype_spotcoords = NULL
return(inputtype_spotcoords)
}
# CASE: NAMED LIST OF DATAFRAMES (COORDINATES)
if(inherits(spotcoords, 'list')){
# Test that lists are named.
if(!is.null(names(spotcoords))){
# Test that all elements in the list are data frames
tests_df = all(sapply(spotcoords, is.data.frame))
if(tests_df){
inputtype_spotcoords = 'list_dfs'
} else{
raise_err(err_code='error0030')
}
rm(tests_df) # Clean env
} else {
raise_err(err_code='error0031')
}
return(inputtype_spotcoords)
}
# CASE: FILE PATH(S) TO COORDINATES TABLES (INCLUDES COSMX-SMI)
# Test that all elements of the input vector exist as files and not as directories.
test_files = all(sapply(spotcoords, file.exists))
test_dirs = all(!sapply(spotcoords, dir.exists))
if(test_files && test_dirs){
# Test that files are CSV or TSV
tests_text = lapply(spotcoords, readLines, n=2)
tests_tab = unlist(lapply(tests_text, function(i){grepl("\t", i[2])}))
tests_comma = unlist(lapply(tests_text, function(i){grepl(",", i[2])}))
if(all(tests_tab)){
inputtype_spotcoords = 'tab_delim'
} else if(all(tests_comma)){
inputtype_spotcoords = 'comma_delim'
} else{
raise_err(err_code='error0029')
}
rm(tests_tab, tests_comma) # Clean env
# Check if spotcoords inputs are COSMX-SMI
tests_cosmx = unlist(lapply(tests_text, function(i){ ( grepl('fov', i[1]) && grepl('cell_ID|cell_id', i[1]) )}))
if(all(tests_cosmx)){
inputtype_spotcoords = paste0(inputtype_spotcoords, '_cosmx')
}
rm(tests_text, tests_cosmx) # Clean env
return(inputtype_spotcoords)
}
rm(test_files, test_dirs) # Clean env
} # CLOSE detect_input_spotcoords
##
# @title detect_input_samples
# @description Detects the type of metadata (`samples`) being provided to the function STList.
# @details
# This function detects the input provided to the `rnacounts` argument of the
# STList() function. It also detects the delimiter of the file when relevant. NOTE
# that the function does minimum checking on the contents of files, limited mostly
# to detect csv/tsv, or a vector with strings.
#
# @param samples the object, file, or directory with counts provided to STList.
# @return inputtype a string indicated the input type.
#
#
detect_input_samples = function(samples=NULL){
# Define output/return variable.
# Return NULL if no input was provided to samples
inputtype_samples = c()
if(is.null(samples) || length(samples) == 0){
inputtype_samples = NULL
return(inputtype_samples)
}
# CASE: SAMPLES FILE
# Test that input is a file path to a table (tsv, csv, or xlsx)
if(length(samples) == 1 && file.exists(samples)){
# Test if file path ends in 'xlsx'
tests_xlsx = grepl("//.xlsx$", samples)
if(tests_xlsx){
inputtype_samples = 'samplesfile_excel'
rm(tests_xlsx) # Clean env
} else{
# Test if file is tab- or comma-delimited
tests_text = readLines(samples, n=2)
tests_tab = grepl("\t", tests_text)
tests_comma = grepl(",", tests_text)
if(all(tests_tab)){
inputtype_samples = 'samplesfile_tab'
} else if(all(tests_comma)){
inputtype_samples = 'samplesfile_comma'
} else{
raise_err(err_code='error0027')
}
rm(tests_text, tests_tab, tests_comma) # Clean env
}
return(inputtype_samples)
}
# CASE: VECTOR WITH SAMPLE NAMES
if(length(samples) >= 1){
# Test that input is a vector with strings
#test_vector = is.vector(samples, mode='character')
inputtype_samples = 'sample_names'
return(inputtype_samples)
}
# Test validity of sample names
# if(test_vector){
# # Test if elements of the begin with a number (SHOULD NOT begin with a number)
# test_number = any(sapply(samples, function(i){grepl("^[0-9]", i)}))
# if(test_number){
# raise_err(err_code='error0028')
# }
# rm(test_number) # Clean env
#
# # Test if elements contain only alpha-numerics, spaces, dash, and underscores
# test_chars = any(!sapply(samples, function(i){grepl("^[ //-_//A-Za-z0-9]+$", i)}))
# if(test_chars){
# raise_err(err_code='error0028')
# }
# rm(test_chars) # Clean env
# }
# rm(test_vector) # Clean env
} # CLOSE detect_input_samples
##
# @title detect_input
# @description Detects the type of data input being provided to the function STList.
# @details
# This function detects what input is being provided to the STList() function. It
# also detects the delimiter of the file when relevant. NOTE that the function does
# minimum checking on the contents of file, limited mostly to detect csv or tsv,
# Visium files, or Seurat objects. Checks are performed on the first element only,
# and thus other elements could not comply with the format.
#
# @param rnacounts the file/directory with counts provided to STList.
# @param spotcoords the file with coordinates provided to STList.
# @param samples the metadata or sample names provided to STList.
# @return inputtype a list containing file types of input arguments.
#
#' @importFrom magrittr %>%
#
#
detect_input = function(rnacounts=NULL, spotcoords=NULL, samples=NULL){
# To prevent NOTES in R CMD check
. = NULL
# Define output/return variable.
# If variable remains NULL, then no valid input was given by the user.
inputtype = list()
inputtype$rna = NULL
inputtype$coords = NULL
inputtype$samples = NULL
# CASE SEURAT OBJECT(S) WITH SAMPLE NAMES OR SAMPLE FILE
if(!is.null(rnacounts)){
if(is(rnacounts, 'Seurat')){
inputtype$rna = 'seurat'
inputtype$samples = 'samples_from_seurat'
return(inputtype)
}
}
# CASE DCC FILES FROM GEOMX
if(!is.null(rnacounts) && !is.null(samples)){
if(is.character(rnacounts)){
if(dir.exists(rnacounts[1])){
dcc_files = list.files(rnacounts, full.names=T, pattern='.dcc$', recursive=T)
if(!is.null(dcc_files)){
if(length(dcc_files) != 0){
test_dcc = readLines(dcc_files[1]) %>% grep('<Code_Summary>', .)
if(length(test_dcc) != 0){
inputtype$rna = 'geomx_dcc'
# Read metadata file and get coordinate information
if(grepl('.xls', samples)){
inputtype$samples = c('samplesfile_geomx', 'xls')
} else{
samples_file = readLines(samples, n=2)
is_tab_samples = grepl("\t", samples_file[2])
is_comma_samples = grepl(",", samples_file[2])
# Determine delimiter of file.
if(is_tab_samples){
del = '\t'
} else if(is_comma_samples){
del = ','
} else{
stop('Samples file is not comma, tab-delimited, or .xls file')
}
inputtype$samples = c('samplesfile_geomx', del)
}
}
return(inputtype)
}
}
}
}
}
# CASE ONLY SAMPLEFILE PROVIDED.
# Test if `rnacounts` was not entered and `samples` argument was entered.
if(is.null(rnacounts) && !is.null(samples)){
# test that `samples` is a single string and a file path.
if(length(samples) == 1 && file.exists(samples)){
# Read two first lines of file and see if it's csv or tsv.
samples_file = readLines(samples, n=2)
is_tab_samples = grepl("\t", samples_file[2])
is_comma_samples = grepl(",", samples_file[2])
# Determine delimiter of file.
if(is_tab_samples){
del = '\t'
} else if(is_comma_samples){
del = ','
} else{
stop('Samples file is not comma or tab-delimited')
}
samples_file_path_test = unlist(strsplit(samples_file[2], del))
# See if second column is a file path.
if(file.exists(samples_file_path_test[2]) && file.exists(samples_file_path_test[3])){
inputtype$samples = c('samplesfile_matrices', del)
} else if(dir.exists(samples_file_path_test[2]) && !dir.exists(samples_file_path_test[3])){
if(dir.exists(samples_file_path_test[2])){
# Check that dirctory contains an element with name matching 'filtered_feature_bc'.
visium_check = list.files(samples_file_path_test[2], pattern='[raw|filtered]_feature_bc', include.dirs=T, full.names=T)
if(!(rlang::is_empty(visium_check))){
h5_test = grep('\\.h5$', visium_check, value=T)
if(!(rlang::is_empty(h5_test))){
if(hdf5r::is_hdf5(h5_test)){
inputtype$samples = c('samplesfile_visium_h5', del)
} else{
warning('The .h5 file does not seem to be in HDF5 format')
}
} else{
inputtype$samples = c('samplesfile_visium_mex', del)
}
}
} else{
stop('If intended input is a Visium output, could not find directory path.')
}
} else(
stop('Samples file does not contain file paths or format is not compatible.')
)
}
}
# CASE: NAMED LIST OF DATAFRAMES WITH OR WTHOUTH SAMPLEFILE, OR SAMPLE NAMES.
# Test if inputs were entered for both 'rnacounts' and 'spotcoords'.
if(!is.null(rnacounts) && !is.null(spotcoords)){
# Test that `rnacounts` and `spotcoords` are lists.
if(inherits(rnacounts, 'list') && inherits(spotcoords, 'list')){
# Test that lists are named.
if(!is.null(names(rnacounts)) && !is.null(names(spotcoords))){
inputtype$rna = 'list_dfs'
inputtype$coords = 'list_dfs'
# Test if samples file (metadata) was provided.
if(!is.null(samples)){
if(length(samples) == 1 && file.exists(samples)){
# Read samples file and see which delimiter has.
samples_file = readLines(samples, n=2)
is_tab_samples = grepl("\t", samples_file[2])
is_comma_samples = grepl(",", samples_file[2])
# Determine delimiter of file.
if(is_tab_samples){
del = '\t'
} else if(is_comma_samples){
del = ','
} else{
stop('Samples file is not comma or tab-delimited')
}
inputtype$samples = c('samplesfile', del)
} else{
inputtype$samples = 'names_from_list_or_df'
}
} else{
inputtype$samples = 'names_from_list_or_df'
}
} else(
raise_err(err_code='error0003')
)
}
}
# CASE: FILE PATH(S) TO COUNT AND COORDINATE MATRICES, AND SAMPLE NAMES (FILE OR VECTOR). COSMX-SMI INCLUDED
# Test that there is an input for both `rnacounts` and `spotcoords`.
if(!is.null(rnacounts) && !is.null(spotcoords) && !is.null(samples)){
# Test that the first (or only) element of input vector exist, and input is not list or a directory.
if(!is.list(rnacounts[1]) || !is.list(spotcoords[1])){
# Determine what was entered as `samples`
# i.e., string with path to clinical file or sample names vector, and NOT a directory with a name matching the name of the sample (when single sample entered)
if(length(samples) == 1 && file.exists(samples) && !dir.exists(samples)){
# Read samples file and see which delimiter has.
samples_file = readLines(samples, n=2)
is_tab_samples = grepl("\t", samples_file[2])
is_comma_samples = grepl(",", samples_file[2])
# Determine delimiter of file.
if(is_tab_samples){
del = '\t'
} else if(is_comma_samples){
del = ','
} else{
stop('Samples file is not comma or tab-delimited')
}
inputtype$samples = c('samplesfile', del)
} else if(length(samples) == length(rnacounts)){ # Check that sample names were entered instead samplefile.
inputtype$samples = 'sample_names'
} else{
stop('Number of sample names do not match number of RNA counts tables.')
}
if(file.exists(rnacounts[1]) && file.exists(spotcoords[1]) && !dir.exists((rnacounts[1]))){
if(length(rnacounts) == length(spotcoords)){
# Read first `rnacounts` and `spotcoords` to detect delimiters.
rna_file = readLines(rnacounts[1], n=2)
coord_file = readLines(spotcoords[1], n=2)
is_tab_rna = grepl("\t", rna_file[2])
is_comma_rna = grepl(",", rna_file[2])
is_tab_coord = grepl("\t", coord_file[2])
is_comma_coord = grepl(",", coord_file[2])
# Determine delimiter of first `rnacounts` file.
if(is_tab_rna){
del_rna = '\t'
} else if(is_comma_rna){
del_rna = ','
} else{
stop('RNA counts file is not comma or tab-delimited')
}
# Determine delimiter of first `spotcoords` file.
if(is_tab_coord){
del_coords = '\t'
} else if(is_comma_coord){
del_coords = ','
} else{
stop('Coordinates file is not comma or tab-delimited')
}
# Check if COSMX-SMI was input
if(grepl('fov', rna_file[1]) & grepl('cell_ID|cell_id', rna_file[1])){
inputtype$rna = c('cosmx', del_rna)
inputtype$coords = c('cosmx', del_coords)
} else{
inputtype$rna = c('rnapath', del_rna)
inputtype$coords = c('coordpath', del_coords)
}
}
}
if(is.null(inputtype$rna)){
stop('Could not open the RNA count file.')
}
}
}
# CASE: FILE PATHS TO VISIUM DIRECTORIES.
# Test that `rnacounts` were provided and first element is a directory.
# Need also sample names that partially match the file path to be provided
if(!is.null(rnacounts) && is.null(spotcoords) && !is.null(samples)){
if(dir.exists(rnacounts[1])){
# Check that dirctory contains an element with name matching 'filtered_feature_bc'.
visium_check = list.files(rnacounts[1], pattern='[raw|filtered]_feature_bc', include.dirs=T, full.names=T)
if(!(rlang::is_empty(visium_check))){
h5_test = grep('\\.h5$', visium_check, value=T)
if(!(rlang::is_empty(h5_test))){
if(hdf5r::is_hdf5(h5_test)){
inputtype$rna = 'visium_out_h5'
} else{
warning('The .h5 file does not seem to be in HDF5 format')
}
} else{
inputtype$rna = 'visium_out_mex'
}
}
} else{
stop('If intended input is a Visium output, could not find directory path.')
}
# Determine what was entered as `samples`.
# if(length(samples) == 1 && file.exists(samples)){
if(length(samples) == 1 && file.exists(samples) && !dir.exists(samples)){ # Suggested by Mr. Manjarres
# Read samples file and see which delimiter has.
samples_file = readLines(samples, n=2)
is_tab_samples = grepl("\t", samples_file[2])
is_comma_samples = grepl(",", samples_file[2])
# Determine delimiter of file.
if(is_tab_samples){
del = '\t'
} else if(is_comma_samples){
del = ','
} else{
stop('Samples file is not comma or tab-delimited')
}
inputtype$samples = c('samplesfile', del)
} else if(length(samples) == length(rnacounts)){
inputtype$samples = 'sample_names'
#} else if(is.data.frame(test_clin)){
} else if(is.data.frame(samples)){
raise_err(err_code='error0004')
} else{
stop('Number of sample names do not match number of Visium output folders.')
}
}
return(inputtype)
} # CLOSE detect_input
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.