R/initiateSpataObject.R

Defines functions initiateSpataObjectMERFISH initiateSpataObjectEmpty initiateSpataObject

Documented in initiateSpataObject initiateSpataObjectMERFISH

#' @title Initiate an object of class `SPATA2`
#'
#' @description Initiates a [`SPATA2`] object using the basic inputs: a coordinates
#' data.frame and a count matrix.
#'
#' @param sample_name Character value. The name with which to identify the `SPATA2` object.
#' @param count_mtr A count matrix. Column names correspond to the barcodes of the \link[=concept_observations]{observations}.
#' Rownames correspond to the names of the molecular features (genes, proteins, metabolites etc.).
#' @param modality Character value. Should best describe the molecular type of the count matrix.
#' Additionally, it defines the \link[=MolecularAssay]{assay} name, that is created with the count matrix and further
#' referred to via the argument `assay_name`. Read more on molecular modalities in
#' SPATA2 \link[=concept_molecular_modalities]{here}.
#'
#' @param coords_df Data.frame with a variable called *barcodes* as well as the
#' *x_orig* and *y_orig* or *x* and *y*.
#' @param img The reference image. See details for more information on how or
#' how not to include an image.
#' @param img_dir Character value or `NULL`. If character, the file directory
#' to the reference image. See details for more information on how and why to include
#' a file directory.
#' @param img_name Character value. The name of the reference image. Only
#' required if at least `img` or `img_dir` is specified.
#' @param spatial_method Character value or object of class [`SpatialMethod`].
#' If character, one of `validSpatialMethods()`.
#' @param meta,misc List of meta- and miscellaneous data for the [`SpatialData`]
#' object.
#' @param scale_factors A list of \link[=concept_scale_factors]{scale_factors}
#' set in slot @@scale_factors of the [`HistoImage`] object (if the `SPATA2` object is initiated
#' with an image, see details) or slot @@scale_factors of the [`SpatialData`] object, if
#' no image is provided.
#'
#' @inherit argument_dummy params
#'
#' @inherit tutorial_hint_dummy sections
#'
#' @note In contrast to [`initiateSpataObjectVisium()`] or [`initiateSpataObjectMERFISH()`],
#' a `SPATA2` object of this function the output does not contain a tissue outline yet!
#' Run [`identifyTissueOutline()`] with your choice of parameters afterwards.
#'
#' @section Initiating the object with an image:
#' `SPATA2` allows to register multiple images with one object via file directories.
#' This facilitates exchanging them during the analysis while they must not
#' be loaded altogether in a `SPATA2` object, which saves storage space. By default,
#' the function takes a directory to the image you want to initiate your `SPATA2` object with,
#' then loads the image and saves the directory, too. If the image does not
#' exist in a file on your device but only in the global environment you can
#' use `img` directly. This way, no image directory is stored. In a scenario,
#' where you want to register an additional image and use it for further analysis,
#' you can not *unload* the image with which you initiated the object because it
#' would be lost since there is not directory from which to retrieve it once
#' you want to use it again. Therefore, we recommend to initiate the object
#' with a file directory to the image and not with an image from the global
#' environment.
#'
#' Lastly, if `img` and `img_dir` is specified the image is saved under
#' this directory as a .png file and the image is registered normally.
#'
#' @return An object of class `SPATA2`.
#' @export
#'
initiateSpataObject <- function(sample_name,
                                count_mtr,
                                modality,
                                coords_df,
                                img = NULL,
                                img_dir = NULL,
                                img_name = "image1",
                                scale_factors = list(),
                                spatial_method = "Undefined",
                                verbose = TRUE,
                                ...){

  # spatial method
  if(base::is.character(spatial_method)){

    confuns::check_one_of(
      input = spatial_method,
      against = validSpatialMethods()
    )

    spatial_method <- spatial_methods[[spatial_method]]

  }

  # column names
  cnames_coords <- base::colnames(coords_df)

  if(!base::all(c("barcodes", "x_orig", "y_orig") %in% cnames_coords) &
     !base::all(c("barcodes", "x", "y") %in% cnames_coords)){

    stop("Data.frame `coords_df` must contain variable 'barcodes' as well as either 'x' and 'y' or 'x_orig' and 'y_orig'.")

  }

  if(!"x_orig" %in% cnames_coords){

    coords_df[["x_orig"]] <- coords_df[["x"]]
    coords_df[["x"]] <- NULL

  }

  if(!"y_orig" %in% cnames_coords){

    coords_df[["y_orig"]] <- coords_df[["y"]]
    coords_df[["y"]] <- NULL

  }

  coords_df[["sample"]] <- sample_name

  # barcodes
  barcodes_coords <- coords_df[["barcodes"]]
  barcodes_counts <- base::colnames(count_mtr)

  if(!base::all(barcodes_counts %in% barcodes_coords)){

    stop("All barcodes from `count_mtr` must be present in `coords_df`.")

  }

  # initiate SPATA2 object
  object <-
    initiateSpataObjectEmpty(
      sample_name = sample_name,
      platform = spatial_method@name,
      verbose = verbose
    )

  # sp_data: create pseudohistoimage if no image is available
  if(base::is.null(img_dir) & base::is.null(img)) {

    confuns::give_feedback(
      msg = "`img_dir` and `img` are NULL. No images registered.",
      verbose = verbose
    )

    sp_data <-
      SpatialData(
        coordinates = coords_df,
        method = spatial_method,
        sample = sample_name,
        scale_factors = scale_factors,
        version = current_spata2_version
      )

    object <- setDefault(object, display_image = FALSE)

    # else create histo image with a combination of img_dir and img
  } else {

    if(base::is.null(scale_factors$image)){

      confuns::give_feedback(
        msg = "No scale factor for image provided. Default to 1.",
        verbose = verbose
      )

      scale_factors$image <- 1

    }

    hist_img_ref <-
      createHistoImage(
        active = TRUE,
        dir = img_dir,
        img = img,
        img_name = img_name,
        reference = TRUE,
        sample = sample_name,
        scale_factors = scale_factors,
        verbose = verbose
      )

    sp_data <-
      createSpatialData(
        sample = sample_name,
        hist_img_ref = hist_img_ref,
        active = img_name,
        unload = FALSE,
        coordinates = coords_df,
        method = spatial_method
      )

  }

  object <- setDefault(object, "pt_size" = 1)

  # molecular assay
  ma <-
    MolecularAssay(
      mtr_counts = count_mtr,
      modality = modality
    )

  if(!modality %in% base::names(signatures)){

    confuns::give_feedback(
      msg = glue::glue("SPATA2 does not have signatures stored for modality '{modality}'. Set yourself with `setSignatures()`."),
      verbose = verbose
    )

  } else {

    ma@signatures <- signatures[[modality]]

  }

  object <- setAssay(object, assa = ma)
  object <- activateAssay(object, assay_name = modality, verbose = verbose)
  object <- activateMatrix(object, mtr_name = "counts", verbose = FALSE)

  # meta data.frame
  meta_df <- tibble::tibble(barcodes = barcodes_coords, sample = {{sample_name}})
  object <- setMetaDf(object, meta_df = meta_df)

  # spatial data
  object <- setSpatialData(object, sp_data = sp_data)

  returnSpataObject(object)

}


#' @keywords internal
initiateSpataObjectEmpty <- function(sample_name, platform, verbose = TRUE){

  object <- SPATA2()

  object@logfile <-
    tibble::tibble(
      fn_name = character(0),
      date_time = Sys.Date(),
      args_input = list(),
      pkg_version = character(0)
      )

  object@platform <- platform
  object@sample <- sample_name
  object@version <- current_spata2_version

  object <- setDefaultInstructions(object)

  confuns::give_feedback(
    msg = glue::glue("Initiating SPATA2 object of spatial platform: `{platform}`"),
    verbose = verbose
  )

  returnSpataObject(object)

}



#' @title Initiate an object of class `SPATA2` from platform MERFISH
#'
#' @description This function initiates a [`SPATA2`] object with data generated
#' using the MERFISH platform.
#'
#' @param directory_merfish Character value. Directory to a MERFISH folder
#' that should contain a .csv file called *~/...cell_by_gene.csv* and a .csv file
#' called *~/...cell_metadata.csv*, where **~** is the directory to the folder.
#' Deviating filenames can be specified using arguments `file_counts`
#' and `file_cell_meta`, respectively.
#' @param read_transcripts Logical value. If `TRUE`, the actual transcript positions
#' are read in via  *~/...detected_transcript.csv* or the input of `file_transcripts`,
#' if specified. Note that this argument defaults to `FALSE` since reading file
#' transcripts can increase the object size by several GB.
#' @param file_counts Character value or `NULL`. If character, specifies
#' the filename of .csv file that contains the gene counts by cell. Use only
#' if filename deviates from the default.
#' @param file_cell_meta Character value or `NULL`. If character, specifies
#' the filename of the .csv file that contains cell meta data, in particular,
#' spatial location via the variables *center_x* and *center_y*.
#' @param file_transcripts Character value or `NULL`. If character, specifies
#' the filename of the .csv file that contains molecule transcript positions, in particular,
#' spatial location via the variables *x* and *y*.
#'
#' @inherit argument_dummy params
#'
#' @return A `SPATA2` object from the MERFISH platform. Default for `pt_size` is
#' set to 0.4 which might need adjustment.
#'
#' @seealso [`addMoleculeCoordinates()`], [`setDefault()`]
#'
#' @details MERFISH works in micron space. The coordinates of the cellular centroids are
#' provided in unit um. Therefore no pixel scale factor must be computed or set
#' to work with SI units.
#'
#' @export

initiateSpataObjectMERFISH <- function(sample_name,
                                       directory_merfish,
                                       read_transcripts = FALSE,
                                       file_counts = NULL,
                                       file_cell_meta = NULL,
                                       file_transcripts = NULL,
                                       verbose = TRUE){

  # create SPATA2
  object <-
    initiateSpataObjectEmpty(
      sample_name = sample_name,
      platform = "MERFISH",
      verbose = verbose
    )

  directory_merfish <- base::normalizePath(directory_merfish)

  files_in_dir <-
    base::list.files(path = directory_merfish, full.names = TRUE)

  # test and read transcripts directory input if required
  if(base::isTRUE(read_transcripts)){

    if(!base::is.character(file_transcripts)){

      file_transcripts <-
        stringr::str_subset(files_in_dir, pattern = "detected_transcript") %>%
        stringr::str_subset(pattern = ".csv$")

      if(base::length(file_transcripts) == 0){

        stop("Did not find transcript file. If not specified otherwise, directory must contain
           one '~/...detected_transcripts.csv' file.")

      } else if(base::length(file_transcripts) > 1){

        stop("Found more than one potential transcript file. Please specify argument 'file_transcripts'.")

      }

    } else {

      file_transcripts <- base::file.path(directory_merfish, file_transcripts)

      if(!base::file.exists(file_transcripts)){

        stop(glue::glue("Directory to transcripts '{file_transcripts}' does not exist."))

      }

    }

    confuns::give_feedback(
      msg = glue::glue("Reading transcripts from {file_transcripts}."),
      verbose = verbose
    )

    if(stringr::str_detect(file_transcripts, ".csv$")){

      mol_coords_df <- readr::read_csv(file = file_transcripts, show_col_types = FALSE)

    } else {

      mol_coords_df <- readr::read_delim(file = file_transcripts, show_col_types = FALSE)

    }

    mol_coords_df <- dplyr::select(mol_coords_df, gene, x = global_x, y = global_y)

  }

  # read counts
  if(!base::is.character(file_counts)){

    file_counts <-
      stringr::str_subset(files_in_dir, pattern = "cell_by_gene")

    if(base::length(file_counts) == 0){

      stop("Did not find counts. If not specified otherwise, directory must contain
           one '~/...cell_by_gene.csv' file.")

    } else if(base::length(file_counts) > 1){

      stop("Found more than one potential counts file. Please specify argument `file_counts`.")

    }

  } else {

    file_counts <- base::file.path(directory_merfish, file_counts)

    if(!base::file.exists(file_counts)){

      stop(glue::glue("Directory to counts '{file_counts}' does not exist."))

    }

  }

  confuns::give_feedback(
    msg = glue::glue("Reading counts from: '{file_counts}'."),
    verbose = verbose
  )

  if(stringr::str_detect(string = file_counts, pattern = "\\.csv$")){

    count_mtr <-
      suppressMessages({

        readr::read_csv(file = file_counts, col_types = "c", show_col_types = FALSE)

      }) %>%
        dplyr::rename(cell = ifelse("...1" %in% colnames(.), "...1", "cell")) %>% # in case cell column is not named
        dplyr::rename(barcodes = cell) %>% # keep original barcode names in metadata
        dplyr::select(-dplyr::matches("^\\.")) %>%
        tibble::column_to_rownames("barcodes") %>%
        dplyr::select_if(.predicate = base::is.numeric) %>%
        base::as.matrix() %>%
        base::t() %>%
        Matrix::Matrix()

    original_barcodes <- colnames(count_mtr) # keep original barcode names for metadata
    colnames(count_mtr) <- stringr::str_c("cell", 1:base::length(colnames(count_mtr)), sep = "_")

  } else {

    # more options ?
    stop("Invalid input for `file_counts`.")

  }

  # spatial data
  sp_data <-
    createSpatialDataMERFISH(
      dir = directory_merfish,
      sample = sample_name
    )

  object <- setSpatialData(object, sp_data = sp_data)

  # molecular assay
  ma <-
    MolecularAssay(
      mtr_counts = count_mtr,
      modality = "gene",
      signatures = signatures$gene
    )

  object <- setAssay(object, assay = ma)
  object <- activateAssay(object, assay_name = "gene")
  object <- activateMatrix(object, mtr_name = "counts")

  # add transcripts positions
  if(base::isTRUE(read_transcripts)){

    object <- addMoleculeCoordinates(object, coordinates = mol_coords_df, assay_name = "gene")

  }

  # meta
  meta_df <-
    tibble::tibble(
      barcodes = getCoordsDf(object)$barcodes,
      sample = {{sample_name}}
      )

  object <- setMetaDf(object, meta_df = meta_df)
  object <- setMetaDf(object, cbind(getMetaDf(object), original_barcodes)) # add original barcodes to metadata

  # default processing
  object <- identifyTissueOutline(object, verbose = verbose)

  # set active content
  object <-
    setDefault(
      object = object,
      display_image = FALSE, # MERFISH does not come with an image
      pt_size = 0.4, # many obs of small size
      use_scattermore = TRUE # usually to many points for ggplot2 to handle
    )

  crange <- getCoordsRange(object)

  object <- computeCaptureArea(object)

  confuns::give_feedback(
    msg = "Estimated field of view (capture area) bases on cell coordinates. Specify with `setCaptureaArea()`.",
    verbose = verbose
  )

  returnSpataObject(object)

}


#' @title Initiate an object of class `SPATA2` from platform SlideSeq
#'
#' @description Wrapper function around the necessary content to create a
#' `SPATA2` object from the standardized output of the [`SlideSeqV1`] platform.
#'
#' @param directory_slide_seq Character value. Directory to a SlideSeq folder
#' that contains a count matrix and bead locations.
#' @param file_counts Character value or `NULL`. If character, specifies
#' the filename of the count matrix. If `NULL`, the SlideSeq folder is skimmed
#' for a file ending with *.mtx*.
#' @param file_barcodes Character value or `NULL`. If character, specifies
#' the filename of the barcode names for the count matrix if it does not
#' contain column names. If `NULL`, the SlideSeq folder is skimmed
#' for a file ending with *barcodes.tsv*.
#' @param file_genes Character value or `NULL`. If character, specifies
#' the filename of the gene names for the count matrix if it does not
#' contain row names. If `NULL`, the SlideSeq folder is skimmed
#' for a file ending with *genes.tsv*.
#' @param file_coords Character value or `NULL`. If character, specifies
#' the filename of the coordinates. If `NULL`, the SlideSeq folder is skimmed
#' for a file ending with *MatchedBeadLocation.csv*.
#'
#' @inherit argument_dummy params
#' @inherit initiateSpataObject params return
#'
#' @export

initiateSpataObjectSlideSeqV1 <- function(sample_name,
                                          directory_slide_seq,
                                          file_counts = NULL,
                                          file_barcodes = NULL,
                                          file_genes = NULL,
                                          file_coords = NULL,
                                          verbose = TRUE){

  # create SPATA2
  object <-
    initiateSpataObjectEmpty(
      sample_name = sample_name,
      platform = "SlideSeqV1",
      verbose = verbose
    )

  confuns::give_feedback(
    msg = glue::glue("Reading from directory {directory_slide_seq}."),
    verbose = verbose
  )

  # read counts
  if(base::is.null(file_counts)){

    file_counts <-
      base::list.files(path = directory_slide_seq, full.names = TRUE) %>%
      stringr::str_subset(pattern = "\\.mtx")

    if(base::length(file_counts) == 0){

      stop("Did not find count matrix. Directory must contain a .mtx file.")

    } else if(base::length(file_counts) > 1){

      stop("Found more than one potential count matrices. Please specify argument `mtr`.")

    }

  } else {

    file_counts <- base::file.path(directory_slide_seq, file_counts)

    if(!base::file.exists(file_counts)){

      stop(glue::glue("Directory to count matrix '{file_counts}' does not exist."))

    }

  }

  confuns::give_feedback(
    msg = glue::glue("Reading count matrix from {file_counts}."),
    verbose = verbose
  )

  count_mtr <- Matrix::readMM(file = file_counts)

  # read barcodes
  if(!base::is.character(base::colnames(count_mtr))){

    if(!base::is.character(file_barcodes)){

      file_barcodes <-
        base::list.files(path = directory_slide_seq, full.names = TRUE) %>%
        stringr::str_subset(pattern = "barcodes\\.tsv$")

      if(base::length(file_barcodes) == 0){

        stop("Did not find barcodes. If not specified otherwise, directory must contain one '~...barcodes.tsv' file.")

      } else if(base::length(file_barcodes) > 1){

        stop("Found more than one potential barcode files. Please specify argument `file_barcodes`.")

      }

    } else if(!base::file.exists(file_barcodes)){

      stop(glue::glue("Directory to barcodes '{file_barcodes}' does not exist."))

    }

    confuns::give_feedback(
      msg = glue::glue("Reading barcodes from '{file_barcodes}'."),
      verbose = verbose
    )

    barcodes <-
      readr::read_tsv(
        file = file_barcodes,
        col_names = FALSE,
        show_col_types = FALSE
      )

    base::colnames(count_mtr) <- barcodes[[1]]

  }

  # read genes
  if(!base::is.character(base::rownames(count_mtr))){

    if(!base::is.character(file_genes)){

      file_genes <-
        base::list.files(path = directory_slide_seq, full.names = TRUE) %>%
        stringr::str_subset(pattern = "genes\\.tsv$")

      if(base::length(file_genes) == 0){

        stop("Did not find gene names. If not specified otherwise, directory must contain one '~...genes.tsv' file.")

      } else if(base::length(file_genes) > 1){

        stop("Found more than one potential gene files. Please specify argument `file_genes`.")

      }

    } else {

      file_genes <- base::file.path(directory_slide_seq, file_genes)

      if(!base::file.exists(file_genes)){

        stop(glue::glue("Directory to gene names '{file_genes}' does not exist."))

      }

    }

    confuns::give_feedback(
      msg = glue::glue("Reading gene names from '{file_genes}'."),
      verbose = verbose
    )

    gene_names <-
      readr::read_tsv(
        file = file_genes,
        col_names = FALSE,
        show_col_types = FALSE
      )

    base::rownames(count_mtr) <- gene_names[[1]]

  }

  # create histo sp_data
  sp_data <-
    createSpatialDataSlideSeqV1(
      dir = directory_slide_seq,
      sample = sample_name
    )

  object <- setSpatialData(object, sp_data = sp_data)

  # molecular assay
  ma <-
    MolecularAssay(
      mtr_counts = count_mtr,
      modality = "gene",
      signatures = signatures$gene
    )

  object <- setAssay(object, assay = ma)
  object <- activateAssay(object, assay_name = "gene")
  object <- activateMatrix(object, mtr_name = "counts")

  # meta df
  meta_df <-
    tibble::tibble(
      barcodes = getCoordsDf(sp_data)$barcodes,
      sample = {{sample_name}}
      )

  object <- setMetaDf(object = object, meta_df = meta_df)

  # default processing
  object <- identifyTissueOutline(object, verbose = verbose)

  # set active content
  object <-
    setDefault(
      object = object,
      display_image = FALSE, # SlideSeqV1 does not come with an image)
      pt_size = 1.5, # many beads of small size
      use_scattermore = TRUE
    )

  object <- computeCaptureArea(object)

  returnSpataObject(object)

}

#' @title Initiate an object of class `SPATA2` from platform Visium
#'
#' @description This function initiates a [`SPATA2`] object with data generated
#' using the 10x Genomics Visium platform.
#'
#' @param sample_name Character. The name of the sample.
#' @param directory_visium Character. The directory containing the Visium output files.
#' @param mtr Character. Specifies which matrix to use, either "filtered" or "raw". Default is "filtered".
#' @param img_active Character. The active image to use, either "lowres" or "hires". Default is "lowres".
#' @param img_ref Character. The reference image to use, either "lowres" or "hires". Default is "hires".
#' @inherit createSpatialData params
#' @inherit argument_dummy params
#'
#' @return A `SPATA2` object containing data from the Visium platform. More precise,
#' depending on the set up used to create the raw data it is of either spatial method:
#'
#'  \itemize{
#'   \item{[`VisiumSmall`]}{: Visium data set with capture area of 6.5mm x 6.5mm.}
#'   \item{[`VisiumLarge`]}{: Visium data set with capture area of 11mm x 11m. }
#'   }
#'
#' In any case, the output is an object of class `SPATA2`.
#'
#' @seealso [`flipAll()`] to revert the effect of a horizontally flipped sample. (SPATA2 works
#' in cartesian coordinate system, in which images are displayed, too. Therefore, samples are
#' displayed "upside-down" when compared to the image in your folder.)
#'
#' @details
#' The function requires a directory containing the output files from a 10x Genomics Visium experiment
#' specified with the argument `directory_visum`. This directory (below denoted as **~**) must include the following files
#' and sub-directories:
#'
#' \itemize{
#'   \item \emph{~/filtered_feature_bc_matrix.h5} or \emph{raw_feature_bc_matrix.h5}: The HDF5 file containing the filtered or raw feature-barcode matrix, respectively.
#'   \item \emph{~/spatial/tissue_lowres_image.png} or \emph{spatial/tissue_hires_image.png}: The low-resolution or high-resolution tissue image.
#'   \item \emph{~/spatial/scalefactors_json.json}: A JSON file containing the scale factors for the images.
#'   \item \emph{~/spatial/tissue_positions_list.csv} or \emph{/~spatial/tissue_positions.csv}: A CSV file containing the tissue positions and spatial coordinates.
#' }
#' The function will check for these files and process them to create a `SPATA2` object. It reads the count matrix, loads the spatial data,
#' and initializes the `SPATA2` object with the necessary metadata and settings.
#'
#' @section Gene and Protein Expression:
#' This function also supports reading coupled gene expression and protein expression data. It expects the input directory to contain an HDF5
#' file that includes separate datasets for gene expression and protein expression. The function uses [`Seurat::Read10X_h5()`] with `unique.features = TRUE`, to read in
#' data and, if the result is a list, it assumes that it contains gene and protein expression. This scenario is handled as follows:
#'
#' \itemize{
#'   \item Gene expression data is extracted from the "Gene Expression" dataset in the HDF5 file and stored in an assay named *gene*.
#'   \item Protein expression data is extracted from the "Antibody Capture" dataset in the HDF5 file *protein*.
#' }
#'
#' The function ensures that molecule names do not overlap by normalizing the names:
#'
#' \itemize{
#'   \item Gene expression molecule names remain in upper case (human data) or title case (mouse data).
#'   \item Protein expression molecule names are forced to lowercase.
#' }
#'
#' This naming convention prevents any overlap and ensures that each molecule type is uniquely
#' identified in the resulting `SPATA2` object, which contains two assays. One of molecular
#' modality *gene* and of of molecular modality *protein*.
#'
#' @section Row and column indices:
#' Visium spot coordinates come with column and row indices. The functions `initiateSpataObjectVisium()`
#' and `initiateSpataObjectVisiumHD()` ensure that *col* aligns with the direction of the x-coordinates and
#' that *row* aligns with the direction of the y-coordinates. If they do not, they are adjusted accordingly.
#' Hence, these variables should not be used as keys for data merging.
#'
#' @export
#'
initiateSpataObjectVisium <- function(sample_name,
                                      directory_visium,
                                      mtr = "filtered",
                                      img_active = "lowres",
                                      img_ref = "hires",
                                      resize_images = NULL,
                                      unload = TRUE,
                                      verbose = TRUE){

  confuns::give_feedback(
    msg = "Initiating SPATA2 object for platform Visium.",
    verbose = verbose
  )

  # validate and process input directory
  dir <- base::normalizePath(directory_visium)

  files <- base::list.files(dir, recursive = TRUE, full.names = TRUE)

  # check and load required mtr
  confuns::check_one_of(
    input = mtr,
    against = c("filtered", "raw")
  )

  if(mtr == "filtered"){

    mtr_pattern <- "filtered_feature_bc_matrix.h5$"

  } else if(mtr == "raw"){

    mtr_pattern <- "raw_feature_bc_matrix.h5$"

  }

  mtr_path <- files[stringr::str_detect(files, pattern = mtr_pattern)]

  if(base::length(mtr_path) == 0){

    confuns::give_feedback(
      msg = glue::glue("'{mtr_pattern}' is missing. Looking for folder."),
      verbose = verbose
      )

    mtr_pattern <- stringr::str_remove(mtr_pattern, ".h5\\$$")

    mtr_folder_path <-
      stringr::str_subset(
        string = list.files(directory_visium, full.names = TRUE, recursive = FALSE),
        pattern = stringr::str_remove(mtr_pattern, ".h5\\$")
      )

    if(base::length(mtr_folder_path) == 1){

      confuns::give_feedback(
        msg = glue::glue("Reading count data from folder: {mtr_folder_path}"),
        verbose = verbose
      )

      counts_out <- read_matrix_from_folder(mtr_folder_path)

    } else {

      stop("Can't find count data.")

    }

  } else {

    if(base::length(mtr_path) > 1){

      warning(glue::glue("Multiple matrices found. Picking first: {mtr_path}."))

      mtr_path <- mtr_path[1]

    }

    confuns::give_feedback(
      msg = glue::glue("Reading count data from '{mtr_path}'."),
      verbose = verbose
    )

    counts_out <-
      base::suppressMessages({

        Seurat::Read10X_h5(filename = mtr_path, unique.features = TRUE)

      })

  }

  # load images and spatial data
  confuns::give_feedback(
    msg = "Reading spatial and image data.",
    verbose = verbose
  )

  sp_data <-
    createSpatialDataVisium(
      dir = dir,
      sample = sample_name,
      img_ref = img_ref,
      img_active = img_active,
      resize_images = resize_images,
      unload = unload,
      verbose = verbose
    )

  # create SPATA2 object
  platform <- sp_data@method@name

  object <-
    initiateSpataObjectEmpty(
      sample_name = sample_name,
      platform = platform, # depends on input
      verbose = FALSE
    )

  object <- setSpatialData(object, sp_data = sp_data)

  # molecular data
  if(base::length(counts_out) == 2){

    # gene expression
    gene_counts <- counts_out[["Gene Expression"]]

    base::rownames(gene_counts) <-
      base::rownames(gene_counts) %>%
      stringr::str_remove_all("\\.d*") %>%
      stringr::str_replace_all(pattern = "_", replacement = "-")

    gene_counts <- make_unique_molecules(gene_counts)

    gene_assay <-
      MolecularAssay(
        mtr_counts = gene_counts,
        modality = "gene",
        signatures = signatures$gene
        )

    object <- setAssay(object, assay = gene_assay)

    # protein expression
    protein_counts <- counts_out[["Antibody Capture"]]

    base::rownames(protein_counts) <-
      base::rownames(protein_counts) %>%
      stringr::str_remove_all(pattern = "\\..*$") %>%
      base::tolower() %>% # force protein names to lower case!
      stringr::str_replace_all(pattern = "_", replacement = "-")

    protein_counts <- make_unique_molecules(protein_counts)

    protein_assay <-
      MolecularAssay(
        mtr_counts = protein_counts,
        modality = "protein",
        signatures = purrr::map(signatures$protein, .f = base::tolower)
      )

    object <- setAssay(object, assay = protein_assay)

    # set required content
    object <- activateAssay(object, assay_name = "gene")
    object <- activateMatrix(object, mtr_name = "counts")
    object <- activateMatrix(object, mtr_name = "counts", assay_name = "protein")

  } else {

    counts_out <- make_unique_molecules(counts_out)

    # molecular assay
    ma <-
      MolecularAssay(
        mtr_counts = counts_out,
        modality = "gene",
        signatures = signatures$gene
      )

    object <- setAssay(object, assay = ma)

    # set required content
    object <- activateAssay(object, assay_name = "gene")
    object <- activateMatrix(object, mtr_name = "counts")

  }

  # meta
  meta_df <-
    tibble::tibble(
      barcodes = getCoordsDf(sp_data)$barcodes,
      sample = {{sample_name}}
      )

  object <- setMetaDf(object, meta_df = meta_df)

  # set default
  object <- setDefault(object, pt_size = getSpotSize(object))

  # default processing
  object <- identifyTissueOutline(object, verbose = verbose)

  object <- computeCaptureArea(object)

  returnSpataObject(object)

}


#' @title Initiate an object of class `SPATA2` from platform VisiumHD
#'
#' @description This function initiates a [`SPATA2`] object with data generated
#' using the 10x Genomics [`VisiumHD`] platform.
#'
#' @param sample_name Character. The name of the sample.
#' @param directory_visium Character. The directory containing the Visium output files.
#' @param square_res Character. The square resolution from which to load the data. While *c('16um', '8um', '2um')*
#' are the default resolutions, [`reduceResolutionVisiumHD()`] is applied if the input
#' deviates from these three input options. See section *Visium HD Resolution* for more information.
#' @param mtr Character. Specifies which matrix to use, either "filtered" or "raw". Default is "filtered".
#' @param genes Character or `NULL`. If character, specifies beforehand which genes to keep in the count
#' matrix. If `NULL`, the default, all genes are kept.
#' @param img_active Character. The active image to use, either "lowres" or "hires". Default is "lowres".
#' @param img_ref Character. The reference image to use, either "lowres" or "hires". Default is "hires".
#' @inherit reduceResolutionVisiumHD params
#' @inherit createSpatialData params
#' @inherit argument_dummy params
#'
#' @return A `SPATA2` object the VisiumHD platform.
#'
#' @note It is crucial to install the package `arrow` in a way that [`arrow::read_parquet()`] works. There
#' are several ways. Installing the package with `install.packages('arrow', repos = 'https://apache.r-universe.dev')`
#' worked reliably for us.
#'
#' @details
#' The function requires a directory containing the output files from a 10x Genomics VisiumHD experiment
#' specified with the argument `directory_visium`. This directory (below denoted as **~**)  must include the following
#' sub-directories:
#'
#' \itemize{
#'   \item \emph{~/binned_outputs}: A folder with the following subdirectories.
#'      \itemize{
#'        \item \emph{~/binned_outputs/square_002um}: The folder containing the data for `square_res = '2um'`.
#'        \item \emph{~/binned_outputs/square_008um}: The folder containing the data for `square_res = '8um'`.
#'        \item \emph{~/binned_outputs/square_016um}: The folder containing the data for `square_res = '16um'`.
#'        }
#' }
#'
#' Depending on your input for `square_res` only the corresponding subfolder is required. This subfolder should
#' contain the following files and sub-directories:
#'
#' \itemize{
#'   \item \emph{~/binned_outputs/<square_res>/filtered_feature_bc_matrix.h5} or \emph{~/raw_feature_bc_matrix.h5}: The HDF5 file containing the filtered or raw feature-barcode matrix, respectively.
#'   \item \emph{~/binned_outputs/<square_res>/spatial/tissue_lowres_image.png} or \emph{~/spatial/tissue_hires_image.png}: The low-resolution or high-resolution tissue image.
#'   \item \emph{~/binned_outputs/<square_res>/spatial/scalefactors_json.json}: A JSON file containing the scale factors for the images.
#'   \item \emph{~/binned_outputs/<square_res>/spatial/tissue_position.parquet} A .parguet file containing the tissue positions and spatial coordinates.
#' }
#'
#' The function will check for these files and process them to create a `SPATA2` object. It reads the count matrix, loads the spatial data,
#' and initializes the `SPATA2` object with the necessary metadata and settings.
#'
#' @section Visium HD Resolution:
#' The input for `square_res` can deviate from the standard resolution options as long as it is an
#' even number divisible by one of the standard resolutions. In such cases, data from the next possible
#' lower resolution is read, and [`reduceResolutionVisiumHD()`] is applied to aggregate the data.
#' For example, if `square_res = '6um'`, data is retrieved from \emph{~/binned_outputs/square_002um}
#' and then aggregated accordingly. The same applies if `square_res = 10um`. If `square_res = 24um`,
#' data is read from \emph{~/binned_outputs/square_008um}; if `square_res = 32um`, data is read from
#' \emph{~/binned_outputs/square_016um}, and so on. If the required folder is missing, data from the
#' next higher resolution folder is used if possible, else an error is thrown.
#'
#' Note, that aggregating counts by resolution can take a considerable amount of time. Consider prefiltering
#' the raw counts using `genes` and/or increasing the number of cores to use with `workers`.
#'
#' @section Row and column indices:
#' Visium spot coordinates come with column and row indices. The functions `initiateSpataObjectVisium()`
#' and `initiateSpataObjectVisiumHD()` ensure that *col* aligns with the direction of the x-coordinates and
#' that *row* aligns with the direction of the y-coordinates. If they do not, they are adjusted accordingly.
#' Hence, these variables should not be used as keys for data merging.
#'
#' @export
#'
initiateSpataObjectVisiumHD <- function(sample_name,
                                        directory_visium,
                                        square_res = "16um",
                                        mtr = "filtered",
                                        genes = NULL,
                                        workers = 1,
                                        batch_size = 1000,
                                        img_active = "lowres",
                                        img_ref = "hires",
                                        resize_images = NULL,
                                        unload = TRUE,
                                        verbose = TRUE){

  confuns::give_feedback(
    msg = "Initiating SPATA2 object for platform: 'VisiumHD'",
    verbose = verbose
  )

  is_dist(square_res, error = TRUE)
  input_square_res <- as_unit(square_res, unit = "um")

  res_options <- check_square_res(square_res)
  n_opts <- length(res_options)

  # validate and process input directory
  dir <- base::normalizePath(directory_visium)

  for(i in 1:n_opts){

    res_opt <- res_options[i]

    square_pattern <- stringr::str_c("binned_outputs\\/.*", res_opt)

    files <-
      base::list.files(dir, recursive = TRUE, full.names = TRUE) %>%
      stringr::str_subset(pattern = square_pattern)

    out_folder <-
      stringr::str_extract(files, pattern = square_pattern) %>%
      base::unique() %>%
      base::file.path(directory_visium, .)

    if(length(out_folder) == 1){

      confuns::give_feedback(
        msg = glue::glue("Found ~ binned_outputs\\ for resolution {res_opt}."),
        verbose = verbose
      )

      break()

    } else if(length(out_folder) == 0 & (i+1) <= n_opts){

      next_opt <- res_options[(i+1)]

      confuns::give_feedback(
        msg = glue::glue("Did not find folder for resolution {res_opt}. Trying {next_opt}."),
        verbose = verbose
      )

    } else {

      stop(glue::glue("Could not find appropriate ~ binned_outputs\\ folder to create SPATA2 object for resolution {square_res}."))

    }

  }


  if(base::length(out_folder) != 1){

    stop("Invalid folder structure.")

  }

  # load spatial data
  # load images
  sp_data <-
    createSpatialDataVisiumHD(
      dir = out_folder,
      square_res = res_opt, # use definition from last loop
      sample = sample_name,
      img_ref = img_ref,
      img_active = img_active,
      resize_images = resize_images,
      unload = unload,
      verbose = verbose
    )

  out_files <-
    base::list.files(out_folder, recursive = TRUE, full.names = TRUE)

  # check and load required mtr
  confuns::check_one_of(
    input = mtr,
    against = c("filtered", "raw")
  )

  if(mtr == "filtered"){

    mtr_pattern <- "filtered_feature_bc_matrix.h5$"

  } else if(mtr == "raw"){

    mtr_pattern <- "raw_feature_bc_matrix.h5$"

  }

  mtr_path <- out_files[stringr::str_detect(out_files, pattern = mtr_pattern)]

  ref_path <-
    stringr::str_remove(mtr_path, directory_visium) %>%
    paste0("~", .)

  if(base::length(mtr_path) > 1){

    mtr_path <- mtr_path[1]
    ref_path <- ref_path[1]

    warning(glue::glue("Multiple matrices found. Picking first. {ref_path}"))

  } else if(base::length(mtr_path) == 0){

    stop(glue::glue("'{mtr_pattern}' is missing.", mtr_pattern = stringr::str_remove(mtr_pattern, "\\$")))

  }

  confuns::give_feedback(
    msg = glue::glue("Reading count matrix from '{ref_path}'."),
    verbose = verbose
  )

  count_mtr <- Seurat::Read10X_h5(filename = mtr_path)

  if(is.character(genes)){

    confuns::check_one_of(
      input = genes,
      against = rownames(count_mtr),
      fdb.fn = "warning",
      fdb.opt = 2,
      ref.opt.2 = "genes in count matrix",
      trunc.at = 0
    )

    genes <- genes[genes %in% rownames(count_mtr)]

    count_mtr <- count_mtr[genes, ]

  }

  # create SPATA2 object
  object <-
    initiateSpataObjectEmpty(
      sample_name = sample_name,
      platform = sp_data@method@name, # depends on input
      verbose = FALSE
    )

  # set required content

  # molecular assay
  ma <-
    MolecularAssay(
      mtr_counts = count_mtr,
      modality = "gene",
      signatures = signatures$gene
    )

  object <- setAssay(object, assay = ma)
  object <- activateAssay(object, assay_name = "gene")
  object <- activateMatrix(object, mtr_name = "counts")

  # meta
  meta_df <-
    tibble::tibble(
      barcodes = getCoordsDf(sp_data)$barcodes,
      sample = {{sample_name}}
    )

  object <- setMetaDf(object, meta_df = meta_df)

  # spatial data
  object <- setSpatialData(object, sp_data = sp_data)

  # set default
  object <- setDefault(object, pt_size = getSpotSize(object), use_scattermore = TRUE)

  if(square_res %in% names(visiumHD_resolutions)){

    object <- identifyTissueOutline(object, verbose = verbose)

  } else {

    object <-
      reduceResolutionVisiumHD(
        object = object,
        res_new = input_square_res,
        new_sample_name = sample_name,
        workers = workers,
        batch_size = batch_size,
        verbose = verbose
        )

  }

  object <- computeCaptureArea(object)

  returnSpataObject(object)

}


#' @title Initiate an object of class `SPATA2` from platform Xenium
#'
#' @description This function initiates a [`SPATA2`] object with data generated
#' using the 10x Genomics [`Xenium`] platform.
#'
#' @param directory_xenium Character value. Directory to a Xenium output folder.
#' @inherit initiateSpataObject params
#'
#' @return A `SPATA2` object from the Xenium platform. Default for `pt_size ` is
#' set to 0.1 which might need adjustment.
#'
#' @seealso [`setDefault()`]
#'
#' @details
#' The function requires a directory containing the output files from a 10x Genomics Xenium experiment
#' specified with the argument `directory_xenium`. This directory (below denoted as **~**) must include the following
#' sub-directories:
#'
#' \itemize{
#'   \item \emph{~/cell_feature_matrix}: A folder that contains the following files:
#'      \itemize{
#'        \item \emph{~/cell_feature_matrix/barcodes.tsv.gz}: The cell barcodes.
#'        \item \emph{~/cell_feature_matrix/features.tsv.gz}: The feature (gene) names.
#'        \item \emph{~/cell_feature_matrix/matrix.mtx.gz}: The raw count data matrix.
#'        }
#'    \item \emph{~/cells.csv.gz}: The cellular positions.
#' }
#'
#' @details Xenium works in micron space. The coordinates of the cellular centroids are
#' provided in unit um. Therefore no pixel scale factor must be computed or set
#' to work with SI units.
#'
#' @export
#'
initiateSpataObjectXenium <- function(sample_name,
                                      directory_xenium,
                                      verbose = TRUE){

  # create SPATA2 object
  object <-
    initiateSpataObjectEmpty(
      sample_name = sample_name,
      platform = "Xenium",
      verbose = verbose
    )

  # spatial data
  sp_data <-
    createSpatialDataXenium(
      dir = directory_xenium,
      sample = sample_name
    )

  object <- setSpatialData(object, sp_data = sp_data)

  # read count matrix
  mtr_path <- base::file.path(directory_xenium, "cell_feature_matrix/")

  confuns::give_feedback(
    msg = glue::glue("Reading count matrix from '{mtr_path}'."),
    verbose = verbose
  )

  count_mtr <- read_matrix_from_folder(mtr_path)

  # molecular assay
  ma <-
    MolecularAssay(
      mtr_counts = count_mtr,
      modality = "gene",
      signatures = signatures$gene
    )

  object <- setAssay(object, assay = ma)
  object <- activateAssay(object, assay_name = "gene")
  object <- activateMatrix(object, mtr_name = "counts")

  # meta
  meta_df <-
    tibble::tibble(
      barcodes = getCoordsDf(sp_data)$barcodes,
      sample = {{sample_name}}
    )

  object <- setMetaDf(object, meta_df = meta_df)

  # Xenium works in micron space
  pxl_scale_fct <- magrittr::set_attr(x = 1, which = "unit", value = "um/px")
  object <- setScaleFactor(object, fct_name = "pixel", value = pxl_scale_fct)

  base::options(scipen = 999)

  object <- identifyTissueOutline(object, verbose = verbose)

  # set default
  object <- setDefault(object, use_scattermore = TRUE, display_image = FALSE, pt_size = 0.1)

  object <- computeCaptureArea(object)

  returnSpataObject(object)

}
theMILOlab/SPATA2 documentation built on Feb. 8, 2025, 11:41 p.m.