R/CoNGA.R

Defines functions QuantifyTcrClones CalculateTcrDiversity CalculateTcrDiversityFromSeurat SeuratToCoNGA RunCoNGA

Documented in CalculateTcrDiversity CalculateTcrDiversityFromSeurat QuantifyTcrClones RunCoNGA SeuratToCoNGA

utils::globalVariables(
  names = c('barcode', 'clusters_gex', 'clusters_tcr', 'nndists_gex', 'nndists_tcr', 'is_invariant', 'conga_scores', 'conga_fdr_values', 'raw_clonotype_id', 'sample_div', 'v_a_gene', 'v_b_gene', 'cdr3_a_aa', 'cdr3_b_aa', 'sampleId'),
  package = 'CellMembrane',
  add = TRUE
)



#' @title Run CoNGA
#'
#' @description Runs CoNGA on a seurat object
#' @param seuratObj The Seurat object containing the data to be run using CoNGA.
#' @param tcrClonesFile The 10x clonotypes file for this Seurat object. Single lanes can use the CellRanger/Vloupe contigs CSV file. Merged lanes need to merge these files and modify them to create unique cellbarcodes and clonotype names, such as the code from Rdiscvr::CreateMergedTcrClonotypeFile().
#' @param seuratToCongaDir The directory to store the results of SeuratToCoNGA() (the input files for the python call to run_CoNGA()).
#' @param assayName Pass-through variable for accessing the assay name within the Seurat object for SeuratToCoNGA().
#' @param organism 'human' or 'rhesus'
#' @param runCongaOutputFilePrefix prefix for the output files from the python call to run_CoNGA().
#' @param gexDatatype This should be "10x_h5" since we're using DropletUtils to write the counts, although "h5ad" is supported.
#' @param runCongaOutfilePrefixForQcPlots Prefix for the qc output files to be generated by run_CoNGA().
#' @param runCongaOutputDirectory The directory that will store the many files created during run_CoNGA().
#' @param congaMetadataPrefix A prefix to be added to the columns of the metadata that will be added from CoNGA within the returned Seurat object.
#' @param pngConversionTool specify the conversion tool used by CoNGA for svg to png conversion. Can be any of: convert, inkscape, rsvg
#' @return A Seurat object with the conga metadata relevant to TCR + gene expression clustering appended. 
#' @examples
#' \dontrun{
#'   CreateMergedTcrClonotypeFile(seuratObj,
#'                                outputFile = "./tcrs.csv")
#'   congaSeuratObj <- RunCoNGA(seuratObj = seuratObj,
#'                              tcrClonesFile = "./tcrs.csv",
#'                              organism = "rhesus")
#' }
#' @export

RunCoNGA <- function(seuratObj=NULL,
                     tcrClonesFile = NULL,
                     seuratToCongaDir = "./seuratToConga", 
                     assayName = "RNA", 
                     organism = NULL,
                     runCongaOutputFilePrefix = "conga_output", 
                     gexDatatype = "10x_h5", 
                     runCongaOutfilePrefixForQcPlots = "qc_plots", 
                     runCongaOutputDirectory = "./conga_output", 
                     congaMetadataPrefix = "conga_",
                     pngConversionTool = NULL) {
  #check python requirements
  if (!reticulate::py_available(initialize = TRUE)) {
    stop(paste0('Python/reticulate not configured. Run "reticulate::py_config()" to initialize python'))
  }
  
  if (!reticulate::py_module_available('conga')) {
    stop('The conga python package has not been installed! If you believe it has been installed, run reticulate::import("conga") to get more information and debug')
  }
  
  #check required inputs
  if (is.null(seuratObj)){
    stop("seuratObj is NULL. Please supply a Seurat object to run CoNGA.")
  }
  
  if (is.null(tcrClonesFile)){
    stop("tcrClonesFile is NULL. Please supply a csv file.")
  } else if (!file.exists(tcrClonesFile)){
    stop("tcrFileFromRdsicvr does not exist. Please ensure your path specification is correct.")
  }

  allowedOrganisms <- c('human', 'mouse', 'rhesus', 'human_gd', 'mouse_gd', 'rhesus_gd')
  if (is.null(organism)){
    stop("Please specify an organism. 'human', 'mouse', and 'rhesus' are supported.")
  } else if (!(organism %in% allowedOrganisms )){
    stop(paste0("Unsupported organism type supplied. Please specify one of: ", paste0(allowedOrganisms, collapse = ', ')))
  }
  
  #strip trailing slashes from runCongaOutputDirectory directory if user-supplied
  if (grepl("/$", runCongaOutputDirectory)){
    runCongaOutputDirectory <- gsub("/$", "", runCongaOutputDirectory)
  }
  #make the runCongaOutputDirectory directory if it doesn't already exist (this is where the output files from run_CoNGA() will be written.)
  if (!dir.exists(runCongaOutputDirectory)) {
    dir.create(runCongaOutputDirectory, recursive = T)
  }

  #normalize paths in case they were specified using non-absolute paths.
  runCongaOutputDirectory <- R.utils::getAbsolutePath(runCongaOutputDirectory)
  tcrClonesFile <- R.utils::getAbsolutePath(tcrClonesFile)
  tcrClones <- read.csv(tcrClonesFile)
  tcrClones <- tcrClones |> dplyr::filter(barcode %in% rownames(seuratObj@meta.data))
  if (nrow(tcrClones) == 0) {
    stop('There were no matching barcodes between the seurat object and TCR clones file')
  }
  write.csv(tcrClones, tcrClonesFile)

  #Run SeuratToCoNGA() to generate files for the python run_CoNGA() call and normalize paths. 
  SeuratToCoNGA(seuratObj, tcrClonesFile, seuratToCongaDir, assayName = assayName)
  variable_features_file <- R.utils::getAbsolutePath(paste0(seuratToCongaDir, "/varfeats.csv"))
  tcr_datafile <- R.utils::getAbsolutePath(paste0(seuratToCongaDir, "/TCRs.csv"))
  gex_datafile <- R.utils::getAbsolutePath(paste0(seuratToCongaDir, "/GEX.h5"))
  clones_file <- R.utils::getAbsolutePath(paste0(seuratToCongaDir,"/clones_file.txt"), mustWork = FALSE)

  #copy run_CoNGA.py in inst/scripts and supply custom arguments 
  str <- readr::read_file(system.file("scripts/run_CoNGA.py", package = "CellMembrane"))
  script <- tempfile()
  readr::write_file(str, script)

  if (!is.null(pngConversionTool)) {
    allowedTools <- c('convert', 'inkscape', 'rsvg')
    if (!pngConversionTool %in% allowedTools) {
      stop(paste0('Unknown pngConversionTool, must be one of: ', paste0(allowedTools, collapse = ',')))
    }

    print(paste0('Setting CoNGA pngConversionTool to: ', pngConversionTool))
    readr::write_file('import os\n', script, append = TRUE)
    readr::write_file(paste0('os.environ["CONGA_PNG_TO_SVG_UTILITY"] = "', pngConversionTool, '"\n'), script, append = TRUE)
  }

  newstr <- paste0("run_CoNGA(features_file = '", variable_features_file,
                   "',tcr_datafile = '", tcr_datafile,
                   "', gex_datafile = '", gex_datafile,
                   "', organism = '", organism,
                   "', outfile_prefix = '", runCongaOutputFilePrefix,
                   "', gex_datatype = '", gexDatatype,
                   "', clones_file = '", clones_file,
                   "', outfile_prefix_for_qc_plots = '", runCongaOutfilePrefixForQcPlots,
                   "', working_directory = '", runCongaOutputDirectory,
                   "')")
  #write the new arguments into the script and execute
  readr::write_file(newstr, script, append = TRUE)
  system2(reticulate::py_exe(), script)
  
  #Find the clustering information within the adata2obs.csv (i.e. the  metadata for the representative clones within CoNGA) and append the information to the input Seurat object.
  if (!file.exists(paste0(runCongaOutputDirectory,"/", runCongaOutputFilePrefix,"_adata2obs.csv"))){
    stop(paste0(runCongaOutputFilePrefix,"_adata2obs.csv was not found! Unable to append CoNGA's clustering information to input Seurat object."))
  }

  conga_dataframe <- read.csv(R.utils::getAbsolutePath(paste0(runCongaOutputDirectory,"/", runCongaOutputFilePrefix,"_adata2obs.csv")))
  #the cell barcodes are stored in "X"
  rownames(conga_dataframe) <- conga_dataframe[,"X"]
  #Append congaMetadataPrefix so we can run Conga iteratively and record the clustering information.
  colnames(conga_dataframe) <- paste0(congaMetadataPrefix, colnames(conga_dataframe))
  seuratObj <- Seurat::AddMetaData(seuratObj, metadata = conga_dataframe[,paste0(congaMetadataPrefix,c("clusters_gex", "clusters_tcr", 'nndists_gex', 'nndists_tcr',
                                                                                                           'is_invariant', 'conga_scores', 'conga_fdr_values'))])
  return(seuratObj)
}

#' @title SeuratToCoNGA
#' @param seuratObj The Seurat object to be written into a 10x file format
#' @param tcrClonesFile The 10x clonotypes file for this Seurat object. Single lanes can use the CellRanger/Vloupe contigs CSV file. Merged lanes need to merge these files and modify them to create unique cellbarcodes and clonotype names, such as the code from Rdiscvr::CreateMergedTcrClonotypeFile().
#' @param seuratToCongaDir The local path to write the output files.
#' @param assayName The name of the gene expression data assay.
#' @description A wrapper function to prepare a Seurat object for Conga.
#' @export
SeuratToCoNGA <- function(seuratObj, 
                           tcrClonesFile,
                           seuratToCongaDir, 
                           assayName = 'RNA') {
  if (!dir.exists(seuratToCongaDir)) {
    dir.create(seuratToCongaDir, recursive = T)
  }

  write.table(Seurat::VariableFeatures(seuratObj), R.utils::getAbsolutePath(paste0(seuratToCongaDir, "/varfeats.csv"), mustWork = FALSE), row.names = FALSE, col.names = FALSE)
  file.copy(from = R.utils::getAbsolutePath(tcrClonesFile, mustWork = FALSE),
            to = R.utils::getAbsolutePath(paste0(seuratToCongaDir, "/TCRs.csv"), mustWork = FALSE),
            overwrite = T)
  
  DropletUtils::write10xCounts(x = Seurat::GetAssayData(seuratObj, assay = assayName, layer = 'counts'),
                               path = R.utils::getAbsolutePath(paste0(seuratToCongaDir, "/GEX.h5"), mustWork = FALSE),
                               overwrite = TRUE)
}

#' @title Calculate TCR diversity from a seurat object
#'
#' @description Calculate TCR a diversity profile for each library in the data
#' @param seuratObj A seurat object with the following columns: TRA, TRA_V, TRB, TRB_V
#' @param groupField The field that defines sample groups.
#' @param order1 The minimum order for calculating the generalized Simpson entropy.
#' @param order2 The maximum order for calculating the generalized Simpson entropy.
#' @return A data frame with the results
#' @export
CalculateTcrDiversityFromSeurat <- function(seuratObj,
                          groupField,
                          order1 = 1,
                          order2 = 200) {

  cols <- c(groupField, c('TRA_V', 'TRB_V', 'TRA', 'TRB'))
  if (!all(cols %in% names(seuratObj@meta.data))) {
    missing <- cols[! cols %in% names(seuratObj@meta.data)]
    stop(paste0('The following columns were missing: ', paste0(missing, collapse = ',')))
  }

  df <- seuratObj@meta.data[cols]
  names(df) <- c('sampleId', 'v_a_gene', 'v_b_gene', 'cdr3_a_aa', 'cdr3_b_aa')
  df <- df %>% dplyr::filter(!is.na(v_a_gene) & !is.na(v_b_gene) & !is.na(cdr3_a_aa) & !is.na(cdr3_b_aa)) %>%
    dplyr::group_by(sampleId, v_a_gene, v_b_gene, cdr3_a_aa, cdr3_b_aa) %>%
    dplyr::summarize(clone_size = n())

  print(paste0('Total cells with paired a/b TCR data: ', sum(df$clone_size), ', out of ', ncol(seuratObj), ' input cells'))

  return(CalculateTcrDiversity(df, order1 = order1, order2 = order2))
}

#' @title Calculate TCR diversity
#'
#' @description Plot a diversity profile for each library in the data
#' @param inputData The a data frame with the columns: sampleId, v_a_gene, v_b_gene, cdr3_a_aa, cdr3_b_aa, and clone_size
#' @param order1 The minimum order for calculating the generalized Simpson entropy.
#' @param order2 The maximum order for calculating the generalized Simpson entropy.
#' @return A data frame with the results
#' @export
CalculateTcrDiversity <- function(inputData,
                     order1 = 1,
                     order2 = 200) {

  cols <- c('sampleId', 'v_a_gene', 'v_b_gene', 'cdr3_a_aa', 'cdr3_b_aa', 'clone_size')
  if (!all(cols %in% names(inputData))) {
    missing <- cols[! cols %in% names(inputData)]
    stop(paste0('The following columns were missing: ', paste0(missing, collapse = ',')))
  }

  #check python requirements
  if (!reticulate::py_available(initialize = TRUE)) {
    stop(paste0('Python/reticulate not configured. Run "reticulate::py_config()" to initialize python'))
  }
  
  if (!reticulate::py_module_available('tcrdist')) {
    stop('The tcrdist3 python package has not been installed! If you believe it has been installed, run reticulate::import("tcrdist") to get more information and debug')
  }
  
  conga_clones_file <- tempfile()
  write.table(inputData, file = conga_clones_file, sep = '\t', row.names = FALSE, quote = FALSE)

  #copy calculate_Diversity.py in inst/scripts and supply custom arguments
  str <- readr::read_file(system.file("scripts/calculate_Diversity.py", package = "CellMembrane"))
  scriptFile <- tempfile()
  readr::write_file(str, scriptFile)

  outputFile <- tempfile()

  newstr <- paste0("calculate_Diversity(conga_clones_file = '", conga_clones_file,
                   "', outputFile = '", outputFile,
                   "', order1 = '", order1,
                   "', order2 = '", order2,
                   "')")
  
  #write the new arguments into the script and execute
  readr::write_file(newstr, scriptFile, append = TRUE)
  system2(reticulate::py_exe(), scriptFile)
  
  df <- read.csv(outputFile)

  y <- colnames(df)[grepl("^Z", colnames(df)) & !grepl("_LCL_|_UCL_", colnames(df))]
  if (is.null(y) || length(y) == 0) {
    stop('Parsing error in tcrdist3 output. No columns containing diversity metrics were found.')
  }
  print(df |> select(tidyr::all_of(c("order", y))) |>
    tidyr::pivot_longer(cols = y, names_to = "sample_div", values_to = "y") |> 
    ggplot(aes(x = order, y = y)) + geom_line(aes(color = sample_div)) + egg::theme_article()
  )

  unlink(conga_clones_file)
  unlink(scriptFile)
  unlink(outputFile)

  return(df)
}

#' @title Append Clone Properties
#' @param seuratObj The Seurat object to append clone properties to
#' @param tcrClonesFile The 10x clonotypes file for this Seurat object. Single lanes can use the CellRanger/Vloupe contigs CSV file. Merged lanes need to merge these files and modify them to create unique cellbarcodes and clonotype names, such as the code from Rdiscvr::CreateMergedTcrClonotypeFile().
#' @param groupingFields A list of fields used to group cells by sample, which establishes librarySize
#' @description A function to append clone properties to a Seurat object
#' @export
QuantifyTcrClones <- function(seuratObj, tcrClonesFile, groupingFields = 'cDNA_ID') {
  
  # Append clonotypeID to seuratObj metadata
  df <- read.csv(tcrClonesFile) |> select(tidyr::all_of(c('barcode', 'raw_clonotype_id'))) |> unique()
  meta <- seuratObj@meta.data |> select()
  meta$barcode <- rownames(meta)
  merged <- left_join(meta, df, by = "barcode")
  seuratObj <- AddMetaData(seuratObj, merged$raw_clonotype_id, col.name = "clonotypeID")
  
  # Calculate and append clone properties to seuratObj metadata
  groupingFieldsWithClone <- c(groupingFields, 'clonotypeID')
  meta2 <- seuratObj@meta.data |> select(tidyr::all_of(groupingFieldsWithClone)) |> group_by(across(all_of(groupingFieldsWithClone))) |> mutate(counts = n())
  meta2$counts <- ifelse(is.na(meta2$clonotypeID), NA, meta2$counts)
  meta2 <- meta2 |> group_by(across(all_of(groupingFields))) |> mutate(libSize = n())
  meta2$prop <- meta2$counts/meta2$libSize
  seuratObj <- AddMetaData(seuratObj, meta2$prop, col.name = "cloneProportion")
  seuratObj <- AddMetaData(seuratObj, meta2$counts, col.name = "cloneSize")
  
  return(seuratObj)
}
bimberlabinternal/CellMembrane documentation built on Oct. 16, 2024, 6:53 a.m.