R/VDJ_build.R

Defines functions VDJ_build

Documented in VDJ_build

#' Minimal Version of the VDJ Building Part from VDJ_GEX_matrix() Function
#'
#' This function imports Cell Ranger output into an R dataframe for downstream analyses. It is a minimal version of the VDJ building part from the `VDJ_GEX_matrix()` function of the Platypus package, adapted for Cell Ranger v7 and older versions. Seurat objects can be integrated by matching barcodes from the Seurat object's metadata with the barcodes in the `barcode` column of the VDJ dataframe.
#'
#' @param VDJ.directory A string specifying the path to the parent directory containing the output folders (one folder for each sample) of Cell Ranger. This pipeline assumes that the output file names have not been changed from the default 10x settings in the `/outs/` folder. This is compatible with B and T cell repertoires. The following 5 files are necessary within this folder: 
#' \describe{
#'   \item{`filtered_contig_annotations.csv`}{Contains the filtered contig annotations.}
#'   \item{`filtered_contig.fasta`}{Contains the filtered contig sequences in FASTA format.}
#'   \item{`consensus_annotations.csv`}{Contains the consensus annotations.}
#'   \item{`consensus.fasta`}{Contains the consensus sequences in FASTA format.}
#'   \item{`concat_ref.fasta`}{Contains concatenated reference sequences.}
#' }
#' @param VDJ.sample.list A list specifying the paths to the output folders (one folder for each sample) of Cell Ranger. This pipeline assumes that the output file names have not been changed from the default 10x settings in the `/outs/` folder and requires the same 5 files listed above.
#' @param remove.divergent.cells A logical value (`TRUE`/`FALSE`). If `TRUE`, cells with more than one VDJ transcript or more than one VJ transcript will be excluded. This could be due to multiple cells being trapped in one droplet or light chain dual expression (concerns ~2-5 percent of B cells, see DOI:10.1084/jem.181.3.1245). Defaults to `FALSE`.
#' @param complete.cells.only A logical value (`TRUE`/`FALSE`). If `TRUE`, only cells with both a VDJ transcript and a VJ transcript are included in the VDJ dataframe. Keeping only cells with 1 VDJ and 1 VJ transcript could be preferable for downstream analysis. Defaults to `FALSE`.
#' @param trim.germlines A logical value (`TRUE`/`FALSE`). If `TRUE`, the raw germline sequences of each clone will be trimmed using the consensus sequences of that clone as reference sequences (using `Biostrings::pairwiseAlignment` with the option "global-local" and a gap opening cost specified by `gap.opening.cost`). Defaults to `FALSE`.
#' @param gap.opening.cost A numeric value representing the cost for opening a gap in `Biostrings::pairwiseAlignment` when aligning and trimming germline sequences. Defaults to 10.
#' @param parallel A logical value (`TRUE`/`FALSE`). If `TRUE`, the per-sample VDJ building is executed in parallel (parallelized across samples). Defaults to `FALSE`.
#' @param num.cores An integer specifying the number of cores to be used when `parallel = TRUE`. Defaults to all available cores minus 1 or the number of sample folders in `VDJ.directory` (whichever is smaller).
#'
#' @return A dataframe representing the VDJ / VGM[[1]] object. Each row in this dataframe represents one cell or one unique cell barcode.
#'
#' @details The function extracts and processes VDJ data from Cell Ranger output folders, making it suitable for integration with downstream analysis workflows such as Seurat. It can handle both T and B cell repertoires and is optimized for Cell Ranger v7.
#'
#' @export
#' 
#' @examples
#' \donttest{
#' try({
#'   VDJ <- VDJ_build(
#'     VDJ.directory = "path/to/VDJ_directory",
#'     remove.divergent.cells = TRUE,
#'     complete.cells.only = TRUE,
#'     trim.germlines = TRUE
#'   )
#' })
#' }




VDJ_build <- function(VDJ.directory,
                      VDJ.sample.list,
                      remove.divergent.cells,
                      complete.cells.only,
                      trim.germlines,
                      gap.opening.cost,
                      parallel,
                      num.cores){


  # If both the 'VDJ.directory' and 'VDJ.sample.list' are missing, a message is returned and execution is stopped
  if(missing(VDJ.directory) & missing(VDJ.sample.list)){stop('The path to the parent directory containing the output folders (one for each sample) from Cell Ranger, or the list of these sample directories are missing.')}
  # If both a 'VDJ.directory' and 'VDJ.sample.list' are specified, a message is returned and execution is stopped
  if(!missing(VDJ.directory) & !missing(VDJ.sample.list)){stop('Both a path to the parent directory containing the output folders (one for each sample) from Cell Ranger and a list of sample directories are given as input. Please provide one parent direcotry or one list of sample directories.')}
  
  #Set defaults
  if(missing(remove.divergent.cells)){remove.divergent.cells <- FALSE}
  if(missing(complete.cells.only)){complete.cells.only <- FALSE}
  if(missing(trim.germlines)){trim.germlines <- FALSE}
  if(missing(gap.opening.cost)){gap.opening.cost <- 10}
  if(missing(parallel)){parallel <- FALSE}
  
  # Create list with paths to output folders of Cell Ranger (one path for each sample) in parent direcotry 'VDJ.directory'
  if(missing(VDJ.sample.list)){
    VDJ.sample.list <- list.dirs(path=VDJ.directory,
                                 full.names=T,
                                 recursive=F)
  }
  # Add a '/' at the end of each directory (if not already present)
  VDJ.sample.list <- sub("/?$", "/", VDJ.sample.list)

  # If the number of cores is not specified, the number is set to the number of CPU cores on the current host, or the number of samples in 'VDJ.sample.list', if this number is lower
  if(parallel && missing(num.cores)){
    num.cores <- min(c((parallel::detectCores() - 1), length(VDJ.sample.list)))
  }

  # Return a message that performing pairwise alignment for all germline sequences significantly increases computation time (if 'trim.germlines' is set to TRUE)
  if(trim.germlines){message("Warning: Setting 'trim.germlines' to TRUE will significantly increase computation time, please be patient!\n")}

  #Global variable definitions for CRAN checks
  sequence_trimmed <- NULL
  chain <- NULL
  barcode <- NULL
  barcode <- NULL
  VDJ_clonotype_id <- NULL
  VJ_clonotype_id <- NULL
  VDJ_chain_count <- NULL
  VJ_chain_count <- NULL
  celltype <- NULL
  VDJ_cgene <- NULL
  clonotype_id <- NULL

  

  get_trim_seqs_from_json <- function(annotation_file){

    # Obtain trimmed sequences from raw sequences using the start and end indices of the annotated 'L-REGION+V-REGION' and 'J-REGION' using 'all_contig_annotations.json' or 'consensus_annotations.json' file from Cell Ranger
    # Arguments:
    # - annotations_JSON: path to JSON file containing annotations of contig or consensus sequences ('all_contig_annotations.json' or 'consensus_annotations.json')
    # Authors: Victor Kreiner, Tudor-Stefan Cotet, Valentijn Tromp

    # Read JSON file and store data object in 'annotations_list'
    # The annotations list, containing a sublist for each contig/consensus sequence, is eventually transformed into a dataframe in which each row represents one trimmed contig/consensus sequence
    # A sequence may contain the following annotation regions: '5-UTR', 'L-REGION+V-REGION', 'D-REGION', 'J-REGION', 'C-REGION' (which are listed in 'annotations')
    annotations_list <- jsonlite::read_json(annotation_file)

    # Define function to extract the following info for each contig/consensus sequence:
    # - id: contig or consensus ID depending on the JSON file provided ('all_contig_annotations.json' or 'consensus_annotations.json', respectively)
    # - start: the start index of the 'L-region+V-region' in the raw sequence
    # - stop: the stop index of the 'L-region+V-region' in the raw sequence
    # - sequence trimmed: the trimmed sequence (given the start and stop indices)
    extract_info <- function(single_annotation){

      # Check if both 'L-REGION+V-REGION' and 'J-REGION' annotations are present
      if (all(c("L-REGION+V-REGION", "J-REGION") %in% sapply(single_annotation$annotations, function(x) x$feature$region_type))){

        # Identify 'L-REGION+V-REGION' annotation (if present)
        lv_region <- single_annotation$annotations[[which(sapply(single_annotation$annotations, function(x) x$feature$region_type == "L-REGION+V-REGION"))]]
        # Identify 'J-REGION' annotation (if present)
        j_region <- single_annotation$annotations[[which(sapply(single_annotation$annotations, function(x) x$feature$region_type == "J-REGION"))]]

        # Create dataframe 'trimmed_sequence' that contains the contig/consensus ID and the trimmed sequence
        trimmed_sequence <- data.frame(
          # Retrieve contig/consensus ID
          id = single_annotation$contig_name,
          # Trim raw sequence using 'L-REGION+V-REGION' start and 'J-REGION' end indices
          sequence_trimmed <- substr(single_annotation$sequence,
                                    start = as.numeric(lv_region$contig_match_start)+1,
                                    stop = as.numeric(j_region$contig_match_end))
        )

        # Return 'trimmed_sequence' dataframe
        return(trimmed_sequence)
      }
    }

    # Use 'lapply' to apply the 'extract_info' function to each element of 'annotations_list' and store in 'trimmed_sequences' list
    trimmed_sequences <- lapply(annotations_list, extract_info)
    # Transform the 'trimmed_sequences' list into a dataframe by 'rbind'
    trimmed_sequences <- do.call(rbind, trimmed_sequences[!sapply(trimmed_sequences, is.null)])
    # Remove "-1" from contig/consensus IDs
    trimmed_sequences$id <- gsub(trimmed_sequences$id, pattern="-1", replacement="")

    # Return the 'trimmed_sequences' dataframe
    return(trimmed_sequences)
  }


  trim_seq <- function(seq1, seq2, gap.opening.cost = 10){

    # Perform global-local alignment with seq1 and seq2 and thereby trim seq2 (subject) using seq1 (pattern)
    # Arguments:
    # - seq1: sequence used as reference during pairwise alignment
    # - seq2: sequence that will be trimmeded during pairwise alignment
    # - gap.opening.cost: cost for opening a gap in Biostrings::pairwiseAlignment
    # Authors: Evgenios Kladis, Valentijn Tromp

    # If 'seq1' or 'seq2' is missing, an empty string is returned
    if(is.na(seq1) | is.na(seq2)){
      return ("")
    } else{

    # Perform pairwise alignment (this type of alignment attempts to align the entire length of both sequences)
    alignment <- Biostrings::pairwiseAlignment(pattern = Biostrings::DNAString(seq1),
                                               subject = Biostrings::DNAString(seq2),
                                               type="global-local",
                                               gapOpening = gap.opening.cost)

    # Return the subject of the alignment, which is trimmed seq2
    return(as.character(alignment@subject))
    }
  }

  translate_DNA<- function(sequence){

    #Translate a nucleotide sequence into an amino acid sequence
    #Arguments:
    #- sequence: nucleotide sequence to be translated

    if (sequence == ""){
      return("")
    }

    #Genetic code
    genetic_code <- list(
      "TTT"="F", "TTC"="F", "TTA"="L", "TTG"="L",
      "TCT"="S", "TCC"="S", "TCA"="S", "TCG"="S",
      "TAT"="Y", "TAC"="Y", "TAA"="*", "TAG"="*",
      "TGT"="C", "TGC"="C", "TGA"="*", "TGG"="W",
      "CTT"="L", "CTC"="L", "CTA"="L", "CTG"="L",
      "CCT"="P", "CCC"="P", "CCA"="P", "CCG"="P",
      "CAT"="H", "CAC"="H", "CAA"="Q", "CAG"="Q",
      "CGT"="R", "CGC"="R", "CGA"="R", "CGG"="R",
      "ATT"="I", "ATC"="I", "ATA"="I", "ATG"="M",
      "ACT"="T", "ACC"="T", "ACA"="T", "ACG"="T",
      "AAT"="N", "AAC"="N", "AAA"="K", "AAG"="K",
      "AGT"="S", "AGC"="S", "AGA"="R", "AGG"="R",
      "GTT"="V", "GTC"="V", "GTA"="V", "GTG"="V",
      "GCT"="A", "GCC"="A", "GCA"="A", "GCG"="A",
      "GAT"="D", "GAC"="D", "GAA"="E", "GAG"="E",
      "GGT"="G", "GGC"="G", "GGA"="G", "GGG"="G"
    )
    #Split the sequence into codons
    codons <- strsplit(sequence, "(?<=.{3})", perl=TRUE)[[1]]

    #Translate the codons
    for (codon_id in 1:length(codons)){
      #Remove codons that are not complete
      if(nchar(codons[codon_id]) < 3){
        codons[codon_id] = ""
      }
      #Codons that contain "-" are replaced with "-"
      else if (grepl("-", codons[codon_id], fixed = TRUE)){
        codons[codon_id] = "-"
      }
      #Translate codons according to the genetic code
      else{
        codons[codon_id] = genetic_code[[codons[codon_id]]]
      }
    }

    #Paste the codons together
    sequence <- paste(codons, collapse="")

    #Return the sequence
    return(sequence)
  }


  make_VDJ_sample <- function(VDJ.sample.directory,
                              remove.divergent.cells,
                              complete.cells.only,
                              trim.germlines,
                              gap.opening.cost){

    # Obtains all necessary data from a VDJ directory to construct the VDJ dataframe for one sample
    # Arguments:
    # - VDJ.sample.directory: Path to a single sample directory from Cell Ranger (optimized for v7)
    # - remove.divergent.cells bool - if TRUE, cells with more than one VDJ transcript or more than one VJ transcript will be excluded. This could be due to multiple cells being trapped in one droplet or due to light chain dual expression (concerns ~2-5% of B cells, see DOI:10.1084/jem.181.3.1245). Defaults to FALSE.
    # - complete.cells.only bool - if TRUE, only cells with both a VDJ transcripts and a VJ transcript are included in the VDJ dataframe. Keeping only cells with 1 VDJ and 1 VJ transcript could be preferable for downstream analysis. Defaults to FALSE.
    # - trim.germlines bool - if TRUE, the raw germline sequences of each clone will be trimmed using the the consensus sequences of that clone as reference seqeunces (using BIostrings::pairwiseAlignment with the option "global-local" and a gap opening cost = gap.opening.cost). Defaults to FALSE.
    # - gap.opening.cost float - the cost for opening a gap in Biostrings::pairwiseAlignment when aligning and trimming germline sequences. Defaults to 10.
    # Authors: Tudor-Stefan Cotet, Aurora Desideri Perea, Anamay Samant, Valentijn Tromp

    # 1. Read in the following files from Cell Ranger output and store in list 'out':
    # [[1]]: 'filtered_contig_annotations.csv' contains high-level annotations of each high-confidence contig from cell-associated barcodes
    # [[2]]: 'filtered_contig.fasta' contains contig sequences from barcodes that passed the filtering steps and are annotated in 'filtered_contig_annotations.csv'
    # [[3]]: 'consensus_annotations.csv' contains high-level annotations of each clonotype consensus sequence
    # [[4]]: 'consensus.fasta' contains the consensus sequences of each assembled contig, which is identical to the sequence of the most frequent exact subclonotype
    # [[5]]: 'concat_ref.fasta' contains the concatenated V(D)J reference segments for the segments detected on each consensus sequence and serves as an approximate reference for each consensus sequence

    # Check if the required files are present in the sample directory, otherwise, a message is returned and execution is stopped
    if(!file.exists(paste0(VDJ.sample.directory,"filtered_contig_annotations.csv"))){stop(paste0("Could not find the 'filtered_contig_annotations.csv' file in ", VDJ.sample.directory))}
    if(!file.exists(paste0(VDJ.sample.directory,"filtered_contig.fasta"))){stop(paste0("Could not find the 'filtered_contig.fasta' file in ", VDJ.sample.directory))}
    if(!file.exists(paste0(VDJ.sample.directory,"consensus_annotations.csv"))){stop(paste0("Could not find the 'consensus_annotations.csv' file in ", VDJ.sample.directory))}
    if(!file.exists(paste0(VDJ.sample.directory,"consensus.fasta"))){stop(paste0("Could not find the 'consensus.fasta' file in ", VDJ.sample.directory))}
    if(!file.exists(paste0(VDJ.sample.directory,"concat_ref.fasta"))){stop(paste0("Could not find the 'concat_ref.fasta' file in ", VDJ.sample.directory))}

    # Create empty list to store warnings during data retrieval out of single sample directory
    warnings <- list()

    # Create dataframe to store number of cells/barcodes filtered when 'remove.divergent.cells' and/or 'complete.cells.only' is/are set to TRUE (all counts default to 0)
    filtering_counts <- data.frame(divergent.cells = 0, incomplete.cells = 0, row.names = basename(VDJ.sample.directory))

    # Create empty list ('out') with length 5
    out <- vector("list", 5)

    # Read 'filtered_contig_annotations.csv' file and store dataframe in out[[1]]
    out[[1]] <- utils::read.csv(paste(VDJ.sample.directory,"filtered_contig_annotations.csv",sep=""),sep=",",header=T)
    # Add "_aa" to column names with aa sequences for clarity
    colnames(out[[1]]) <- gsub(colnames(out[[1]]), pattern="^(fwr\\d|cdr\\d)$", replacement="\\1_aa")
    # Remove "raw_" from column names for consistency
    colnames(out[[1]]) <- gsub(colnames(out[[1]]), pattern="raw_", replacement="")
    # Remove "-1" from cell barcodes and contig IDs
    out[[1]]$barcode <- gsub(out[[1]]$barcode, pattern="-1", replacement="")
    out[[1]]$contig_id <- gsub(out[[1]]$contig_id, pattern="-1", replacement="")
    # Remove "_" from 'v_gene', 'd_gene', 'j_gene', and 'c_gene' column
    colnames(out[[1]]) <- gsub(colnames(out[[1]]), pattern = "_gene", replacement="gene")
    # Concatenate nt and aa sequences of FWR1-4 and CDR1-3 regions and place concatenated sequence in new column in dataframe in out[[1]] (if FWR1-4 and CDR1-3 regions are annotated in 'filtered_contig_annotations.csv')
    if(all(c("fwr1_nt","cdr1_nt","fwr2_nt","cdr2_nt","fwr3_nt","cdr3_nt","fwr4_nt","fwr1_aa","cdr1_aa","fwr2_aa","cdr2_aa","fwr3_aa","cdr3_aa","fwr4_aa") %in% colnames(out[[1]]))){
      out[[1]]$sequence_nt_trimmed <- paste0(out[[1]][["fwr1_nt"]], out[[1]][["cdr1_nt"]],
                                             out[[1]][["fwr2_nt"]], out[[1]][["cdr2_nt"]],
                                             out[[1]][["fwr3_nt"]], out[[1]][["cdr3_nt"]],
                                             out[[1]][["fwr4_nt"]])
      out[[1]]$sequence_aa_trimmed <- paste0(out[[1]][["fwr1_aa"]], out[[1]][["cdr1_aa"]],
                                             out[[1]][["fwr2_aa"]], out[[1]][["cdr2_aa"]],
                                             out[[1]][["fwr3_aa"]], out[[1]][["cdr3_aa"]],
                                             out[[1]][["fwr4_aa"]])
    # If FWR1-4 and CDR1-3 regions are not annotated (e.g., in older Cell Ranger versions where only the CDR3 sequence is annotated in 'filtered_contig_annotations.csv'), trimmed sequences are retrieved using the 'all_contig_annotations.json' file
    }else{
      # Check if the 'all_contig_annotations.json' file is present in the sample directory
      if(file.exists(paste(VDJ.sample.directory,"all_contig_annotations.json",sep=""))){
        # Raw sequences are trimmed using the annotated 'L-REGION+V-REGION' and 'J-REGION' and stored in 'trimmed_contigs' dataframe
        trimmed_contigs <- get_trim_seqs_from_json(paste(VDJ.sample.directory,"all_contig_annotations.json",sep=""))
        # Trimmed sequences are appended to dataframe in out[[1]] by merging by 'contig_id'
        out[[1]] <- out[[1]] |>
          dplyr::left_join(trimmed_contigs, by= c("contig_id" = "id")) |>
          dplyr::rename(sequence_nt_trimmed = sequence_trimmed)
        # Translate trimmed nt sequences into aa sequences
        out[[1]]$sequence_aa_trimmed[which(!is.na(out[[1]]$sequence_nt_trimmed))]  <- sapply(out[[1]]$sequence_nt_trimmed[which(!is.na(out[[1]]$sequence_nt_trimmed))], function(x){
          aa_sequence <- suppressWarnings(translate_DNA(x))
          return(as.character(aa_sequence))})
      # If the 'all_contig_annotations.json' file is absent in the sample directory, a warning message is returned and 'sequence_nt_trimmed' and 'sequence_aa_trimmed' columns are set to NA
      } else{
        warnings <- c(warnings, paste0("Could not find the 'all_contig_annotations.json' file in ", VDJ.sample.directory, " so 'sequence_nt_trimmed' and 'sequence_aa_trimmed' columns of sample ", basename(VDJ.sample.directory), "  remain empty."))
        out[[1]]$sequence_nt_trimmed <- NA
        out[[1]]$sequence_aa_trimmed <- NA
      }
    }

    # Read 'filtered.contig.fasta' file and store sequences in dataframe in out[[2]]
    # Since additional filtration steps are performed by enclone during the clonotype grouping step, some contigs present in 'filtered_contig.fasta' may be absent in 'filtered_contig_annotations.csv'
    out[[2]] <- data.frame(Biostrings::readDNAStringSet(paste(VDJ.sample.directory,"filtered_contig.fasta",sep="")))
    # Rename column with sequences of dataframe in out[[2]]
    colnames(out[[2]]) <- 'sequence_nt_raw'
    # Add contig IDs from FASTA file in separate column in dataframe in out[[2]] and remove "-1" from and contig IDs
    out[[2]]$contig_id <- gsub(rownames(out[[2]]), pattern="-1", replacement="")

    # Read 'consensus_annotations.csv' file and store in out[[3]]
    out[[3]] <- utils::read.csv(paste(VDJ.sample.directory,"consensus_annotations.csv",sep=""),sep=",",header=T)
    # Add "_aa" to column names with aa sequences for clarity
    colnames(out[[3]]) <- gsub(colnames(out[[3]]), pattern="^(fwr\\d|cdr\\d)$", replacement="\\1_aa")
    # Modify consensus IDs by placing an underscore between 'consensus' and the subsequent number for consistency
    out[[3]]$consensus_id <- unlist(lapply(out[[3]]$consensus_id, function(x) gsub(x, pattern = "(consensus)(.*)", replacement = "\\1_\\2")))
    # Remove double underscore (if present)
    out[[3]]$consensus_id <- unlist(lapply(out[[3]]$consensus_id, function(x) gsub(x, pattern = "__", replacement = "_")))
    # Remove "_" from 'v_gene', 'd_gene', 'j_gene', and 'c_gene' column
    colnames(out[[3]]) <- gsub(colnames(out[[3]]), pattern = "_gene", replacement="gene")
    # Concatenate nt and aa sequences of FWR1-4 and CDR1-3 regions and place concatenated sequence in new column in dataframe in out[[3]] (if FWR1-4 and CDR1-3 regions are annotated in 'consensus_annotations.csv')
    if(all(c("fwr1_nt","cdr1_nt","fwr2_nt","cdr2_nt","fwr3_nt","cdr3_nt","fwr4_nt","fwr1_aa","cdr1_aa","fwr2_aa","cdr2_aa","fwr3_aa","cdr3_aa","fwr4_aa") %in% colnames(out[[3]]))){
      out[[3]]$consensus_nt_trimmed <- paste0(out[[3]][["fwr1_nt"]], out[[3]][["cdr1_nt"]],
                                              out[[3]][["fwr2_nt"]], out[[3]][["cdr2_nt"]],
                                              out[[3]][["fwr3_nt"]], out[[3]][["cdr3_nt"]],
                                              out[[3]][["fwr4_nt"]])
      out[[3]]$consensus_aa_trimmed <- paste0(out[[3]][["fwr1_aa"]], out[[3]][["cdr1_aa"]],
                                              out[[3]][["fwr2_aa"]], out[[3]][["cdr2_aa"]],
                                              out[[3]][["fwr3_aa"]], out[[3]][["cdr3_aa"]],
                                              out[[3]][["fwr4_aa"]])
    # If FWR1-4 and CDR1-3 regions are not annotated (e.g., in older Cell Ranger versions where only the CDR3 sequence is annotated in 'consensus_annotations.csv'), trimmed sequences are retrieved using the 'consensus_annotations.json' file
    }else{
      # Check if the 'consensus_annotations.json' file is present in the sample directory
      if(file.exists(paste0(VDJ.sample.directory,"consensus_annotations.json"))){
        # Raw sequences are trimmed using the annotated 'L-REGION+V-REGION' and 'J-REGION' and stored in 'trimmed_consensuses' dataframe
        trimmed_consensuses <- get_trim_seqs_from_json(paste(VDJ.sample.directory,"consensus_annotations.json",sep=""))
        # Trimmed sequences are appended to dataframe in out[[3]] by merging by 'consensus_id'
        out[[3]] <- out[[3]] |>
          dplyr::left_join(trimmed_consensuses, by= c("consensus_id" = "id")) |>
          dplyr::rename(consensus_nt_trimmed = sequence_trimmed)
        # Translate trimmed nt sequences into aa sequences
        out[[3]]$consensus_aa_trimmed[which(!is.na(out[[3]]$consensus_nt_trimmed))]  <- sapply(out[[3]]$consensus_nt_trimmed[which(!is.na(out[[3]]$consensus_nt_trimmed))], function(x){
          aa_sequence <- suppressWarnings(translate_DNA(x))
          return(as.character(aa_sequence))})
        # If the 'consensus_annotations.json' file is absent in the sample directory, a warning message is returned and 'consensus_nt_trimmed' and 'consensus_aa_trimmed' columns are set to NA
      } else{
        warnings <- c(warnings, paste0("Could not find the 'consensus_annotations.json' file in ", VDJ.sample.directory, " so 'consensus_nt_trimmed' and 'consensus_aa_trimmed' columns of sample ", basename(VDJ.sample.directory), " remain empty."))
        out[[3]]$consensus_nt_trimmed <- NA
        out[[3]]$consensus_aa_trimmed <- NA
      }
    }

    # Read 'consensus.fasta' file and store sequences in dataframe in out[[4]]
    out[[4]] <- data.frame(Biostrings::readDNAStringSet(paste(VDJ.sample.directory,"consensus.fasta",sep="")))
    # Rename column with sequences of dataframe in out[[4]]
    colnames(out[[4]]) <- 'consensus_nt_raw'
    # Add consensus IDs from FASTA file in separate column in dataframe in out[[4]]
    out[[4]]$consensus_id <- rownames(out[[4]])

    # Read 'concat_ref.fasta' file and store sequences in dataframe in out[[5]]
    out[[5]] <- data.frame(Biostrings::readDNAStringSet(paste(VDJ.sample.directory,"concat_ref.fasta",sep="")))
    # Rename column with sequences of dataframe in out[[5]]
    colnames(out[[5]]) <- 'germline_nt_raw'
    # Add consensus IDs from FASTA file in separate column in dataframe in out[[5]]
    out[[5]]$consensus_id <- rownames(out[[5]])
    # Modify raw consensus IDs by replacing '_concact_ref_' with '_consensus_'
    out[[5]]$consensus_id <- unlist(lapply(out[[5]]$consensus_id, function(x) gsub(x, pattern = "_concat_ref_", replacement = "_consensus_")))


    # 2. Merge data frames in list 'out' and rename items as follows:
    # [[1]] and [[2]] will be merged in 'filtered_contigs'
    # [[3]], [[4]], and [[5]] will be merged in 'consensuses'

    # Merge 'filtered_contig_annotations.csv' in out[[1]] with 'filtered_contig.fasta' in out[[2]] by 'contig_id'
    filtered_contigs <- out[[1]] |>
      dplyr::left_join(out[[2]], by = 'contig_id')
    # Merge 'consensus_annotations.csv' in out[[3]] with 'consensus.fasta' in out[[4]] and with 'concat_ref.fasta' in out[[5]] by 'consensus_id'
    consensuses <- out[[3]] |>
            dplyr::left_join(out[[4]], by="consensus_id") |>
            dplyr::left_join(out[[5]], by="consensus_id")


    # 3. Add 'germline_nt_trimmed' and 'germline_aa_trimmed' to 'consensuses' dataframe

    # If 'trim.germlines' is set to TRUE, perform pairwise alignment with 'trim_seq' function in parallel with 'mapply'
    if(trim.germlines){
      consensuses$germline_nt_trimmed <- mapply(trim_seq, consensuses$consensus_nt_trimmed, consensuses$germline_nt_raw, gap.opening.cost)
      # Translate trimmed nt sequences into aa sequences
      consensuses$germline_aa_trimmed[which(!is.na(consensuses$germline_nt_trimmed))] <- sapply(consensuses$germline_nt_trimmed[which(!is.na(consensuses$germline_nt_trimmed))], function(x){
        aa_sequence <- suppressWarnings(translate_DNA(x))
        return(as.character(aa_sequence))
      })
    # If 'trim.germlines' is set to FALSE (default), 'germline_nt_trimmed' and 'germline_aa_trimmed' columns are set to NA
    }else{
      consensuses$germline_nt_trimmed <- NA
      consensuses$germline_aa_trimmed <- NA
    }


    # 4. Select the columns out of 'filtered_contigs' and 'consensuses' for each transcript and perform VDJ and VJ pairing
    # Simultaneously perform the filtering of cells/barcodes which contain multiple VDJ or VJ transcripts, if 'remove.divergent.cells' is set to TRUE (default)

    # Make selection of columns to be selected out of 'filtered_contigs' for each VDJ (heavy/beta chain) and VJ (light/alpha chain) transcript
    columns_contigs <- c("barcode",
                         "contig_id",
                         "clonotype_id",
                         "consensus_id",
                         "chain",
                         "umis",
                         "vgene","dgene","jgene","cgene",
                         "fwr1_nt","fwr1_aa",
                         "cdr1_nt","cdr1_aa",
                         "fwr2_nt","fwr2_aa",
                         "cdr2_nt","cdr2_aa",
                         "fwr3_nt","fwr3_aa",
                         "cdr3_nt","cdr3_aa",
                         "fwr4_nt","fwr4_aa",
                         "sequence_nt_raw",
                         "sequence_nt_trimmed",
                         "sequence_aa_trimmed")

    # Make selection of columns to be selected out of 'consensuses' for each VDJ (heavy/beta chain) and VJ (light/alpha chain) transcript
    columns_consensuses <- c("consensus_id",
                             "consensus_nt_raw",
                             "consensus_nt_trimmed",
                             "consensus_aa_trimmed",
                             "germline_nt_raw",
                             "germline_nt_trimmed",
                             "germline_aa_trimmed")

    # All column names
    all_columns <- c(columns_consensuses, columns_contigs)

    # If columns are not present in 'filtered_contigs', add them with NA
    for(col in columns_contigs){
      if(!(col %in% colnames(filtered_contigs))){
        filtered_contigs[[col]] <- NA
      }
    }

    # If columns are not present in 'consensuses', add them with NA
    for(col in columns_consensuses){
      if(!(col %in% colnames(consensuses))){
        consensuses[[col]] <- NA
      }
    }

    # Pre-merge the filtered contigs and consensuses (before splitting into the VDJ and VJ subsets)
    contigs_merged <- subset(filtered_contigs, select = columns_contigs) |>
      dplyr::left_join(subset(consensuses, select = columns_consensuses), by = "consensus_id")

    # Create a VDJ subset by selecting contigs encoding a TRB, TRG, or IGH (heavy) chain
    vdj_subset <- subset(contigs_merged, chain %in% c("TRB","TRG","IGH"))
    # Add "VDJ_" to column names of 'vdj_subset' (except the 'barcode' column)
    colnames(vdj_subset) <- ifelse(colnames(vdj_subset)  == "barcode", yes = colnames(vdj_subset), no = paste0("VDJ_", colnames(vdj_subset)))

    # Create a VJ subset by selecting contigs encoding a TRA, TRD, IGK, or IGL (light) chain
    vj_subset <- subset(contigs_merged, chain %in% c("TRA","TRD","IGK","IGL")) |>
      # Select all column except 'dgene' (absent in VJ transcripts)
      dplyr::select(all_columns[all_columns != "dgene"])
    # Add "VJ_" to column names of 'vj_subset' (except the 'barcode' column)
    colnames(vj_subset) <- ifelse(colnames(vj_subset) == "barcode", yes = colnames(vj_subset), no = paste0("VJ_", colnames(vj_subset)))

    # If 'remove.divergent.cells' is set to TRUE (default), filter out cells with multiple VDJ transcripts or multiple VJ transcripts by filtering out non-unique barcodes
    if(remove.divergent.cells){
      # Retrieve non-unique barcodes in 'vdj_subset' and 'vd_subset' and store in 'non_singlets_vdj' and 'non_singlets_vj' vector, respectively
      non_singlets_vdj <- unique(vdj_subset$barcode[duplicated(vdj_subset$barcode)])
      non_singlets_vj <- unique(vj_subset$barcode[duplicated(vj_subset$barcode)])
      non_singlets <- unique(c(non_singlets_vdj, non_singlets_vj))
      # Subset by the unique barcodes per VDJ and VJ dataframe
      vdj_subset_processed <- subset(vdj_subset, !(barcode %in% non_singlets))
      vj_subset_processed <- subset(vj_subset, !(barcode %in% non_singlets))
      # Save number of cells excluded due to suspicion of being divergent singlets in 'filtering_counts' dataframe
      filtering_counts[basename(VDJ.sample.directory), "divergent.cells"] <- length(non_singlets)
      }

    # If 'remove.divergent.cells' is set to FALSE, merge rows with identical 'barcode' in 'VDJ_subset' and VJ_subset'
    if(!remove.divergent.cells){
      # Concatenate non-missing values in each column into a comma-separated string for 'vdj_subset' and 'vj_subset'
      vdj_subset_processed <- stats::aggregate(data = vdj_subset, x = . ~ barcode, FUN = \(x)toString(stats::na.omit(x)), na.action = identity)
      vj_subset_processed <- stats::aggregate(data = vj_subset, x = . ~ barcode, FUN = \(x)toString(stats::na.omit(x)), na.action = identity)
    }

    # Merge 'vdj_subset' with 'vj_subset' by 'barcode'
    VDJ_df <- dplyr::full_join(vdj_subset_processed, vj_subset_processed, by = "barcode")

    # Add number of contigs per barcode for both VDJ and VJ transcript in columns 'VDJ_chain_count' and 'VJ_chain_count'
    VDJ_df$VDJ_chain_count <- sapply(VDJ_df$barcode, function(x) nrow(subset(vdj_subset, barcode == x)))
    VDJ_df$VJ_chain_count <- sapply(VDJ_df$barcode, function(x) nrow(subset(vj_subset, barcode == x)))

    # Combine 'VDJ_clonotype_id' and 'VDJ_clonotype_id'
    VDJ_df <- VDJ_df |>
      dplyr::mutate(clonotype_id = ifelse(!is.na(VDJ_clonotype_id) & !is.na(VJ_clonotype_id),
                                          ifelse(VDJ_clonotype_id == VJ_clonotype_id, VDJ_clonotype_id, paste(VDJ_clonotype_id, VJ_clonotype_id, sep = ", ")),
                                          ifelse(is.na(VDJ_clonotype_id), VJ_clonotype_id, VDJ_clonotype_id))) |>
      dplyr::select(-VDJ_clonotype_id, -VJ_clonotype_id)

    # If 'complete.cells.only' is set to TRUE, filter out cells that have a 'VDJ_chain_count' or 'VJ_chain_count' of 0
    if(complete.cells.only){
      number_cells_before <- nrow(VDJ_df)
      VDJ_df <- subset(VDJ_df, VDJ_chain_count > 0 & VJ_chain_count > 0)
      number_cells_after <- nrow(VDJ_df)
      # Save number of cells excluded due to missing VDJ or VJ transcript in 'filtering_counts' dataframe
      filtering_counts[basename(VDJ.sample.directory), "incomplete.cells"] <- number_cells_before - number_cells_after
      }


    # 5. Finalize VDJ dataframe

    # Add column 'sample_id' to 'VDJ' dataframe and use name of sample directory folder
    VDJ_df$sample_id <- basename(VDJ.sample.directory)

    # Add column 'celltype' based on chains in 'VDJ_chain' and 'VJ_chain' column ("TRA", "TRB", "TRD", and TRG" --> "T cell"; "IGH", "IGK", and "IGL" --> "B cell")
    VDJ_df$celltype[stringr::str_detect(paste0(VDJ_df$VDJ_chain,VDJ_df$VJ_chain), "TR")] <- "T cell"
    VDJ_df$celltype[stringr::str_detect(paste0(VDJ_df$VDJ_chain,VDJ_df$VJ_chain), "IG")] <- "B cell"

    # Add the 'isotype' column based on the 'VDJ_cgene' column
    VDJ_df <- VDJ_df |>
      dplyr::mutate(
        isotype = ifelse(celltype == "B cell",
                         sub("^IGH([A-Z])([0-9])([A-Z]*)$", "Ig\\1\\2", VDJ_cgene),
                         NA))

    # Add column 'clonotype_frequencies'
    VDJ_df$clonotype_frequency <- unlist(lapply(VDJ_df$clonotype_id, function(x) length(which(VDJ_df$clonotype_id == x))))

    # Specify order of columns in 'VDJ_df'
    new_order <- c("sample_id", "barcode", "celltype", "isotype",
                   "VDJ_contig_id", "VJ_contig_id",
                   "VDJ_consensus_id", "VJ_consensus_id",
                   "clonotype_id", "clonotype_frequency",

                   "VDJ_chain", "VDJ_chain_count", "VDJ_umis",
                   "VDJ_vgene", "VDJ_dgene", "VDJ_jgene", "VDJ_cgene",
                   "VDJ_fwr1_nt", "VDJ_fwr1_aa",
                   "VDJ_cdr1_nt", "VDJ_cdr1_aa",
                   "VDJ_fwr2_nt", "VDJ_fwr2_aa",
                   "VDJ_cdr2_nt", "VDJ_cdr2_aa",
                   "VDJ_fwr3_nt", "VDJ_fwr3_aa",
                   "VDJ_cdr3_nt", "VDJ_cdr3_aa",
                   "VDJ_fwr4_nt", "VDJ_fwr4_aa",
                   "VDJ_sequence_nt_raw", "VDJ_sequence_nt_trimmed", "VDJ_sequence_aa_trimmed",
                   "VDJ_consensus_nt_raw", "VDJ_consensus_nt_trimmed", "VDJ_consensus_aa_trimmed",
                   "VDJ_germline_nt_raw", "VDJ_germline_nt_trimmed", "VDJ_germline_aa_trimmed",

                   "VJ_chain", "VJ_chain_count", "VJ_umis",
                   "VJ_vgene", "VJ_jgene", "VJ_cgene",
                   "VJ_fwr1_nt", "VJ_fwr1_aa",
                   "VJ_cdr1_nt", "VJ_cdr1_aa",
                   "VJ_fwr2_nt", "VJ_fwr2_aa",
                   "VJ_cdr2_nt", "VJ_cdr2_aa",
                   "VJ_fwr3_nt", "VJ_fwr3_aa",
                   "VJ_cdr3_nt", "VJ_cdr3_aa",
                   "VJ_fwr4_nt", "VJ_fwr4_aa",
                   "VJ_sequence_nt_raw", "VJ_sequence_nt_trimmed", "VJ_sequence_aa_trimmed",
                   "VJ_consensus_nt_raw", "VJ_consensus_nt_trimmed", "VJ_consensus_aa_trimmed",
                   "VJ_germline_nt_raw", "VJ_germline_nt_trimmed", "VJ_germline_aa_trimmed")
    VDJ_df <- VDJ_df[, new_order]

    # Reorder rows in 'VDJ_df' dataframe by 'clonotype_id'
    VDJ_df <- VDJ_df |>
      dplyr::arrange(as.integer(stringr::str_extract(clonotype_id, "\\d+")))

    # Combine 'VDJ_df' dataframe, 'warnings' list, and 'filtering_counts' dataframe in 'output' list
    output <- list(VDJ_df = VDJ_df, warnings = warnings, filtering = filtering_counts)

    # Return 'output' list
    return(output)
  }


  # If 'parallel' is set to TRUE, the VDJ dataframe will be constructed for each sample in parallel (if the number of samples is not exceeding the number of cores specified or available)
  if(parallel){

    # Retrieve the operating system
    operating_system <- Sys.info()[['sysname']]

    # Define partial function for 'mclapply' or 'parLapply' to be executed in parallel for each sample:
    partial_function <- function(VDJ.sample.directory){

      # Execute make_VDJ_sample() function
      make_VDJ_sample(VDJ.sample.directory,
                      remove.divergent.cells = remove.divergent.cells,
                      complete.cells.only = complete.cells.only,
                      trim.germlines = trim.germlines,
                      gap.opening.cost = gap.opening.cost)
      }

    # If the operating system is Linux or Darwin, 'mclapply' is used for parallelization
    if(operating_system %in% c('Linux', 'Darwin')){
      # Construct a VDJ dataframe for each sample output folder in 'VDJ.directory' and store output in the list 'output_list'
      output_list <- parallel::mclapply(mc.cores=num.cores, X=VDJ.sample.list, FUN=partial_function)
      # Name items in 'output_list' according to sample names in 'VDJ.sample.list'
      names(output_list) <- basename(VDJ.sample.list)
      }

    # If the operating system is Windows, "parLapply" is used for parallelization
    if(operating_system == "Windows"){
      # Create cluster
      cluster <- parallel::makeCluster(num.cores)
      # Construct a VDJ dataframe for each sample output folder in 'VDJ.directory' and store output in the list 'output_list'
      output_list <- parallel::parLapply(cluster, X=VDJ.sample.list, fun=partial_function)
      # Name items in 'output_list' according to sample names in 'VDJ.sample.list'
      names(output_list) <- basename(VDJ.sample.list)
      # Stop cluster
      parallel::stopCluster(cluster)
      }
  }

  # If 'parallel' is set to FALSE, the per-sample VDJ building is not executed in parallel
  if(!parallel){
    # Construct a VDJ dataframe for each sample output folder in 'VDJ.directory' and store output in the list 'output_list'
    output_list <- lapply(VDJ.sample.list, make_VDJ_sample,
                          remove.divergent.cells = remove.divergent.cells,
                          complete.cells.only = complete.cells.only,
                          trim.germlines = trim.germlines,
                          gap.opening.cost = gap.opening.cost)
    # Name items in 'output_list' according to sample names in 'VDJ.sample.list'
    names(output_list) <- basename(VDJ.sample.list)
    }

  # Select and print warning messages in 'output_list'
  warnings_output <- unlist(sapply(output_list, function(x) x$warnings))
  for(warning in warnings_output){message(paste0("Warning: ",warning,"\n"))}

  # Select and row bind filtering counts in 'output_list'
  filtering_output <- do.call(rbind, lapply(output_list, function(x) x$filtering))
  # Print 'filtering_output' to show number of cells/barcodes excluded (if 'remove.divergent.cells' and/or 'complete.cells.only' is/are set to TRUE)
  message("Warning: Please be aware of the number of cells excluded, when 'remove.divergent.cells' or 'complete.cells.only' is set to TRUE:")
  print(knitr::kable(filtering_output))

  # Select and row bind VDJ dataframes in 'output_list'
  VDJ_output <- do.call(rbind, lapply(output_list, function(x) x$VDJ_df))

  # Remove rownames from 'VDJ_output'
  rownames(VDJ_output) <- NULL

  # Ensure every missing value and empty string are set to NA
  VDJ_output[VDJ_output == 'None'] <- NA
  VDJ_output[VDJ_output == 'NA'] <- NA
  VDJ_output[VDJ_output == 'NULL'] <- NA
  VDJ_output[VDJ_output == ''] <- NA
  VDJ_output[is.null(VDJ_output)] <- NA

  # Return 'VDJ_output'
  return(VDJ_output)
}

Try the Platypus package in your browser

Any scripts or data that you put into this service are public.

Platypus documentation built on Oct. 18, 2024, 5:08 p.m.