R/PlatypusDB_VGM_to_AIRR.R
In Platypus: Single-Cell Immune Repertoire and Gene Expression Analysis

Documented in PlatypusDB_VGM_to_AIRR

#'Platypus V3 VGM to AIRR compatibility function
#'
#'@description Exports AIRR compatible tables supplemented with VDJ and GEX information from the Platypus VGM object and the cellranger output airr_rearrangements.tsv
#' @param VGM Output object of the VDJ_GEX_matrix function generated with VDJ.combine = T, GEX.combine = T (to merge all samples) and integrate.VDJ.to.GEX = T (to integrate VDJ and GEX data)
#'@param VDJ.features.to.append Character vector. Defaults to "none". Can be either "all" or column names of the VGM VDJ matrix (VGM[[1]]) to append to the AIRR compatible table.
#'@param GEX.features.to.append Character vector. Defaults to "none". Can be either "all" or GEX metadata column names or Gene names of the VGM GEX object (VGM[[2]])(passed to Seurat::FetchData()) to append to the AIRR compatible table. For a list of available features run: names(VGM[[2]]@meta.data) and rownames(VGM[[2]])
#'@param airr.rearrangements  Source of the airr_rearrangements.tsv file as generated by Cellranger. There are 3 available input options: 1. R list object from Platypus_DB_load_from_disk or Platypus_DB_fetch / 2. List with local paths to airr_rearrangements.tsv / 3. List of airr_rearrangements.tsv loaded in as R objects within the current R enviroment. ! Order of input list must be identical to that of sample_ids in the VGM ! If not provided or set to "none" CIGAR strings in output will be empty.
#' @param airr.integrate Boolean. Defaults to TRUE, whether to integrate output AIRR tables
#' @return A list of length of samples in VGM containing a AIRR-compatible dataframe for each sample if airr.integrate = F or a single dataframe if airr.integrate = T ! Cave the format: VGM object => 1 cell = 1 row; AIRR table 1 cell = as many rows as VDJ and VJ chains available for that cell. GEX cell-level information is attached to all rows containing a chain of that cell.
#' @export
#' @examples
#' \donttest{
#'try({
#'airr.list.out <- PlatypusDB_VGM_to_AIRR(VGM = VGM
#', VDJ.features.to.append = c("VDJ_cdr3s_aa")
#', GEX.features.to.append = c("CTLA4", "TOX"), airr.rearrangements = Data.in)
#'
#'airr.list.out <- PlatypusDB_VGM_to_AIRR(VGM = VGM
#', VDJ.features.to.append = c("VDJ_cdr3s_aa")
#', GEX.features.to.append = c("CTLA4", "TOX"),
#'airr.rearrangements =list("~/path_to/s1/airr.rearrangement.tsv"
#',"~/path_to/s2/airr_rearrangement.tsv"))
#'
#'airr.list.out <- PlatypusDB_VGM_to_AIRR(VGM = VGM
#', VDJ.features.to.append = c("VDJ_cdr3s_aa")
#', GEX.features.to.append = c("CTLA4", "TOX"),
#'airr.rearrangements = list(airr_rearrangements.s1, airr_rearrangements_2))
#'
#'VDJ.out.directory.list <- list()
#'VDJ.out.directory.list[[1]] <- c("~/cellrangerVDJ/s1")
#'VDJ.out.directory.list[[2]] <- c("~/cellrangerVDJ/s2")
#'
#'GEX.out.directory.list <- list()
#'GEX.out.directory.list[[1]] <- c("~/cellrangerGEX/s1")
#'GEX.out.directory.list[[2]] <- c("~/cellrangerGEX/s2")
#'VGM <- VDJ_GEX_matrix(VDJ.out.directory.list = VDJ.out.directory.list,
#'GEX.out.directory.list = GEX.out.directory.list,
#'GEX.integrate = TRUE, VDJ.combine = TRUE, integrate.GEX.to.VDJ = TRUE
#', integrate.VDJ.to.GEX = TRUE,
#'get.VDJ.stats = FALSE, trim.and.align = FALSE)
#'airr.list.out <- PlatypusDB_VGM_to_AIRR(VGM = VGM,
#'VDJ.features.to.append = c("VDJ_sequence_nt_trimmed","VJ_sequence_nt_trimmed"),
#'GEX.features.to.append = c("UMAP_1","UMAP_2","CTLA4", "TOX"),
#'airr.rearrangements = c("~/cellrangerVDJ/s1/airr_rearrangement.tsv"
#',"~/cellrangerVDJ/s2/airr_rearrangement.tsv"))
#' })
#' }
#'

PlatypusDB_VGM_to_AIRR <- function(VGM,
                                   VDJ.features.to.append,
                                   GEX.features.to.append,
                                   airr.rearrangements,
                                   airr.integrate){


  #garbage collector to clear some ram
  gc()

  #grab relevant columns from the VDJ and GEX VGM
  if(missing(VDJ.features.to.append)) VDJ.features.to.append <- "none"
  if(missing(GEX.features.to.append)) GEX.features.to.append <- "none"
  if(missing(airr.integrate)) airr.integrate <- TRUE

  if(VDJ.features.to.append[1] != "none"){
    if(any(c("barcode", "sample_id","celltype", "VDJ_chain_contig", "VJ_chain_contig") %in% names(VGM[[1]]) == FALSE)){
      stop('VGM VDJ matrix does not contain all neccessary columns: "barcode", "sample_id","celltype", "VDJ_chain_contig", "VJ_chain_contig"')
    }

    if(tolower(VDJ.features.to.append[1]) == "all"){
        VDJ.features.to.append <- names(VGM[[1]])[!stringr::str_detect(names(VGM[[1]]), "(VDJ_)|(VJ_)")]
        VDJ.features.to.append <- VDJ.features.to.append[!VDJ.features.to.append %in% c("barcode", "sample_id")]
    }

    to_del <- c(1000) #to avoid an error if all features are found and no entry is added to to_del
    for(k in 1:length(VDJ.features.to.append)){
      if(!VDJ.features.to.append[k] %in% names(VGM[[1]])){
        to_del <- c(to_del, k)
        warning(paste0("Column ", VDJ.features.to.append[k], " was not found in VGM VDJ matrix. Skipping this feature..."))
      }
    }
    VDJ.features.to.append <- VDJ.features.to.append[-to_del]
    #grab features
    VDJ.features.to.append <- c("barcode", "sample_id","celltype", "VDJ_chain_contig", "VJ_chain_contig", VDJ.features.to.append)
    VDJ.to.add <- VGM[[1]][,VDJ.features.to.append]
  } else {
    #grab necessary features
    VDJ.features.to.append <- c("barcode", "sample_id","celltype", "VDJ_chain_contig", "VJ_chain_contig")
    VDJ.to.add <- VGM[[1]][,VDJ.features.to.append]
  }

  if(GEX.features.to.append[1] != "none"){
    if(tolower(GEX.features.to.append[1]) != "all"){
    #first grab gene features
    gene.ids <- which(GEX.features.to.append %in% rownames(VGM[[2]]))
    if(length(gene.ids) > 0){
    GEX.to.add <- SeuratObject::FetchData(VGM[[2]], vars = GEX.features.to.append[gene.ids])
    GEX.features.to.append <- GEX.features.to.append[-gene.ids]
    } else {
        GEX.to.add <- as.data.frame(VGM[[2]]@meta.data[,1])
        names(GEX.to.add) <- "orig.ident"
      }
    } else {
      GEX.to.add <- as.data.frame(VGM[[2]]@meta.data[,1])
      names(GEX.to.add) <- "orig.ident"
    }

    if(length(GEX.features.to.append)>0){
    if(tolower(GEX.features.to.append[1]) == "all"){
      if(tolower(VDJ.features.to.append[1]) == "all"){
        GEX.features.to.append <- names(VGM[[2]]@meta.data)[!names(VGM[[2]]@meta.data) %in% VDJ.features.to.append]
        GEX.features.to.append <- GEX.features.to.append[GEX.features.to.append != "FB_assignment.y"]
      } else {
        GEX.features.to.append <- names(VGM[[2]]@meta.data)[!stringr::str_detect(names(VGM[[2]]@meta.data), "(VDJ_)|(VJ_)")]
        GEX.features.to.append <- GEX.features.to.append[GEX.features.to.append != "FB_assignment.y"]
      }
    }


    #secondly grab metadata items
    metadat.ids <- which(GEX.features.to.append %in% names(VGM[[2]]@meta.data))
    if(length(GEX.features.to.append)>0){
    GEX.metadata.add <- VGM[[2]]@meta.data[,GEX.features.to.append[metadat.ids]]
    GEX.to.add <- cbind(GEX.to.add, GEX.metadata.add)
    GEX.features.to.append <- GEX.features.to.append[-metadat.ids]
    }
    }

    if(length(GEX.features.to.append) > 0){ #any remaining inputs that have not been found in genes or metadata columns
      warning(paste0("Features ", GEX.features.to.append, " were not found in GEX genes or metadata. Skipping these features..."))
    }

    #make barcode column for merging
    GEX.to.add$barcode <- rownames(GEX.to.add)
  } else {
    GEX.to.add <- "none"
  }

  #Figure out input format of airr_rearrangements
  #cases:
    #1. A list of loaded tables => just use as is (order must be the same as samples in VGM)
    #2. A list of data.in from platypus DB (similar to the VGM function) => use code from VGM function to order and make list of tables
    #3. A list of local paths to access => load in as list

  if(inherits(airr.rearrangements,"list")){
    if(inherits(airr.rearrangements[[1]], "data.frame")){ #case 1.
      airr.list <- airr.rearrangements

    } else if(inherits(airr.rearrangements[[1]],"list")){ #case 2.
      #Make sense of input => we need this small parser to accept data loaded locally and downloaded and compiled from the PlatypusDB
      airr.list <- list()
      for(i in 1:length(airr.rearrangements)){ #First level
        if(names(airr.rearrangements[[i]])[1] == "VDJ"){ #check if we are already on a sample level
          #add the respective tsv table
          airr.list[[length(airr.list) + 1]] <- airr.rearrangements[[i]][[1]][[6]] #[[1]] for VDJ [[6]] for the airr_rearrangements table
        } else {
          for(j in 1:length(airr.rearrangements[[i]])){ #Second level
            if(names(airr.rearrangements[[i]][[j]])[1] == "VDJ"){ #check if we are a sample level
              airr.list[[length(airr.list) + 1]] <- airr.rearrangements[[i]][[j]][[1]][[6]]
            } else { #Data structure does not match expectations
              stop("Provided airr_rearrangement list input datastructure does not match necessary input format")
            }
          }
        }
      }

    } else if(inherits(airr.rearrangements[[1]],"character")){ #case 3

        vdj_load_error <- tryCatch({
          airr.list <- list()
          for(j in 1:length(airr.rearrangements)){
            if(file.exists(airr.rearrangements[[j]])){
              airr.list[[j]] <- utils::read.delim(airr.rearrangements[[j]], header = T)
            } else {
              warning(paste0(" airr_rearrangement.tsv not found for sample ", j, ". AIRR compatibility for this sample will not be available"))
              airr.list[[j]] <- "none"
            }
          }

        }, error = function(e){e
          message(e)})
        if(inherits(vdj_load_error,"error")){
          message("Loading airr_rearrangement from disk failed")}

    }
  } else {
    stop("Please provide airr_rearrangement inputs as a list. e.g. for paths list(~/data/project/airr_rearrangement.tsv)")
  }

  #Do some checks on the Airr dataframes
  if(length(airr.list) != length(unique(stringr::str_split(VGM[[1]]$barcode, "_", simplify = T)[,1]))){
    stop("List of airr_rearrangement dataframes has a different length than the number of unique barcode prefixes/sample_ids in the provided VGM. Please provide one dataframe per sample_id in the VGM in the same order.")
  }
  for(j in 1:length(airr.list)){
    if(any(!c("cell_id", "clone_id", "sequence_id", "sequence", "v_cigar") %in% names(airr.list[[j]]))){
      stop(paste0("Airr_rearrangement dataframe ", j, " does not contain all neccessary input columns. Please verifiy column names or table formatting are unaltered from the cellranger output"))
    }
  }
  #Tests passed => can proceed

    message(paste0("Order of samples in VGM is (Barcode prefix: entry in sample_id column): ", paste0(unique(paste0(stringr::str_split(VDJ.to.add$barcode, "_", simplify = T)[,1], ": ", VGM[[1]]$sample_id)), collapse = " / ")))

    #rename barcodes in airr and join data
    names(airr.list) <- paste0("s", c(1:length(airr.list)))
    for(i in 1:length(airr.list)){
      airr.list[[i]]$cell_id <- paste0(names(airr.list)[i], "_", airr.list[[i]]$cell_id)
      airr.list[[i]]$barcode <- airr.list[[i]]$cell_id

      #join VDJ data
      #Because the Airr format has one row per chain, a cell is represented in > 1 row. the merge function below appends information of one cell to all rows containing chains from that cell.

      airr.list[[i]] <- merge(airr.list[[i]], subset(VDJ.to.add, stringr::str_detect(VDJ.to.add$barcode,  names(airr.list)[i])), by = "barcode", all.x = T, all.y = F)

      if(inherits(GEX.to.add) != "character"){
      #join GEX data
      GEX.to.add <- GEX.to.add[!names(GEX.to.add) %in% names(airr.list[[i]])[names(airr.list[[i]]) != "barcode"]] #dont add existing columns
      airr.list[[i]] <- merge(airr.list[[i]], subset(GEX.to.add, stringr::str_detect(GEX.to.add$barcode,  names(airr.list)[i])), by = "barcode", all.x = T, all.y = F)
      }
    }

    if(airr.integrate) airr.list <- do.call("rbind", airr.list)

    return(airr.list)
}