R/io.R

Defines functions save_vdjtools save_immunarch parse_archer parse_10x_filt_contigs parse_10x_consensus parse_immunarch parse_airr parse_imgt parse_vdjtools parse_tcr parse_migmap parse_migec parse_mixcr parse_mitcr parse_immunoseq parse_repertoire .postprocess .remove.alleles .get_coltypes .which_recomb_type .make_names .detect_format .remove.ext repSave repLoad

Documented in repLoad repSave

if (getRversion() >= "2.15.1") {
  utils::globalVariables(c(
    "v_call", "d_call", "j_call", "junction", "junction_aa",
    "CDR3.nt", "D.end", "D.name", "D.start", "DJ.ins", "J.name",
    "J.start", "V.end", "V.name", "VD.ins", "VJ.ins", "barcode",
    "chain", "contig_id", "raw_clonotype_id", "raw_consensus_id"
  ))
}



##### Main IO functions #####


#' Load immune repertoire files into the R workspace
#'
#' @concept io
#'
#' @importFrom readr read_delim read_tsv read_csv col_integer col_character col_double col_logical col_guess cols write_lines
#' @importFrom stringr str_split str_detect str_replace_all
#' @importFrom methods as
#' @importFrom dplyr contains first
#' @importFrom utils read.table
#' @importFrom data.table setDF
#'
#' @description The \code{repLoad} function loads repertoire files
#' into R workspace in the immunarch format where you can immediately use them for
#' the analysis. \code{repLoad} automatically detects the right format for
#' your files, so all you need is simply provide the path to your files.
#'
#' See "Details" for more information on supported formats. See "Examples" for
#' diving right into it.
#'
#' @param .path A character string specifying the path to the input data.
#' Input data can be one of the following:
#'
#' - a single repertoire file.
#' In this case \code{repLoad} returns an R \link{data.frame};
#'
#' - a vector of paths to repertoire files.
#' Same as in the case with no metadata file presented in the next section below;
#'
#' - a path to the folder with repertoire files and, if available, metadata file "metadata.txt".
#' If the metadata file if presented, then the \code{repLoad} returns a list with two elements "data" and "meta".
#' "data" is an another list with repertoire R \link{data.frame}s. "meta" is a data frame with the metadata.
#' If the metadata file "metadata.txt" is not presented, then the \code{repLoad} creates a dummy metadata file with
#' sample names and returns a list with two elements "data" and "meta".
#' If input data has multiple chains or cell types stored in the same file
#' (for example, like in 10xGenomics repertoire files), such repertoire files will be splitted to different
#' R data frames with only one type of chain and cell presented. The metadata file will have additional columns specifying
#' cell and chain types for different samples.
#'
#' @param .format A character string specifying what format to use. Do NOT use it. See "Details" for more information on supported formats.
#'
#' Leave NA (which is default) if you want `immunarch` to detect formats automatically.
#'
#' @param .mode Either "single" for single chain data or "paired" for paired chain data.
#'
#' Currently "single" works for every format, and "paired" works only for 10X Genomics data.
#'
#' By default, 10X Genomics data will be loaded as paired chain data, and other files will be loaded as single chain data.
#'
#' @param .coding A logical value. Pass TRUE to get coding-only clonotypes (by defaul). Pass FALSE to get all clonotypes.
#'
#' @details
#' The metadata has to be a tab delimited file with first column named "Sample".
#' It can have any number of additional columns with arbitrary names.
#' The first column should contain base names of files without extensions in
#' your folder. Example:
#' \tabular{llll}{
#'  Sample \tab Sex \tab Age \tab Status\cr
#'  immunoseq_1 \tab M \tab 1 \tab C\cr
#'  immunoseq_2 \tab M \tab 2 \tab C\cr
#'  immunoseq_3 \tab FALSE \tab 3 \tab A
#' }
#'
#' \code{repLoad} has the ".format" argument that sets the format for input repertoire files.
#' Immunarch detects the file format automatically, and the argument is left only for the compatability
#' purposes. It will be soon removed. Do not pass it or your code will stop working!
#'
#' Currently, Immunarch support the following formats:
#'
#' - "immunoseq" - ImmunoSEQ of any version. http://www.adaptivebiotech.com/immunoseq
#'
#' - "mitcr" - MiTCR. https://github.com/milaboratory/mitcr
#'
#' - "mixcr" - MiXCR (the "all" files) of any version. https://github.com/milaboratory/mixcr
#'
#' - "migec" - MiGEC. http://migec.readthedocs.io/en/latest/
#'
#' - "migmap" - For parsing IgBLAST results postprocessed with MigMap. https://github.com/mikessh/migmap
#'
#' - "tcr" - tcR, our previous package. https://imminfo.github.io/tcr/
#'
#' - "vdjtools" - VDJtools of any version. http://vdjtools-doc.readthedocs.io/en/latest/
#'
#' - "imgt" - IMGT HighV-QUEST. http://www.imgt.org/HighV-QUEST/
#'
#' - "airr" - adaptive immune receptor repertoire (AIRR) data format. http://docs.airr-community.org/en/latest/datarep/overview.html
#'
#' - "10x" - 10XGenomics clonotype annotations tables. https://support.10xgenomics.com/single-cell-vdj/software/pipelines/latest/output/annotation
#'
#' - "archer" - ArcherDX clonotype tables. https://archerdx.com/
#'
#' @return
#' A list with two named elements:
#'
#'  - "data" is a list of input samples;
#'
#'  - "meta" is a data frame with sample metadata.
#'
#' @seealso \link{immunr_data_format} for immunarch data format; \link{repSave} for file saving;
#' \link{repOverlap}, \link{geneUsage} and \link{repDiversity} for starting with immune repertoires basic statistics.
#'
#' @examples
#' # To load the data from a single file (note that you don't need to specify the data format):
#' file_path <- paste0(system.file(package = "immunarch"), "/extdata/io/Sample1.tsv.gz")
#' immdata <- repLoad(file_path)
#'
#' # Suppose you have a following structure in your folder:
#' # >_ ls
#' # immunoseq1.txt
#' # immunoseq2.txt
#' # immunoseq3.txt
#' # metadata.txt
#'
#' # To load the whole folder with every file in it type:
#' file_path <- paste0(system.file(package = "immunarch"), "/extdata/io/")
#' immdata <- repLoad(file_path)
#' print(names(immdata))
#'
#' # We recommend creating a metadata file named exactly "metadata.txt" in the folder.
#'
#' # In that case, when you load your data you will see:
#' # > immdata <- repLoad("path/to/your/folder/")
#' # > names(immdata)
#' # [1] "data" "meta"
#'
#' # If you do not have "metadata.txt", you will see the same output,
#' # but your metadata will be almost empty:
#' # > immdata <- repLoad("path/to/your/folder/")
#' # > names(immdata)
#' # [1] "data" "meta"
#' @export repLoad
repLoad <- function(.path, .format = NA, .mode = "paired", .coding = TRUE) {
  if (!is.na(.format)) {
    warning("Please don't provide the .format argument,
            immunarch detects the format automatically.
            The .format argument will soon be removed.")
  }

  exclude_extensions <- c("so", "exe", "bam", "fasta", "fai", "fastq", "bed", "rds", "report", "vdjca")

  # Process a repertoire file: detect format and load the data
  # Return: a named list with a repertoire data frame and it's name
  .read_repertoire <- function(.path, .format, .mode, .coding) {
    parse_res <- list()

    # Detect format
    cur_format <- .format
    if (is.na(.format)) {
      cur_format <- .detect_format(.path)
    }

    # Parse the file
    if (is.na(cur_format)) {
      message("unsupported format, skipping")
    }
    else {
      message(cur_format)

      parse_fun <- switch(cur_format,
        mitcr = parse_mitcr,
        migec = parse_migec,
        migmap = parse_migmap,
        mixcr = parse_mixcr,
        tcr = parse_tcr,
        vdjtools = parse_vdjtools,
        imgt = parse_imgt,
        immunoseq = parse_immunoseq,
        airr = parse_airr,
        `10x (consensus)` = parse_10x_consensus,
        `10x (filt.contigs)` = parse_10x_filt_contigs,
        archer = parse_archer,
        immunarch = parse_immunarch,
        NA
      )

      if (suppressWarnings(is.na(parse_fun))) {
        message("unknown format, skipping")
      }
      else {
        parse_res <- parse_fun(.path, .mode)

        if (is.null(parse_res)) {
          message("  [!] Warning: zero clonotypes found, skipping")
          parse_res <- list()
        } else {
          if (.coding) {
            parse_res <- coding(parse_res)
          }

          if (!has_class(parse_res, "list")) {
            parse_res <- list(parse_res)
            names(parse_res) <- .remove.ext(.path)
          }
        }
      }
    }

    parse_res
  }


  # Process a vector with filepaths to repertoire files, metadata and barcodes:
  # just load all repertoire files.
  # Do NOT (!) create a dummy metadata, return en empty data frame instead
  # Return: list with data, metadata and barcodes (if necessary)
  .process_batch <- function(.files, .format, .mode, .coding) {
    parsed_batch <- list()
    metadata <- tibble()

    for (.filepath in .files) {
      message('  -- [', match(.filepath, .files), '/', length(.files), '] Parsing "', .filepath, '" -- ', appendLF = FALSE)
      if (!file.exists(.filepath)) {
        message('Can\'t find\t"', .filepath, '", skipping')
      }
      else {
        # Check for the type: repertoire, metadata or barcodes
        if (stringr::str_detect(.filepath, "metadata")) {
          message("metadata")

          metadata <- readr::read_tsv(.filepath, trim_ws = TRUE, col_types = cols())

          # The most basic check
          if (!("Sample" %in% colnames(metadata))) {
            stop('No "Sample" column found in the metadata file. The "Sample" column with the names of samples without extensions (e.g., ".txt", ".tsv") is required. Please provide it and run the parsing again.')
          }
        }
        else if (stringr::str_detect(.filepath, "barcode")) {
          # TODO: add the barcode processing subroutine to split by samples
        }
        else {
          repertoire <- .read_repertoire(.filepath, .format, .mode, .coding)
          if (length(repertoire) != 0) {
            parsed_batch <- c(parsed_batch, repertoire)
          }
        }
      }
    }

    list(data = parsed_batch, meta = metadata)
  }

  # Recursively find all subdirectories and create
  # a list of batches with repertoire files to process
  .split_to_batches <- function(.paths, .root_folder) {
    # split .paths to files and folders
    folders <- c()
    files <- c()
    for (path in .paths) {
      if (file.info(path)$isdir) {
        folders <- c(folders, path)
      } else if (!any(endsWith(path, exclude_extensions))) {
        files <- c(files, path)
      }
    }

    batches <- list(files)
    names(batches) <- .root_folder

    # process each subdirectory to batches
    for (folder_path in folders) {
      new_batch <- .split_to_batches(dir(folder_path, full.names = TRUE, recursive = FALSE), folder_path)
      batches <- c(batches, new_batch)
    }

    batches
  }


  .check_metadata <- function(.metadata, .rep_names) {
    error_flag <- FALSE

    # Check for zero .metadata
    if (nrow(.metadata) == 0) {
      message("  -- Metadata file not found; creating a dummy metadata...")
      .metadata <- tibble(Sample = .rep_names)
      error_flag <- TRUE
    }

    # Check for repertoire names
    missed_in_folders <- setdiff(.rep_names, .metadata$Sample)
    missed_in_metadata <- setdiff(.metadata$Sample, .rep_names)
    if (length(missed_in_folders) || length(missed_in_metadata)) {
      if (length(missed_in_metadata)) {
        message("  -- Samples found in the metadata, but not in the folder:\n     ", missed_in_metadata)
        message("  Did you correctly specify all the sample names in the metadata file?")

        error_flag <- TRUE
      }
      if (length(missed_in_folders)) {
        message("  -- Samples found in the folder, but not in the metadata:\n     ", missed_in_folders)
        message("  Did you add all the necessary samples to the metadata file with correct names?")
        message("  Creating dummy sample records in the metadata for now...")

        .metadata <- merge(.metadata, as_tibble(data.frame(Sample = missed_in_folders, stringsAsFactors = FALSE)), all = TRUE)

        error_flag <- TRUE
      }
    }

    # Drop incorrect columns
    to_drop <- c()
    for (col_name in names(.metadata)) {
      if (sum(is.na(.metadata[[col_name]])) == nrow(.metadata)) {
        to_drop <- c(to_drop, col_name)
      }
    }

    if (length(to_drop)) {
      .metadata <- .metadata[-match(to_drop, names(.metadata))]
      message("Dropping ", length(to_drop), "column(s) from .metadata.txt. Do you have spaces or tabs after the name of the last column? Remove them to ensure everything works correctly.")
      error_flag <- TRUE
    }

    if (!error_flag) {
      message("  -- Everything is OK!")
    }

    .metadata
  }

  #
  # Step 1: load repertoire files
  #
  message("\n== Step 1/3: loading repertoire files... ==\n")

  # Split .path to files and folders
  batches <- .split_to_batches(.path, "<initial>")

  # Parse repertoires from each batch
  parsed_batches <- list()
  for (batch_i in 1:length(batches)) {
    if (length(batches[[batch_i]])) {
      message('Processing "', names(batches)[batch_i], '" ...')
      parsed_batches[[names(batches)[batch_i]]] <- .process_batch(batches[[batch_i]], .format, .mode, .coding)
    }
  }

  #
  # Step 2: check metadata files
  #
  message("\n== Step 2/3: checking metadata files and merging files... ==\n")

  for (batch_i in 1:length(parsed_batches)) {
    message('Processing "', names(parsed_batches)[batch_i], '" ...')
    parsed_batches[[batch_i]]$meta <- .check_metadata(parsed_batches[[batch_i]]$meta, names(parsed_batches[[batch_i]]$data))
  }

  names(parsed_batches) <- NA
  parsed_batches <- list(data = do.call(c, lapply(parsed_batches, "[[", "data")), meta = Reduce(function(x, y) merge(x, y, all = TRUE), lapply(parsed_batches, "[[", "meta")))
  names(parsed_batches$data) <- substr(names(parsed_batches$data), 4, nchar(names(parsed_batches$data)))

  if (any(duplicated(names(parsed_batches$data)))) {
    stop("Found non-unique sample names: ", paste0(names(parsed_batches$data)[duplicated(names(parsed_batches$data))], collapse = " "), " \n  Please rename your samples so they all have unique names.")
  }

  #
  # Step 3: split by chains and barcodes
  #
  message("\n== Step 3/3: processing paired chain data... ==\n")

  # Check for mixed chains repertoire files and split them if necessary
  rep_names <- names(parsed_batches$data)
  metadata <- parsed_batches$meta

  for (source_name in rep_names) {
    if ("Chain" %in% colnames(metadata)) {
      if (!is.na(metadata[which(metadata$Sample == source_name), "Chain"])) {
        next
      }
    }

    if ("chain" %in% colnames(parsed_batches$data[[source_name]])) {
      #
      # If mode is "single" then split paired chain data to two chain-specific immune repertoires
      #
      if (.mode == "single") {
        df_split <- split(parsed_batches$data[[source_name]], parsed_batches$data[[source_name]]$chain)
        if (length(df_split) > 1) {
          chain_col <- "Chain"
          source_col <- "Source"

          if (!(chain_col %in% colnames(metadata))) {
            message("Splitting repertoires by their chain type. Columns ", chain_col, " and ", source_col, " added to the metadata.")
            message("  -- Column ", chain_col, " specifies chain types: TRA, TRB, etc.")
            message("  -- Column ", source_col, " specifies the initial sample name for a repertoire after splitting by chain types")
          }

          for (i in 1:length(df_split)) {
            sample_name <- paste0(source_name, "_", names(df_split)[i])
            parsed_batches$data[[sample_name]] <- df_split[[i]]
            new_record <- metadata[which(metadata$Sample == source_name), ]
            new_record$Sample <- sample_name
            new_record[[chain_col]] <- names(df_split)[i]
            new_record[[source_col]] <- source_name
            metadata <- merge(metadata, new_record, all = TRUE)
          }

          parsed_batches$data[[source_name]] <- NULL
          metadata <- metadata[-(which(metadata$Sample == source_name)), ]
        }
      }
      #
      # If mode is "paired" then nothing to do here
      #
      else {
        # pass
      }
    }
  }
  message("Done!\n")

  parsed_batches$meta <- metadata
  parsed_batches
}


#' Save immune repertoires to the disk
#'
#' @concept io
#'
#' @importFrom utils packageVersion
#' @importFrom plyr mapvalues
#'
#' @description The \code{repSave} function is deigned to save your data to the disk
#' in desirable format. Currently supports "immunarch" and "vdjtools" file formats.
#'
#' @param .data An R dataframe, a list of R dataframes or a list with \code{data} and
#' \code{meta} where first element is a list of dataframes and the latter is a dataframe
#' with metadata.
#' @param .path A string with the path to the output directory. It should include file
#' name if a single dataframe is provided to .data argument.
#' @param .format A string with desirable format specification. Current options are
#' "immunarch" and "vdjtools".
#' @param .compress A boolean value. Defines whether the output will be compressed or not.
#'
#' @details It is not necessary to create directories beforehand. If the provided directory
#' does not exist it will be created automatically.
#'
#' @return No return value.
#'
#' @examples
#' data(immdata)
#' dirpath <- tempdir()
#' # Save the list of repertoires
#' repSave(immdata, dirpath)
#' # Load it and check if it is the same
#' new_immdata <- repLoad(dirpath)
#' # sum(immdata$data[[1]] != new_immdata$data[[1]], na.rm = TRUE)
#' # sum(immdata$data[[2]] != new_immdata$data[[2]], na.rm = TRUE)
#' # sum(immdata$meta != new_immdata$meta, na.rm = TRUE)
#' @export repSave
repSave <- function(.data, .path, .format = c("immunarch", "vdjtools"),
                    .compress = TRUE) {
  .format <- .format[1]
  if (!.format %in% c("immunarch", "vdjtools")) {
    stop("Unknown format. Please provide either 'immunarch' or 'vdjtools'")
  }

  if (!has_class(.data, "list")) {
    success <- switch(.format,
      immunarch = save_immunarch(.data, .path, .compress),
      vdjtools = save_vdjtools(.data, .path, .compress),
      NA
    )
  } else {
    if ("meta" %in% names(.data)) {
      ifelse(!dir.exists(.path), dir.create(.path), FALSE)
      readr::write_tsv(.data$meta, path = paste0(.path, "/metadata.txt"))
      for (name in names(.data$data)) {
        success <- switch(.format,
          immunarch = save_immunarch(
            .data$data[[name]],
            paste0(.path, "/", name),
            .compress
          ),
          vdjtools = save_vdjtools(
            .data$data[[name]],
            paste0(.path, "/", name),
            .compress
          ),
          NA
        )
      }
    } else {
      ifelse(!dir.exists(.path), dir.create(.path), FALSE)
      for (name in names(.data)) {
        success <- switch(.format,
          immunarch = save_immunarch(
            .data[[name]],
            paste0(.path, "/", name),
            .compress
          ),
          vdjtools = save_vdjtools(
            .data[[name]],
            paste0(.path, "/", name),
            .compress
          ),
          NA
        )
      }
    }
  }
}





##### IO utility functions #####
.remove.ext <- function(.str) {
  # gsub(pattern = '.*/|[.].*$', replacement = '', x = .str)
  gsub(pattern = ".*/|[.](txt|tsv|csv)$|([.](txt|tsv|csv))?[.](gz|bzip|bzip2|bz2)$", replacement = "", x = .str)
}


.detect_format <- function(.filename) {
  res_format <- NA

  f <- file(.filename, "r")
  l <- readLines(f, 1)
  close(f)

  if (any(str_detect(l, c("MiTCRFullExport", "mitcr")))) {
    res_format <- "mitcr"
  } else if (str_detect(l, "CDR3 amino acid sequence") && str_detect(l, "V segment") && !str_detect(l, "Good events")) {
    res_format <- "mitcr"
  } else if (str_detect(l, "CDR3 amino acid sequence") && str_detect(l, "V segment") && str_detect(l, "Good events")) {
    res_format <- "migec"
  } else if (str_detect(l, "v.end.in.cdr3") && str_detect(l, "cdr3aa")) {
    res_format <- "migmap"
  } else if (str_detect(l, "CDR3.amino.acid.sequence") && str_detect(l, "Umi.count")) {
    res_format <- "tcr"
  } else if (str_detect(tolower(l), "cdr3nt") && str_detect(tolower(l), "vend") && str_detect(tolower(l), "v")) {
    res_format <- "vdjtools"
  } else if (str_detect(tolower(l), "count") && str_detect(tolower(l), "sequence") && str_detect(tolower(l), "d segment")) {
    res_format <- "vdjtools"
  } else if (str_detect(tolower(l), "junction start") && str_detect(tolower(l), "v-d-j-region end") && str_detect(tolower(l), "v-region")) {
    res_format <- "imgt"
  } else if (str_detect(tolower(l), "v_resolved") && str_detect(tolower(l), "amino_acid")) {
    res_format <- "immunoseq"
  } else if (str_detect(tolower(l), "maxresolved")) {
    res_format <- "immunoseq"
  } else if (str_detect(tolower(l), "v_gene") && str_detect(tolower(l), "templates") && str_detect(tolower(l), "amino_acid")) {
    res_format <- "immunoseq"
  } else if (str_detect(tolower(l), "allvalignment") && str_detect(tolower(l), "vhit")) {
    res_format <- "mixcr"
  } else if (str_detect(tolower(l), "bestvhit") && str_detect(tolower(l), "bestjhit")) {
    res_format <- "mixcr"
  } else if (str_detect(tolower(l), "clonal sequence")) {
    res_format <- "mixcr"
  } else if (str_detect(tolower(l), "clonalsequence")) {
    res_format <- "mixcr"
  } else if (str_detect(tolower(l), "targetsequences")) {
    res_format <- "mixcr"
  } else if (str_detect(tolower(l), "junction_aa") && str_detect(tolower(l), "cigar")) {
    res_format <- "airr"
  } else if (str_detect(tolower(l), "raw_clonotype_id") && str_detect(tolower(l), "barcode") && str_detect(tolower(l), "v_gene")) {
    res_format <- "10x (filt.contigs)"
  } else if (str_detect(tolower(l), "clonotype_id") && str_detect(tolower(l), "v_gene")) {
    res_format <- "10x (consensus)"
  } else if (str_detect(tolower(l), "clonotype sequence") && str_detect(tolower(l), "v regions")) {
    res_format <- "archer"
  } else if (str_detect(tolower(l), "exported from immunarch")) {
    res_format <- "immunarch"
  } else if (str_detect(tolower(l), "clones") && str_detect(tolower(l), "v.name") && str_detect(tolower(l), "proportion")) {
    res_format <- "immunarch"
  }

  res_format
}


.make_names <- function(.char) {
  if (is.na(.char[1])) {
    NA
  }
  # else { tolower(make.names(.char)) }
  else {
    tolower(.char)
  }
}


.which_recomb_type <- function(.name) {
  recomb_type <- NA

  i <- 1

  while (is.na(recomb_type) && i < 100) {
    if (any(str_detect(.name[i], c("TCRA", "TRAV", "TCRG", "TRGV", "IGKV", "IGLV")))) {
      recomb_type <- "VJ"
    } else if (any(str_detect(.name[i], c("TCRB", "TRBV", "TCRD", "TRDV", "IGHV")))) {
      recomb_type <- "VDJ"
    }
    i <- i + 1
  }
  if (is.na(recomb_type)) {
    warning("Can't determine the type of V(D)J recombination. No insertions will be presented in the resulting data table.")
  }

  recomb_type
}


.get_coltypes <- function(.filename, .nuc.seq, .aa.seq, .count,
                          .vgenes, .jgenes, .dgenes,
                          .vend, .jstart, .dstart, .dend,
                          .vd.insertions, .dj.insertions, .total.insertions,
                          .skip, .sep, .add = NA) {
  table.colnames <- colnames(readr::read_delim(.filename,
    col_types = cols(),
    delim = .sep,
    quote = "",
    escape_double = FALSE,
    comment = "",
    n_max = 1,
    trim_ws = TRUE,
    skip = .skip
  ))

  swlist <- list(
    col_character(), col_character(),
    col_integer(),
    col_character(), col_character(), col_character(),
    col_integer(), col_integer(), col_integer(), col_integer(),
    col_integer(), col_integer(), col_integer()
  )

  names(swlist) <- tolower(c(
    .nuc.seq, ifelse(is.na(.aa.seq), "NA", .aa.seq),
    .count,
    .vgenes, .jgenes, .dgenes,
    .vend, .jstart, .dstart, .dend,
    .vd.insertions, .dj.insertions, .total.insertions
  ))
  if (!is.na(.add[1])) {
    swlist <- c(swlist, rep(col_guess(), length(.add)))
    names(swlist)[tail(1:length(swlist), length(.add))] <- .add
  }

  swlist <- c(swlist, "_")

  if (is.na(.aa.seq)) {
    swlist <- swlist[-2]
  }

  col.classes <- list(sapply(tolower(table.colnames), function(x) {
    do.call(switch, c(x, swlist))
  }, USE.NAMES = FALSE))[[1]]
  names(col.classes) <- table.colnames

  col.classes
}

.remove.alleles <- function(.data) {
  if (has_class(.data, "list")) {
    lapply(.data, .remove.alleles)
  } else {
    .data[[IMMCOL$v]] <- return_segments(.data[[IMMCOL$v]])
    .data[[IMMCOL$j]] <- return_segments(.data[[IMMCOL$j]])
    .data
  }
}

.postprocess <- function(.data, .mode) {
  .data[[IMMCOL$cdr3nt]][.data[[IMMCOL$cdr3nt]] == "NONE"] <- NA
  logic <- is.na(.data[[IMMCOL$cdr3aa]]) & !is.na(.data[[IMMCOL$cdr3nt]])
  if (any(logic)) {
    .data[[IMMCOL$cdr3aa]][logic] <- bunch_translate(.data[[IMMCOL$cdr3nt]][logic])
  }

  logic <- is.na(.data[[IMMCOL$cdr3aa]]) & is.na(.data[[IMMCOL$cdr3nt]])
  if (any(logic)) {
    warn_msg <- c("  [!] Removed ", sum(logic))
    warn_msg <- c(warn_msg, " clonotypes with no nucleotide and amino acid CDR3 sequence.")
    message(warn_msg)
  }
  .data <- .data[!logic, ]

  if (nrow(.data)) {
    for (colname in c(IMMCOL$ve, IMMCOL$ds, IMMCOL$de, IMMCOL$js, IMMCOL$vnj, IMMCOL$vnd, IMMCOL$dnj)) {
      if (colname %in% colnames(.data)) {
        logic <- is.na(.data[[colname]])
        .data[[colname]][logic] <- -1

        logic <- .data[[colname]] < 0
        .data[[colname]][logic] <- NA
      }
    }

    for (col_i in 1:length(IMMCOL$order)) {
      colname <- IMMCOL$order[col_i]
      if (colname %in% colnames(.data)) {
        if (!has_class(.data[[colname]], IMMCOL$type[col_i])) {
          .data[[colname]] <- as(.data[[colname]], IMMCOL$type[col_i])
        }
      }
    }

    logic <- is.na(.data[[IMMCOL$count]])
    if (any(logic)) {
      message("  [!] Warning: found NAs in clonal counts. Setting them to 1's.")
      .data[[IMMCOL$count]][logic] <- 1
    }
    .data <- .data[order(.data[[IMMCOL$count]], decreasing = TRUE), ]
  } else {
    .data <- NULL
  }

  .data
}


##### Parsers #####

parse_repertoire <- function(.filename, .mode, .nuc.seq, .aa.seq, .count,
                             .vgenes, .jgenes, .dgenes,
                             .vend, .jstart, .dstart, .dend,
                             .vd.insertions, .dj.insertions, .total.insertions,
                             .skip = 0, .sep = "\t", .add = NA) {
  .nuc.seq <- .make_names(.nuc.seq)
  .aa.seq <- .make_names(.aa.seq)
  .count <- .make_names(.count)
  .vgenes <- .make_names(.vgenes)
  .jgenes <- .make_names(.jgenes)
  .dgenes <- .make_names(.dgenes)
  .vend <- .make_names(.vend)
  .jstart <- .make_names(.jstart)
  .vd.insertions <- .make_names(.vd.insertions)
  .dj.insertions <- .make_names(.dj.insertions)
  .total.insertions <- .make_names(.total.insertions)
  .dstart <- .make_names(.dstart)
  .dend <- .make_names(.dend)
  .add <- .make_names(.add)

  col.classes <- .get_coltypes(.filename, .nuc.seq, .aa.seq, .count,
    .vgenes, .jgenes, .dgenes,
    .vend, .jstart, .dstart, .dend,
    .vd.insertions, .dj.insertions, .total.insertions,
    .skip = .skip, .sep = "\t", .add
  )

  # IO_REFACTOR
  suppressMessages(df <- readr::read_delim(.filename,
    col_names = TRUE,
    col_types = col.classes, delim = .sep,
    quote = "", escape_double = FALSE,
    comment = "", trim_ws = TRUE,
    skip = .skip, na = c("", "NA", ".")
  ))
  # suppressMessages(df <- fread(.filename, skip = .skip, data.table = FALSE, na.strings = c("", "NA", ".")))

  names(df) <- tolower(names(df))
  recomb_type <- .which_recomb_type(df[[.vgenes]])

  table.colnames <- names(col.classes)

  df[[.nuc.seq]] <- toupper(df[[.nuc.seq]])

  if (is.na(.aa.seq)) {
    df$CDR3.amino.acid.sequence <- bunch_translate(df[[.nuc.seq]])
    .aa.seq <- "CDR3.amino.acid.sequence"
  }

  if (is.na(.count)) {
    .count <- "Count"
    df$Count <- 1
  }

  df$Proportion <- df[[.count]] / sum(df[[.count]])
  .prop <- "Proportion"

  ins_ok <- FALSE
  if (is.na(.vd.insertions)) {
    .vd.insertions <- "VD.insertions"
    df$VD.insertions <- NA
  }

  if (!(.vd.insertions %in% table.colnames)) {
    .vd.insertions <- "VD.insertions"
    df$VD.insertions <- NA

    if (!is.na(.vend) && !is.na(.dstart)) {
      if (!is.na(recomb_type) && recomb_type == "VDJ") {
        df$VD.insertions <- df[[.dstart]] - df[[.vend]] - 1
        df$VD.insertions[is.na(df[[.dstart]])] <- NA
        df$VD.insertions[is.na(df[[.vend]])] <- NA

        ins_ok <- TRUE
      }
    }
  }

  if (!ins_ok) {
    df$V.end <- NA
    df$D.start <- NA
    df$D.end <- NA
    .vend <- "V.end"
    .dstart <- "D.start"
    .dend <- "D.end"
  }

  ins_ok <- FALSE
  if (is.na(.dj.insertions)) {
    .dj.insertions <- "DJ.insertions"
    df$DJ.insertions <- NA
  }

  if (!(.dj.insertions %in% table.colnames)) {
    .dj.insertions <- "DJ.insertions"
    df$DJ.insertions <- NA

    if (!is.na(.jstart) && !is.na(.dend)) {
      if (!is.na(recomb_type) && recomb_type == "VDJ") {
        df$DJ.insertions <- df[[.jstart]] - df[[.dend]] - 1
        df$DJ.insertions[is.na(df[[.dend]])] <- NA
        df$DJ.insertions[is.na(df[[.jstart]])] <- NA

        ins_ok <- TRUE
      }
    }
  }
  if (!ins_ok) {
    df$J.start <- NA
    df$D.start <- NA
    df$D.end <- NA
    .jstart <- "J.start"
    .dstart <- "D.start"
    .dend <- "D.end"
  }

  ins_ok <- FALSE
  if (is.na(.total.insertions)) {
    .total.insertions <- "Total.insertions"
    df$Total.insertions <- NA
  }

  if (!(.total.insertions %in% table.colnames)) {
    .total.insertions <- "Total.insertions"
    df$Total.insertions <- -1
    if (!is.na(recomb_type)) {
      if (recomb_type == "VJ") {
        df$Total.insertions <- df[[.jstart]] - df[[.vend]] - 1
        df$Total.insertions[df$Total.insertions < 0] <- 0
      } else if (recomb_type == "VDJ") {
        df$Total.insertions <- df[[.vd.insertions]] + df[[.dj.insertions]]
      }
    } else {
      df$Total.insertions <- NA
    }
  }

  vec_names <- c(
    .count, .prop, .nuc.seq, .aa.seq,
    .vgenes, .dgenes, .jgenes,
    .vend, .dstart, .dend, .jstart,
    .total.insertions, .vd.insertions, .dj.insertions
  )
  if (!is.na(.add[1])) {
    vec_names <- c(vec_names, .add)
  }

  df <- df[, vec_names]

  colnames(df)[1] <- IMMCOL$count
  colnames(df)[2] <- IMMCOL$prop
  colnames(df)[3] <- IMMCOL$cdr3nt
  colnames(df)[4] <- IMMCOL$cdr3aa
  colnames(df)[5] <- IMMCOL$v
  colnames(df)[6] <- IMMCOL$d
  colnames(df)[7] <- IMMCOL$j
  colnames(df)[8] <- IMMCOL$ve
  colnames(df)[9] <- IMMCOL$ds
  colnames(df)[10] <- IMMCOL$de
  colnames(df)[11] <- IMMCOL$js
  colnames(df)[12] <- IMMCOL$vnj
  colnames(df)[13] <- IMMCOL$vnd
  colnames(df)[14] <- IMMCOL$dnj

  .postprocess(df, .mode)
}

parse_immunoseq <- function(.filename, .mode, .wash.alleles = TRUE) {
  .fix.immunoseq.genes <- function(.col) {
    # fix ","
    .col <- gsub(",", ", ", .col, fixed = TRUE, useBytes = TRUE)
    # fix forward zeros
    .col <- gsub("-([0])([0-9])", "-\\2", .col, useBytes = TRUE)
    .col <- gsub("([VDJ])([0])([0-9])", "\\1\\3", .col, useBytes = TRUE)
    # fix gene names
    .col <- gsub("TCR", "TR", .col, fixed = TRUE, useBytes = TRUE)
    .col
  }

  filename <- .filename
  file_cols <- list()
  file_cols[[IMMCOL$count]] <- "templates"
  file_cols[[IMMCOL$cdr3nt]] <- "rearrangement"
  file_cols[[IMMCOL$cdr3aa]] <- "amino_acid"
  file_cols[[IMMCOL$v]] <- "v_resolved"
  file_cols[[IMMCOL$d]] <- "d_resolved"
  file_cols[[IMMCOL$j]] <- "j_resolved"
  file_cols[[IMMCOL$vnj]] <- "n1_insertions"
  file_cols[[IMMCOL$vnd]] <- "n1_insertions"
  file_cols[[IMMCOL$dnj]] <- "n2_insertions"

  v_index_col_name <- "v_index"
  d_index_col_name <- "d_index"
  j_index_col_name <- "j_index"
  n1_index_col_name <- "n1_index"
  n2_index_col_name <- "n2_index"

  #
  # Check for the version of ImmunoSEQ files
  #
  f <- file(.filename, "r")
  l <- readLines(f, 2)
  close(f)
  if (str_detect(l[[1]], "v_gene") && !str_detect(l[[1]], "v_resolved")) {
    file_cols[[IMMCOL$v]] <- "v_gene"
    file_cols[[IMMCOL$d]] <- "d_gene"
    file_cols[[IMMCOL$j]] <- "j_gene"
  } else if (str_detect(l[[1]], "MaxResolved")) {
    file_cols[[IMMCOL$v]] <- "vMaxResolved"
    file_cols[[IMMCOL$d]] <- "dMaxResolved"
    file_cols[[IMMCOL$j]] <- "jMaxResolved"

    file_cols[[IMMCOL$vnj]] <- "n1insertion"
    file_cols[[IMMCOL$vnd]] <- "n1insertion"
    file_cols[[IMMCOL$dnj]] <- "n2insertion"
  }


  l_split <- strsplit(l, "\t")
  if (str_detect(l[[1]], "templates")) {
    if (str_detect(l[[1]], "templates/reads")) {
      file_cols[[IMMCOL$count]] <- "count (templates/reads)"
      file_cols[[IMMCOL$cdr3nt]] <- "nucleotide"
      file_cols[[IMMCOL$cdr3aa]] <- "aminoAcid"
    } else if (l_split[[2]][match("templates", l_split[[1]])] == "null") {
      file_cols[[IMMCOL$count]] <- "reads"
    }
  }

  if (!str_detect(l[[1]], "v_index")) {
    v_index_col_name <- "vindex"
    d_index_col_name <- "dindex"
    j_index_col_name <- "jindex"
    n1_index_col_name <- "n1index"
    n2_index_col_name <- "n2index"
  }

  for (col_name in names(file_cols)) {
    file_cols[[col_name]] <- .make_names(file_cols[[col_name]])
  }

  file_cols[[IMMCOL$prop]] <- IMMCOL$prop
  file_cols[[IMMCOL$ve]] <- IMMCOL$ve
  file_cols[[IMMCOL$ds]] <- IMMCOL$ds
  file_cols[[IMMCOL$de]] <- IMMCOL$de
  file_cols[[IMMCOL$js]] <- IMMCOL$js
  file_cols[[IMMCOL$seq]] <- IMMCOL$seq

  # IO_REFACTOR
  suppressMessages(df <- readr::read_delim(.filename,
    col_names = TRUE, col_types = cols(),
    delim = "\t", quote = "",
    escape_double = FALSE, comment = "",
    trim_ws = TRUE, skip = 0
  ))
  # suppressMessages(df <- fread(.filename, data.table = FALSE))

  names(df) <- tolower(names(df))

  df[[file_cols[[IMMCOL$prop]]]] <- df[[file_cols[[IMMCOL$count]]]] / sum(df[[file_cols[[IMMCOL$count]]]])

  # Save full nuc sequences and cut them down to CDR3
  df[[IMMCOL$seq]] <- df[[file_cols[[IMMCOL$cdr3nt]]]]

  # TODO: what if df[["v_index]] has "-1" or something like that?
  df[[file_cols[[IMMCOL$cdr3nt]]]] <- stringr::str_sub(df[[IMMCOL$seq]], df[[v_index_col_name]] + 1, nchar(df[[IMMCOL$seq]]))

  df[[file_cols[[IMMCOL$v]]]] <- .fix.immunoseq.genes(df[[file_cols[[IMMCOL$v]]]])
  df[[file_cols[[IMMCOL$d]]]] <- .fix.immunoseq.genes(df[[file_cols[[IMMCOL$d]]]])
  df[[file_cols[[IMMCOL$j]]]] <- .fix.immunoseq.genes(df[[file_cols[[IMMCOL$j]]]])

  recomb_type <- "VDJ"
  if (recomb_type == "VDJ") {
    df[[file_cols[[IMMCOL$ve]]]] <- df[[n1_index_col_name]] - df[[v_index_col_name]]
    df[[file_cols[[IMMCOL$ds]]]] <- df[[d_index_col_name]] - df[[v_index_col_name]]
    df[[file_cols[[IMMCOL$de]]]] <- df[[n2_index_col_name]] - df[[v_index_col_name]]
    df[[file_cols[[IMMCOL$js]]]] <- df[[j_index_col_name]] - df[[v_index_col_name]]
    file_cols[[IMMCOL$vnj]] <- IMMCOL$vnj
    df[[IMMCOL$vnj]] <- -1
  }

  sample_name_vec <- NA
  if ("sample_name" %in% colnames(df)) {
    if (length(unique(df[["sample_name"]])) > 1) {
      sample_name_vec <- df[["sample_name"]]
    }
  }

  df <- df[unlist(file_cols[IMMCOL$order])]
  names(df) <- IMMCOL$order

  if (.wash.alleles) {
    df <- .remove.alleles(df)
    df[[IMMCOL$v]] <- gsub("([VDJ][0-9]*)$", "\\1-1", df[[IMMCOL$v]], useBytes = TRUE)
    df[[IMMCOL$j]] <- gsub("([VDJ][0-9]*)$", "\\1-1", df[[IMMCOL$j]], useBytes = TRUE)
  }

  if (nrow(df) > 0) {
    if (has_class(df[[IMMCOL$vnj]], "character")) {
      df[[IMMCOL$vnj]][df[[IMMCOL$vnj]] == "no data"] <- NA
    }
    if (has_class(df[[IMMCOL$vnd]], "character")) {
      df[[IMMCOL$vnd]][df[[IMMCOL$vnd]] == "no data"] <- NA
    }
    if (has_class(df[[IMMCOL$dnj]], "character")) {
      df[[IMMCOL$dnj]][df[[IMMCOL$dnj]] == "no data"] <- NA
    }

    df[[IMMCOL$vnj]] <- as.integer(df[[IMMCOL$vnj]])
    df[[IMMCOL$vnd]] <- as.integer(df[[IMMCOL$vnd]])
    df[[IMMCOL$dnj]] <- as.integer(df[[IMMCOL$dnj]])
  }

  df[[IMMCOL$v]][df[[IMMCOL$v]] == "unresolved"] <- NA
  df[[IMMCOL$d]][df[[IMMCOL$d]] == "unresolved"] <- NA
  df[[IMMCOL$j]][df[[IMMCOL$j]] == "unresolved"] <- NA

  if (!is.na(sample_name_vec[1])) {
    df <- lapply(split(df, sample_name_vec), .postprocess)
  } else {
    .postprocess(df)
  }
}

parse_mitcr <- function(.filename, .mode) {
  .skip <- 0
  f <- file(.filename, "r")
  l <- readLines(f, 1)
  # Check for different levels of the MiTCR output
  if (any(stringr::str_detect(l, c("MiTCRFullExport", "mitcr")))) {
    .skip <- 1
  }
  mitcr_format <- 1
  if (stringr::str_detect(l, "MiTCRFullExport") || .skip == 0) {
    mitcr_format <- 2
  }
  close(f)

  if (mitcr_format == 1) {
    filename <- .filename
    .count <- "count"
    nuc.seq <- "cdr3nt"
    aa.seq <- "cdr3aa"
    vgenes <- "v"
    jgenes <- "j"
    dgenes <- "d"
    vend <- "VEnd"
    jstart <- "JStart"
    dstart <- "DStart"
    dend <- "DEnd"
    vd.insertions <- NA
    dj.insertions <- NA
    total.insertions <- NA
    .sep <- "\t"
  } else {
    # Check if there are barcodes
    f <- file(.filename, "r")
    l <- readLines(f, 1 + .skip)[.skip + 1]
    barcodes <- NA
    .count <- "Read count"
    if ("NNNs" %in% strsplit(l, "\t", TRUE)[[1]]) {
      .count <- "NNNs"
    }
    close(f)

    filename <- .filename
    nuc.seq <- "CDR3 nucleotide sequence"
    aa.seq <- "CDR3 amino acid sequence"
    vgenes <- "V segments"
    jgenes <- "J segments"
    dgenes <- "D segments"
    vend <- "Last V nucleotide position"
    jstart <- "First J nucleotide position"
    dstart <- "First D nucleotide position"
    dend <- "Last D nucleotide position"
    vd.insertions <- "VD insertions"
    dj.insertions <- "DJ insertions"
    total.insertions <- "Total insertions"
    .sep <- "\t"
  }

  parse_repertoire(
    .filename = filename, .mode = .mode, .nuc.seq = nuc.seq, .aa.seq = aa.seq, .count = .count, .vgenes = vgenes, .jgenes = jgenes, .dgenes = dgenes,
    .vend = vend, .jstart = jstart, .dstart = dstart, .dend = dend,
    .vd.insertions = vd.insertions, .dj.insertions = dj.insertions,
    .total.insertions = total.insertions, .skip = .skip, .sep = .sep
  )
}

parse_mixcr <- function(.filename, .mode) {
  fix.alleles <- function(.data) {
    .data[[IMMCOL$v]] <- gsub("[*][[:digit:]]*", "", .data[[IMMCOL$v]])
    .data[[IMMCOL$d]] <- gsub("[*][[:digit:]]*", "", .data[[IMMCOL$d]])
    .data[[IMMCOL$j]] <- gsub("[*][[:digit:]]*", "", .data[[IMMCOL$j]])
    .data
  }

  .filename <- .filename
  .count <- "clonecount"
  .sep <- "\t"
  .vend <- "allvalignments"
  .jstart <- "alljalignments"
  .dalignments <- "alldalignments"
  .vd.insertions <- "VD.insertions"
  .dj.insertions <- "DJ.insertions"
  .total.insertions <- "Total.insertions"

  table.colnames <- tolower(make.names(read.table(.filename, sep = .sep, skip = 0, nrows = 1, stringsAsFactors = FALSE, strip.white = TRUE, comment.char = "", quote = "")[1, ]))
  table.colnames <- gsub(".", "", table.colnames, fixed = TRUE)

  # Columns of different MiXCR formats
  # Clone count - Clonal sequence(s) - N. Seq. CDR3
  # cloneCount - clonalSequence - nSeqCDR3
  # cloneCount - targetSequences - nSeqImputedCDR3
  # cloneCount - targetSequences - nSeqCDR3
  if ("targetsequences" %in% table.colnames) {
    if ("nseqimputedcdr3" %in% table.colnames) {
      .nuc.seq <- "nseqimputedcdr3"
    } else {
      .nuc.seq <- "nseqcdr3"
    }

    .big.seq <- "targetsequences"
  } else {
    .nuc.seq <- "nseqcdr3"

    if ("clonalsequences" %in% table.colnames) {
      .big.seq <- "clonalsequences"
    } else if ("clonalsequence" %in% table.colnames) {
      .big.seq <- "clonalsequence"
    } else {
      .big.seq <- NA
    }
  }

  if (!("allvalignments" %in% table.colnames)) {
    if ("allvalignment" %in% table.colnames) {
      .vend <- "allvalignment"
    } else {
      .vend <- NA
    }
  }
  if (!("alldalignments" %in% table.colnames)) {
    if ("alldalignment" %in% table.colnames) {
      .dalignments <- "alldalignment"
    } else {
      .dalignments <- NA
    }
  }
  if (!("alljalignments" %in% table.colnames)) {
    if ("alljalignment" %in% table.colnames) {
      .jstart <- "alljalignment"
    } else {
      .jstart <- NA
    }
  }

  if ("bestvhit" %in% table.colnames) {
    .vgenes <- "bestvhit"
  } else if ("allvhits" %in% table.colnames) {
    .vgenes <- "allvhits"
  } else if ("vhits" %in% table.colnames) {
    .vgenes <- "vhits"
  } else if ("allvhitswithscore" %in% table.colnames) {
    .vgenes <- "allvhitswithscore"
  } else if ("bestvgene" %in% table.colnames) {
    .vgenes <- "bestvgene"
  } else {
    message("Error: can't find a column with V genes")
  }

  if ("bestjhit" %in% table.colnames) {
    .jgenes <- "bestjhit"
  } else if ("alljhits" %in% table.colnames) {
    .jgenes <- "alljhits"
  } else if ("jhits" %in% table.colnames) {
    .jgenes <- "jhits"
  } else if ("alljhitswithscore" %in% table.colnames) {
    .jgenes <- "alljhitswithscore"
  } else if ("bestjgene" %in% table.colnames) {
    .jgenes <- "bestjgene"
  } else {
    message("Error: can't find a column with J genes")
  }

  if ("bestdhit" %in% table.colnames) {
    .dgenes <- "bestdhit"
  } else if ("alldhits" %in% table.colnames) {
    .dgenes <- "alldhits"
  } else if ("dhits" %in% table.colnames) {
    .dgenes <- "dhits"
  } else if ("alldhitswithscore" %in% table.colnames) {
    .dgenes <- "alldhitswithscore"
  } else if ("bestdgene" %in% table.colnames) {
    .dgenes <- "bestdgene"
  } else {
    message("Error: can't find a column with D genes")
  }


  # IO_REFACTOR
  df <- read_delim(
    file = .filename, col_types = cols(),
    delim = .sep, skip = 0, comment = "",
    quote = "", escape_double = FALSE, trim_ws = TRUE
  )
  # df <- fread(.filename, data.table = FALSE)

  #
  # return NULL if there is no clonotypes in the data frame
  #
  if (nrow(df) == 0) {
    return(NULL)
  }

  names(df) <- make.names(names(df))
  names(df) <- tolower(gsub(".", "", names(df), fixed = TRUE))
  names(df) <- str_replace_all(names(df), " ", "")

  # check for VJ or VDJ recombination
  # VJ / VDJ / Undeterm
  recomb_type <- "Undeterm"
  if (sum(substr(head(df)[[.vgenes]], 1, 4) %in% c("TCRA", "TRAV", "TRGV", "IGKV", "IGLV"))) {
    recomb_type <- "VJ"
  } else if (sum(substr(head(df)[[.vgenes]], 1, 4) %in% c("TCRB", "TRBV", "TRDV", "IGHV"))) {
    recomb_type <- "VDJ"
  }

  if (!is.na(.vend) && !is.na(.jstart)) {
    .vd.insertions <- "VD.insertions"
    df$VD.insertions <- -1
    if (recomb_type == "VJ") {
      df$VD.insertions <- -1
    } else if (recomb_type == "VDJ") {
      logic <- sapply(strsplit(df[[.dalignments]], "|", TRUE, FALSE, TRUE), length) >= 4 &
        sapply(strsplit(df[[.vend]], "|", TRUE, FALSE, TRUE), length) >= 5
      df$VD.insertions[logic] <-
        as.numeric(sapply(strsplit(df[[.dalignments]][logic], "|", TRUE, FALSE, TRUE), "[[", 4)) -
        as.numeric(sapply(strsplit(df[[.vend]][logic], "|", TRUE, FALSE, TRUE), "[[", 5)) - 1
    }

    .dj.insertions <- "DJ.insertions"
    df$DJ.insertions <- -1
    if (recomb_type == "VJ") {
      df$DJ.insertions <- -1
    } else if (recomb_type == "VDJ") {
      logic <- sapply(strsplit(df[[.jstart]], "|", TRUE, FALSE, TRUE), length) >= 4 &
        sapply(strsplit(df[[.dalignments]], "|", TRUE, FALSE, TRUE), length) >= 5
      df$DJ.insertions[logic] <-
        as.numeric(sapply(strsplit(df[[.jstart]][logic], "|", TRUE, FALSE, TRUE), "[[", 4)) -
        as.numeric(sapply(strsplit(df[[.dalignments]][logic], "|", TRUE, FALSE, TRUE), "[[", 5)) - 1
    }

    # VJ.insertions
    logic <- (sapply(strsplit(df[[.vend]], "|", TRUE, FALSE, TRUE), length) > 4) & (sapply(strsplit(df[[.jstart]], "|", TRUE, FALSE, TRUE), length) >= 4)
    .total.insertions <- "Total.insertions"
    if (recomb_type == "VJ") {
      df$Total.insertions <- NA
      if (length(which(logic)) > 0) {
        df$Total.insertions[logic] <-
          as.numeric(sapply(strsplit(df[[.jstart]][logic], "|", TRUE, FALSE, TRUE), "[[", 4)) - as.numeric(sapply(strsplit(df[[.vend]][logic], "|", TRUE, FALSE, TRUE), "[[", 5)) - 1
      }
    } else if (recomb_type == "VDJ") {
      df$Total.insertions <- df[[.vd.insertions]] + df[[.dj.insertions]]
    } else {
      df$Total.insertions <- NA
    }
    df$Total.insertions[df$Total.insertions < 0] <- -1

    df$V.end <- -1
    df$J.start <- -1
    df[[.vend]] <- gsub(";", "", df[[.vend]], fixed = TRUE)
    logic <- sapply(strsplit(df[[.vend]], "|", TRUE, FALSE, TRUE), length) >= 5
    df$V.end[logic] <- sapply(strsplit(df[[.vend]][logic], "|", TRUE, FALSE, TRUE), "[[", 5)
    logic <- sapply(strsplit(df[[.jstart]], "|", TRUE, FALSE, TRUE), length) >= 4
    df$J.start[logic] <- sapply(strsplit(df[[.jstart]][logic], "|", TRUE, FALSE, TRUE), "[[", 4)
  } else {
    df$V.end <- -1
    df$J.start <- -1
    df$Total.insertions <- -1
    df$VD.insertions <- -1
    df$DJ.insertions <- -1

    .dj.insertions <- "DJ.insertions"
    .vd.insertions <- "VD.insertions"
  }

  .vend <- "V.end"
  .jstart <- "J.start"

  if (!is.na(.dalignments)) {
    logic <- sapply(str_split(df[[.dalignments]], "|"), length) >= 5
    df$D5.end <- -1
    df$D3.end <- -1
    df$D5.end[logic] <- sapply(str_split(df[[.dalignments]][logic], "|"), "[[", 4)
    df$D3.end[logic] <- sapply(str_split(df[[.dalignments]][logic], "|"), "[[", 5)
    .dalignments <- c("D5.end", "D3.end")
  } else {
    df$D5.end <- -1
    df$D3.end <- -1
  }

  .dalignments <- c("D5.end", "D3.end")

  if (!(.count %in% table.colnames)) {
    warn_msg <- c("  [!] Warning: can't find a column with clonal counts. Setting all clonal counts to 1.")
    warn_msg <- c(warn_msg, "\n      Did you apply repLoad to MiXCR file *_alignments.txt?")
    warn_msg <- c(warn_msg, " If so please consider moving all *.clonotypes.*.txt MiXCR files to")
    warn_msg <- c(warn_msg, " a separate folder and apply repLoad to the folder.")
    warn_msg <- c(warn_msg, "\n      Note: The *_alignments.txt file IS NOT a repertoire file suitable for any analysis.")
    message(warn_msg)

    df[[.count]] <- 1
  }
  .freq <- "Proportion"
  df$Proportion <- df[[.count]] / sum(df[[.count]], na.rm = TRUE)

  .aa.seq <- IMMCOL$cdr3aa
  df[[.aa.seq]] <- bunch_translate(df[[.nuc.seq]])

  if (is.na(.big.seq)) {
    .big.seq <- "BigSeq"
    df$BigSeq <- df[[.nuc.seq]]
  }

  df <- df[, make.names(c(
    .count, .freq,
    .nuc.seq, .aa.seq,
    .vgenes, .dgenes, .jgenes,
    .vend, .dalignments, .jstart,
    .total.insertions, .vd.insertions, .dj.insertions, .big.seq
  ))]

  colnames(df) <- IMMCOL$order

  df[[IMMCOL$v]] <- gsub("([*][[:digit:]]*)([(][[:digit:]]*[.,]*[[:digit:]]*[)])", "", df[[IMMCOL$v]])
  df[[IMMCOL$v]] <- gsub(",", ", ", df[[IMMCOL$v]])
  df[[IMMCOL$v]] <- str_replace_all(df[[IMMCOL$v]], '"', "")
  df[[IMMCOL$v]] <- sapply(
    strsplit(df[[IMMCOL$v]], ", ", useBytes = TRUE),
    function(x) paste0(sort(unique(x)), collapse = ", ")
  )

  df[[IMMCOL$d]] <- gsub("([*][[:digit:]]*)([(][[:digit:]]*[.,]*[[:digit:]]*[)])", "", df[[IMMCOL$d]])
  df[[IMMCOL$d]] <- gsub(",", ", ", df[[IMMCOL$d]])
  df[[IMMCOL$d]] <- str_replace_all(df[[IMMCOL$d]], '"', "")
  df[[IMMCOL$d]] <- sapply(
    strsplit(df[[IMMCOL$d]], ", ", useBytes = TRUE),
    function(x) paste0(sort(unique(x)), collapse = ", ")
  )

  df[[IMMCOL$j]] <- gsub("([*][[:digit:]]*)([(][[:digit:]]*[.,]*[[:digit:]]*[)])", "", df[[IMMCOL$j]])
  df[[IMMCOL$j]] <- gsub(",", ", ", df[[IMMCOL$j]])
  df[[IMMCOL$j]] <- str_replace_all(df[[IMMCOL$j]], '"', "")
  df[[IMMCOL$j]] <- sapply(
    strsplit(df[[IMMCOL$j]], ", ", useBytes = TRUE),
    function(x) paste0(sort(unique(x)), collapse = ", ")
  )

  .postprocess(fix.alleles(df))
}

parse_migec <- function(.filename, .mode) {
  filename <- .filename
  nuc.seq <- "CDR3 nucleotide sequence"
  aa.seq <- "CDR3 amino acid sequence"
  .count <- "Good events"
  vgenes <- "V segments"
  jgenes <- "J segments"
  dgenes <- "D segments"
  vend <- "Last V nucleotide position"
  jstart <- "First J nucleotide position"
  dstart <- "First D nucleotide position"
  dend <- "Last D nucleotide position"
  vd.insertions <- "VD insertions"
  dj.insertions <- "DJ insertions"
  total.insertions <- "Total insertions"
  .skip <- 0
  .sep <- "\t"

  parse_repertoire(
    .filename = filename, .mode = .mode, .nuc.seq = nuc.seq, .aa.seq = aa.seq, .count = .count,
    .vgenes = vgenes, .jgenes = jgenes, .dgenes = dgenes,
    .vend = vend, .jstart = jstart, .dstart = dstart, .dend = dend,
    .vd.insertions = vd.insertions, .dj.insertions = dj.insertions,
    .total.insertions = total.insertions, .skip = .skip, .sep = .sep
  )
}

parse_migmap <- function(.filename, .mode) {
  filename <- .filename
  nuc.seq <- "cdr3nt"
  aa.seq <- "cdr3aa"
  .count <- "count"
  vgenes <- "v"
  jgenes <- "j"
  dgenes <- "d"
  vend <- "v.end.in.cdr3"
  jstart <- "j.start.in.cdr3"
  dstart <- "d.start.in.cdr3"
  dend <- "d.end.in.cdr3"
  vd.insertions <- NA
  dj.insertions <- NA
  total.insertions <- NA
  .skip <- 0
  .sep <- "\t"

  parse_repertoire(
    .filename = filename, .mode = .mode, .nuc.seq = nuc.seq, .aa.seq = aa.seq, .count = .count, .vgenes = vgenes, .jgenes = jgenes, .dgenes = dgenes,
    .vend = vend, .jstart = jstart, .dstart = dstart, .dend = dend,
    .vd.insertions = vd.insertions, .dj.insertions = dj.insertions,
    .total.insertions = total.insertions, .skip = .skip, .sep = .sep
  )
}

parse_tcr <- function(.filename, .mode) {
  f <- file(.filename, "r")
  l <- readLines(f, 2)[2]
  close(f)

  nuc.seq <- "CDR3.nucleotide.sequence"
  aa.seq <- "CDR3.amino.acid.sequence"
  .count <- "Read.count"
  vgenes <- "V.gene"
  jgenes <- "J.gene"
  dgenes <- "D.gene"
  vend <- "V.end"
  jstart <- "J.start"
  dstart <- "D5.end"
  dend <- "D3.end"
  vd.insertions <- "VD.insertions"
  dj.insertions <- "DJ.insertions"
  total.insertions <- "Total.insertions"
  .skip <- 0
  .sep <- "\t"

  if (substr(l, 1, 2) != "NA") {
    .count <- "Umi.count"
  }

  parse_repertoire(
    .filename = .filename, .mode = .mode, .nuc.seq = nuc.seq, .aa.seq = aa.seq, .count = .count, .vgenes = vgenes, .jgenes = jgenes, .dgenes = dgenes,
    .vend = vend, .jstart = jstart, .dstart = dstart, .dend = dend,
    .vd.insertions = vd.insertions, .dj.insertions = dj.insertions,
    .total.insertions = total.insertions, .skip = .skip, .sep = .sep
  )
}

parse_vdjtools <- function(.filename, .mode) {
  skip <- 0

  # Check for different VDJtools outputs
  f <- file(.filename, "r")
  l <- readLines(f, 1)
  close(f)

  .skip <- 0
  .count <- "count"
  filename <- .filename
  nuc.seq <- "cdr3nt"
  aa.seq <- "CDR3aa"
  vgenes <- "V"
  jgenes <- "J"
  dgenes <- "D"
  vend <- "Vend"
  jstart <- "Jstart"
  dstart <- "Dstart"
  dend <- "Dend"
  vd.insertions <- NA
  dj.insertions <- NA
  total.insertions <- NA
  .sep <- "\t"

  if (length(strsplit(l, "-", TRUE)) > 0) {
    if (length(strsplit(l, "-", TRUE)[[1]]) == 3) {
      if (strsplit(l, "-", TRUE)[[1]][2] == "header") {
        .count <- "count"
        .skip <- 1
      }
    } else if (tolower(substr(l, 1, 2)) == "#s") {
      .count <- "#Seq. count"
      nuc.seq <- "N Sequence"
      aa.seq <- "AA Sequence"
      vgenes <- "V segments"
      jgenes <- "J segments"
      dgenes <- "D segment"
      vend <- NA
      jstart <- NA
      dstart <- NA
      dend <- NA
    } else if (stringr::str_detect(l, "#")) {
      .count <- "X.count"
    } else {
      .count <- "count"
    }
  }

  parse_repertoire(
    .filename = filename, .mode = .mode, .nuc.seq = nuc.seq, .aa.seq = aa.seq, .count = .count, .vgenes = vgenes, .jgenes = jgenes, .dgenes = dgenes,
    .vend = vend, .jstart = jstart, .dstart = dstart, .dend = dend,
    .vd.insertions = vd.insertions, .dj.insertions = dj.insertions,
    .total.insertions = total.insertions, .skip = .skip, .sep = .sep
  )
}

parse_imgt <- function(.filename, .mode) {
  .fix.imgt.alleles <- function(.col) {
    sapply(strsplit(.col, " "), function(x) {
      if (length(x) > 1) {
        x[[2]]
      } else {
        NA
      }
    })
  }

  f <- file(.filename, "r")
  l <- readLines(f, 2)[2]
  close(f)

  nuc.seq <- "JUNCTION"
  aa.seq <- NA
  .count <- NA
  vgenes <- "V-GENE and allele"
  jgenes <- "J-GENE and allele"
  dgenes <- "D-GENE and allele"
  vend <- "3'V-REGION end"
  jstart <- "5'J-REGION start"
  dstart <- "D-REGION start"
  dend <- "D-REGION end"
  vd.insertions <- NA
  dj.insertions <- NA
  total.insertions <- NA
  .skip <- 0
  .sep <- "\t"
  junc_start <- .make_names("JUNCTION start")

  df <- parse_repertoire(
    .filename = .filename, .mode = .mode, .nuc.seq = nuc.seq, .aa.seq = aa.seq, .count = .count, .vgenes = vgenes, .jgenes = jgenes, .dgenes = dgenes,
    .vend = vend, .jstart = jstart, .dstart = dstart, .dend = dend,
    .vd.insertions = vd.insertions, .dj.insertions = dj.insertions,
    .total.insertions = total.insertions, .skip = .skip, .sep = .sep, .add = junc_start
  )

  df[[IMMCOL$ve]] <- df[[IMMCOL$ve]] - df[[junc_start]]
  df[[IMMCOL$ds]] <- df[[IMMCOL$ds]] - df[[junc_start]]
  df[[IMMCOL$de]] <- df[[IMMCOL$de]] - df[[junc_start]]
  df[[IMMCOL$js]] <- df[[IMMCOL$js]] - df[[junc_start]]

  df[[IMMCOL$ve]][df[[IMMCOL$ve]] < 0] <- NA
  df[[IMMCOL$ds]][df[[IMMCOL$ds]] < 0] <- NA
  df[[IMMCOL$de]][df[[IMMCOL$de]] < 0] <- NA
  df[[IMMCOL$js]][df[[IMMCOL$js]] < 0] <- NA

  df[[IMMCOL$v]] <- .fix.imgt.alleles(df[[IMMCOL$v]])
  df[[IMMCOL$d]] <- .fix.imgt.alleles(df[[IMMCOL$d]])
  df[[IMMCOL$j]] <- .fix.imgt.alleles(df[[IMMCOL$j]])

  df[[junc_start]] <- NULL

  df
}

# parse_vidjil <- function (.filename) {
#   stop(IMMUNR_ERROR_NOT_IMPL)
# }
#
# parse_rtcr <- function (.filename) {
#   stop(IMMUNR_ERROR_NOT_IMPL)
# }
#
# parse_imseq <- function (.filename) {
#   stop(IMMUNR_ERROR_NOT_IMPL)
# }

parse_airr <- function(.filename, .mode) {
  df <- airr::read_rearrangement(.filename)

  df <- df %>%
    select(
      sequence, v_call, d_call, j_call, junction, junction_aa,
      contains("v_germline_end"), contains("d_germline_start"), contains("d_germline_end"),
      contains("j_germline_start"), contains("np1_length"), contains("np2_length"),
      contains("duplicate_count")
    )

  namekey <- c(
    duplicate_count = IMMCOL$count, junction = IMMCOL$cdr3nt, junction_aa = IMMCOL$cdr3aa,
    v_call = IMMCOL$v, d_call = IMMCOL$d, j_call = IMMCOL$j, v_germline_end = IMMCOL$ve,
    d_germline_start = IMMCOL$ds, d_germline_end = IMMCOL$de, j_germline_start = IMMCOL$js,
    np1_length = "unidins", np2_length = IMMCOL$dnj, sequence = IMMCOL$seq
  )

  names(df) <- namekey[names(df)]

  if (!("unidins" %in% colnames(df))) {
    df["unidins"] <- NA
  }

  recomb_type <- .which_recomb_type(df[[IMMCOL$v]])

  if (!is.na(recomb_type)) {
    if (recomb_type == "VJ") {
      df[IMMCOL$vnj] <- df["unidins"]
      df[IMMCOL$vnd] <- NA
      df[IMMCOL$dnj] <- NA
    } else if (recomb_type == "VDJ") {
      df[IMMCOL$vnj] <- NA
      df[IMMCOL$vnd] <- df["unidins"]
    }
  }

  for (column in IMMCOL$order) {
    if (!(column %in% colnames(df))) {
      df[column] <- NA
    }
  }

  df <- df[IMMCOL$order]
  total <- sum(df$Clones)
  df[IMMCOL$prop] <- df[IMMCOL$count] / total
  df[IMMCOL$seq] <- stringr::str_remove_all(df[[IMMCOL$seq]], "N")
  df <- .postprocess(df)
  df
}

parse_immunarch <- function(.filename, .mode) {
  df <- readr::read_tsv(.filename, col_types = cols(), comment = "#")
  if (ncol(df) == 1) {
    # "," in the files, parse differently then
    df <- readr::read_csv(.filename, col_types = cols(), comment = "#")
  }
  df <- .postprocess(df)
  df
}

parse_10x_consensus <- function(.filename, .mode) {
  df <- parse_repertoire(.filename,
    .mode = .mode,
    .nuc.seq = "cdr3_nt", .aa.seq = NA, .count = "umis",
    .vgenes = "v_gene", .jgenes = "j_gene", .dgenes = "d_gene",
    .vend = NA, .jstart = NA, .dstart = NA, .dend = NA,
    .vd.insertions = NA, .dj.insertions = NA, .total.insertions = NA,
    .skip = 0, .sep = ",", .add = c("chain", "clonotype_id", "consensus_id")
  )
  setnames(df, "clonotype_id", "ClonotypeID")
  setnames(df, "consensus_id", "ConsensusID")
  df
}

parse_10x_filt_contigs <- function(.filename, .mode) {
  df <- parse_repertoire(.filename,
    .mode = .mode,
    .nuc.seq = "cdr3_nt", .aa.seq = NA, .count = "umis",
    .vgenes = "v_gene", .jgenes = "j_gene", .dgenes = "d_gene",
    .vend = NA, .jstart = NA, .dstart = NA, .dend = NA,
    .vd.insertions = NA, .dj.insertions = NA, .total.insertions = NA,
    .skip = 0, .sep = ",", # .add = c("chain", "raw_clonotype_id", "raw_consensus_id", "barcode", "contig_id")
    .add = c("chain", "barcode", "raw_clonotype_id", "contig_id")
  )
  # setnames(df, "raw_clonotype_id", "RawClonotypeID")
  # setnames(df, "raw_consensus_id", "RawConsensusID")

  # Process 10xGenomics filtered contigs files - count barcodes, merge consensues ids, clonotype ids and contig ids
  df <- df[order(df$chain),]
  setDT(df)

  if (.mode == "paired") {
    df <- df %>%
      lazy_dt() %>%
      group_by(barcode, raw_clonotype_id) %>%
      summarise(
        CDR3.nt = paste0(CDR3.nt, collapse = IMMCOL_ADD$scsep),
        CDR3.aa = paste0(CDR3.aa, collapse = IMMCOL_ADD$scsep),
        V.name = paste0(V.name, collapse = IMMCOL_ADD$scsep),
        J.name = paste0(J.name, collapse = IMMCOL_ADD$scsep),
        D.name = paste0(D.name, collapse = IMMCOL_ADD$scsep),
        chain = paste0(chain, collapse = IMMCOL_ADD$scsep),
        # raw_clonotype_id = gsub("clonotype", "", paste0(raw_clonotype_id, collapse = IMMCOL_ADD$scsep)),
        # raw_consensus_id = gsub("clonotype|consensus", "", paste0(raw_consensus_id, collapse = IMMCOL_ADD$scsep)),
        contig_id = gsub("_contig_", "", paste0(contig_id, collapse = IMMCOL_ADD$scsep))
      ) %>%
      as.data.table()
  }
  df <- df %>%
    lazy_dt() %>%
    group_by(CDR3.nt, V.name, J.name) %>%
    summarise(
      Clones = length(unique(barcode)),
      CDR3.aa = first(CDR3.aa),
      D.name = first(D.name),
      chain = first(chain),
      barcode = paste0(unique(barcode), collapse = IMMCOL_ADD$scsep),
      raw_clonotype_id = gsub("clonotype|None", "", paste0(unique(raw_clonotype_id), collapse = IMMCOL_ADD$scsep)),
      # raw_clonotype_id = gsub("clonotype", "", paste0(raw_clonotype_id, collapse = IMMCOL_ADD$scsep)),
      # raw_consensus_id = gsub("clonotype|consensus", "", paste0(raw_consensus_id, collapse = IMMCOL_ADD$scsep)),
      contig_id = paste0(contig_id, collapse = IMMCOL_ADD$scsep)
    ) %>%
    as.data.table()

  df$V.end <- NA
  df$J.start <- NA
  df$D.end <- NA
  df$D.start <- NA
  df$VD.ins <- NA
  df$DJ.ins <- NA
  df$VJ.ins <- NA
  df$Sequence <- df$CDR3.nt

  setnames(df, "contig_id", "ContigID")
  setnames(df, "barcode", "Barcode")

  df[[IMMCOL$prop]] <- df[[IMMCOL$count]] / sum(df[[IMMCOL$count]])
  setcolorder(df, IMMCOL$order)

  setDF(df)

  .postprocess(df)
}

parse_archer <- function(.filename, .mode) {
  parse_repertoire(.filename,
    .mode = .mode,
    .nuc.seq = "Clonotype Sequence", .aa.seq = NA, .count = "Clone Abundance",
    .vgenes = "Predicted V Region", .jgenes = "Predicted J Region", .dgenes = "Predicted D Region",
    .vend = NA, .jstart = NA, .dstart = NA, .dend = NA,
    .vd.insertions = NA, .dj.insertions = NA, .total.insertions = NA,
    .skip = 0, .sep = "\t"
  )
}

##### Savers #####

save_immunarch <- function(.data, .path, .compress = TRUE) {
  if (.compress) {
    filepath <- gzfile(paste0(.path, ".tsv.gz"), compression = 9)
  } else {
    filepath <- paste0(.path, ".tsv")
  }
  readr::write_lines(paste0("# Exported from immunarch ", packageVersion("immunarch"), " https://immunarch.com"),
    path = filepath
  )
  filepath <- gzfile(paste0(.path, ".tsv.gz"), compression = 9)
  readr::write_tsv(x = .data, path = filepath, append = TRUE, col_names = TRUE)
}

save_vdjtools <- function(.data, .path, .compress = TRUE) {
  if (.compress) {
    filepath <- gzfile(paste0(.path, ".tsv.gz"), compression = 9)
  } else {
    filepath <- paste0(.path, ".tsv")
  }

  old <- c(
    IMMCOL$count,
    IMMCOL$prop,
    IMMCOL$cdr3nt,
    IMMCOL$cdr3aa,
    IMMCOL$v,
    IMMCOL$d,
    IMMCOL$j
  )

  new <- c(
    "#Seq. Count",
    "Percent",
    "N Sequence",
    "AA Sequence",
    "V Segments",
    "D Segment",
    "J Segments"
  )

  names(.data) <- plyr::mapvalues(names(.data),
    from = old,
    to = as.character(new)
  )

  readr::write_tsv(x = .data, path = filepath)
}
abrown435/immunarch-test documentation built on July 29, 2020, 12:04 a.m.