R/readImmunoSeq.R
In LymphoSeq: Analyze high-throughput sequencing of T and B cell receptors

Documented in readImmunoSeq

#' Read ImmunoSeq files
#' 
#' Imports tab-separated value (.tsv) files exported by the Adaptive 
#' Biotechnologies ImmunoSEQ analyzer and stores them as a list of data frames.
#' 
#' @param path Path to the directory containing tab-delimited files.  Only 
#' files with the extension .tsv are imported.  The names of the data frames are 
#' the same as names of the files.
#' @param columns Column names from the tab-delimited files that you desire to 
#' import, all others will be disregarded.  May use "all" to import all columns.  
#' A warning may be called if columns contain no data or must be coereced to a 
#' different class.  Usually this warning can be ignored.
#' @param recursive A Boolean value indicating whether tab-delimited files in 
#' subdirectories should be imported.  If TRUE, then all files in the parent as 
#' well as the subdirectory are imported.  If FALSE, then only files in the 
#' parent directory are imported.
#' @details May import tab-delimited files containing antigen receptor 
#' sequencing from other sources (e.g. miTCR or miXCR) as long as the column 
#' names are the same as used by ImmunoSEQ files.  Available column headings in 
#' ImmunoSEQ files are:  
#' "nucleotide", "aminoAcid", "count", "count (templates)", "count (reads)", 
#' "count (templates/reads)", "frequencyCount", "frequencyCount (\%)", "cdr3Length", 
#' "vMaxResolved", "vFamilyName", "vGeneName", "vGeneAllele", "vFamilyTies", 
#' "vGeneNameTies", "vGeneAlleleTies", "dMaxResolved", "dFamilyName", "dGeneName", 
#' "dGeneAllele", "dFamilyTies", "dGeneNameTies", "dGeneAlleleTies", "jMaxResolved", 
#' "jFamilyName", "jGeneName", "jGeneAllele", "jFamilyTies", "jGeneNameTies", 
#' "jGeneAlleleTies", "vDeletion", "d5Deletion", "d3Deletion", "jDeletion", 
#' "n2Insertion", "n1Insertion", "vIndex", "n2Index", "dIndex", "n1Index", 
#' "jIndex", "estimatedNumberGenomes", "sequenceStatus", "cloneResolved", 
#' "vOrphon", "dOrphon", "jOrphon", "vFunction", "dFunction", "jFunction", 
#' "fractionNucleated".  
#'  
#' IMPORTANT: be aware that Adaptive has changed the 
#' column names of their files over time and if the headings of your files are 
#' inconsistent, then specify column = "all" or include all variations of the 
#' headings you want to important.  For example, column = c("count", 
#' "count (templates)", "count (reads)").  Also be aware that the "count" 
#' column previously reported the number of sequencing reads in earlier 
#' versions of ImmunoSEQ but now is equivalent to the 
#' "estimatedNumberGenomes" column.
#' @return Returns a list of data frames.  The names of each data frame are
#' assigned according to the original ImmunoSEQ file names.
#' @examples
#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq")
#' 
#' file.list <- readImmunoSeq(path = file.path, 
#'                            columns = c("aminoAcid", "nucleotide", "count", 
#'                                      "count (templates)", "count (reads)", 
#'                                      "count (templates/reads)",
#'                                      "frequencyCount", "frequencyCount (%)", 
#'                                      "estimatedNumberGenomes"), 
#'                            recursive = FALSE)
#' @export
#' @importFrom data.table fread
#' @importFrom plyr llply
readImmunoSeq <- function(path, columns = c("aminoAcid", "nucleotide", "count", 
                                            "count (templates)", "count (reads)", 
                                            "count (templates/reads)",
                                            "frequencyCount", "frequencyCount (%)", 
                                            "estimatedNumberGenomes", "vFamilyName", 
                                            "dFamilyName", "jFamilyName","vGeneName", 
                                            "dGeneName", "jGeneName"), 
                          recursive = FALSE) {
    file.paths1 <- list.files(path, full.names = TRUE, all.files = FALSE, 
                             recursive = recursive, pattern = ".tsv", 
                             include.dirs = FALSE)
    file.info <- file.info(file.paths1)
    file.paths2 <- rownames(file.info)[file.info$size > 0]
    if(!identical(file.paths1,file.paths2)){
        warning("One or more of the files you are trying to import has no sequences and will be ignored.", call. = FALSE)
    }
    if (any(columns == "all")) {
        file.list <- suppressWarnings(plyr::llply(file.paths2, data.table::fread, 
                                 data.table = FALSE, .progress = "text"))
    } else {
        file.list <- suppressWarnings(plyr::llply(file.paths2, data.table::fread, select = columns, 
                                 data.table = FALSE, .progress = "text"))
        if(length(unique(plyr::llply(file.list, ncol))) > 1){
            warning("One or more of the files you are trying to import do not contain all the columns you specified.", call. = FALSE)
        }
    }
    file.names <- gsub(".tsv", "", basename(file.paths2))
    names(file.list) <- file.names
    i <- 1
    for (i in 1:length(file.list)) {
        colnames(file.list[[i]]) <- ifelse(grepl("frequencyCount*", 
                                                 colnames(file.list[[i]])), 
                                           "frequencyCount", 
                                           colnames(file.list[[i]]))
        colnames(file.list[[i]]) <- ifelse(grepl("count*", 
                                                 colnames(file.list[[i]])), 
                                           "count", 
                                           colnames(file.list[[i]]))
    }
    return(file.list)
}