R/Sequence.R
In alakazam: Immunoglobulin Clonal Lineage and Diversity Analysis

Documented in collapseDuplicates extractVRegion getAAMatrix getDNAMatrix junctionAlignment maskSeqEnds maskSeqGaps nonsquareDist padSeqEnds pairwiseDist seqDist

# Common DNA, amino acid, and gene annotation operations for Alakazam

#### Distance functions ####

#' Build a DNA distance matrix
#'
#' \code{getDNAMatrix} returns a Hamming distance matrix for IUPAC ambiguous
#' DNA characters with modifications for gap, \code{c("-", ".")}, and missing, 
#' \code{c("?")}, character values.
#' 
#' @param    gap  value to assign to characters in the set \code{c("-", ".")}.
#' 
#' @return   A \code{matrix} of DNA character distances with row and column names 
#'           indicating the character pair. By default, distances will be either 0 
#'           (equivalent), 1 (non-equivalent or missing), or -1 (gap). 
#' 
#' @seealso  Creates DNA distance matrix for \link{seqDist}.
#'           See \link{getAAMatrix} for amino acid distances.
#' 
#' @examples
#' # Set gap characters to Inf distance
#' # Distinguishes gaps from Ns
#' getDNAMatrix()
#' 
#' # Set gap characters to 0 distance
#' # Makes gap characters equivalent to Ns
#' getDNAMatrix(gap=0)
#' 
#' @export
getDNAMatrix <- function(gap=-1) {
    # Define Hamming distance matrix
    sub_mat <- diag(18)
    colnames(sub_mat) <- rownames(sub_mat) <- c(names(IUPAC_DNA), c("-", ".", "?"))
    for (i in 1:length(IUPAC_DNA)) {
        for (j in i:length(IUPAC_DNA)) {
            sub_mat[i, j] <- sub_mat[j, i] <- any(IUPAC_DNA[[i]] %in% IUPAC_DNA[[j]])
        }
    }
    
    # Add gap characters
    sub_mat[c("-", "."), c("-", ".")] <- 1 
    sub_mat[c("-", "."), 1:15] <- 1 - gap 
    sub_mat[1:15, c("-", ".")] <- 1 - gap
    
    return(1 - sub_mat)
}


#' Build an AA distance matrix
#'
#' \code{getAAMatrix} returns a Hamming distance matrix for IUPAC ambiguous
#' amino acid characters.
#' 
#' @param    gap  value to assign to characters in the set \code{c("-", ".")}.
#' 
#' @return   A \code{matrix} of amino acid character distances with row and column names 
#'           indicating the character pair.
#' 
#' @seealso  Creates an amino acid distance matrix for \link{seqDist}.
#'           See \link{getDNAMatrix} for nucleotide distances.
#' 
#' @examples
#' getAAMatrix()
#' 
#' @export
getAAMatrix <- function(gap=0) {
  # Define Hamming distance matrix
  sub_mat <- diag(27)
  colnames(sub_mat) <- rownames(sub_mat) <- c(names(IUPAC_AA), c("-", "."))
  for (i in 1:length(IUPAC_AA)) {
    for (j in i:length(IUPAC_AA)) {
      sub_mat[i, j] <- sub_mat[j, i] <- any(IUPAC_AA[[i]] %in% IUPAC_AA[[j]])
    }
  }
  
  # Add gap characters
  sub_mat[c("-", "."), c("-", ".")] <- 1 
  sub_mat[c("-", "."), c(1:27)] <- 1 - gap 
  sub_mat[c(1:27), c("-", ".")] <- 1 - gap
  
  return(1 - sub_mat)
}


#' Remove duplicate DNA sequences and combine annotations
#' 
#' \code{collapseDuplicates} identifies duplicate DNA sequences, allowing for ambiguous 
#' characters, removes the duplicate entries, and combines any associated annotations.
#'
#' @param    data         data.frame containing Change-O columns. The data.frame 
#'                        must contain, at a minimum, a unique identifier column 
#'                        and a column containg a character vector of DNA sequences.
#' @param    id           name of the column containing sequence identifiers.
#' @param    seq          name of the column containing DNA sequences.
#' @param    text_fields  character vector of textual columns to collapse. The textual 
#'                        annotations of duplicate sequences will be merged into a single 
#'                        string with each unique value alphabetized and delimited by 
#'                        \code{sep}.
#' @param    num_fields   vector of numeric columns to collapse. The numeric annotations
#'                        of duplicate sequences will be summed. 
#' @param    seq_fields   vector of nucletoide sequence columns to collapse. The sequence 
#'                        with the fewest numer of non-informative characters will be 
#'                        retained. Where a non-informative character is one of 
#'                        \code{c("N", "-", ".", "?")}. Note, this is distinct from the 
#'                        \code{seq} parameter which is used to determine duplicates.
#' @param    add_count    if \code{TRUE} add the column \code{collpase_count} that 
#'                        indicates the number of sequences that were collapsed to build 
#'                        each unique entry.
#' @param    ignore       vector of characters to ignore when testing for equality.
#' @param    sep          character to use for delimiting collapsed annotations in the 
#'                        \code{text_fields} columns. Defines both the input and output 
#'                        delimiter.
#' @param    dry          if \code{TRUE} perform dry run. Only labels the sequences without 
#'                        collapsing them.
#' @param    verbose      if \code{TRUE} report the number input, discarded and output 
#'                        sequences; if \code{FALSE} process sequences silently.
#'                        
#' @return   A modified \code{data} data.frame with duplicate sequences removed and 
#'           annotation fields collapsed if \code{dry=FALSE}. If \code{dry=TRUE}, 
#'           sequences will be labeled with the collapse action, but the input will be
#'           otherwise unmodifed (see Details).
#'           
#' @details
#' \code{collapseDuplicates} identifies duplicate sequences in the \code{seq} column by
#' testing for character identity, with consideration of IUPAC ambiguous nucleotide codes. 
#' A cluster of sequences are considered duplicates if they are all equivalent, and no 
#' member of the cluster is equivalent to a sequence in a different cluster. 
#' 
#' Textual annotations, specified by \code{text_fields}, are collapsed by taking the unique
#' set of values within in each duplicate cluster and delimiting those values by \code{sep}.
#' Numeric annotations, specified by \code{num_fields}, are collapsed by summing all values 
#' in the duplicate cluster. Sequence annotations, specified by \code{seq_fields}, are 
#' collapsed by retaining the first sequence with the fewest number of N characters.
#' 
#' Columns that are not specified in either \code{text_fields}, \code{num_fields}, or 
#' \code{seq_fields} will be retained, but the value will be chosen from a random entry 
#' amongst all sequences in a cluster of duplicates.
#' 
#' An ambiguous sequence is one that can be assigned to two different clusters, wherein
#' the ambiguous sequence is equivalent to two sequences which are themselves 
#' non-equivalent. Ambiguous sequences arise due to ambiguous characters at positions that
#' vary across sequences, and are discarded along with their annotations when \code{dry=FALSE}. 
#' Thus, ambiguous sequences are removed as duplicates of some sequence, but do not create a potential
#' false-positive annotation merger. Ambiguous sequences are not included in the 
#' \code{collapse_count} annotation that is added when \code{add_count=TRUE}.
#' 
#' If \code{dry=TRUE} sequences will not be removed from the input. Instead, the following columns
#' will be appended to the input defining the collapse action that would have been performed in the
#' \code{dry=FALSE} case.
#' 
#' \itemize{
#'   \item  \code{collapse_id}:     an identifer for the group of identical sequences.
#'   \item  \code{collapse_class}:  string defining how the sequence matches to the other in the set.
#'                                  one of \code{"duplicated"} (has duplicates),
#'                                  \code{"unique"} (no duplicates), \code{"ambiguous_duplicate"} 
#'                                  (no duplicates after ambiguous sequences are removed), 
#'                                  or \code{"ambiguous"} (matches multiple non-duplicate sequences).
#'   \item  \code{collapse_pass}:   \code{TRUE} for the sequences that would be retained.
#' }
#' 
#' @seealso  Equality is tested with \link{seqEqual} and \link{pairwiseEqual}. 
#'           For IUPAC ambiguous character codes see \link{IUPAC_DNA}.
#'
#' @examples
#' # Example data.frame
#' db <- data.frame(sequence_id=LETTERS[1:4],
#'                  sequence_alignment=c("CCCCTGGG", "CCCCTGGN", "NAACTGGN", "NNNCTGNN"),
#'                  c_call=c("IGHM", "IGHG", "IGHG", "IGHA"),
#'                  sample_id=c("S1", "S1", "S2", "S2"),
#'                  duplicate_count=1:4,
#'                  stringsAsFactors=FALSE)
#' 
#' # Annotations are not parsed if neither text_fields nor num_fields is specified
#' # The retained sequence annotations will be random
#' collapseDuplicates(db, verbose=TRUE)
#' 
#' # Unique text_fields annotations are combined into a single string with ","
#' # num_fields annotations are summed
#' # Ambiguous duplicates are discarded
#' collapseDuplicates(db, text_fields=c("c_call", "sample_id"), num_fields="duplicate_count", 
#'                    verbose=TRUE)
#'
#' # Use alternate delimiter for collapsing textual annotations
#' collapseDuplicates(db, text_fields=c("c_call", "sample_id"), num_fields="duplicate_count", 
#'                    sep="/", verbose=TRUE)
#' 
#' # Add count of duplicates
#' collapseDuplicates(db, text_fields=c("c_call", "sample_id"), num_fields="duplicate_count", 
#'                    add_count=TRUE, verbose=TRUE)
#' 
#' # Masking ragged ends may impact duplicate removal
#' db$sequence_alignment <- maskSeqEnds(db$sequence_alignment)
#' collapseDuplicates(db, text_fields=c("c_call", "sample_id"), num_fields="duplicate_count", 
#'                    add_count=TRUE, verbose=TRUE)
#'
#' @export
collapseDuplicates <- function(data, id="sequence_id", seq="sequence_alignment",
                               text_fields=NULL, num_fields=NULL, seq_fields=NULL,
                               add_count=FALSE, ignore=c("N", "-", ".", "?"), 
                               sep=",", dry=FALSE, verbose=FALSE) {
    # Stop if ids are not unique
    if (any(duplicated(data[[id]]))) {
        stop("All values in the id column are not unique")
    }
    # Verify column classes and exit if they are incorrect
    if (!is.null(text_fields)) {
        if (!all(sapply(subset(data, select=text_fields), is.character))) {
            stop("All text_fields columns must be of type 'character'")
        }
    }
    if (!is.null(num_fields)) {
        if (!all(sapply(subset(data, select=num_fields), is.numeric))) {
            stop("All num_fields columns must be of type 'numeric'")
        }
    }
    if (!is.null(seq_fields)) {
        if (!all(sapply(subset(data, select=seq_fields), is.character))) {
            stop("All seq_fields columns must be of type 'character'")
        }
    }
    seq_len <- stri_length(data[[seq]])
    if (any(seq_len != seq_len[1])) {
        warning("All sequences are not the same length for data with first ", 
                id, " = ", data[[id]][1])
    }
    
    # Define verbose reporting function
    .printVerbose <- function(n_total, n_unique, n_discard) {
        cat(" FUNCTION> collapseDuplicates\n", sep="")
        cat(" FIRST_ID> ", data[[id]][1], "\n", sep="")
        cat("    TOTAL> ", n_total, "\n", sep="")
        cat("   UNIQUE> ", n_unique, "\n", sep="")
        cat("COLLAPSED> ", n_total - n_unique - n_discard, "\n", sep="")
        cat("DISCARDED> ", n_discard, "\n", sep="")
        cat("\n")
    }
    
    # Define function to count informative positions in sequences
    .informativeLength <- function(x) {
        stri_length(gsub("[N\\-\\.\\?]", "", x, perl=TRUE))
    }
    
    # Initialize collapse_count with 1 for each sequence
    if(add_count) {
        data[["collapse_count"]] <- rep(1, nrow(data))
        num_fields <- c(num_fields, "collapse_count")
    }
    
    # Initialize dry run columns 
    if (dry) {
        data$collapse_id <- NA
        data$collapse_class <- NA
        data$collapse_pass <- TRUE
    }
    
    # Return input if there are no sequences to collapse
    nseq <- nrow(data)
    if (nseq <= 1) { 
        if (verbose) { .printVerbose(nseq, 1, 0) }
        if (dry) {
            data[['collapse_id']] <- 1
            data[['collapse_class']] <- "unique"
            data[['collapse_pass']] <- TRUE            
        }
        return(data)
    }
    
    # Build distance matrix
    exact_duplicates <- any(duplicated(data[[seq]]))
    d_mat <- pairwiseEqual(unique(data[[seq]]))
    colnames(d_mat) <- rownames(d_mat) <- unique(data[[seq]])
    n_uniqueseq <- nrow(d_mat)
    
    # Return input if no sequences are equal
    if (!any(d_mat[lower.tri(d_mat, diag=F)]) & !exact_duplicates) {
        if (verbose) { .printVerbose(nseq, nseq, 0) }
        if (dry) {
            data[['collapse_id']] <- 1:nrow(data)
            data[['collapse_class']] <- "unique"
            data[['collapse_pass']] <- TRUE            
        }
        return(data)
    }        
    
    # Find sequences that will cluster ambiguously
    ambig_rows <- numeric()
    for (i in 1:n_uniqueseq) {
        idx <- which(d_mat[i, ])
        tmp_mat <- d_mat[idx, idx]
        if (!all(tmp_mat)) { 
            ambig_rows <- append(ambig_rows, i) 
        }
    }
    discard_count <- length(ambig_rows)
    
    # from ambiguous rows in d_mat to 
    # ambiguous rows in data
    data_ambig_rows <- data[[seq]] %in% rownames(d_mat)[ambig_rows]
    data_discard_count <- sum(data_ambig_rows)
    
    if (dry & length(ambig_rows)>0) {
        data[["collapse_class"]][data_ambig_rows] <- "ambiguous"
        data[["collapse_pass"]][data_ambig_rows] <- FALSE
    }
    
    # Return single sequence if all or all but one sequence belong to ambiguous clusters 
	if (nrow(data) - data_discard_count <= 1) {
        inform_len <- data.frame(list("inform_len"=.informativeLength(data[[seq]])))
        # For each ambiguous cluster, return the best sequence
        g <- igraph::simplify(igraph::graph_from_adjacency_matrix(d_mat))
        inform_len$clusters <- igraph::components(g)$membership[data[[seq]]]
        inform_len$select_id <- 1:nrow(inform_len)
        selected <- inform_len %>%
            dplyr::group_by(!!rlang::sym("clusters")) %>%
            dplyr::slice(which.max(!!rlang::sym("inform_len"))) %>%
            dplyr::ungroup() %>%
            dplyr::select(!!rlang::sym("select_id")) %>% unlist() 
        
        if (verbose) { .printVerbose(nseq, 0, discard_count - 1) }
        if (dry) {
            data[["collapse_id"]] <- inform_len$clusters
            data[["collapse_pass"]][selected] <- TRUE
        } else {
            return(data[selected, ])
        }
    }
    
    # Exclude ambiguous sequences from clustering
    if (!dry & discard_count > 0) {
            d_mat <- d_mat[-ambig_rows, -ambig_rows]
            data <- data[!data_ambig_rows,]
    }
    
    # Cluster remaining sequences into unique and duplicate sets
    dup_taxa <-  list()
    uniq_taxa <- character()
    done_taxa <- character()
    taxa_names <- rownames(d_mat)
    collapse_id <- 1
    
    for (taxa_i in 1:length(taxa_names)) {
        
        taxa <- taxa_names[taxa_i]
        data_taxa_i <- which(data[[seq]] %in% taxa)
        
        # Skip taxa if previously assigned to a cluster
        # or if ambiguous
        # (ambiguous taxa don't get their own collapse_id)
        if (taxa %in% done_taxa) { next }
        if (dry & taxa_i %in% ambig_rows) { next }
        
        # Find all zero distance taxa
        idx <- which(d_mat[taxa, ])
        # Translate from d_mat idx to data idx
        data_idx <- which(data[[seq]] %in% colnames(d_mat)[idx])
        
        # Update vector of clustered taxa
        done_taxa <- c(done_taxa, taxa_names[idx])
        
        # Update collapse group
        if (dry) {
            data[["collapse_id"]][data_idx] <- paste(data[["collapse_id"]][data_idx], collapse_id, sep=",")
        }
        
        if (dry) {
            #idx_copy <- idx
            data_idx_copy <- data_idx
            idx <- idx[idx %in% ambig_rows == FALSE]
            data_idx <- which(data[[seq]] %in% colnames(d_mat)[idx])
        }
        
        if (length(data_idx) == 1) {
            # Assign unique sequences to unique vector
            uniq_taxa <- append(uniq_taxa, taxa_names[idx])
            if (dry) {
                if (length(data_idx_copy)==1) {
                    ## 'truly' unique
                    data[["collapse_class"]][data_taxa_i] <- "unique"    
                } else {
                    ## unique after ambiguous removal
                    data[["collapse_class"]][data_taxa_i] <- "ambiguous_duplicate"
                }
                data[["collapse_pass"]][data_taxa_i] <- TRUE
            }
        } else if (length(data_idx) > 1) {
            # Assign clusters of duplicates to duplicate list            
            dup_taxa <- c(dup_taxa, list(taxa_names[idx]))    
            if (dry) {
                # Keep collpase_pass==TRUE for the sequence with the
                # larger number of informative positions 
                # (the first one if ties)
                max_info_idx <- which.max(.informativeLength(data[[seq]][data_idx]))[1]
                data[["collapse_class"]][data_idx] <- "duplicated"
                data[["collapse_pass"]][data_idx[-max_info_idx]] <- FALSE
            }
        } else {
            # Report error (should never occur)
                stop("Error in distance matrix of collapseDuplicates")
        }
        
        collapse_id <- collapse_id + 1
    }
   
    if (dry) {
        data[["collapse_id"]] <-  sub("^NA,","",data[["collapse_id"]])
        return(data)
    }
    
    # Collapse duplicate sets and append entries to unique data.frame
    unique_list <- list(data[data[[seq]] %in% uniq_taxa, ])
    for (taxa in dup_taxa) {
        # Define row indices of identical sequences
        idx <- which(data[[seq]] %in% taxa)
        tmp_df <- data[idx[1], ]

        if (length(idx) > 1) {

            # Initialize with data from most informative sequence
            seq_set <- data[idx, c(id, seq)]
            inform_len <- .informativeLength(seq_set[[seq]])
            max_inform <- which.max(inform_len)[1] # if ties, pick first
            tmp_df <- data[idx[max_inform], ]

            # Define set of text fields for row
            for (f in text_fields) {
                f_set <- na.omit(data[[f]][idx])
                if (length(f_set) > 0) {
                    f_set <- unlist(strsplit(f_set, sep))
                    f_set <- sort(unique(f_set))
                    f_val <- paste(f_set, collapse=sep)
                } else {
                    f_val <- NA
                }
                tmp_df[, f] <- f_val
            }

            # Sum numeric fields
            for (f in num_fields) {
                f_set <- na.omit(data[[f]][idx])
                if (length(f_set) > 0) {
                    f_val <- sum(f_set)
                } else {
                    f_val <- NA
                }
                tmp_df[, f] <- f_val
            }

            # Select sequence fields with fewest Ns
            for (f in seq_fields) {
                f_set <- na.omit(data[[f]][idx])
                if (length(f_set) > 0) {
                    f_len <- .informativeLength(f_set)
                    f_val <- f_set[which.max(f_len)]
                } else {
                    f_val <- NA
                }
                tmp_df[, f] <- f_val
            }

        }

        # Add row to unique list
        unique_list <- c(unique_list, list(tmp_df))
    }

    # Combine all rows into unique data.frame
    unique_df <- as.data.frame(bind_rows(unique_list))
    
    if (verbose) { .printVerbose(nseq, nrow(unique_df), discard_count) }
    return(unique_df)
}


#### Transformation functions ####

#' Translate nucleotide sequences to amino acids
#' 
#' \code{translateDNA} translates nucleotide sequences to amino acid sequences.
#' 
#' @param   seq     vector of strings defining DNA sequence(s) to be converted to translated.
#' @param   trim    boolean flag to remove 3 nts from both ends of seq
#'          (converts IMGT junction to CDR3 region).
#' 
#' @return  A vector of translated sequence strings.
#' 
#' @seealso  \code{\link[seqinr]{translate}}.
#' 
#' @examples
#' # Translate a single sequence
#' translateDNA("ACTGACTCGA")
#'
#' # Translate a vector of sequences
#' translateDNA(ExampleDb$junction[1:3])
#' 
#' # Remove the first and last codon from the translation
#' translateDNA(ExampleDb$junction[1:3], trim=TRUE)
#' 
#' @export
translateDNA <- function (seq, trim=FALSE) {
  # Function to translate a single string
  .translate <- function(x) {
    if (stri_length(x) >= 3 & !is.na(x)) {
      stri_join(seqinr::translate(unlist(strsplit(x, "")), ambiguous=TRUE), 
                collapse="")
    } else {
      NA
    }
  }
  
  # Remove 3 nucleotides from each end
  # Eg,  "ACTGACTCGA" -> "GACT" (with "ACT" and "CGA" removed)
  if (trim) { seq <- substr(seq, 4, stri_length(seq) - 3) }
  
  # Replace gaps with N
  seq <- gsub("[-.]", "N", seq)
  
  # Apply translation
  aa <- sapply(seq, .translate, USE.NAMES=FALSE)
  
  return(aa)
}


#' Masks gap characters in DNA sequences
#' 
#' \code{maskSeqGaps} substitutes gap characters, \code{c("-", ".")}, with \code{"N"} 
#' in a vector of DNA sequences.
#'
#' @param    seq         character vector of DNA sequence strings.
#' @param    mask_char   character to use for masking.
#' @param    outer_only  if \code{TRUE} replace only contiguous leading and trailing gaps;
#'                       if \code{FALSE} replace all gap characters.
#'                       
#' @return   A modified \code{seq} vector with \code{"N"} in place of \code{c("-", ".")} 
#'           characters.
#' 
#' @seealso  See \link{maskSeqEnds} for masking ragged edges.
#'           
#' @examples
#' # Mask with Ns
#' maskSeqGaps(c("ATG-C", "CC..C"))
#' maskSeqGaps("--ATG-C-")
#' maskSeqGaps("--ATG-C-", outer_only=TRUE)
#' 
#' # Mask with dashes
#' maskSeqGaps(c("ATG-C", "CC..C"), mask_char="-")
#' 
#' @export
maskSeqGaps <- function(seq, mask_char="N", outer_only=FALSE) {
  if (outer_only) {
    for (i in 1:length(seq)) {
      head_match <- attr(regexpr("^[-\\.]+", seq[i]), "match.length")
      tail_match <- attr(regexpr("[-\\.]+$", seq[i]), "match.length")
      if (head_match > 0) { 
        seq[i] <- gsub("^[-\\.]+", 
                       paste(rep(mask_char, head_match), collapse=""), 
                       seq[i]) 
      }
      if (tail_match > 0) { 
        seq[i] <- gsub("[-\\.]+$", 
                       paste(rep(mask_char, tail_match), collapse=""), 
                       seq[i]) 
      }
    }
  } else {
    seq <- gsub("[-\\.]", mask_char, seq)
  }
  
  return(seq)
}


#' Masks ragged leading and trailing edges of aligned DNA sequences
#' 
#' \code{maskSeqEnds} takes a vector of DNA sequences, as character strings,
#' and replaces the leading and trailing characters with \code{"N"} characters to create 
#' a sequence vector with uniformly masked outer sequence segments.
#' 
#' @param    seq        character vector of DNA sequence strings.
#' @param    mask_char  character to use for masking.
#' @param    max_mask   the maximum number of characters to mask. If set to 0 then
#'                      no masking will be performed. If set to \code{NULL} then the upper 
#'                      masking bound will be automatically determined from the maximum 
#'                      number of observed leading or trailing \code{"N"} characters amongst 
#'                      all strings in \code{seq}. 
#' @param    trim       if \code{TRUE} leading and trailing characters will be cut rather 
#'                      than masked with \code{"N"} characters.
#' @return   A modified \code{seq} vector with masked (or optionally trimmed) sequences.
#' 
#' @seealso  See \link{maskSeqGaps} for masking internal gaps.
#'           See \link{padSeqEnds} for padding sequence of unequal length.
#' 
#' @examples
#' # Default behavior uniformly masks ragged ends
#' seq <- c("CCCCTGGG", "NAACTGGN", "NNNCTGNN")
#' maskSeqEnds(seq)
#'
#' # Does nothing
#' maskSeqEnds(seq, max_mask=0)
#' 
#' # Cut ragged sequence ends
#' maskSeqEnds(seq, trim=TRUE)
#'
#' # Set max_mask to limit extent of masking and trimming
#' maskSeqEnds(seq, max_mask=1)
#' maskSeqEnds(seq, max_mask=1, trim=TRUE)
#' 
#' # Mask dashes instead of Ns
#' seq <- c("CCCCTGGG", "-AACTGG-", "---CTG--")
#' maskSeqEnds(seq, mask_char="-")
#' 
#' @export
maskSeqEnds <- function(seq, mask_char="N", max_mask=NULL, trim=FALSE) {
  # Find length of leading and trailing Ns
  left_lengths <- attr(regexpr(paste0("(^", mask_char, "*)"), seq, perl=T), "capture.length")
  right_lengths <- attr(regexpr(paste0("(", mask_char, "*$)"), seq, perl=T), "capture.length")
  
  # Mask to minimal inner sequence length
  left_mask <- min(max(left_lengths[, 1]), max_mask)
  right_mask <- min(max(right_lengths[, 1]), max_mask)
  seq_lengths <- stri_length(seq)
  if (trim) {
    seq <- substr(seq, left_mask + 1, seq_lengths - right_mask)
  } else {
    substr(seq, 0, left_mask) <- paste(rep(mask_char, left_mask), collapse='')
    substr(seq, seq_lengths - right_mask + 1, seq_lengths + 1) <- 
      paste(rep(mask_char, right_mask), collapse='')
  }
  
  return(seq)
}


#' Pads ragged ends of aligned DNA sequences
#' 
#' \code{padSeqEnds} takes a vector of DNA sequences, as character strings,
#' and appends the ends of each sequence with an appropriate number of \code{"N"} 
#' characters to create a sequence vector with uniform lengths.
#' 
#' @param    seq       character vector of DNA sequence strings.
#' @param    len       length to pad to. Only applies if longer than the maximum length of
#'                     the data in \code{seq}.
#' @param    start     if \code{TRUE} pad the beginning of each sequence instead of the end. 
#' @param    pad_char  character to use for padding.
#' @param    mod3      if \code{TRUE} pad sequences to be of length multiple three.
#' 
#' @return   A modified \code{seq} vector with padded sequences.
#' 
#' @seealso  See \link{maskSeqEnds} for creating uniform masking from existing masking.
#' 
#' @examples
#' # Default behavior uniformly pads ragged ends
#' seq <- c("CCCCTGGG", "ACCCTG", "CCCC")
#' padSeqEnds(seq)
#'
#' # Pad to fixed length
#' padSeqEnds(seq, len=15)
#'
#' # Add padding to the beginning of the sequences instead of the ends
#' padSeqEnds(seq, start=TRUE)
#' padSeqEnds(seq, len=15, start=TRUE)
#' 
#' @export
padSeqEnds <- function(seq, len=NULL, start=FALSE, pad_char="N", mod3=TRUE) {
  # Set length to max input length
  width <- max(stringi::stri_length(seq),len)
  if (mod3 && width %% 3 != 0) {
    width <- width + (3 - width %% 3)
  }
  
  # Pad
  if (!start) { 
    seq <- stringi::stri_pad_right(seq, width=width, pad=pad_char)
  } else {
    seq <- stringi::stri_pad_left(seq, width=width, pad=pad_char)
  }
  
  return(seq)
}


#### Subregion functions ####

#' Extracts FWRs and CDRs from IMGT-gapped sequences
#' 
#' \code{extractVRegion} extracts the framework and complementarity determining regions of 
#' the V segment for IMGT-gapped immunoglobulin (Ig) nucleotide sequences according to the 
#' IMGT numbering scheme.
#'
#' @param     sequences  character vector of IMGT-gapped nucleotide sequences.
#' @param     region     string defining the region(s) of the V segment to extract. 
#'                       May be a single region or multiple regions (as a vector) from
#'                       \code{c("fwr1", "cdr1", "fwr2", "cdr2" ,"fwr3")}.  By default, all
#'                       regions will be returned.
#'                       
#' @return    If only one region is specified in the \code{region} argument, a character 
#'            vector of the extracted sub-sequences will be returned. If multiple regions 
#'            are specified, then a character matrix will be returned with columns 
#'            corresponding to the specified regions and a row for each entry in 
#'            \code{sequences}.
#' 
#' @seealso   IMGT-gapped region boundaries are defined in \link{IMGT_REGIONS}.
#' 
#' @references
#' \enumerate{
#'   \item  Lefranc M-P, et al. IMGT unique numbering for immunoglobulin and T cell 
#'            receptor variable domains and Ig superfamily V-like domains.
#'            Dev Comp Immunol. 2003 27(1):55-77.
#' }
#' 
#' @examples
#' # Assign example clone
#' clone <- subset(ExampleDb, clone_id == 3138)
#'
#' # Get all regions
#' extractVRegion(clone$sequence_alignment)
#' 
#' # Get single region
#' extractVRegion(clone$sequence_alignment, "fwr1")
#' 
#' # Get all CDRs
#' extractVRegion(clone$sequence_alignment, c("cdr1", "cdr2"))
#' 
#' # Get all FWRs
#' extractVRegion(clone$sequence_alignment, c("fwr1", "fwr2", "fwr3"))
#'
#' @export
extractVRegion <- function(sequences, region=c("fwr1", "cdr1", "fwr2", "cdr2", "fwr3")) {
    # Check region argument
    region <- match.arg(region, several.ok=TRUE)
    
    if (length(region) == 1) {
        sub_sequences <- substr(sequences, 
                                IMGT_REGIONS[[region]][1], 
                                IMGT_REGIONS[[region]][2])
    } else {
        sub_sequences <- sapply(region, function(x) substr(sequences, 
                                                           IMGT_REGIONS[[x]][1], 
                                                           IMGT_REGIONS[[x]][2]))
    }
    
    return(sub_sequences)
}


#' Calculate junction region alignment properties
#'
#' \code{junctionAlignment} determines the number of deleted germline nucleotides in the 
#' junction region and the number of V gene and J gene nucleotides in the CDR3.
#'
#' @param   data                \code{data.frame} containing sequence data.
#' @param   germline_db         reference germline database for the V, D and J genes.
#'                              in \code{data}
#' @param   v_call              V gene assignment column.
#' @param   d_call              D gene assignment column.
#' @param   j_call              J gene assignment column.
#' @param   v_germline_start    column containing the start position of the alignment 
#'                              in the V reference germline.
#' @param   v_germline_end      column containing the end position of the alignment in the 
#'                              V reference germline.
#' @param   d_germline_start    column containing the start position of the alignment 
#'                              in the D reference germline.
#' @param   d_germline_end      column containing the start position of the alignment 
#'                              in the D reference germline.
#' @param   j_germline_start    column containing the start position of the alignment 
#'                              in the J reference germline.
#' @param   j_germline_end      column containing the start position of the alignment 
#'                              in the J reference germline.
#' @param   np1_length          combined length of the N and P regions between the 
#'                              V and D regions (heavy chain) or V and J regions (light chain).      
#' @param   np2_length          combined length of the N and P regions between the 
#'                              D and J regions (heavy chain).            
#' @param   junction            column containing the junction sequence.
#' @param   junction_length     column containing the length of the junction region in nucleotides.
#' @param   sequence_alignment  column containing the aligned sequence.
#' 
#' @return  A modified input \code{data.frame} with the following additional columns storing 
#'          junction alignment information:
#'          \enumerate{
#'              \item  \code{e3v_length}:     number of 3' V germline nucleotides deleted.
#'              \item  \code{e5d_length}:     number of 5' D germline nucleotides deleted.
#'              \item  \code{e3d_length}:     number of 3' D germline nucleotides deleted.
#'              \item  \code{e5j_length}:     number of 5' J germline nucleotides deleted.
#'              \item  \code{v_cdr3_length}:  number of sequence_alignment V nucleotides in the CDR3.
#'              \item  \code{j_cdr3_length}:  number of sequence_alignment J nucleotides in the CDR3.
#'          }
#' 
#' @examples
#' germline_db <- list(
#' "IGHV3-11*05"="CAGGTGCAGCTGGTGGAGTCTGGGGGA...GGCTTGGTCAAGCCTGGAGGGTCCCTGAGACT
#' CTCCTGTGCAGCCTCTGGATTCACCTTC............AGTGACTACTACATGAGCTGGATCCGCCAGGCTCCAG
#' GGAAGGGGCTGGAGTGGGTTTCATACATTAGTAGTAGT......AGTAGTTACACAAACTACGCAGACTCTGTGAAG
#' ...GGCCGATTCACCATCTCCAGAGACAACGCCAAGAACTCACTGTATCTGCAAATGAACAGCCTGAGAGCCGAGGA
#' CACGGCCGTGTATTACTGTGCGAGAGA",
#' "IGHD3-10*01"="GTATTACTATGGTTCGGGGAGTTATTATAAC",
#' "IGHJ5*02"="ACAACTGGTTCGACCCCTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG"
#' )
#' 
#' db <- junctionAlignment(SingleDb, germline_db)
#'
#' @export
junctionAlignment <- function(data, germline_db, 
                              v_call="v_call",
                              d_call="d_call",
                              j_call="j_call",
                              v_germline_start="v_germline_start",
                              v_germline_end="v_germline_end",
                              d_germline_start="d_germline_start",
                              d_germline_end="d_germline_end",
                              j_germline_start="j_germline_start",
                              j_germline_end="j_germline_end",
                              np1_length="np1_length",
                              np2_length="np2_length",
                              junction="junction",
                              junction_length="junction_length",
                              sequence_alignment="sequence_alignment") {
  
  # Check input
  check <- checkColumns(data, 
                        c(v_call, d_call, j_call, 
                          v_germline_start, v_germline_end,
                          d_germline_start, d_germline_end,
                          j_germline_start, j_germline_end,
                          np1_length, np2_length,
                          junction, junction_length,
                          sequence_alignment))
  if (check != TRUE) { stop(check) }
  
  # Get deletions
  for (i in 1:nrow(data))  {
    v_dels <- countDeleted(data[i,], 
                           allele_call=v_call, germline_start=v_germline_start, germline_end=v_germline_end, 
                           germline_db=germline_db, junction=junction, junction_length=junction_length, 
                           sequence_alignment=sequence_alignment)
    d_dels <- countDeleted(data[i,], 
                           allele_call=d_call, germline_start=d_germline_start, germline_end=d_germline_end, 
                           germline_db=germline_db, junction=junction, junction_length=junction_length, 
                           sequence_alignment=sequence_alignment)
    j_dels <- countDeleted(data[i,], 
                           allele_call=j_call, germline_start=j_germline_start, germline_end=j_germline_end, 
                           germline_db=germline_db, junction=junction, junction_length=junction_length, 
                           sequence_alignment=sequence_alignment)
    data[['e3v_length']][i] <- v_dels[2]
    data[['e5d_length']][i] <- d_dels[1]
    data[['e3d_length']][i] <- d_dels[2]
    data[['e5j_length']][i] <- j_dels[1]
    data[['v_cdr3_length']][i] <- v_dels[3]
    data[['j_cdr3_length']][i] <- j_dels[3]
  }
  
  return(data)
}

# Junction alignment helper
#
# Report the number of deleted germline nucleotides in the alignment
#
# @param    db_row               one row from a Rearrangement database.
# @param    allele_call          column containing gene assignments.
# @param    germline_start       column containing the start position of the alignment in the reference germline.
# @param    germline_end         column containing the end position of the alignment in the reference germline.
# @param    germline_db          reference germline database for the V, D and J genes.
# @param    junction             column containing the junction sequence.
# @param    junction_length      column containing the length of the  junction region in nucleotides.
# @param    sequence_alignment   column containing the aligned sequence.
# 
# @return   Alignment deletions
countDeleted <- function(db_row, allele_call, germline_start, germline_end, 
                         germline_db, junction, junction_length,
                         sequence_alignment) {
    # db_row: one row from data
    # allele_call: one of v,d,j
    # germline_db: the reference germline database used to assign genes. 
    allele <- getAllele(db_row[[allele_call]], first=T)
    deleted <- c(NA, NA, NA)
    
    # Check for valid allele information
    if (is.na(allele)) { 
        return(deleted) 
    }
    # Check for allele in reference germlines
    tryCatch(germline <- germline_db[[allele]],
             error=function(e) { stop(allele, " not found in germline_db.") })
    
    allele_germline_start <- as.numeric(db_row[[germline_start]])
    allele_germline_end <- as.numeric(db_row[[germline_end]])
    
    germline_head <- stringi::stri_sub(germline, 1, allele_germline_start - 1)
    deleted_head <- nchar(gsub("\\.", "", germline_head))
    
    germline_tail <- stringi::stri_sub(germline, allele_germline_end+1, nchar(germline))
    deleted_tail <- nchar(gsub("\\.", "", germline_tail))
    
    deleted[1] <- deleted_head
    deleted[2] <- deleted_tail
    
    if (is.na(db_row[[junction]])) {
        warning("NA junction found.")
        return (deleted)
    }
    if (!db_row[[junction_length]]>6) {
        message("Junction length <= 6.")
        return (deleted)
    }
    
    junction_len <- db_row[[junction_length]]
    junction_start <- 310
    # junction_end <- junction_start + junction_len - 1
    
    # get aligned junction end (counting gaps)
    seq_aln <- s2c(db_row[[sequence_alignment]]) != "-"
    seq_aln[1:junction_start-1] <- 0
    junction_end <- which(cumsum(seq_aln[1:length(seq_aln)]) > junction_len)[1] - 1
    
    # For V and J alleles, calculate number of nt in the CDR3
    germ_cdr3_length <- NA
    if (grepl("[Vv]", allele)) {
        last_cdr3_pre_np <- db_row[[germline_end]] - db_row[[germline_start]] + 1 
        first_cdr3_pre_np <- junction_start + 3   # without conserved 
        # len <- last_cdr3_pre_np - first_cdr3_pre_np + 1
        #germ_seq <- stringi::stri_sub(germline, db_row[[germline_end]]+1-len, db_row[[germline_end]] )
        germ_seq <- stringi::stri_sub(db_row[[sequence_alignment]], first_cdr3_pre_np, last_cdr3_pre_np )
        germ_cdr3_length <- nchar(gsub("[\\.-]", "", germ_seq))
    } else if (grepl("[Jj]", allele))  {
        j_aln_len <- db_row[[germline_end]] - db_row[[germline_start]] + 1 
        # germ_seq <- stringi::stri_sub(germline, db_row[[germline_start]], db_row[[germline_end]]-j_tail)
        germ_seq <- stringi::stri_sub(db_row[[sequence_alignment]], 
                                      nchar(db_row[[sequence_alignment]]) - j_aln_len + 1,
                                      junction_end - 3)
        germ_cdr3_length <- nchar(gsub("-", "", germ_seq))
    } 
    
    deleted <- c(deleted_head, deleted_tail, germ_cdr3_length)
    return(deleted)
}


#### Rcpp wrappers ####

#' Calculate distance between two sequences
#' 
#' \code{seqDist} calculates the distance between two DNA sequences.
#'
#' @param    seq1      character string containing a DNA sequence.
#' @param    seq2      character string containing a DNA sequence.
#' @param    dist_mat  Character distance matrix. Defaults to a Hamming distance 
#'                     matrix returned by \link{getDNAMatrix}. If gap 
#'                     characters, \code{c("-", ".")}, are assigned a value of -1 
#'                     in \code{dist_mat} then contiguous gaps of any run length,
#'                     which are not present in both sequences, will be counted as a 
#'                     distance of 1. Meaning, indels of any length will increase
#'                     the sequence distance by 1. Gap values other than -1 will 
#'                     return a distance that does not consider indels as a special case.
#'
#' @return   Numerical distance between \code{seq1} and \code{seq2}.
#' 
#' @seealso  Nucleotide distance matrix may be built with 
#'           \link{getDNAMatrix}. Amino acid distance matrix may be built
#'           with \link{getAAMatrix}. Used by \link{pairwiseDist} for generating
#'           distance matrices. See \link{seqEqual} for testing sequence equivalence.
#'           
#' @examples
#' # Ungapped examples
#' seqDist("ATGGC", "ATGGG")
#' seqDist("ATGGC", "ATG??")
#' 
#' # Gaps will be treated as Ns with a gap=0 distance matrix
#' seqDist("ATGGC", "AT--C", dist_mat=getDNAMatrix(gap=0))
#' 
#' # Gaps will be treated as universally non-matching characters with gap=1
#' seqDist("ATGGC", "AT--C", dist_mat=getDNAMatrix(gap=1))
#' 
#' # Gaps of any length will be treated as single mismatches with a gap=-1 distance matrix
#' seqDist("ATGGC", "AT--C", dist_mat=getDNAMatrix(gap=-1))
#' 
#' # Gaps of equivalent run lengths are not counted as gaps
#' seqDist("ATG-C", "ATG-C", dist_mat=getDNAMatrix(gap=-1))
#'
#' # Overlapping runs of gap characters are counted as a single gap
#' seqDist("ATG-C", "AT--C", dist_mat=getDNAMatrix(gap=-1))
#' seqDist("A-GGC", "AT--C", dist_mat=getDNAMatrix(gap=-1))
#' seqDist("AT--C", "AT--C", dist_mat=getDNAMatrix(gap=-1))
#' 
#' # Discontiguous runs of gap characters each count as separate gaps
#' seqDist("-TGGC", "AT--C", dist_mat=getDNAMatrix(gap=-1))
#' 
#' @export
seqDist <- function(seq1, seq2, dist_mat=getDNAMatrix()) {
    seqDistRcpp(seq1, seq2, dist_mat)
}


#' Calculate pairwise distances between sequences
#' 
#' \code{pairwiseDist} calculates all pairwise distance between a set of sequences.
#'
#' @param    seq       character vector containing a DNA sequences.
#' @param    dist_mat  Character distance matrix. Defaults to a Hamming distance 
#'                     matrix returned by \link{getDNAMatrix}. If gap 
#'                     characters, \code{c("-", ".")}, are assigned a value of -1 
#'                     in \code{dist_mat} then contiguous gaps of any run length,
#'                     which are not present in both sequences, will be counted as a 
#'                     distance of 1. Meaning, indels of any length will increase
#'                     the sequence distance by 1. Gap values other than -1 will 
#'                     return a distance that does not consider indels as a special case.
#'
#' @return   A matrix of numerical distance between each entry in \code{seq}. 
#'           If \code{seq} is a named vector, row and columns names will be added 
#'           accordingly.
#' 
#'           Amino acid distance matrix may be built with \link{getAAMatrix}. 
#'           Uses \link{seqDist} for calculating distances between pairs.
#'           See \link{pairwiseEqual} for generating an equivalence matrix.
#'           
#' @examples
#' # Gaps will be treated as Ns with a gap=0 distance matrix
#' pairwiseDist(c(A="ATGGC", B="ATGGG", C="ATGGG", D="AT--C"), 
#'              dist_mat=getDNAMatrix(gap=0))
#' 
#' # Gaps will be treated as universally non-matching characters with gap=1
#' pairwiseDist(c(A="ATGGC", B="ATGGG", C="ATGGG", D="AT--C"), 
#'              dist_mat=getDNAMatrix(gap=1))
#' 
#' # Gaps of any length will be treated as single mismatches with a gap=-1 distance matrix
#' pairwiseDist(c(A="ATGGC", B="ATGGG", C="ATGGG", D="AT--C"), 
#'              dist_mat=getDNAMatrix(gap=-1))
#' 
#' @export
pairwiseDist <- function(seq, dist_mat=getDNAMatrix()) {
    pairwiseDistRcpp(seq, dist_mat)
}


#' Calculate pairwise distances between sequences
#' 
#' \code{nonsquareDist} calculates all pairwise distance between a set of sequences and a subset of it.
#'
#' @param    seq       character vector containing a DNA sequences. The sequence vector needs to
#'                     be named.
#' @param    indx      numeric vector contating the indices (a subset of indices of \code{seq}).
#' @param    dist_mat  Character distance matrix. Defaults to a Hamming distance 
#'                     matrix returned by \link{getDNAMatrix}. If gap 
#'                     characters, \code{c("-", ".")}, are assigned a value of -1 
#'                     in \code{dist_mat} then contiguous gaps of any run length,
#'                     which are not present in both sequences, will be counted as a 
#'                     distance of 1. Meaning, indels of any length will increase
#'                     the sequence distance by 1. Gap values other than -1 will 
#'                     return a distance that does not consider indels as a special case.
#'
#' @return   A matrix of numerical distance between each entry in \code{seq} and 
#'           sequences specified by \code{indx} indices. 
#'           
#'           Note that the input subsampled indices will be ordered ascendingly. Therefore, 
#'           it is necassary to assign unique names to the input sequences, \code{seq}, 
#'           to recover the input order later. Row and columns names will be added accordingly.
#' 
#'           Amino acid distance matrix may be built with \link{getAAMatrix}. 
#'           Uses \link{seqDist} for calculating distances between pairs.
#'           See \link{pairwiseEqual} for generating an equivalence matrix.
#'           
#' @examples
#' # Gaps will be treated as Ns with a gap=0 distance matrix
#' seq <- c(A="ATGGC", B="ATGGG", C="ATGGG", D="AT--C")
#' pairwiseDist(seq, 
#'              dist_mat=getDNAMatrix(gap=0))
#' 
#' nonsquareDist(seq, indx=c(1,3), 
#'              dist_mat=getDNAMatrix(gap=0))
#' 
#' @export
nonsquareDist <- function(seq, indx, dist_mat=getDNAMatrix()) {
    nonsquareDistRcpp(seq, indx, dist_mat)
}