R/alignSeq.R

Defines functions alignSeq

Documented in alignSeq

#' Align multiple sequences
#'
#' Perform multiple sequence alignment using one of three methods and output
#' results to the console or as a pdf file. One may perform the alignment of all
#' amino acid or nucleotide sequences in a single repertoire_id. Alternatively,
#' one may search for a given sequence within a list of samples using an edit
#' distance threshold.
#'
#' @param study_table A tibble consisting of antigen receptor sequences
#' imported by the LymphoSeq2 function [readImmunoSeq()].
#' @param repertoire_ids A character vector indicating the name of the
#' repertoire_id in the productive sequence list.
#' @param sequence_list A character vector of one ore more amino acid or
#' nucleotide CDR3 sequences to search.
#' @param edit_distance An integer giving the minimum edit distance that the
#' sequence must be less than or equal to.  See details below.
#' @param type A character vector indicating whether "junction_aa" or "junction"
#' sequences should be aligned.  If "junction_aa" is specified, then run
#' [productiveSeq()] first.
#' @param method A character vector indicating the multiple sequence alignment
#' method to be used.  Refer to the Bioconductor "msa" package for more details.
#' Options include "ClustalW", "ClustalOmega", and "Muscle".
#' @param top The number of top sequences to perform alignment.
#' @details Edit distance is a way of quantifying how dissimilar two sequences
#' are to one another by counting the minimum number of operations required to
#' transform one sequence into the other.  For example, an edit distance of 0
#' means the sequences are identical and an edit distance of 1 indicates that
#' the sequences different by a single amino acid or junction.
#' @return Performs a multiple sequence alignment object.
#' @seealso If having trouble saving pdf files, refer to Bioconductor package
#' msa for installation instructions
#' \url{http://bioconductor.org/packages/release/bioc/vignettes/msa/inst/doc/msa.pdf}
#' @examples
#' file_path <- system.file("extdata", "IGH_sequencing", package = "LymphoSeq2")
#' study_table <- LymphoSeq2::readImmunoSeq(path = file_path, threads = 1)
#' study_table <- LymphoSeq2::topSeqs(study_table, top = 100)
#' nucleotide_table <- LymphoSeq2::productiveSeq(study_table, aggregate = "junction")
#' LymphoSeq2::alignSeq(nucleotide_table,
#'   repertoire_ids = "IGH_MVQ92552A_BL", type = "junction",
#'   method = "ClustalW"
#' )
#' @export
alignSeq <- function(study_table, repertoire_ids = NULL,
                     sequence_list = NULL, edit_distance = 15,
                     type = "junction", method = "ClustalOmega",
                     top = 150) {
  # If the sequence list is not null and repertoire ids are NULL,
  # set repertoire_ids to all samples and align all sequences against
  # sequence list
  if (!is.null(sequence_list) & is.null(repertoire_ids)) {
    search_table <- LymphoSeq2::searchSeq(
      study_table = study_table,
      sequence = sequence_list,
      edit_distance = edit_distance,
      seq_type = type,
      match = "partial"
    )
    repertoire_ids <- study_table |>
      dplyr::pull(repertoire_id) |>
      base::unique()
    if (is.null(search_table)) {
      stop("There are no sequences to be aligned", call. = FALSE)
    }
  } else if (!is.null(sequence_list) & !is.null(repertoire_ids)) {
    study_table <- study_table |>
      dplyr::filter(repertoire_id %in% repertoire_ids)
    search_table <- LymphoSeq2::searchSeq(
      study_table = study_table,
      sequence = sequence_list,
      edit_distance = edit_distance,
      seq_type = type,
      match = "partial"
    )
    if (is.null(search_table)) {
      stop("There are no sequences to be aligned", call. = FALSE)
    }
  } else if (is.null(sequence_list) & !is.null(repertoire_ids)) {
    search_table <- study_table |>
      dplyr::filter(repertoire_id %in% repertoire_ids)
  } else if (is.null(sequence_list) & is.null(repertoire_ids)) {
    search_table <- study_table
    repertoire_ids <- study_table |>
      dplyr::pull(repertoire_id) |>
      base::unique()
  }
  # Select only the top sequences to perform alignment
  if (nrow(search_table) > top) {
    if (is.null(sequence_list)) {
      search_table <- search_table |>
        LymphoSeq2::topSeqs(top = top)
    } else {
      search_table <- search_table |>
        dplyr::filter(!!base::as.symbol(type) %in% searchSequence) |>
        LymphoSeq2::topSeqs(top = top)
      message(str_c("Only 150 sequences sampled equally from each search group",
        "will be selected", sep = " "))
    }
  }
  # Select the string set according to the type provided by the user
  # and create a DNA/AAStringSet with this
  if (type == "junction") {
    search_table <- search_table |>
      dplyr::filter(base::nchar(junction) > 15)
    string_list <- search_table |>
      dplyr::pull(junction) |>
      Biostrings::DNAStringSet()
  } else if (type == "junction_aa") {
    search_table <- search_table |>
      dplyr::filter(base::nchar(junction_aa) > 3)
    string_list <- search_table |>
      dplyr::pull(junction_aa) |>
      Biostrings::AAStringSet()
  }
  # Stop if the number of sequences are less than three
  if (nrow(search_table) < 3) {
    stop("There are less than 3 sequences to be aligned", call. = FALSE)
  }
  # Prepare a string list to name annotate the sequences with repertoire_ids
  # if provided or use gene family names with sequence counts
  if (!is.null(repertoire_ids)) {
    names(string_list) <- search_table |>
      dplyr::mutate(rep_id = stringr::str_c(repertoire_id, dplyr::row_number(),
        sep = "_"
      )) |>
      dplyr::pull(rep_id)
  } else {
    names(string_list) <- search_table |>
      dplyr::mutate(
        seq_name = base::paste(v_family, d_family, j_family,
          duplicate_count,
          sep = "_"
        ),
        seq_name = stringr::str_replace(
          seq_name,
          "IGH|IGL|IGK|TCRB|TCRA", ""
        ),
        seq_name = stringr::str_replace(seq_name, "unresolved", "UNR"),
        seq_name = stringr::str_replace_na(seq_name,
          replacement = "UNR"
        )
      ) |>
      dplyr::pull(seq_name)
  }
  # Perform multiple sequence alignment using the method described by the user
  base::set.seed(12357)
  alignment <- suppressMessages(msa::msa(string_list, method = method))
  return(alignment)
}
shashidhar22/LymphoSeq2 documentation built on Jan. 16, 2024, 4:29 a.m.