R/bagpipe.R

Defines functions bagpipe

Documented in bagpipe

#' @useDynLib alphabetr
#' @importFrom Rcpp sourceCpp
NULL

#' Identify candidate alpha/beta pairs.
#'
#' \code{bagpipe()} uses the alphabetr resampling procedure on sequencing data
#'   to identify candidate alpha/beta pairs. The procedure takes a subsample of
#'   the data without replacement, calculates association scores with
#'   \code{\link{chain_scores}}, and then for each well, uses the Hungarian
#'   algorithm to determine the most likely pairings for the chains found in the
#'   well. Each time this is done is a replicate, and the number of replicates is
#'   specified as an option. A threshold is then used to filter the candidate
#'   pairs that appear in proportion of the replicates larger than the threshold,
#'   resulting in the final list of candidate pairs. Bagpipe is an acronym for
#'   \strong{b}ootstrapping \strong{a}lphabetr \strong{g}enerated
#'   \strong{p}a\strong{i}rs \strong{p}rocedur\strong{e} (based on older versions
#'   that utilized bootstrapping)
#'
#' @param alpha Matrix recording which alpha chains appear in each well of the
#'    data. See \code{\link{create_data}}.
#' @param beta Matrix recording which beta chains appear in the each well of the
#'    data. See \code{\link{create_data}}.
#' @param replicates The number of times the resampling procedure is repeated,
#'    i.e. the number of replicates. At least 100 replicates is recommended.
#' @param frac The fraction of the wells resampled in each replicate. Default
#'    is 75\% of the wells
#' @param bootstrap Legacy option. Calls a bootstrapping strategy (which
#'    resamples with replacement) instead of sampling a subset without
#'    replacement.
#'
#' @return A n x 5 matrix where n is the number of clones determined by
#'   \code{bagpipe()}. Each row represents the chains of the clone.
#'   Columns 1 and 2 represent the beta chain indices of the clone.
#'   Columns 3 and 4 represent the alpha chain indices of the clone.
#'   Column 5 represents the number of replicates that this clone was found in.
#'   Column 5 is used to filter our the clones that have not be determined by
#'   a certain "threshold" proportion of replicates.
#'
#'   Note that column 1 == column 2 and column 3 == column 4 since
#'   \code{bagpipe()} does not try to determine dual-alpha or dual-beta clones.
#'
#' @examples
#'  # see the help for create_clones() and create_data()
#'  clones <- create_clones(numb_beta = 1000,
#'                          dual_alpha = .3,
#'                          dual_beta = .06,
#'                          alpha_sharing = c(0.80, 0.15, 0.05),
#'                          beta_sharing  = c(0.75, 0.20, 0.05))
#'  dat <- create_data(clones$TCR, plate = 5,
#'                     error_drop = c(.15, .01),
#'                     error_seq  = c(.05, .001),
#'                     error_mode = c("lognormal", "lognormal"),
#'                     skewed = 10,
#'                     prop_top = 0.6,
#'                     dist = "linear",
#'                     numb_cells = matrix(c(50, 480), ncol = 2))
#'
#'  \dontrun{
#'  # normally want to set replicates to 100 or more
#'  pairs <- bagpipe(alpha = dat$alpha,
#'                beta  = dat$beta,
#'                replicates = 5,
#'                frac = 0.75,
#'                bootstrap = FALSE)
#'
#'  # using a threshold of 0.3 of replicates
#'  pairs <- pairs[pairs[, 5] > 0.3, ]
#'  }
#' @export

bagpipe <- function(alpha, beta, replicates = 100, frac = 0.75, bootstrap = FALSE) {
  # If user asks for bootstrapping, then the resampled size must be equal
  # to the original sample size
  if (bootstrap == TRUE) frac <- 1.0
  # Ensure that there are the same number of wells in the alpha and beta data
  if (nrow(alpha) != nrow(beta))
    stop("Different number of wells in the alpha and beta data inputs.")
  if (replicates < 1) stop("The replicates arguments need to be at least 1.")

  numb_plates <- nrow(alpha) / 96
  size_rep    <- floor(frac * nrow(alpha))  # number of wells to sample

  # for CRAN checks
  list_pair1 <- list_pair2 <- NULL

  for (plat in 1:replicates) {
    # choose a random subset of the wells of the data without replacement
    # (unless the user wants to bootstrap and use replacement)
    ind <- sample(1:(96 * numb_plates), size_rep, replace = bootstrap)
    data_alph <- alpha[ind, ]
    data_beta <- beta[ind, ]

    # calculate association scores between
    pref <- chain_scores(data_a = data_alph, data_b = data_beta)

    # number of alpha/beta chains
    numb_alph  <- ncol(data_alph) # columns of data_alph is the number of alpha chains
    numb_beta  <- ncol(data_beta) # columns of data_beta is the number of beta chains

    # matrices of scores
    score_alph  <- pref$ascores   # matrices of scores that alpha (row i) has for beta (col j)
    score_beta  <- pref$bscores   # matrices of scores that beta (row i) has for beta (col j)
    rm(pref)

    # This section loops through each well, looks at the alpha and beta chains
    # in the well and use the Hungarian algorithm to choose the set of one-to-
    # one beta-alpha pairings that maximizes the sum of the association scores

    # Entry (i, j) of track_pairs keeps track the number of wells beta_i and
    # alpha_j are chosen in; a threshold is then calculated to filter out the
    # beta_i/alpha_j pairs that are most likely not true pairs
    track_pairs <- matrix(0, nrow = numb_beta, ncol = numb_alph)
    #track_assign <- list(length = nrow(data_beta))
    score_mat <- score_beta + t(score_alph)
    for (well in 1:nrow(data_beta)) {
      # find the alpha and beta chain indices in the well
      ind_alph <- which(data_alph[well, ] == 1)
      ind_beta <- which(data_beta[well, ] == 1)
      # Hungarian algorithm is posed differently depending on whether there are
      # more alpha or beta chains in the well
      if (length(ind_beta) <= length(ind_alph)) {
        # use the hungarian algorithm to obtain the beta_i/alpha_j pairings that
        # maximize the sum of the scores for the well
        assign <- clue::solve_LSAP(score_mat[ind_beta, ind_alph, drop = FALSE],
                                   maximum = TRUE)
        assign <- matrix(c(ind_beta, ind_alph[assign]), ncol = 2)
        #track_assign[[well]] <- assign
        track_pairs[assign] <- track_pairs[assign] + 1
      } else {
        assign <- clue::solve_LSAP(t(score_mat[ind_beta, ind_alph, drop=FALSE]),
                                   maximum = TRUE)
        assign <- matrix(c(ind_beta[assign], ind_alph), ncol = 2)
        #track_assign[[well]] <- assign
        track_pairs[assign] <- track_pairs[assign] + 1
      }
    }

    # Determine a threshold for the # number of wells that a candidate pair must
    # be picked in in order to make the cut
    threshold <- mean(track_pairs[track_pairs > 0])

    # arr.ind = TRUE will return a n by 2 matrix, which col 1 is the beta index
    # and col 2 is the alpha index; finding the beta, alpha pairs that are above
    # the threshold
    app <- which(track_pairs > threshold, arr.ind = TRUE)
    app <- app[order(app[, 1]), ]

    # Sorting the resulting candidate pairs into a list pair where the ith
    # element of the list contains the alpha indices pairing with beta_i
    list_pair <- list(length = numb_beta)
    for (i in 1:numb_beta) {
      # find the candidate pairs with the beta index and save those alpha
      # indices to the ith element of the list
      ind_beta <- which(app[, 1] == i)
      list_pair[[i]] <- unique(as.vector(app[ind_beta, 2]))
    } # end for i

    # recording the beta_i\alpha_j pairs from the output of the replicate to the
    # variable "list_pairX" where X is the X_th replicate
    result_name <- paste("list_pair", plat, sep = "")
    assign(result_name, list_pair)
  } # end for - plat

  # combine all the results of the replicates into one master list
  # The vector of the i_th element of list_master represents the indices of the
  # alpha chains paired with beta_i
  list_master <- mapply(c, list_pair1, list_pair2, SIMPLIFY = FALSE)
  rm(list_pair1, list_pair2)
  for (i in 3:replicates) {
    list_name   <- paste("list_pair", i, sep = "")
    list_master <- mapply(c, list_master, eval(parse(text = list_name)),
                          SIMPLIFY = FALSE)
    rm(list = list_name)
  }

  # determine the number of candidate pairs by adding up all the number of
  # unique alpha chain indices paired with each beta chain
  cand_pairs <- sum(sapply(lapply(list_master, unique), length))

  jack_results <- matrix(0, nrow = cand_pairs, ncol = 3)
  ind <- 1
  for (ind_beta in 1:length(list_master)) {
    if (length(list_master[[ind_beta]]) > 0) {
      alphas <- table(list_master[[ind_beta]])
      indices <- as.numeric(names(alphas))
      prop_replicates  <- as.vector(alphas) / replicates
      i1 <- ind
      i2 <- ind + length(unique(indices)) - 1
      jack_results[i1:i2, 1] <- ind_beta
      jack_results[i1:i2, 2] <- indices
      jack_results[i1:i2, 3] <- prop_replicates
      ind <- ind + length(unique(indices))
    }
  }

  jack_results <- jack_results[, c(1, 1, 2, 2, 3), drop = FALSE]
  colnames(jack_results) <- c("beta1", "beta2", "alpha1", "alpha2", "prop_replicates")
  jack_results
} # end function
EdwardSLee/alphabetr documentation built on May 6, 2019, 3:12 p.m.