R/segments.R
In tcR: Advanced Data Analysis of Immune Receptor Repertoires

Documented in pca.segments pca.segments.2D

########## Statistics and analysis of Variable and Joining genes usage ##########


if (getRversion() >= "2.15.1") {
  utils::globalVariables(c('PC1', 'PC2', "Subject", "n"))
}


#' Gene usage.
#' 
#' @aliases geneUsage
#' 
#' @description 
#' Compute frequencies or counts of gene segments ("V / J - usage").
#' 
#' @param .data Cloneset data frame or a list with clonesets.
#' @param .genes Either one of the gene alphabet (e.g., HUMAN_TRBV, \link{genealphabets}) or list with two gene alphabets for computing 
#' joint distribution.
#' @param .quant Which column to use for the quantity of clonotypes: NA for computing only number of genes without using clonotype counts, 
#' "read.count" for the "Read.count" column, "umi.count" for the "Umi.count" column, "read.prop" for the "Read.proportion" column,
#' "umi.prop" for the "Umi.proportion" column.
#' @param .norm If T then return proportions of resulting counting of genes.
#' @param .ambig If F than remove from counting genes which are not presented in the given gene alphabet(s). 
#' 
#' @return 
#' If \code{.data} is a cloneset and \code{.genes} is NOT a list than return a data frame with first column "Gene" with genes and second with counts / proportions.
#' 
#' If \code{.data} is a list with clonesets and \code{.genes} is NOT a list than return a data frame with first column "Gene" 
#' with genes and other columns with counts / proportions for each cloneset in the input list.
#' 
#' If \code{.data} is a cloneset and \code{.genes} IS a list than return a matrix with gene segments for the first gene in \code{.genes}
#' and column names for the second gene in \code{.genes}. See "Examples".
#' 
#' If \code{.data} is a list with clonesets and \code{.genes} IS a list than return a list with matrices like in the previous case.
#' 
#' @seealso \code{\link{genealphabets}}, \code{\link{vis.gene.usage}}, \code{\link{pca.segments}}
#' 
#' @examples
#' \dontrun{
#' # Load your data
#' data(twb)
#' # compute V-segments frequencies of human TCR beta.
#' seg <- geneUsage(twb, HUMAN_TRBV, .norm = T)
#' # plot V-segments frequencies as a heatmap
#' vis.heatmap(seg, .labs = c("Sample", "V gene"))
#' # plot V-segments frequencies directly from clonesets
#' vis.gene.usage(twb, HUMAN_TRBV)
#' # plot V-segments frequencies from the gene frequencies
#' vis.gene.usage(seg, NA)
#' # Compute V-J joint usage.
#' geneUsage(twb, list(HUMAN_TRBV, HUMAN_TRBJ))
#' # for future:
#' # geneUsage(twb, "human", "trbv")
#' }
geneUsage <- function (.data, .genes = HUMAN_TRBV_MITCR, .quant = c(NA, "read.count", "umi.count", "read.prop", "umi.prop"), 
                       .norm = F, .ambig = F #, .species = c("human", "mouse"), .genes = c("trbv", "trbd", "trbj")
                       ) {
  
  message("
========================================
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!! The tcR package WILL SOON BE ORPHANED 
!! AND REMOVED FROM CRAN.
!!
!! A new package is available that is 
!! designed to replace tcR: 
!! immunarch  --  https://immunarch.com/
!!
!! We will be happy to help you to move
!! to the new package. Feel free to contact us:
!! http://github.com/immunomind/immunarch
!!
!! Sincerely, 
!!  immunarch dev team and 
!!  Vadim I. Nazarov, lead developer of tcR
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
=======================================")
  
  .process.df <- function (.df, .quant, .cols) {
    cast.fun <- dcast
    if (length(.cols) == 2) { cast.fun <- acast; len <- 2 }
    
    for (i in 1:length(.cols)) {
      .df[ which(!(.df[[.cols[i]]] %in% .genes[[i]])), .cols[i] ] <- "Ambiguous"
    }
    
    if (length(.cols) == 1) { .cols <- c(.cols, '.'); len <- 1}
    
    .df = .df %>% select(na.exclude(c(.quant, .cols[1:len])))
    .df = .df %>% grouped_df(.cols[1:len])
    
    if (!is.na(.quant)) {
      .df = .df %>% summarise(Freq = sum(.quant))
    } else {
      .df = .df %>% summarise(Freq = n())
    }

    .df
  }
  
  
  .fix.ambig <- function (.res, .ambig) {
    if (length(.genes) == 2) {
      .res <- lapply(.res, function (x) {
        x[row.names(x) != "Ambiguous", ][, colnames(x) != "Ambiguous"]
      })
      if (length(.data) == 1) {
        .res <- .res[[1]]
      }
      .res
    } else {
      .res[.res[,1] != "Ambiguous", ]
    }
  }
  
  
  quant <- NA
  if (!is.na(.quant[1])) { quant <- .column.choice(.quant, T) }
  
  if (has.class(.data, 'data.frame')) { .data <- list(Sample = .data) }
  
  if (has.class(.genes, 'list')) {    
    genecols <- c(paste0(substr(.genes[[1]][1], 4, 4), ".gene"), paste0(substr(.genes[[2]][1], 4, 4), ".gene"))
  } else {
    genecols <- paste0(substr(.genes[1], 4, 4), ".gene")
    .genes <- list(.genes)
  }
  
  tbls <- lapply(.data, .process.df, .quant = quant, .cols = genecols)
  
  
  # JOINT GENE DISTRIBUTION
  if (length(.genes) == 2) {
    tbls <- lapply(tbls, function (x) {
      genrows <- .genes[[1]][is.na(match(.genes[[1]], row.names(x)))]
      gencols <- .genes[[2]][is.na(match(.genes[[2]], colnames(x)))]

      if (length(genrows) > 0) {
        x = rbind(x, matrix(0, ncol = ncol(x), nrow = length(genrows)))
        row.names(x)[(nrow(x) - length(genrows) + 1):nrow(x)] <- genrows
      }

      if (length(gencols) > 0) {
        x = cbind(x, matrix(0, nrow = nrow(x), ncol = length(gencols)))
        colnames(x)[(ncol(x) - length(gencols) + 1):ncol(x)] <- gencols
      }

      x[is.na(x)] <- 0
      # x = x[order(.genes[[1]]), ][, order(.genes[[2]])]
      # View(x, "y")
      x
    })
    
    if (.norm) {
      tbls <- lapply(tbls, function (x) x / sum(x))
    }
    return(.fix.ambig(tbls, .ambig))
  }
  
  # SINGLE GENE DISTRIBUTION
  res <- tbls[[1]]
  colnames(res) <- c("Gene", names(.data)[1])
  
  if (length(.data) > 1) {
    for (i in 2:length(.data)) {
      colnames(tbls[[i]]) <- c("Gene", names(.data)[i])
      res <- merge(res, tbls[[i]], by = "Gene", all = T)
    }
  }
  res <- merge(res, data.frame(Gene = .genes[[1]], Something = 0, stringsAsFactors = F), by = "Gene", all = T)
  res <- res[, -ncol(res)]
  res[is.na(res)] <- 0
  
  if (!.ambig) {
    res <- .fix.ambig(res, .ambig)
  }
  
  if (.norm) {
    if (length(.genes) == 1) {
      res[,-1] <- apply(as.matrix(res[,-1]), 2, function (col) col / sum(col))
    } else {
      res <- res / sum(res)
    }
  }
  
  res
}


#' Perform PCA on segments frequency data.
#' 
#' @aliases pca.segments pca.segments.2D
#' 
#' @description
#' Perform PCA on gene segments frequency data for V- and J-segments and either return pca object or plot the results.
#' 
#' @usage
#' pca.segments(.data, .cast.freq.seg = T, ..., .text = T, .do.plot = T)
#' 
#' pca.segments.2D(.data, .cast.freq.seg = T, ..., .text = T, .do.plot = T)
#' 
#' @param .data Either data.frame or a list of data.frame or a result obtained from the \code{geneUsage} function.
#' @param .cast.freq.seg if T then apply code{geneUsage} to the supplied data.
#' @param ... Further arguments passed to \code{prcomp} or \code{geneUsage}.
#' @param .text If T then plot sample names in the resulting plot.
#' @param .do.plot if T then plot a graphic, else return a pca object.
#' 
#' @return If .do.plot is T than ggplot object; else pca object.
#' 
#' @examples
#' \dontrun{
#' # Load the twins data.
#' data(twb)
#' # Plot a plot of results of PCA on V-segments usage.
#' pca.segments(twb, T, scale. = T)
#' }
pca.segments <- function(.data, .cast.freq.seg = T, ..., .text = T, .do.plot = T){
  if (.cast.freq.seg) { .data <- geneUsage(.data, ...)[,-1] }
  pca.res <- prcomp(t(as.matrix(.data)), ...)
  if (.do.plot) {
    pca.res <- data.frame(PC1 = pca.res$x[,1], PC2 = pca.res$x[,2], Subject = names(.data))
    p <- ggplot() + geom_point(aes(x = PC1, y = PC2, colour = Subject), size = 3, data = pca.res)
    if (.text) {
      p <- p + geom_text(aes(x = PC1, y = PC2, label = Subject), data = pca.res, hjust=.5, vjust=-.3)
    }
    p + theme_linedraw() + guides(size=F) + ggtitle("VJ-usage: Principal Components Analysis") + .colourblind.discrete(length(pca.res$Subject), T)
  } else {
    pca.res
  }
}

pca.segments.2D <- function(.data, .cast.freq.seg = T, ..., .text = T, .do.plot = T){
  if (.cast.freq.seg) { .data <- lapply(geneUsage(.data, ...), function (x) as.vector(x)) }
  pca.res <- prcomp(do.call(rbind, .data), ...)
  if (.do.plot) {
    pca.res <- data.frame(PC1 = pca.res$x[,1], PC2 = pca.res$x[,2], Subject = names(.data))
    p <- ggplot() + geom_point(aes(x = PC1, y = PC2, colour = Subject), size = 3, data = pca.res)
    if (.text) {
      p <- p + geom_text(aes(x = PC1, y = PC2, label = Subject), data = pca.res, hjust=.5, vjust=-.3)
    }
    p <- p + theme_linedraw() + guides(size=F) + ggtitle("VJ-usage: Principal Components Analysis") + .colourblind.discrete(length(pca.res$Subject), T)
  } else {
    pca.res
  }
}