R/internal_functions.R

Defines functions sortGenotype next_divisor draw_segment Write_text novelAlleleAnnotation_geno splitlines novelAlleleAnnotation nonReliableAllelesText_V2 mgsub getDigLegend allelePalette sortDFByGene

Documented in allelePalette draw_segment nonReliableAllelesText_V2 novelAlleleAnnotation novelAlleleAnnotation_geno sortDFByGene sortGenotype splitlines Write_text

#' @include vdjbaseVis.R
NULL

########################################################################################################
#' Sort data frame by genes
#'
#' \code{sortDFByGene} Sort the \code{data.frame} by the genes names or position. For sorting by gene names the \code{sortAlleles} function by TIgGER is used.
#' For sorting by position the defualt package gene location list is used.
#'
#' @param    DATA                 data frame to sort
#' @param    chain                the Ig chain: IGH,IGK,IGL. Default is IGH.
#' @param    method               the method for sorting the genes. If by 'name' the genes in the output are ordered lexicographically,
#' if by 'position' only functional genes are used and are ordered by their chromosomal location. Default is 'position'.
#' @param    removeIGH            if TRUE, 'IGH'\'IGK'\'IGL' prefix is removed from gene names.
#'
#' @return   sorted \code{data.frame}
#' @export
sortDFByGene <- function(DATA, chain = c("IGH", "IGK", "IGL"), method = c("name", "position"), removeIGH = FALSE, geno = FALSE, peseudo_remove = F, ORF_remove = F) {
  if (missing(chain)) {
    chain = "IGH"
  }
  chain <- match.arg(chain)

  if (missing(method)) {
    method = "position"
  }
  method <- match.arg(method)

  GENE.loc.tmp <- GENE.loc[[chain]]
  vs <- grep("V",GENE.loc.tmp,value = T)
  ps <- grep("V",PSEUDO[[chain]],value = T)
  ps <- ps[!ps %fin% vs]
  orf <- unique(grep("OR|NL", DATA$gene,value = T,perl = T))
  GENE.loc.tmp <- c(vs,ps,orf,grep("V",GENE.loc.tmp,value = T,invert = T,perl = T))


  if(peseudo_remove){
    DATA <- DATA[!DATA$gene %fin% PSEUDO[[chain]],]
    GENE.loc.tmp <- GENE.loc.tmp[!GENE.loc.tmp %fin% ps]
  }
  if(ORF_remove){
    DATA <- DATA[!DATA$gene %fin% orf,]
    GENE.loc.tmp <- GENE.loc.tmp[!GENE.loc.tmp %fin% orf]
  }

  names(GENE.loc.tmp) <- GENE.loc.tmp

  if (method == "name") {
    DATA$gene <- factor(DATA$gene, levels = rev(sortAlleles(unique(DATA$gene), method = method)))
    if (removeIGH) {
      DATA$gene <- gsub("IG[H|K|L]", "", DATA$gene)
      DATA$gene <- factor(DATA$gene, levels = rev(sortAlleles(unique(DATA$gene), method = method)))
      if(!geno) DATA$hapBy <- gsub("IG[H|K|L]", "", DATA$hapBy)
    }
  } else {
    DATA$gene <- factor(DATA$gene, levels = rev(GENE.loc.tmp))
    if (removeIGH) {
      GENE.loc.tmp <- gsub("IG[H|K|L]", "", GENE.loc.tmp)
      names(GENE.loc.tmp) <- GENE.loc.tmp
      DATA$gene <- gsub("IG[H|K|L]", "", DATA$gene, perl = T)
      DATA$gene <- factor(DATA$gene, levels = rev(GENE.loc.tmp))
      if(!geno) DATA$hapBy <- gsub("IG[H|K|L]", "", DATA$hapBy)
    }
  }

  return(DATA)
}

########################################################################################################
#' Creates the allele color palette for haplotype graphical output
#'
#' \code{allelePalette} Takes a list of the haplotype alleles and returns the allele color palette.
#'
#' @param   hap_alleles          a list of the haplotype alleles.
#'
#' @return   Haplotype allele color palette
#' @export
allelePalette <- function(hap_alleles, NRA = TRUE) {

  Alleles <- grep("[012]", unique(hap_alleles), value = T, perl = T)
  AlleleCol.tmp <- sort(unique(sapply(strsplit(Alleles, "_"), "[", 1)))
  tmp.col <- ALLELE_PALETTE[AlleleCol.tmp]


  novels <- grep("_", Alleles, value = T)
  if (length(novels) > 0) {
    novels.col <- ALLELE_PALETTE[sapply(strsplit(novels, "_"), "[", 1)]
    names(novels.col) <- novels
    alleles.comb <- c(tmp.col, novels.col)[order(names(c(tmp.col, novels.col)))]
  } else {
    alleles.comb <- c(tmp.col)[order(names(c(tmp.col)))]

  }

  AlleleCol <- names(c(alleles.comb, Unk = "#dedede", Del = "#6d6d6d", NR = "#000000", NRA = "#fbf7f5"))
  names(AlleleCol) <- c(alleles.comb, Unk = "#dedede", Del = "#6d6d6d", NR = "#000000", NRA = "#fbf7f5")
  rm_allele <- function(allele,alleles,AlleleCol){
    if(!allele %in% alleles){
      id <- which(allele == AlleleCol)
      return(AlleleCol[-id])
    }
    return(AlleleCol)
  }
  AlleleCol <- rm_allele("NR",hap_alleles,AlleleCol)
  AlleleCol <- rm_allele("Del",hap_alleles,AlleleCol)
  AlleleCol <- rm_allele("Unk",hap_alleles,AlleleCol)
  AlleleCol <- rm_allele("NRA",hap_alleles,AlleleCol)


  transper <- sapply(AlleleCol, function(x) {
    if (grepl("_", x)) {
      mom_allele <- strsplit(x, "_")[[1]][1]
      all_novel <- grep(paste0(mom_allele, "_"), AlleleCol, value = T)
      if (length(all_novel) == 1) {
        return(0.5)
      }
      if (length(all_novel) == 2) {
        m = which(all_novel == x)
        return(ifelse(m == 1, 0.6, 0.3))
      }
      if (length(all_novel) == 3) {
        m = which(all_novel == x)
        if (m == 1) {
          return(0.6)
        }
        return(ifelse(m == 2, 0.4, 0.2))
      }
      if (length(all_novel) > 9) {
        m = which(all_novel == x)
        if (m == 1) {
          return(1)
        }
        return(1 - m/20)
      }
      if (length(all_novel) > 3) {
        m = which(all_novel == x)
        if (m == 1) {
          return(0.85)
        }
        return(0.85 - m/10)
      }
    } else (1)
  })
  names(transper) <- AlleleCol

  # remove 'mother' allele if added (when there is no germline allele but there is a novel)

  special <- c("Unk", "Del", "NR", "NRA")[c("Unk", "Del", "NR", "NRA") %in% AlleleCol]

  AlleleCol <- AlleleCol[AlleleCol %in% c(sort(grep("[012]", unique(hap_alleles), value = T, perl = T)), special)]

  transper <- transper[names(transper) %in% AlleleCol]

  return(list(transper = transper, AlleleCol = AlleleCol))
}

########################################################################################################
# Get diagonal line for legend
#
getDigLegend <- function(color){

  return(ggplotGrob(ggplot(data.frame(x=c(1,2),y=c(3,4)), aes_string("x","y")) + geom_abline(aes_string(colour="color", intercept = 1, slope = 1), show.legend = T) +
                      scale_color_manual(values = c("white"), name = "", drop = FALSE) + guides(color = guide_legend(override.aes = list(size = 0.5), order = 2)) +
                      theme(legend.justification = "center", legend.key = element_rect(fill = "gray"), legend.position = "bottom")))
}

########################################################################################################
# Get a list of patterns, replacments, and a string
#
# Returns the replaced string
#
mgsub <- function(pattern, replacement, x, ...) {
  if (length(pattern) != length(replacement)) {
    stop("pattern and replacement do not have the same length.")
  }
  result <- x
  for (i in 1:length(pattern)) {
    result <- gsub(pattern[i], replacement[i], result, ...)
  }
  result
}

########################################################################################################
#' Creates the non reliable allele text annotation for plots
#'
#' \code{nonReliableAllelesText_V2} Takes the haplotype data frame
#'
#' @param   geno_table          a data frame of the haplotypes.
#'
#' @return   Non reliable alleles text data frame for plots annotation.
#' @export
nonReliableAllelesText_V2 <- function(non_reliable_alleles_text, size = 3, map = F) {

  id <- grepl("[0-9][0-9]_[0-9][0-9]", non_reliable_alleles_text$alleles)
  num_text <- sapply(1:length(unique(non_reliable_alleles_text$alleles[id])),function(i) paste0('[*',i,']'))
  names(num_text) <- unique(non_reliable_alleles_text$alleles[id])
  non_reliable_alleles_text$text <- ''
  non_reliable_alleles_text$text[id] <- num_text[non_reliable_alleles_text$alleles]
  non_reliable_alleles_text$text_bottom <- ''
  non_reliable_alleles_text$text_bottom[id] <- paste(num_text[non_reliable_alleles_text$alleles[id]],non_reliable_alleles_text$alleles[id])
  non_reliable_alleles_text$pos <- ifelse(non_reliable_alleles_text$freq == 1, 1,
                                          ifelse(non_reliable_alleles_text$freq == 2, 0.5,
                                                 ifelse(non_reliable_alleles_text$freq == 3, 0.33, 0.25)))
  non_reliable_alleles_text$size <- size
  if(!map){
    non_reliable_alleles_text <- non_reliable_alleles_text %>% ungroup() %>% group_by(gene, subject, hapBy) %>%
      dplyr::mutate(pos = pos + ifelse(dplyr::row_number()==2,dplyr::row_number()-1.5,dplyr::row_number()-1))
    }else{
    non_reliable_alleles_text <- non_reliable_alleles_text %>% ungroup() %>% group_by(gene, subject) %>%
      dplyr::mutate(pos = ifelse(n == 1, 0.5,
                          ifelse(n == 2, seq(0.25,1,by = 0.5)[1:max(dplyr::row_number())],
                                 ifelse(n == 3, seq(0.165,1,by = 0.33)[1:max(dplyr::row_number())],
                                        seq(0.125,1,by = 0.25)[1:max(dplyr::row_number())]))))
  }
  #non_reliable_alleles_text$NEW_alleles <- non_reliable_alleles_text$alleles
  non_reliable_alleles_text$alleles[id] <- "NRA"
  return(non_reliable_alleles_text)
}

########################################################################################################
#' Creates the novel allele text annotation for plots
#'
#' \code{novelAlleleAnnotation} Takes the haplotype data frame
#'
#' @param   novel_allele         data frame with the novel allele cordinates.
#'
#' @return   novel alleles text data frame for plots annotation.
#' @export
novelAlleleAnnotation <- function(novel_allele, new_label, size = 3) {
  if (nrow(novel_allele) != 0) {
    novel_allele$text <- sapply(new_label[novel_allele$alleles],function(s) strsplit(s,'-')[[1]][1])
    novel_allele$text_bottom <- paste(new_label[novel_allele$alleles],novel_allele$alleles)
    novel_allele$pos <- ifelse(novel_allele$freq == 1, 1,
                               ifelse(novel_allele$freq == 2, 0.5,
                                      ifelse(novel_allele$freq == 3, 0.33, 0.25)))
    novel_allele$size = size
    novel_allele <- novel_allele %>% dplyr::ungroup() %>% dplyr::group_by(gene, subject) %>%
      dplyr::mutate(pos = ifelse(n == 1, 0.5,
                          ifelse(n == 2, seq(0.25,1,by = 0.5)[1:max(dplyr::row_number())],
                                 ifelse(n == 3, seq(0.165,1,by = 0.33)[1:max(dplyr::row_number())],
                                        seq(0.125,1,by = 0.25)[1:max(dplyr::row_number())]))))
    return(novel_allele)
  } else {
    return(setNames(data.frame(matrix(ncol = 8, nrow = 0)), c("gene", "alleles", "n", "freq", "text", "pos", "size")))
  }
}

########################################################################################################
#' Split lines of short reads assignments
#'
#' The \code{splitlines} function sliplits the text by line width.
#'
#' @param    bottom_annot         annotation text.
#' @param    line_width           the line width allowed.
#'
#' @return Seperated text to lines
#'
#' @export
splitlines<-function(bottom_annot,line_width=60){
  if(line_width<=max(sapply(bottom_annot,nchar))){
    line_width = max(sapply(bottom_annot,nchar))+2
    print(paste0("Set line width to ",line_width))
  }
  collapsed_annot<-paste(bottom_annot,collapse=", ")
  L<-nchar(collapsed_annot)
  if(L<line_width)return(collapsed_annot)
  vec_annot<-substring(collapsed_annot,1:L,1:L)
  ind<-grep("[",vec_annot,fixed=TRUE)
  aligned_text<-NULL
  #i<-line_width
  i_previous<-1
  while(i_previous<=(L-line_width)){
    temp<-findInterval(line_width*(length(aligned_text)+1)+1,ind)
    aligned_text<-c(aligned_text,substring(collapsed_annot,i_previous,ind[temp]-1))
    i_previous<-ind[temp]
  }
  aligned_text<-c(aligned_text,substring(collapsed_annot,i_previous,L))
  return(aligned_text)
}

########################################################################################################
#' Creates the novel allele text annotation for plots
#'
#' \code{novelAlleleAnnotation} Takes the haplotype data frame
#'
#' @param   novel_allele         data frame with the novel allele cordinates.
#'
#' @return   novel alleles text data frame for plots annotation.
#' @export
novelAlleleAnnotation_geno <- function(novel_allele, new_label, size = 3) {
  if (nrow(novel_allele) != 0) {
    novel_allele$text <- sapply(new_label[novel_allele$alleles],function(s) strsplit(s,'-')[[1]][1])
    novel_allele$text_bottom <- paste(new_label[novel_allele$alleles],novel_allele$alleles)
    novel_allele$pos <- ifelse(novel_allele$freq == 1, 1,
                               ifelse(novel_allele$freq == 2, 0.5,
                                      ifelse(novel_allele$freq == 3, 0.33, 0.25)))
    novel_allele$size = size
    novel_allele <- novel_allele %>% dplyr::ungroup() %>% dplyr::group_by(gene, subject) %>%
      dplyr::mutate(pos = ifelse(n == 1, 0.5,
                          ifelse(n == 2, seq(0.25,1,by = 0.5)[1:max(dplyr::row_number())],
                                 ifelse(n == 3, seq(0.165,1,by = 0.33)[1:max(dplyr::row_number())],
                                        seq(0.125,1,by = 0.25)[1:max(dplyr::row_number())]))))
    return(novel_allele)
  } else {
    return(setNames(data.frame(matrix(ncol = 8, nrow = 0)), c("gene", "alleles", "n", "freq", "text", "pos", "size")))
  }
}

########################################################################################################
#' Write text annotations for heatmap graph
#'
#' \code{Write_text} takes values for plotting text on heatmap
#'
#' @param   NR           number of rows.
#' @param   NC           number of columns.
#' @param   I            row index.
#' @param   J            column index.
#' @param   ALLELE       allele index in the individual gene box.
#' @param   N_ALLELES    number of alleles for individual in gene box.
#' @param   TEXT         annotation text.
#'
#' @return   plotting text annotation.
#' @export
Write_text<-function(NR,NC,I,J,ALLELE,N_ALLELES,TEXT,...){
  STEP_X<-1/(NC-1)
  STEP_Y<-1/(NR-1)
  text(STEP_X*J-STEP_X/2+STEP_X*12/N_ALLELES*(ALLELE-1/2),
       STEP_Y*I,
       TEXT,...)
}
########################################################################################################
#' Draw lk value lines on heatmap
#'
#' \code{draw_segment} takes values for plotting text on heatmap
#'
#' @param   NR           number of rows.
#' @param   NC           number of columns.
#' @param   I            row index.
#' @param   J            column index.
#'
#' @return   plotting lk lines.
#' @export
draw_segment<-function(NR,NC,I,J,...){
  STEP_X<-1/(NC-1)
  STEP_Y<-1/(NR-1)
  points(c(STEP_X*(J-0.5),STEP_X*(J+1.5)),
         c(STEP_Y*(I),STEP_Y*(I+0.5)), type = "l",...)
  points(c(STEP_X*(J-0.5),STEP_X*(J+3.5)),
         c(STEP_Y*(I-0.5),STEP_Y*(I+0.5)), type = "l",...)
  points(c(STEP_X*(J+1.5),STEP_X*(J+5.5)),
         c(STEP_Y*(I-0.5),STEP_Y*(I+0.5)), type = "l",...)
  points(c(STEP_X*(J+3.5),STEP_X*(J+7.5)),
         c(STEP_Y*(I-0.5),STEP_Y*(I+0.5)), type = "l",...)
  points(c(STEP_X*(J+5.5),STEP_X*(J+9.5)),
         c(STEP_Y*(I-0.5),STEP_Y*(I+0.5)), type = "l",...)
  points(c(STEP_X*(J+7.5),STEP_X*(J+11.5)),
         c(STEP_Y*(I-0.5),STEP_Y*(I+0.5)), type = "l",...)
  points(c(STEP_X*(J+9.5),STEP_X*(J+11.5)),
         c(STEP_Y*(I-0.5),STEP_Y*(I)), type = "l",...)
}
########################################################################################################
# finds next dividor for an int number
next_divisor <- function(x,y){
  if(x>y) return(y)
  while(T){
    if(y%%x==0) return(x)
    x <- x+1
  }
}

########################################################################################################
#' Sort multiple genotypes
#'
#' The \code{sortGenotype} function sort multiple genotype tables.
#'
#'
#' @param    geno_table           genoytpe summary table. See details.
#' @param    chain                the IG chain: IGH,IGK,IGL. Default is IGH.
#' @param    gene_sort            if by 'name' the genes in the output are ordered lexicographically,
#' if by 'position' only functional genes are used and are ordered by their chromosomal location. Default is 'position'.
#' @param    removeIGH            if TRUE, 'IGH'\'IGK'\'IGL' prefix is removed from gene names.
#' @param    lk_cutoff            the lK cutoff value to be considerd low for texture layer. Defualt is lK<1.
#'
#' @return
#'
#' A \code{data.frame} of the genotypes.
#'
#' @details
#'
#' A \code{data.frame} created by \code{inferGenotypeBaysian}.
#'
#'
#' @export
sortGenotype <- function(geno_table, chain = c("IGH", "IGK", "IGL"), gene_sort = "position", removeIGH = TRUE, lk_cutoff = 1){
  if (missing(chain)) {
    chain = "IGH"
  }
  chain <- match.arg(chain)
  lk_cutoff = as.numeric(lk_cutoff)
  # select columns
  geno_db <- geno_table[,c("subject", "gene", "GENOTYPED_ALLELES", "k_diff", "Freq_by_Clone")]
  # rename the columns
  names(geno_db)[3:4] <- c("alleles", "K")
  # correct deletion annotations
  geno_db$alleles <- gsub("Deletion","Del",geno_db$alleles)
  # set data.table and correct missing Unk annotations and K
  geno_db <- setDT(geno_db)[CJ("subject" = geno_db$subject, "gene" = geno_db$gene, unique=TRUE), on=c("subject", "gene")]
  geno_db[is.na(geno_db$alleles) , c("alleles","K") := list("Unk", NA_integer_)]
  # set K value for deleted genes
  geno_db$K[grep("Del",geno_db$alleles)] <- NA_integer_
  # expand row, one allele per row
  geno_db <- splitstackshape::cSplit(geno_db, "alleles", sep = ",", direction = "long", fixed = T, type.convert = F)

  # add pseudo genes and orf to color base
  color_pes_orf <- c()
  if(pseudo_genes){
    color_pes_orf <- c(grep("V",PSEUDO[[chain]],value = T),color_pes_orf)
  }
  if(ORF_genes){
    color_pes_orf <- c(unique(grep("OR|NL", geno_db$gene,value = T)),color_pes_orf)
  }

  # sort the data, remove pseudo and orf if needed
  geno_db <- sortDFByGene(DATA = geno_db, chain = chain, method = gene_sort, removeIGH = removeIGH, geno = T,
                          peseudo_remove = pseudo_genes, ORF_remove = ORF_genes)

  geno_db$gene <- factor(geno_db$gene, levels = gsub("IG[H|K|L]", "", GENE.loc[[chain]]))

  # rename genes to numbers
  gene_loc <- 1:length(unique(geno_db$gene)[order(match(unique(geno_db$gene), levels(geno_db$gene)))])
  names(gene_loc) <- unique(geno_db$gene)[order(match(unique(geno_db$gene), levels(geno_db$gene)))]
  geno_db$GENE_LOC <- gene_loc[as.character(geno_db$gene)]

  ######sort the heatmap for plotting
  geno_db_m <- geno_db[, n:=  .N, by = c("subject", "gene")][] # count number of alleles for group
  geno_db_m$ALLELES_G <- geno_db_m$alleles # for grouping
  geno_db_m$text <- ''
  geno_db_m$text_bottom <- geno_db_m$alleles
  # change ambiguous alleles call
  id_nra <- grepl("[0-9][0-9]_[0-9][0-9]", geno_db_m$alleles)
  nra <- F
  if (any(id_nra)) {
    # number ambiguous alleles
    num_text <- paste0('[*',1:length(unique(geno_db_m$alleles[id_nra])),']')
    names(num_text) <- unique(geno_db_m$alleles[id_nra])
    # text for plot
    geno_db_m$text[id_nra] <- num_text[geno_db_m$alleles[id_nra]]
    # text for legend
    geno_db_m$text_bottom[id_nra] <- paste(num_text[geno_db_m$alleles[id_nra]],geno_db_m$alleles[id_nra])
    # change allele to NRA - non reliable allele
    geno_db_m$alleles[id_nra] <- "NRA"
    # indicates that nra exists
    nra <- T
  }
  # create allele palette
  allele_palette <- allelePalette(geno_db_m$alleles)

  # sort novel allele calls for plot
  val_novel <- grep('^[0-9]+[_][A-Z][0-9]+[A-Z]',geno_db_m$alleles, value = T)
  novel <- F
  novel_allele_text <- c()
  novel_symbol <- "\u005E"
  if(length(val_novel)!=0){
    # sort the palettle colors for novel alleles
    id <- grep('^[0-9]+[_][A-Z][0-9]+[A-Z]',names(allele_palette$transper))
    allele_palette$transper[id] <- 1
    # cerate code index for novel allele
    code_allele <- paste0(novel_symbol,1:length(id))
    names(code_allele) <-allele_palette$AlleleCol[id]
    new_allele <- paste0(novel_symbol,1:length(id),'-',allele_palette$AlleleCol[id])
    names(new_allele) <-allele_palette$AlleleCol[id]
    # change the text for plot
    ids <- geno_db_m$alleles %fin% names(new_allele)
    rep <- new_allele[geno_db_m$alleles[ids]]
    rep2 <- code_allele[geno_db_m$alleles[ids]]
    # add new allele code to data
    geno_db_m[ids, c("alleles","text_bottom","text") := list(rep,rep,rep2)]
    # change annotation in legend colors
    allele_palette$AlleleCol[id] <- new_allele
    names(allele_palette$transper)[id] <- new_allele
    # indicates that novel exists
    novel <- T
  }

  geno_db_m$alleles <- factor(geno_db_m$alleles, levels = allele_palette$AlleleCol)
  return(geno_db_m)
}
williamdlees/vdjbasevis documentation built on Sept. 13, 2020, 12:17 a.m.