vdjbasevis: multiple visualizations for genotypes

Documented in genoHeatmap_html

########################################################################################################
#' Graphical output of alleles division by genotypes
#'
#' The \code{genoHeatmap_html} function generates an interactive graphical output of the alleles per gene in multiple samples.
#'
#'
#' @param    geno_table           genoytpe summary table. See details.
#' @param    chain                the IG chain: IGH,IGK,IGL. Default is IGH.
#' @param    gene_sort            if by 'name' the genes in the output are ordered lexicographically,
#' if by 'position' only functional genes are used and are ordered by their chromosomal location. Default is 'position'.
#' @param    removeIGH            if TRUE, 'IGH'\'IGK'\'IGL' prefix is removed from gene names.
#' @param    lk_cutoff            the lK cutoff value to be considerd low for texture layer. Defualt is lK<1.
#' @param    mark_low_lk          if TRUE, a texture is add for low lK values. Defualt is TRUE.
#'
#' @return
#'
#' An interactive heat-map visualization of the genotype inference for multiple samples.
#'
#' @details
#'
#' A \code{data.frame} created by \code{inferGenotypeBaysian}.
#'
#'
#' @export
genoHeatmap_html <- function(geno_table, chain = c("IGH", "IGK", "IGL"), gene_sort = "position", removeIGH = TRUE, lk_cutoff = 1, mark_low_lk = TRUE, n_line = 4, line_length=60, pseudo_genes = FALSE, ORF_genes = FALSE, file = file.path(normalizePath(tempdir()),"genotype_heatmap.html")) {
  if (missing(chain)) {
    chain = "IGH"
  }
  chain <- match.arg(chain)
  lk_cutoff = as.numeric(lk_cutoff)
    # select columns
  geno_db <- geno_table[,c("subject", "gene", "GENOTYPED_ALLELES", "k_diff", "Freq_by_Clone")]
  # rename the columns
  names(geno_db)[3:4] <- c("ALLELES", "K")
  # correct deletion annotations
  geno_db$ALLELES <- gsub("Deletion","Del",geno_db$ALLELES)
  # set data.table and correct missing Unk annotations and K
  geno_db <- setDT(geno_db)[CJ(subject = subject, gene = gene, unique=TRUE), on=.(subject, gene)]
  geno_db[is.na(ALLELES) , c("ALLELES","K") := list("Unk", NA_integer_)]
  # set K value for deleted genes
  geno_db$K[grep("Del",geno_db$ALLELES)] <- NA_integer_
  # expand row, one allele per row
  geno_db <- splitstackshape::cSplit(geno_db, "ALLELES", sep = ",", direction = "long", fixed = T, type.convert = F)

  # add pseudo genes and orf to color base
  color_pes_orf <- c()
  GENE.loc.tmp <- GENE.loc[[chain]]
  if(pseudo_genes){
    color_pes_orf <- c(grep("V",PSEUDO[[chain]],value = T),color_pes_orf)
  }else{
    GENE.loc.tmp <- c(unique(grep("V",PSEUDO[[chain]],value = T))[! grep("V",PSEUDO[[chain]],value = T) %in% GENE.loc.tmp],GENE.loc.tmp)
  }
  if(ORF_genes){
    color_pes_orf <- c(unique(grep("OR|NL", geno_db$gene,value = T)),color_pes_orf)
  }else{
    GENE.loc.tmp <- c(unique(grep("OR|NL", geno_db$gene,value = T))[! unique(grep("OR|NL", geno_db$gene,value = T)) %in% GENE.loc.tmp],GENE.loc.tmp)
  }

  # sort the data, remove pseudo and orf if needed
  geno_db <- sortDFByGene(DATA = geno_db, chain = chain, method = gene_sort, removeIGH = removeIGH, geno = T,
                          peseudo_remove = pseudo_genes, ORF_remove = ORF_genes)


  geno_db$gene <- factor(geno_db$gene, levels = gsub("IG[H|K|L]", "", GENE.loc.tmp))

  # rename genes to numbers
  gene_loc <- 1:length(unique(geno_db$gene)[order(match(unique(geno_db$gene), levels(geno_db$gene)))])
  names(gene_loc) <- unique(geno_db$gene)[order(match(unique(geno_db$gene), levels(geno_db$gene)))]
  geno_db$GENE_LOC <- gene_loc[as.character(geno_db$gene)]

  ######sort the heatmap for plotting
  geno_db_m <- geno_db[, n:=  .N, by = list(subject,gene)][] # count number of alleles for group
  geno_db_m$ALLELES_G <- geno_db_m$ALLELES # for grouping
  geno_db_m$text <- ''
  geno_db_m$text_bottom <- geno_db_m$ALLELES
  # change ambiguous alleles call
  id_nra <- grepl("[0-9][0-9]_[0-9][0-9]", geno_db_m$ALLELES)
  nra <- F
  if (any(id_nra)) {
    # number ambiguous alleles
    num_text <- paste0('[*',1:length(unique(geno_db_m$ALLELES[id_nra])),']')
    names(num_text) <- unique(geno_db_m$ALLELES[id_nra])
    # text for plot
    geno_db_m$text[id_nra] <- num_text[geno_db_m$ALLELES[id_nra]]
    # text for legend
    geno_db_m$text_bottom[id_nra] <- paste(num_text[geno_db_m$ALLELES[id_nra]],geno_db_m$ALLELES[id_nra])
    # change allele to NRA - non reliable allele
    geno_db_m$ALLELES[id_nra] <- "NRA"
    # indicates that nra exists
    nra <- T
  }
  # create allele palette
  allele_palette <- allelePalette(geno_db_m$ALLELES)

  # sort novel allele calls for plot
  val_novel <- grep('^[0-9]+[_][A-Z][0-9]+[A-Z]',geno_db_m$ALLELES, value = T)
  novel <- F
  novel_allele_text <- c()
  novel_symbol <- "\u005E"
  if(length(val_novel)!=0){
    # sort the palettle colors for novel alleles
    id <- grep('^[0-9]+[_][A-Z][0-9]+[A-Z]',names(allele_palette$transper))
    allele_palette$transper[id] <- 1
    # cerate code index for novel allele
    code_allele <- paste0(novel_symbol,1:length(id))
    names(code_allele) <-allele_palette$AlleleCol[id]
    new_allele <- paste0(novel_symbol,1:length(id),'-',allele_palette$AlleleCol[id])
    names(new_allele) <-allele_palette$AlleleCol[id]
    # change the text for plot
    ids <- geno_db_m$ALLELES %fin% names(new_allele)
    rep <- new_allele[geno_db_m$ALLELES[ids]]
    rep2 <- code_allele[geno_db_m$ALLELES[ids]]
    # add new allele code to data
    geno_db_m[ids, c("ALLELES","text_bottom","text") := list(rep,rep,rep2)]
    # change annotation in legend colors
    allele_palette$AlleleCol[id] <- new_allele
    names(allele_palette$transper)[id] <- new_allele
    # indicates that novel exists
    novel <- T
  }

  geno_db_m$ALLELES <- factor(geno_db_m$ALLELES, levels = allele_palette$AlleleCol)


  # samples names and number
  samples <- unique(geno_table$subject)
  samples_n <- length(samples)
  # genes names and number
  genes <- unique(geno_db_m$gene)
  genes_n <- length(genes)

  # order the data by gene loc
  setorderv(geno_db_m, c("subject","GENE_LOC"))
  setkey(geno_db_m, subject)

  # sort data for matrix
  geno_db_m[,line:=12/n]
  allele_code <- sapply(strsplit(gsub("\\^[0-9]+[-]","",allele_palette$AlleleCol),"_",fixed = T), "[[",1, USE.NAMES = T)
  last <- as.numeric(allele_code[length(allele_code)-3])+1
  ids_a <- last:(last+2)
  allele_code[(length(allele_code)-2):length(allele_code)] <- ids_a
  names(allele_code) <- gsub("\\^[0-9]+[-]","",allele_palette$AlleleCol)
  # sort the alleles in gene box
  geno_db_m[,A_CODE:=as.numeric(allele_code[ALLELES])]
  geno_db_m[grep("[0-9]_[0-9]",geno_db_m$ALLELES,perl = T),A_CODE:=allele_code["NRA"]]
  setorderv(geno_db_m, c("subject","GENE_LOC","A_CODE"))

  # duplicate the data by 12 box to gene
  geno_db_m[,id := 1:.N, by = .(subject, gene)]
  geno_db_f = geno_db_m[,.(n_line = 1:line), by = .(subject, gene, GENE_LOC, ALLELES_G, A_CODE,text_bottom,K), nomatch = 0]


  vline <- function(x = 0, color = "white") {
    list(
      type = "line",
      y0 = 0,
      y1 = 1,
      yref = "paper",
      x0 = x,
      x1 = x,
      line = list(color = color)
    )
  }

  kline <- function(NR,NC,X,Y, color = "white") {
    STEP_X<-1/(NC-1)
    STEP_Y<-1/(NR-1)
    y0 = ifelse((Y-1)/NR<0,0,(Y-1)/NR)
    y1 = Y/NR#(Y+0.5)*STEP_Y #ifelse((Y-0.5)*STEP_Y < 0, 0, (Y-0.5)*STEP_Y)
    list(list(
      type = "line",
      y0 = y0,
      y1 = y1,
      yref = "paper",
      x0 = (X+0.5)*STEP_X,
      x1 = (X+3.5)*STEP_X,
      xref = "paper",
      line = list(color = color)
    ),list(
      type = "line",
      y0 = y0,
      y1 = y1,
      yref = "paper",
      x0 = (X+4.5)*STEP_X,
      x1 = (X+7.5)*STEP_X,
      xref = "paper",
      line = list(color = color)
    ),list(
      type = "line",
      y0 = y0,
      y1 = y1,
      yref = "paper",
      x0 = (X+8.5)*STEP_X,
      x1 = (X+11.5)*STEP_X,
      xref = "paper",
      line = list(color = color)
    ))
  }

  col_names <- unique(sapply(strsplit(gsub("\\^[0-9]+[-]","",allele_palette$AlleleCol),"_",fixed = T), "[[",1, USE.NAMES = T))
  mypal <- colorRampPalette(unique(names(allele_palette$AlleleCol)))
  ncols = length(unique(names(allele_palette$AlleleCol)))#+1
  cols <- mypal(ncols)
  zseq <- seq(0,1,length.out=ncols+1)
  colorScale <- data.frame(
    z = c(0,rep(zseq[-c(1,length(zseq))],each=2),1),
    col=rep(cols,each=2)
  )
  colorScale$col <- as.character(colorScale$col,sort(as.numeric(unique(geno_db_f[[5]]))))
  colors_change <- setNames(round(sapply(seq(1,nrow(colorScale),2),function(x) mean(colorScale$z[x:(x+1)])),3),sort(as.numeric(unique(geno_db_f[[5]]))))
  # transform allele codes to matrix, 12 box for each gene. each row is an individual
  m <- matrix(colors_change[as.character(geno_db_f[[5]])],ncol = 12*genes_n,byrow = T,dimnames = list(unique(geno_db_f[[1]]),geno_db_f[[2]][1:(12*genes_n)]))

  allele_code_t <- allele_palette$AlleleCol
  names(allele_code_t) <- allele_code

  geno_db_f[,text:=paste("Individual:",subject,"<br />Gene:",gene,"<br />Allele:",text_bottom,"<br />Kdiff:",round(K,3))]

  conditions.text <- matrix(geno_db_f[[9]], ncol = 12*genes_n, byrow = TRUE)
  #conditions.cols <- matrix(geno_db_f[[9]], ncol = 12*genes_n, byrow = TRUE)


  zmx <- round(max(m))
  zmn <- round(min(m))
  ids_color_scale <- round(sapply(seq(1,nrow(colorScale),2),function(x) mean(colorScale$z[x:(x+1)])),3)*(ncols-1)
  ids_color_scale <- c(ids_color_scale,ids_color_scale[ncols]+(ids_color_scale[ncols]-ids_color_scale[ncols-1]))
  colorbar=list(tickmode='array', tick0=0, dtick=1,tickvals = 1:length(seq(1,nrow(colorScale),2)), ticktext=c("",col_names),
                len = 0.6, outlinecolor="white",bordercolor="white",borderwidth=5,bgcolor="white")


  # add grid lines
  gridlines <- lapply(seq(11.5,genes_n*12,by=12),vline)


  # plot dim
  plot_height <- 500 + 10*nrow(m)
  plot_width <- 100 + 2*ncol(m)
  # create plot
  p <- plotly::plot_ly(z=(m),type = "heatmap",
                       colorscale= colorScale,
                       #colorbar = colorbar,
                       hoverinfo='text',text=conditions.text, width = plot_width, height = plot_height) %>%
    plotly::layout(yaxis = list(dtick = 1, ticktext = rownames(m), tickmode="array", tickvals = 0:(nrow(m)-1)),
                   xaxis = list(dtick = 1, ticktext = unique(colnames(m)), tickmode="array", tickvals = seq(6,12*genes_n,12)))

  # add k lines
  # klines = geno_db_m[geno_db_m$K<lk_cutoff,]
  # if(nrow(klines)>0){
  #   klines[, y:=match(subject,samples)-1] # row index
  #   klines[, yend:=y+0.5] # row index
  #   klines[, x:=(as.numeric(GENE_LOC)-1)*12] # col index
  #   klines[, xend:=x+1] # col index
  #   NR = samples_n
  #   NC = genes_n*12
  #   klines2 <- apply(klines, 1,function(x) kline(NR,NC,as.numeric(x["x"]),as.numeric(x["y"])))
  #
  #   p <- p %>% plotly::layout(shapes = c(gridlines,unlist(klines2,recursive = F)))
  # }

  # add text annotations
  ids_text <- grep('^[0-9]|Del|Unk',geno_db_m$text_bottom,invert = T)
  if(length(ids_text)>0){
    annot = geno_db_m[ids_text,]
    annot[, y:=(match(subject,samples)-1)]
    annot[, x:=((as.numeric(GENE_LOC)-1)*12+as.numeric(id)*(12/n)-1.5 )]
    p <- p %>%  plotly::add_annotations(x = annot$x,
                                        y = annot$y,
                                        text = annot$text,
                                        xref = 'x',
                                        yref = 'y', size = 0.025, showarrow = FALSE, font=list(color='black',size=0.025))
  }


  # save html file
  save_widget <- function(p, f) {
    htmlwidgets::saveWidget(p, f, selfcontained = F)
    mb <- round(file.info(f)$size / 1e6, 3)
    message("File is: ", mb," MB")
  }

  save_widget(plotly::partial_bundle(p%>% plotly::rangeslider()), f=file)
}