R/plot_network.R

Defines functions plot_network

Documented in plot_network

#' Plotting network calculated by turboGliph or GLIPH2
#'
#' @description With \code{plot_network} an automated function is provided to visualize the network generated by
#' \code{turboGliph} or \code{gliph2} with the package visNetwork.
#'
#' @param result_folder character. By default \code{""}. Path to the folder in which the output files of \code{turboGliph} or \code{gliph2} are stored.
#' If specified, the clustering results are loaded directly from the files and the parameter \code{clustering_output} is ignored. 
#' @param show_additional_columns character vector. By default \code{NULL}. By default, the sequence, cluster tag, and cluster score are shown for each node.
#' Here, further column names can be specified, whose information is additionally displayed. All column names of the data frame originally entered under the
#' parameter \code{cdr3_sequences} in \code{turboGliph}, \code{gliph2} or \code{gliph_combined} (sequence specific information) and all column names of the data
#' frame under \code{clustering_output$cluster_properties} (cluster specific information) are available.
#' @param clustering_output list. By default \code{NULL}. If the value of \code{result_folder} is \code{""}, the output list of the functions
#' \code{turboGliph} or \code{gliph2} must be entered here.
#' @param color_info character. By default \code{"total.score"}. \code{color_info} specifies the column name by which the nodes are colored.
#' All column names of the data frame originally entered under the parameter \code{cdr3_sequences} in \code{turboGliph}, \code{gliph2} or \code{gliph_combined}
#' (sequence specific information) and all column names of the data frame under \code{clustering_output$cluster_properties} (cluster specific
#' information) are available. If \code{color_info = "color"} applies, the values
#' from the column "color" are directly interpreted as color values for the respective sequence. If other columns
#' are specified, colors are automatically assigned to the contained values using the viridis palette. For numeric values,
#' purple corresponds to small values and yellow corresponds to high values.
#' Two example options:
#' \itemize{
#' \item "none": All nodes are colored gray.
#' \item "total.score": The nodes are colored according to their total cluster score using the viridis palette. Purple corresponds to small scores of
#' significant clusters. Yellow corresponds to high scores of clusters with little significance.
#' }
#' @param color_palette function. This function specifies the color palette for node coloring. By default, the viridis-palette is used. The function
#' has to expect a number \code{n} as input representing the desired amount of color values and has to return \code{n} color values in a vector.
#' @param local_edge_color color. By default \code{#11c21d}. Specifies the color of edges representing local similarities. The value must specify a color.
#' @param global_edge_color color. By default \code{#ad1717}. Specifies the color of edges representing global similarities. The value must specify a color.
#' @param size_info character. By default \code{NULL}. \code{size_info} specifies the column name after which the node sizes are determined.
#' All column names of the data frame originally entered under the parameter \code{cdr3_sequences } in \code{turboGliph}, \code{gliph2} or \code{gliph_combined}
#' (sequence specific information) and all column names of the data frame under \code{clustering_output$cluster_properties} (cluster specific
#' information) are available. The values of the chosen column have to represent numeric values.
#' @param absolute_size logical. By default \code{FALSE}. If \code{TRUE}, the values from the column of
#' \code{cdr3_sequences} named after \code{size_info} are used as absolute sizes of the nodes. Otherwise,
#' the values are normalized linearly to a range of values for the node size between 12 and 20.
#' @param cluster_min_size numeric. By default 3. Minimal size of a cluster required to be considered for plotting.
#' @param n_cores numeric. Number of cores to use, by default 1. In case of \code{NULL} it will be set to number of cores in your machine minus 2.
#'
#'@return \code{plot_network} returns an object of class 'visNetwork' which visualizes the network from the results of \code{turboGliph} or \code{gliph2}. The resulting graph
#'is interactive. Scrolling zooms, hovering over a node displays additional information about that node, and
#'clicking on a node highlights all direct neighbors. The color and size of the nodes can be adjusted with the parameters.
#'For details, see the parameter information.
#'
#' @examples
#' utils::data("gliph_input_data")
#' res <- turbo_gliph(cdr3_sequences = gliph_input_data[base::seq_len(200),],
#' sim_depth = 100,
#' n_cores = 1)
#'
#' plot_network(clustering_output = res,
#' n_cores = 1)
#'
#' @import viridis foreach grDevices
#' @export

plot_network <- function(clustering_output = NULL,
                         result_folder = "",
                         show_additional_columns = NULL,
                         color_info = "total.score",
                         color_palette = viridis::viridis,
                         local_edge_color = "orange",
                         global_edge_color = "#68bceb",
                         size_info = NULL,
                         absolute_size = FALSE,
                         cluster_min_size = 3,
                         n_cores = 1) {
  clustering_output <- clustering_output
  
  ##################################################################
  ##                         Unit-testing                         ##
  ##################################################################
  
  ### result_folder clustering_output
  if(!base::is.character(result_folder)) stop("result_folder has to be a character object")
  if(base::length(result_folder) > 1) stop("result_folder has to be a single path")
  if(result_folder != ""){
    clustering_output <- load_gliph_output(result_folder = result_folder)
    base::cat("\n")
  }
  
  ### show_additional_columns
  if(!base::is.character(show_additional_columns) && !(base::is.null(show_additional_columns))) base::stop("show_additional_columns must be a character vector representing column names.")
  
  ### color_info
  if(!base::is.character(color_info)) base::stop("color_info must be a character.")
  if(base::length(color_info) > 1) stop("color_info has to be a single column name")
  
  ### color_palette
  if(!base::is.function(color_palette)) base::stop("color_palette must be a function expecting a number as input and returning color values.")
  if(!plotfunctions::isColor(color_palette(1))) base::stop("color_palette must be a function expecting a number as input and returning color values.")
  
  ### local_edge_color
  if(!plotfunctions::isColor(local_edge_color)) base::stop("local_edge_color has to be a color.")
  
  ### global_edge_color
  if(!plotfunctions::isColor(global_edge_color)) base::stop("global_edge_color has to be a color.")
  
  ### size_info
  if(!base::is.character(size_info) && !(base::is.null(size_info))) base::stop("size_info must be either NULL or a character.")
  if(base::length(size_info) > 1  && !(base::is.null(size_info))) stop("size_info has to be a single column name")
  
  ### absolute_size
  if(!base::is.logical(absolute_size)) base::stop("absolute_size has to be logical")
  
  ### cluster_min_size
  if(!base::is.numeric(cluster_min_size)) base::stop("cluster_min_size has to be numeric")
  if(base::length(cluster_min_size) > 1) base::stop("cluster_min_size has to be a single number")
  if(cluster_min_size < 1) base::stop("cluster_min_size must be at least 1")
  cluster_min_size <- base::round(cluster_min_size)
  
  ### n_cores
  if(base::is.null(n_cores)) n_cores <- base::max(1, parallel::detectCores()-2) else {
    if(!base::is.numeric(n_cores)) base::stop("n_cores has to be numeric")
    if(base::length(n_cores) > 1) base::stop("n_cores has to be a single number")
    if(n_cores < 1) base::stop("n_cores must be at least 1")
    
    n_cores <- base::min(n_cores, parallel::detectCores()-2)
  }
  
  #################################################################
  ##                      Input preparation                      ##
  #################################################################
  
  ### Initiate parallelization
  doParallel::registerDoParallel(n_cores)
  
  ### Get cluster_list and cluster_properties with at least cluster_min_size members
  # cluster_list:       contains all members of the cluster and the sequence specific additional information (e.g. patient, count, etc.)
  # cluster_properties: contains cluster specific information like all scores
  parameters <- clustering_output$parameters
  cluster_list <- clustering_output$cluster_list
  if(base::is.null(cluster_list)) base::stop("The specified clustering_output does not contain any clusters.")
  if(base::length(cluster_list) == 0) base::stop("The specified clustering_output does not contain any clusters.")
  cluster_properties <- clustering_output$cluster_properties
  
  hold_ids <- base::which(base::as.numeric(cluster_properties$cluster_size) >= cluster_min_size)
  if(base::length(hold_ids) == 0) base::stop("The specified clustering_output does not contain any clusters with a minimal cluster size of ",
                                            cluster_min_size,
                                            ".")
  cluster_list <- cluster_list[hold_ids]
  cluster_properties <- cluster_properties[hold_ids,]
  
  ### Pool sequence and cluster specific information (identical sequences in differenct clusters are treated as different entities)
  cluster_data_frame <- base::data.frame(foreach::foreach(i = base::seq_along(cluster_list), .combine = rbind) %dopar% {
      temp_df <- cluster_list[[i]]
      temp_adds <- base::unlist(cluster_properties[i,])
      temp_adds <- base::data.frame(base::matrix(base::rep(temp_adds, times = base::nrow(temp_df)), nrow = base::nrow(temp_df), byrow = TRUE), stringsAsFactors = FALSE)
      base::colnames(temp_adds) <- base::colnames(cluster_properties)
      temp_df <- base::cbind(temp_adds, temp_df)
      
      base::return(temp_df)
  }, stringsAsFactors = FALSE)
  cluster_data_frame[] <- base::lapply(cluster_data_frame, base::as.character)
  for(i in base::seq_len(base::ncol(cluster_data_frame))){
    if(base::suppressWarnings(base::any(base::is.na(base::as.numeric(cluster_data_frame[,i])))) == FALSE) cluster_data_frame[,i] <- base::as.numeric(cluster_data_frame[,i])
  }
  cluster_data_frame$ID <- base::seq_len(base::nrow(cluster_data_frame))
  
  # For better visualization insert a line break (<br>) between significant HLA alleles in the same cluster
  if("lowest.hlas" %in% colnames(cluster_data_frame)){
    cluster_data_frame$lowest.hlas <- gsub(", ", "<br>", cluster_data_frame$lowest.hlas)
    cluster_data_frame$lowest.hlas[cluster_data_frame$lowest.hlas != ""] <- paste0("<br>",
                                                                                   cluster_data_frame$lowest.hlas[cluster_data_frame$lowest.hlas != ""])
  }
  
  
  #################################################################
  ##                     Prepare graph edges                     ##
  #################################################################
  
  base::cat("Preparing graph nodes and edges.\n")
  clone_network <- base::c()
  ### Different edge identification procedure in distinct GLIPH versions
  if(!("gliph_version" %in% base::names(parameters))){
    ### Merged function for GLIPH1.0 and GLIPH2.0 options
    
    ### Clustering method as in GLIPH1.0
    if(parameters$clustering_method == "GLIPH1.0"){
      # Get connections between sequences calculated by GLIPH; exclude all singletons
      ori_clone_net <- clustering_output$connections
      ori_clone_net <- ori_clone_net[ori_clone_net[,3] != "singleton",]
      if(base::nrow(ori_clone_net) == 0) base::stop("The specified clustering_output does not contain any clusters.")
      
      # Are patient and v gene dependent restrictions possible?
      patient.info <- FALSE
      vgene.info <- FALSE
      if("patient" %in% base::colnames(cluster_data_frame)) patient.info <- TRUE
      if("TRBV" %in% base::colnames(cluster_data_frame)) vgene.info <- TRUE
      
      ### Get connections between different tuples consisting of CDR3b sequence, v gene and donor
      x <- NULL
      clone_network <- foreach::foreach(x = base::seq_len(base::nrow(ori_clone_net))) %dopar% {
        # Identify all tuples with correpsonding CDR3b sequence
        act_ids1 <- cluster_data_frame$ID[cluster_data_frame$CDR3b == ori_clone_net[x,1]]
        act_ids2 <- cluster_data_frame$ID[cluster_data_frame$CDR3b == ori_clone_net[x,2]]
        
        # First assume connection between all tuples
        comb_ids <- base::expand.grid(act_ids1, act_ids2)
        comb_ids <- base::unique(comb_ids)
        
        # If requested (public_tcrs != "all"), exclude connections between tuples including different donors
        if((parameters$public_tcrs == "none" || parameters$public_tcrs == ori_clone_net[x,3]) && patient.info == TRUE && base::nrow(comb_ids) > 0){
          comb_ids <- comb_ids[cluster_data_frame$patient[comb_ids[,1]] == cluster_data_frame$patient[comb_ids[,2]],]
        }
        
        # If requested (vgene_match != "none"), exclude connections between tuples including different v genes
        if((parameters$vgene_match == "all" || parameters$vgene_match == ori_clone_net[x,3]) && vgene.info == TRUE && base::nrow(comb_ids) > 0){
          comb_ids <- comb_ids[cluster_data_frame$TRBV[comb_ids[,1]] == cluster_data_frame$TRBV[comb_ids[,2]],]
        }
        
        # add type of connection and return connections
        if(base::nrow(comb_ids) > 0){
          var_ret <- base::data.frame(comb_ids, stringsAsFactors = FALSE)
          var_ret <- base::cbind(var_ret, base::rep(ori_clone_net[x,3], base::nrow(var_ret)))
          base::return(base::unlist(base::t(var_ret)))
          base::t(var_ret)
        } else base::return(NULL)
      }
      clone_network <- base::data.frame(base::matrix(base::unlist(clone_network), ncol = 3, byrow = TRUE), stringsAsFactors = FALSE)
      base::colnames(clone_network) <- base::c("from", "to", "label")
      clone_network$from <- base::as.numeric(clone_network$from)
      clone_network$to <- base::as.numeric(clone_network$to)
    }
    ### Clustering method as in GLIPH2.0
    if(parameters$clustering_method == "GLIPH2.0"){
      ### local connections
      if(parameters$local_similarities == TRUE){
        ### Get local connections between different tuples consisting of CDR3b sequence, v gene and donor
        local_clone_network <- foreach::foreach(i = base::which(cluster_properties$type == "local")) %dopar% {
          
          # Get IDs, members and details of current cluster
          temp_ids <- cluster_data_frame$ID[cluster_data_frame$tag == cluster_properties$tag[i]] 
          temp_members <- cluster_data_frame$CDR3b[temp_ids]
          
          # Extract current motif and starting position range for the motif
          if(parameters$structboundaries) temp_members_frags <- base::substr(x = temp_members,start = parameters$boundary_size + 1 ,stop = base::nchar(temp_members) - parameters$boundary_size) else temp_members_frags <- temp_members
          pattern <- base::strsplit(cluster_properties$tag[i], split = "_")[[1]][[1]]
          
          # Search all positions of the motif in the cluster members 
          temp_members_frags <- base::rep(temp_members_frags, each = base::max(base::nchar(temp_members_frags)-base::nchar(pattern)+1))
          temp_subs <- base::substr(temp_members_frags,
                                    start = base::rep(base::seq_len(base::max(base::nchar(temp_members_frags)-base::nchar(pattern)+1)), times = base::length(temp_members)),
                                    stop = base::rep(base::nchar(pattern):base::max(base::nchar(temp_members_frags)), times = base::length(temp_members)))
          temp_which <- base::which(temp_subs == pattern)
          details <- base::data.frame(id = temp_ids[base::floor((temp_which-1)/base::max(base::nchar(temp_members_frags)-base::nchar(pattern)+1))+1],
                                      pos = ((temp_which-1) %% base::max(base::nchar(temp_members_frags)-base::nchar(pattern)+1))+1,
                                      stringsAsFactors = FALSE)
          
          # First assume connections between all cluster members
          if(base::length(temp_members) >= 2){
            combn_ids <- base::t(utils::combn(base::seq_along(temp_members), m = 2))
          } else {
            combn_ids <- base::t(utils::combn(base::rep(1, 2), m = 2))
          }
          combn_ids <- combn_ids[details$id[combn_ids[,1]] != details$id[combn_ids[,2]],]
          if(!base::is.matrix(combn_ids)) combn_ids <- base::matrix(combn_ids, ncol = 2, byrow = TRUE)
          
          temp_df <- base::data.frame(from = details$id[combn_ids[,1]], to = details$id[combn_ids[,2]],
                                      label = base::rep("local", base::nrow(combn_ids)), stringsAsFactors = FALSE)
          
          # Restrict local connections to sequences in which the starting position varies in a certain range of positions
          temp_df <- base::unique(temp_df[base::abs(details$pos[combn_ids[,1]]-details$pos[combn_ids[,2]]) < parameters$motif_distance_cutoff,])
          temp_df <- temp_df[cluster_data_frame$tag[temp_df$from] == cluster_data_frame$tag[temp_df$to],]
          temp_df <- base::t(temp_df)
          base::return(base::unlist(temp_df))
        }
        
        ### If available, set clone network to the local network
        if(base::length(local_clone_network) == 0) parameters$local_similarities == FALSE else {
          local_clone_network <- base::data.frame(base::matrix(base::unlist(local_clone_network), ncol = 3, byrow = TRUE),
                                                  stringsAsFactors = FALSE)
          base::colnames(local_clone_network) <- base::c("from", "to", "label")
          
          clone_network <- local_clone_network
        }
        
        cat(paste0("Restrict local similarities to sequences with motif position varying between ", parameters$motif_distance_cutoff," positions.\n"))
      }
      
      ### global connections
      if(parameters$global_similarities == TRUE){
        # load BlosumVec to determine interchangeable amino acids defined by BLOSUM62
        utils::data("BlosumVec", package = "turboGliph")
        BlosumVec <- BlosumVec
        
        ### Get local connections between different tuples consisting of CDR3b sequence, v gene and donor
        global_clone_network <- foreach::foreach(i = base::which(cluster_properties$type == "global")) %dopar% {
          
          # Get IDs and members of current cluster
          temp_ids <- cluster_data_frame$ID[cluster_data_frame$tag == cluster_properties$tag[i]] 
          temp_members <- cluster_data_frame$CDR3b[temp_ids]
          
          # Assume connections between all cluster members
          if(base::length(temp_members) >= 2){
            combn_ids <- base::t(utils::combn(base::seq_along(temp_members), m = 2))
          } else {
            combn_ids <- base::t(utils::combn(base::rep(1, 2), m = 2))
          }
          temp_df <- base::data.frame(from = temp_ids[combn_ids[,1]], to = temp_ids[combn_ids[,2]],
                                      label = base::rep("global", base::nrow(combn_ids)), stringsAsFactors = FALSE)
          
          # If requested, restrict global connections to sequences with a Hamming distance of 1 and a differing amino acid pair allowed by the BLOSUM62 matrix
          if(parameters$all_aa_interchangeable == FALSE){
            temp_pos <- stringr::str_locate(string = cluster_properties$tag[i], pattern = "%")
            if(parameters$structboundaries == TRUE) temp_pos <- temp_pos+parameters$boundary_size
            temp_df <- base::unique(temp_df[base::paste0(base::substr(cluster_data_frame$CDR3b[temp_df$from], temp_pos, temp_pos),
                                                         base::substr(cluster_data_frame$CDR3b[temp_df$to], temp_pos, temp_pos)) %in% BlosumVec,])
          }
          temp_df <- base::t(temp_df)
          base::return(base::unlist(temp_df))
        }
        if(parameters$all_aa_interchangeable == FALSE) cat("Restrict global connections to sequences with a BLOSUM62 value for the amino acid substitution greater or equal to zero.\n")
        
        ### Append global network to the clone network
        if(base::length(global_clone_network) == 0) parameters$local_similarities == FALSE else {
          global_clone_network <- base::data.frame(base::matrix(base::unlist(global_clone_network), ncol = 3, byrow = TRUE),
                                                   stringsAsFactors = FALSE)
          base::colnames(global_clone_network) <- base::c("from", "to", "label")
          
          if(parameters$local_similarities == FALSE){
            clone_network <- global_clone_network
          } else {
            clone_network <- base::rbind(clone_network, global_clone_network)
          }
        }
      }
      clone_network$from <- base::as.numeric(clone_network$from)
      clone_network$to <- base::as.numeric(clone_network$to)
    }
  } else {
    if(parameters$gliph_version == 1){
      ### Original GLIPH
      # Get connections between sequences calculated by GLIPH; exclude all singletons
      ori_clone_net <- clustering_output$connections
      ori_clone_net <- ori_clone_net[ori_clone_net[,3] != "singleton",]
      if(base::nrow(ori_clone_net) == 0) base::stop("The specified clustering_output does not contain any clusters.")
      
      # Are patient and v gene dependent restrictions possible?
      patient.info <- FALSE
      vgene.info <- FALSE
      if("patient" %in% base::colnames(cluster_data_frame)) patient.info <- TRUE
      if("TRBV" %in% base::colnames(cluster_data_frame)) vgene.info <- TRUE
      
      ### Get connections between different tuples consisting of CDR3b sequence, v gene and donor
      x <- NULL
      clone_network <- foreach::foreach(x = base::seq_len(base::nrow(ori_clone_net))) %dopar% {
        # Identify all tuples with correpsonding CDR3b sequence
        act_ids1 <- cluster_data_frame$ID[cluster_data_frame$CDR3b == ori_clone_net[x,1]]
        act_ids2 <- cluster_data_frame$ID[cluster_data_frame$CDR3b == ori_clone_net[x,2]]
        
        # First assume connection between all tuples
        comb_ids <- base::expand.grid(act_ids1, act_ids2)
        comb_ids <- base::unique(comb_ids)
        
        # If requested (public_tcrs == FALSE), exclude connections between tuples including different donors
        if(parameters$public_tcrs == FALSE && patient.info == TRUE && base::nrow(comb_ids) > 0){
          comb_ids <- comb_ids[cluster_data_frame$patient[comb_ids[,1]] == cluster_data_frame$patient[comb_ids[,2]],]
        }
        
        # If requested (global_vgene == TRUE), exclude global connections between tuples including different v genes
        if(parameters$global_vgene == TRUE && vgene.info == TRUE && ori_clone_net[x,3] == "global" && base::nrow(comb_ids) > 0){
          comb_ids <- comb_ids[cluster_data_frame$TRBV[comb_ids[,1]] == cluster_data_frame$TRBV[comb_ids[,2]],]
        }
        
        # add type of connection and return connections
        if(base::nrow(comb_ids) > 0){
          var_ret <- base::data.frame(comb_ids, stringsAsFactors = FALSE)
          var_ret <- base::cbind(var_ret, base::rep(ori_clone_net[x,3], base::nrow(var_ret)))
          base::return(base::unlist(base::t(var_ret)))
          base::t(var_ret)
        } else base::return(NULL)
      }
      if(parameters$positional_motifs == TRUE) cat("Restrict local similarities to sequences with identical N-terminal motif position.\n")
      if(parameters$public_tcrs == FALSE && patient.info == TRUE) cat("Restrict similarities to sequences obtained from identical donor.\n")
      if(parameters$global_vgene == TRUE && vgene.info == TRUE) cat("Restrict global similarities to sequences with identical v gene.\n")
      clone_network <- base::data.frame(base::matrix(base::unlist(clone_network), ncol = 3, byrow = TRUE), stringsAsFactors = FALSE)
      base::colnames(clone_network) <- base::c("from", "to", "label")
      clone_network$from <- base::as.numeric(clone_network$from)
      clone_network$to <- base::as.numeric(clone_network$to)
    }
    if(parameters$gliph_version == 2){
      ### GLIPH2
      
      ### local connections
      if(parameters$local_similarities == TRUE){
        ### Get local connections between different tuples consisting of CDR3b sequence, v gene and donor
        local_clone_network <- foreach::foreach(i = base::which(cluster_properties$type == "local")) %dopar% {
          
          # Get IDs, members and details of current cluster
          temp_ids <- cluster_data_frame$ID[cluster_data_frame$tag == cluster_properties$tag[i]] 
          temp_members <- cluster_data_frame$CDR3b[temp_ids]
          
          # Extract current motif and starting position range for the motif
          if(parameters$structboundaries) temp_members_frags <- base::substr(x = temp_members,start = parameters$boundary_size + 1 ,stop = base::nchar(temp_members) - parameters$boundary_size) else temp_members_frags <- temp_members
          pattern <- base::strsplit(cluster_properties$tag[i], split = "_")[[1]][[1]]
          
          # Search all positions of the motif in the cluster members 
          temp_members_frags <- base::rep(temp_members_frags, each = base::max(base::nchar(temp_members_frags)-base::nchar(pattern)+1))
          temp_subs <- base::substr(temp_members_frags,
                                    start = base::rep(base::seq_len(base::max(base::nchar(temp_members_frags)-base::nchar(pattern)+1)), times = base::length(temp_members)),
                                    stop = base::rep(base::nchar(pattern):base::max(base::nchar(temp_members_frags)), times = base::length(temp_members)))
          temp_which <- base::which(temp_subs == pattern)
          details <- base::data.frame(id = temp_ids[base::floor((temp_which-1)/base::max(base::nchar(temp_members_frags)-base::nchar(pattern)+1))+1],
                                      pos = ((temp_which-1) %% base::max(base::nchar(temp_members_frags)-base::nchar(pattern)+1))+1,
                                      stringsAsFactors = FALSE)
          
          # First assume connections between all cluster members
          if(base::length(temp_members) >= 2){
            combn_ids <- base::t(utils::combn(base::seq_along(temp_members), m = 2))
          } else {
            combn_ids <- base::t(utils::combn(base::rep(1, 2), m = 2))
          }
          combn_ids <- combn_ids[details$id[combn_ids[,1]] != details$id[combn_ids[,2]],]
          if(!base::is.matrix(combn_ids)) combn_ids <- base::matrix(combn_ids, ncol = 2, byrow = TRUE)
          
          temp_df <- base::data.frame(from = details$id[combn_ids[,1]], to = details$id[combn_ids[,2]],
                                      label = base::rep("local", base::nrow(combn_ids)), stringsAsFactors = FALSE)
          
          # Restrict local connections to sequences in which the starting position varies in a certain range of positions
          temp_df <- base::unique(temp_df[base::abs(details$pos[combn_ids[,1]]-details$pos[combn_ids[,2]]) < parameters$motif_distance_cutoff,])
          temp_df <- temp_df[cluster_data_frame$tag[temp_df$from] == cluster_data_frame$tag[temp_df$to],]
          temp_df <- base::t(temp_df)
          base::return(base::unlist(temp_df))
        }
        
        ### If available, set clone network to the local network
        if(base::length(local_clone_network) == 0) parameters$local_similarities == FALSE else {
          local_clone_network <- base::data.frame(base::matrix(base::unlist(local_clone_network), ncol = 3, byrow = TRUE),
                                                  stringsAsFactors = FALSE)
          base::colnames(local_clone_network) <- base::c("from", "to", "label")
          
          clone_network <- local_clone_network
        }
        
        cat(paste0("Restrict local similarities to sequences with motif position varying between ", parameters$motif_distance_cutoff," positions.\n"))
      }
      
      ### global connections
      if(parameters$global_similarities == TRUE){
        # load BlosumVec to determine interchangeable amino acids defined by BLOSUM62
        utils::data("BlosumVec", package = "turboGliph")
        BlosumVec <- BlosumVec
        
        ### Get local connections between different tuples consisting of CDR3b sequence, v gene and donor
        global_clone_network <- foreach::foreach(i = base::which(cluster_properties$type == "global")) %dopar% {
          
          # Get IDs and members of current cluster
          temp_ids <- cluster_data_frame$ID[cluster_data_frame$tag == cluster_properties$tag[i]] 
          temp_members <- cluster_data_frame$CDR3b[temp_ids]
          
          # Assume connections between all cluster members
          if(base::length(temp_members) >= 2){
            combn_ids <- base::t(utils::combn(base::seq_along(temp_members), m = 2))
          } else {
            combn_ids <- base::t(utils::combn(base::rep(1, 2), m = 2))
          }
          temp_df <- base::data.frame(from = temp_ids[combn_ids[,1]], to = temp_ids[combn_ids[,2]],
                                      label = base::rep("global", base::nrow(combn_ids)), stringsAsFactors = FALSE)
          
          # If requested, restrict global connections to sequences with a Hamming distance of 1 and a differing amino acid pair allowed by the BLOSUM62 matrix
          if(parameters$all_aa_interchangeable == FALSE){
            temp_pos <- stringr::str_locate(string = cluster_properties$tag[i], pattern = "%")
            if(parameters$structboundaries == TRUE) temp_pos <- temp_pos+parameters$boundary_size
            temp_df <- base::unique(temp_df[base::paste0(base::substr(cluster_data_frame$CDR3b[temp_df$from], temp_pos, temp_pos),
                                                         base::substr(cluster_data_frame$CDR3b[temp_df$to], temp_pos, temp_pos)) %in% BlosumVec,])
          }
          temp_df <- base::t(temp_df)
          base::return(base::unlist(temp_df))
        }
        if(parameters$all_aa_interchangeable == FALSE) cat("Restrict global connections to sequences with a BLOSUM62 value for the amino acid substitution greater or equal to zero.\n")
        
        ### Append global network to the clone network
        if(base::length(global_clone_network) == 0) parameters$local_similarities == FALSE else {
          global_clone_network <- base::data.frame(base::matrix(base::unlist(global_clone_network), ncol = 3, byrow = TRUE),
                                                   stringsAsFactors = FALSE)
          base::colnames(global_clone_network) <- base::c("from", "to", "label")
          
          if(parameters$local_similarities == FALSE){
            clone_network <- global_clone_network
          } else {
            clone_network <- base::rbind(clone_network, global_clone_network)
          }
        }
      }
      clone_network$from <- base::as.numeric(clone_network$from)
      clone_network$to <- base::as.numeric(clone_network$to)
    }
  }
  clone_network <- base::unique(clone_network)
  
  ### To ensure that global connections are preferred over local connections for drawing
  clone_network <- clone_network[base::order(clone_network$label, decreasing = TRUE),]
  
  ### Set color of edges
  # Create data frame inlcuding the colors for local and global connections
  leg.col <- base::data.frame(color=base::c(local_edge_color,global_edge_color))
  leg.col[] <- base::lapply(leg.col,base::as.character)
  base::row.names(leg.col) <- c("local", "global")
  
  # Assign the corresponding color to each connection in the clone network
  temp_match <- grr::matches(clone_network$label, base::row.names(leg.col))
  if(base::any(base::is.na(temp_match[,1]))) temp_match <- temp_match[!base::is.na(temp_match[,1]),]
  clone_network$color[temp_match[,1]] <- leg.col[base::row.names(leg.col)[temp_match[,2]],1]
  
  #################################################################
  ##                     Prepare graph nodes                     ##
  #################################################################
  
  ### Create the data frame framework for all node information
  vert.info <- base::data.frame(id = base::unique(base::c(clone_network$from,clone_network$to)),
                                stringsAsFactors = FALSE)
  vert.info$size <- base::rep(20, base::nrow(vert.info))
  vert.info$color <- base::rep("gray", base::nrow(vert.info))
  vert.info$label <- cluster_data_frame$CDR3b[vert.info$id]
  vert.info$title <- base::paste0("<p><b>",vert.info$label, "</b>")
  
  ### If requested, determine the node sizes
  if(!base::is.null(size_info)){
    if(!(size_info %in% base::colnames(cluster_data_frame))) base::stop("Column named ", size_info, " determining node size is not found.")
    if(base::suppressWarnings(base::any(base::is.na(base::as.numeric(cluster_data_frame[,size_info])))) == TRUE)base::stop("Column named ", size_info, " determining node size contains non-numeric values.")
  }
  
  if(base::is.null(size_info)){
    vert.info$size <- base::rep(20, base::nrow(vert.info))
  } else{
    vert.info$size <- cluster_data_frame[vert.info$id, size_info]
    
    if(absolute_size == FALSE){
      vert.info$size <- (vert.info$size-base::min(vert.info$size))/(base::max(vert.info$size)-base::min(vert.info$size))*8+12
    }
  }
  
  ### Determine the node colors
  if(!(color_info %in% base::c("none", "total.score", base::colnames(cluster_data_frame)))){
    base::stop("Column named ", color_info, " determining node color is not found.")
  }
  if(color_info == "color" && !(base::any(plotfunctions::isColor(cluster_data_frame[, color_info]) == FALSE))) base::stop("Column ", color_info, " determining node color has to contain only values that represent colors.")
  
  color.scale = ""
  if("color" %in% base::colnames(cluster_data_frame)){
    # Use the user specified colors
    vert.info$color <- cluster_data_frame$color[vert.info$id]
  } else if(color_info == "total.score"){
    # Color accoring to the total cluster score with a logarithmic scale
    min_score <- base::floor(base::log10(base::min(cluster_data_frame$total.score)))
    max_score <- base::ceiling(base::log10(base::max(cluster_data_frame$total.score)))
    
    node.col <- base::data.frame(logvalue = (base::seq(from = min_score, to = max_score, length.out = 50)), stringsAsFactors = FALSE)
    node.col$value <- 10^node.col$logvalue
    node.col$color <- color_palette(base::nrow(node.col))
    
    vert.info$color <- base::vapply(X=cluster_data_frame$total.score[vert.info$id], FUN = function(x){
      base::return(node.col$color[base::which.min(base::abs(node.col$value - x))])
    }, FUN.VALUE = base::c("#00ff00"))
    
    color.scale <- "log"
  } else if(color_info == "none"){} else {
    ### Automatic coloring based on the desired property
    
    # Coloring for numeric values by determining and using a linear or logarthmic scale
    if(base::is.numeric(cluster_data_frame[, color_info])){
      vals <- base::sort(base::unique(cluster_data_frame[, color_info]))
      lin_model <- stats::lm(vals ~ base::seq_along(vals))
      log_model <- stats::lm(base::log(vals) ~ base::seq_along(vals))
      
      if(base::summary(lin_model)$sigma < base::summary(log_model)$sigma) color.scale <- "linear" else color.scale <- "log"
      
      if(color.scale == "log"){
        min_score <- base::floor(base::log10(base::min(cluster_data_frame$total.score)))
        max_score <- base::ceiling(base::log10(base::max(cluster_data_frame$total.score)))
      } else {
        min_score <- base::floor(base::min(cluster_data_frame$total.score))
        max_score <- base::ceiling(base::max(cluster_data_frame$total.score))
      }
      
      
      node.col <- base::data.frame(logvalue = (base::seq(from = min_score, to = max_score, length.out = 50)), stringsAsFactors = FALSE)
      if(color.scale == "log") node.col$value <- 10^node.col$logvalue else node.col$value <- node.col$logvalue
      node.col$color <- color_palette(base::nrow(node.col))
      
      vert.info$color <- base::vapply(X=cluster_data_frame$total.score[vert.info$id], FUN = function(x){
        base::return(node.col$color[base::which.min(base::abs(node.col$value - x))])
      }, FUN.VALUE = base::c("#00ff00"))
    } else {
      # Coloring for categorial values
      vals <- base::sort(base::as.character(base::unique(cluster_data_frame[, color_info])))
      cols <- color_palette(base::length(vals))
      temp_match <- grr::matches(base::as.character(cluster_data_frame[vert.info$id, color_info]), vals)
      temp_match <- temp_match[!base::is.na(temp_match[,1]),]
      vert.info$color[temp_match[,1]] <- cols[temp_match[,2]]
      
      if(!base::is.null(vals) && !base::is.null(cols)){
        # Create legend only if number of values is managable (<=6)
        if(base::length(vals) <= 6) vert.info$collab <- cluster_data_frame[vert.info$id, color_info]
      }
    }
  }
  
  ### Write the description for every node with the desired information (if possible)
  this_column_names <- base::c()
  if(!("gliph_version" %in% base::names(parameters))){
    this_column_names <- base::c("tag", "total.score", show_additional_columns)
  } else {
    if(parameters$gliph_version == 1) this_column_names <- base::c("tag", "total.score", show_additional_columns) else this_column_names <- base::c("tag", "fisher.score", "total.score", show_additional_columns)
  }
  
  all_column_names <- base::c(base::colnames(cluster_list[[1]]), base::colnames(cluster_properties))
  this_column_names <- this_column_names[this_column_names %in% all_column_names]
  for(actCol in this_column_names){
    vert.info$title <- base::paste0(vert.info$title, "<br>", actCol, ": ",cluster_data_frame[vert.info$id,actCol])
  }
  vert.info$title <- base::paste0(vert.info$title, "</p>")
  vert.info$group <- cluster_data_frame$tag[vert.info$id]
  
  
  ##################################################################
  ##       Convert edge and node information for visNetwork       ##
  ##################################################################
  vertex.info <- vert.info
  edge.info <- clone_network
  layout <- "layout_components"

  ### Node properties
  vertexes <- vertex.info$id
  num.v <- base::length(vertexes)
  v.size <- 8
  v.label <- vertexes
  v.group <- NA
  v.shape <- base::rep("dot")
  v.title <- vertexes
  v.color <- base::rep("gray", num.v)
  v.shadow <- base::rep(FALSE, num.v)
  if ("size" %in% base::names(vertex.info)) v.size = base::as.numeric(vertex.info$size)
  if ("label" %in% base::names(vertex.info)) v.label = base::as.character(vertex.info$label)
  if ("group" %in% base::names(vertex.info)) v.group = base::as.character(vertex.info$group)
  if ("shape" %in% base::names(vertex.info)) v.shape = base::as.character(vertex.info$shape)
  if ("title" %in% base::names(vertex.info)) v.title = base::as.character(vertex.info$title)
  if ("color" %in% base::names(vertex.info)) v.color = base::as.character(vertex.info$color)
  if ("shadow" %in% base::names(vertex.info)) v.shadow = base::as.logical(vertex.info$shadow)
  nodes <- base::data.frame(id = vertexes,
                            color = base::list(background = v.color, border = "black", highlight = "red"),
                            size=v.size,
                            label=v.label,
                            title=v.title,
                            group=v.group,
                            shape=v.shape,
                            shadow=v.shadow)

  ### Edge properties
  eds <- edge.info
  num.e <- base::dim(eds)[1]
  e.label <- NA
  e.length <- base::rep(150,num.e)
  e.width <- base::rep(10,num.e)
  e.color <- base::rep("gray", num.e)
  e.arrows <- base::rep("",num.e)
  e.dashes <- base::rep(FALSE,num.e)
  e.title <- base::rep("",num.e)
  e.smooth <- base::rep(FALSE,num.e)
  e.shadow <- base::rep(FALSE,num.e)
  if ("length" %in% base::names(edge.info)) e.length = base::as.numeric(edge.info$length)
  if ("label" %in% base::names(edge.info)) e.label = base::as.character(edge.info$label)
  if ("width" %in% base::names(edge.info)) e.width = base::as.numeric(edge.info$width)
  if ("color" %in% base::names(edge.info)) e.color = base::as.character(edge.info$color)
  if ("arrows" %in% base::names(edge.info)) e.arrows = base::as.character(edge.info$arrows)
  if ("dashes" %in% base::names(edge.info)) e.dashes = base::as.logical(edge.info$dashes)
  if ("title" %in% base::names(edge.info)) e.title = base::as.character(edge.info$title)
  if ("smooth" %in% base::names(edge.info)) e.smooth = base::as.logical(edge.info$smooth)
  if ("shadow" %in% base::names(edge.info)) e.shadow = base::as.logical(edge.info$shadow)
  edges <- base::data.frame(from = eds$from, to = eds$to,
                            length = e.length,
                            width = e.width,
                            color = e.color,
                            arrows = e.arrows,
                            dashes = e.dashes,
                            title = e.title,
                            smooth = e.smooth,
                            shadow = e.shadow )
  
  ### Legend properties
  ledges <- NULL
  if ("label" %in% base::names(edge.info) && "color" %in% base::names(edge.info)) {
    leg.info <- base::unique(base::cbind(base::as.character(edge.info$label),base::as.character(edge.info$color)))
    ledges <- base::data.frame(color = leg.info[,2], label = leg.info[,1],
                               arrows=base::rep("",base::length(leg.info[,1])), width=base::rep(4,base::length(leg.info[,1])))
  }
  lenodes <- NULL
  if ("color" %in% base::names(vertex.info) && "collab" %in% base::names(vertex.info)) {
    leg.info <- base::unique(base::cbind(base::as.character(vertex.info$collab),base::as.character(vertex.info$color)))
    lenodes <- base::data.frame(label = leg.info[,1], color = leg.info[,2], shape="dot", size=10)
  }
  if(color.scale != ""){
    leg.info <- node.col[base::seq(from = 1, to = base::nrow(node.col), by = (base::nrow(node.col)-1)/5), base::c(1, 3)]
    if(color.scale == "log") leg.info[,1] <- base::paste0("10^(", base::round(leg.info[,1], digits = 3), ")") else leg.info[,1] <- base::as.character(base::round(leg.info[,1], digits = 5))
    lenodes <- base::data.frame(label = leg.info[,1], color = leg.info[,2], shape="dot", size=10)
  }
  
  doParallel::stopImplicitCluster()

  cat("Drawing the graph.\n")
  
  ret <- visNetwork::visLegend(visNetwork::visOptions(visNetwork::visIgraphLayout(visNetwork::visNetwork(nodes = nodes, edges = edges), layout = "layout_components"),
                                                                   highlightNearest = base::list(enabled = TRUE, degree = 1,algorithm = "hierarchical"),
                                                                   selectedBy = base::list(variable = "group", multiple = TRUE), manipulation = TRUE),
                                            addEdges = ledges, addNodes = lenodes, useGroups = FALSE, position = "right",
                                            width=0.15, zoom = FALSE)
  
  ### Closing time!
  return(ret)
}
HetzDra/turboGliph documentation built on Oct. 2, 2022, 2:22 a.m.