R/VDJ_clonal_expansion.R

Defines functions VDJ_clonal_expansion

Documented in VDJ_clonal_expansion

#' Flexible wrapper for clonal expansion barplots by isotype, GEX cluster etc.
#'
#'@description Clonal frequency plot displaying clonal expansion for either T and B cells with Platypus VDJ_build() output. For Platypus v2 plotting of B cell clonotype expansion and isotypes please refer to VDJ_isotypes_per_clone.
#' @param VDJ VDJ dataframe generated using the VDJ_build function.
#' @param celltype Character. Either "Tcells" or "Bcells". If set to Tcells bars will not be colored by default and the parameters treat_incomplete_cells, treat_incomplete_clones, subtypes and species are ignored. The color.by and group.by arguments work identically for both celltypes. If none provided it will detect this param from the celltype column.
#' @param clones numeric value indicating the number of clones to be considered for the clonal expansion plot. Default value is 50. For a standard plot more than 50 is discouraged. When showing only one - possibly rare - isotype via isotypes.to.plot it may be useful to set this number higher (e.g. 100-200)
#' @param subtypes Logical indicating whether to display isotype subtypes or not.
#' @param isotypes.to.plot Character vector. Defaults to "all". This can be set to any number of specific Isotypes, that are to be shown exclusively. For example, to show only clones containing IgG, input "IGHG". If only wanting to check clones with IgA and IgD input c("IGHA","IGHD"). Works equally if subtypes are set to TRUE. Is ignored if color.by is not set to "isotype"
#' @param species Character indicating whether the samples are from "Mouse" or "Human". Default is "Human".
#' @param treat.incomplete.clones Character indicating how to proceed with clonotypes lacking a VDJC (in other words, no cell within the clonotype has a VDJC). "exclude" removes these clonotypes from the analysis. "include" keeps these clonotypes in the analysis. In the plot they will appear has having an unknown isotype.
#' @param treat.incomplete.cells Character indicating how to proceed with cells assigned to a clonotype but missing a VDJC. "proportional" to fill in the VDJ isotype according to the proportions present in of clonotype (in case present proportions are not replicable in the total number of cells e.g. 1/3 in 10 cells, values are rounded to the next full integer and if the new counts exceed the total number of cells, 1 is subtracted from the isotype of highest frequency. If the number is below the number of cell, 1 is added to the isotype with lowest frequency to preserve diversity), "exclude" to exclude them from analysis and rank clonotypes only by the number of cells with a heavy chain. This ranking may deviate from the frequency column in the clonotype table. CAVE: if treat_incomplete_cells is set to "exclude", clonotypes lacking a VDJC entierly will be removed from the analysis. This results in a similar but not identical output as when treat_incomplete_clones is set to true. The two parameters are thereby non-redundant.
#' @param group.by Character. Defaults to "sample_id". Column name of VDJ to split VDJ by. For each unique entry in that column a plot will be generated. Therefore plots can be generated by sample_id, group_id or any other metadata item.To get plots for the whole repertoire set to "none"
#' @param color.by Character. Defaults to "isotype". If set to "isotype" bars are colored by the respective IgH chain or in grey for T cells. This can alternatively be set to any column name of the VDJ. This allows coloring clones by their V_gene usage or by GEX clusters
#' @param variant.plot Logical indicating whether to plot the output showing the variants or not.
#' @param text.size Numeric value indicating the size of the text in the plot. Default is 12.
#' @return Returns a nested list. out[[1]] are plots out[[2]] are raw datatables containing also barcode and CDR3 information
#' @export
#' @examples
#'clonal_out <- VDJ_clonal_expansion(VDJ = Platypus::small_vdj,
#'  celltype = "Bcells", clones = 30,subtypes = FALSE, species = "Mouse"
#'  ,treat.incomplete.clones = "exclude"
#'  ,treat.incomplete.cells = "proportional")
#'clonal_out[[1]] #list of plots
#'clonal_out[[2]] #list of source dataframes


VDJ_clonal_expansion <- function(VDJ,
                                 celltype,
                                 clones,
                                 subtypes,
                                 isotypes.to.plot,
                                 species,
                                 treat.incomplete.clones,
                                 treat.incomplete.cells,
                                 group.by,
                                 color.by,
                                 variant.plot,
                                 text.size){

  #Adding compatibility with input naming scheme
  VDJ.matrix <- VDJ
  VDJ.matrix[is.na(VDJ.matrix)] <- 'unknown'
  VDJ <- NULL

  ClonalRank <- NULL
  Counts <- NULL
  sum_counts <- NULL
  Isotype <- NULL
  Color <- NULL
  pasted_variants <- NULL
  isotype <- NULL
  colors <- NULL
  match_ex_crit <- NULL
  clonotype_id <- NULL
  variant <- NULL
  n <- NULL

  #add a color scheme similar to Seurat's
  ggplotColours <- function(n, h = c(0, 360) + 15){
    if ((diff(h) %% 360) < 1) h[2] <- h[2] - 360/n
    grDevices::hcl(h = (seq(h[1], h[2], length = n)), c = 100, l = 65)
  }

  if(missing(clones)) clones <- 50

  if(missing(subtypes)) subtypes <- FALSE
  if(missing(isotypes.to.plot)) isotypes.to.plot <- "all"
  if(missing(species)) species <- "Human"
  if(missing(treat.incomplete.cells)) treat.incomplete.cells <- "proportional"
  if(missing(treat.incomplete.clones)) treat.incomplete.clones <- "exclude"
  if(missing(text.size)) text.size <- 12

  if(!treat.incomplete.cells %in% c("exclude", "proportional")){
    stop("Please set treat.incomplete.cells to either 'proportional' or 'exclude'. 'Proportional' will assign cells of a clonotype missing a VDJ chain proportionally to the isotypes present in that clone (Default). 'exclude' will remove all cells missing a VDJ chain and thereby also alter clonotype frequencies")
  }
  if(!treat.incomplete.clones %in% c("exclude", "include")){
    stop("Please set treat.incomplete.clones to either 'include' or 'exclude'. 'include' will show also clones which of which no cell has a VDJ chain. 'exclude' will remove such clones (default)")
  }

  platypus.version <- "v3"

  if(missing(celltype)){
    if(stringr::str_detect(VDJ.matrix$celltype[1], "B")){celltype = "Bcells"
    } else if(stringr::str_detect(VDJ.matrix$celltype[1], "T")){celltype = "Tcells"
    } else {stop("No celltype found in celltype column. celltype column must contain either 'B cells' or 'T cells'")}
  }

  if(missing(group.by)){
    group.by <- "sample_id"}

  if(missing(variant.plot)){
    variant.plot <- FALSE}

  if(group.by == "none"){
    group.by <- "ungroup"
    VDJ.matrix$ungroup <- 1
  } else if(!group.by %in% names(VDJ.matrix) & group.by != "sample_id" & group.by  == "none"){
    stop("Please provide a valid column name of the VDJ.matrix in group.by")
  }

  if(missing(color.by)){
    color.by <- "isotype"}
  if(!color.by %in% names(VDJ.matrix) & color.by != "isotype"){
    stop("Please provide a valid column name of the VDJ.matrix in color.by, or set to 'isotype' for default (also for Tcells)")
  } else if(color.by %in% names(VDJ.matrix) & color.by != "isotype"){
    unique_colors <- ggplotColours(n = length(unique(VDJ.matrix[,color.by])))
    names(unique_colors) <- unique(VDJ.matrix[,color.by])
  }

  if(celltype == "Bcells"){ ####START Bcells

    #check that dataframe is the input
    if(!inherits(VDJ.matrix,"data.frame")){
      stop("Please provide a VDJ dataframe. (VDJ_GEX_matrix_out[[1]]")
    }
    #set up for subsetting by group.by item
    VDJ.matrix[,group.by] <- as.character(VDJ.matrix[,group.by])
    sample.names <- unique(VDJ.matrix[,group.by])
    sample.names[is.na(sample.names)] <- "NONE"
    VDJ.matrix[is.na(VDJ.matrix[,group.by]),group.by] <- "NONE"
    VDJ.matrix.all <- VDJ.matrix

    #Disabled due to the possibility of having low abundance of isotypes of interest. If filtered by isotype, 50 clones may not be enought to get a decent plot.
    #if(clones<1 | clones > 50){stop("Number of clones must be an integer value between 1 and 50")}

    VDJ_per_clone_output_all <- list()
    clones_per_isotype <- list()
    clones_per_isotype_all <-list()
    output_plot <- list()
    variant_df_list <- list()
    if(subtypes == FALSE | color.by != "isotype"){ #if colored by something else we don't care about subtypes...

      if(variant.plot == FALSE){
        for (i in 1:length(sample.names)){
          #subset VDJ.matrix by group.by
          VDJ.matrix <- subset(VDJ.matrix.all, VDJ.matrix.all[,group.by] == sample.names[i])
          #get clonotype frequency table
          clono_freq <- as.data.frame(table(VDJ.matrix$clonotype_id))
          clono_freq <- clono_freq[order(clono_freq$Freq, decreasing = TRUE),]

          #get essential info from VDJ_GEX_matrix
          curr_rep_iso <- VDJ.matrix[,c("barcode","clonotype_id", "VDJ_cgene", "VDJ_cdr3_aa", "VJ_cdr3_aa")]
          names(curr_rep_iso)[3] <- "isotype"

          if(color.by != "isotype"){ #add color info in case its needed
            curr_rep_iso$colors <- as.character(VDJ.matrix[,color.by]) #get as character
            curr_rep_iso$colors[which(is.na(curr_rep_iso$colors))] <- "None" #replace any NAs which would mess up string counting later

            if(inherits(VDJ.matrix[,color.by],"factor")){ #REORDER Depending on class in VDJ.matrix. This is an attempt at conserving existing factor levels such as in the seurat_clusters column
              curr_rep_iso$colors <- ordered(as.factor(curr_rep_iso$colors),levels = c(levels(VDJ.matrix[,color.by]), "None")) #reorder
            } else {
              curr_rep_iso$colors <- ordered(as.factor(curr_rep_iso$colors),levels = c(as.character(unique(VDJ.matrix[,color.by])), "None"))
            }
          }

          if(treat.incomplete.clones == "exclude"){
            clones_to_del <-c()
            for(k in 1:nrow(clono_freq)){
              if(all(curr_rep_iso[which(curr_rep_iso$clonotype_id == clono_freq[k,1]),"isotype"] == "")){ #detect if no VDJC is available in this clonotype
                clones_to_del <- append(clones_to_del, k)
              }
            }
            if(length(clones_to_del > 0)){
              clono_freq <- clono_freq[-clones_to_del,]} #remove these clones entirely
            clono_freq <- clono_freq[order(clono_freq$Freq, decreasing = T),] #reorder, to make sure
          }

          #iterate over clones and get isotype info
          clones_per_isotype <- list()
          for (j in 1:clones){
            curr_clone <- curr_rep_iso[which(curr_rep_iso$clonotype_id == clono_freq[j,1]),]
            if(nrow(curr_clone) > 0){

              if(color.by == "isotype"){

                #str_split and discard potentially second chain to avoid overcounting in the case of a cell having 2 HC with 2 different isotypes
                curr_clone$isotype <- stringr::str_split(curr_clone$isotype, ";", simplify = TRUE)[,1]

                if(treat.incomplete.cells == "proportional" & stringr::str_detect(paste0(curr_clone$isotype,collapse = ";"), pattern = "IG")==TRUE){ #check that there is at least one IG entry
                  props <- table(curr_clone$isotype[which(stringr::str_detect(curr_clone$isotype, pattern = "IG")==TRUE)]) #getting proportions
                  n_total <- nrow(curr_clone) #getting total number of cells in clonotype
                  props <- round(props / sum(props) * n_total,0) #calculating new number of each isotype to match proportions
                  if(sum(props) > n_total){props[which.max(props)] <- props[which.max(props)] - 1
                  } else if(sum(props) < n_total){props[which.min(props)] <- props[which.min(props)] + 1} #catching rounding derived errors.

                  curr_clone$isotype <- rep.int(names(props), props) #new isotype column with the new number of isotypes. ! these are not in the original order !

                } else if (treat.incomplete.cells == "proportional" & stringr::str_detect(paste0(curr_clone$isotype,collapse = ";"), pattern = "IG")==FALSE){ #if no entry is present

                  curr_clone$isotype <- "None"
                }

                clones_per_isotype[[j]] <- data.frame("Counts"=rep(0, 6), "Color"=rep("", 6), "Isotype"=rep("", 6), "ClonalRank"=rep("", 6), "clonotype_id" = rep(curr_clone$clonotype_id[1],6), "VDJ_cdr3_aa" = rep(curr_clone$VDJ_cdr3_aa[which(curr_clone$VDJ_cdr3_aa != "")][1],6), "VJ_cdr3_aa" = rep(curr_clone$VJ_cdr3_aa[which(curr_clone$VJ_cdr3_aa != "")][1],6), "barcode" = rep(paste0(curr_clone$barcode, collapse = ";"),6))  #to maintain clonotype information

                clones_per_isotype[[j]]$Counts[1] <- sum(stringr::str_count(curr_clone$isotype, "IGHG"))
                clones_per_isotype[[j]]$Counts[2] <- sum(stringr::str_count(curr_clone$isotype, "IGHM"))
                clones_per_isotype[[j]]$Counts[3] <- sum(stringr::str_count(curr_clone$isotype, "IGHA"))
                clones_per_isotype[[j]]$Counts[4] <- sum(stringr::str_count(curr_clone$isotype, "IGHD"))
                clones_per_isotype[[j]]$Counts[5] <- sum(stringr::str_count(curr_clone$isotype, "IGHE"))
                clones_per_isotype[[j]]$Counts[6] <- sum(stringr::str_count(curr_clone$isotype, "None"))

                if(color.by == "isotype"){
                  clones_per_isotype[[j]]$Color <- c("green4", "black", "red", "blue", "purple", "gray")
                  clones_per_isotype[[j]]$Isotype <- c("IGHG", "IGHM", "IGHA", "IGHD", "IGHE", "Unknown")
                } else {
                  clones_per_isotype[[j]]$Isotype <- names(which.max(table(curr_clone$colors)))
                }

                clones_per_isotype[[j]]$ClonalRank <- j

              } else if(color.by != "isotype"){
                color_cur_clone <- unique(curr_rep_iso$colors)
                n_color_cur_clone <- length(unique(curr_rep_iso$colors))

                clones_per_isotype[[j]] <- data.frame("Counts"=rep(0, n_color_cur_clone), "Color"= color_cur_clone, "ClonalRank"=rep("", n_color_cur_clone), "clonotype_id" = rep(curr_clone$clonotype_id[1],n_color_cur_clone), "VDJ_cdr3_aa" = rep(curr_clone$VDJ_cdr3_aa[which(curr_clone$VDJ_cdr3_aa != "")][1],n_color_cur_clone), "VJ_cdr3_aa" = rep(curr_clone$VJ_cdr3_aa[which(curr_clone$VJ_cdr3_aa != "")][1],n_color_cur_clone), "barcode" = rep(paste0(curr_clone$barcode, collapse = ";"),n_color_cur_clone))

                for(k in 1:nrow(clones_per_isotype[[j]])){

                  clones_per_isotype[[j]]$Counts[k] <- stringr::str_count(paste0("/",paste0(curr_clone$colors,collapse = "/ /"),"/"), pattern = paste0("/", as.character(clones_per_isotype[[j]]$Color[k]),"/"))

                }
                clones_per_isotype[[j]]$ClonalRank <- j
              } # end not coloring by isotype but someting else
            } #end if(nrow(curr_clone) > 0)
          }
          clones_per_isotype_all[[i]] <- do.call("rbind", clones_per_isotype)

          if(treat.incomplete.cells == "exclude" & color.by == "isotype"){
            rank_raw <- as.data.frame(clones_per_isotype_all[[i]] %>% dplyr::group_by(ClonalRank) %>% dplyr::summarise(sum_counts = sum(Counts)) %>% dplyr::arrange(dplyr::desc(sum_counts)) %>% dplyr::mutate(rank = 1:length(unique(ClonalRank))))
            clones_per_isotype_all[[i]]$ClonalRank_2 <- 0
            for(l in 1:nrow(rank_raw)){
              clones_per_isotype_all[[i]]$ClonalRank_2[which(clones_per_isotype_all[[i]]$ClonalRank == rank_raw$ClonalRank[l])] <- rank_raw$rank[l]
            }
            clones_per_isotype_all[[i]]$ClonalRank <- clones_per_isotype_all[[i]]$ClonalRank_2
            #reorder the dataframe the sum_counts order by decreasing sum, the clonal rank is there to keep clones that have the same count sum together as a group of rows

            message("New ranking based only on present VDJ chains: ")
            message(unique(clones_per_isotype_all[[i]]$clonotype_id))
          }

          #Delete clones not of interest
          if(color.by == "isotype" & isotypes.to.plot[1] != "all"){

            if(!any(isotypes.to.plot %in% unique(clones_per_isotype_all[[i]]$Isotype))){stop("isotype.to.plot input not found in dataframe. Please check if the isotype is spelled correctly")}
            to_del <- c()
            for(k in 1:length(unique(clones_per_isotype_all[[i]]$ClonalRank))){
              if(sum(clones_per_isotype_all[[i]]$Counts[which(clones_per_isotype_all[[i]]$ClonalRank == unique(clones_per_isotype_all[[i]]$ClonalRank)[k] & clones_per_isotype_all[[i]]$Isotype %in% isotypes.to.plot)]) == 0){
                to_del <- append(to_del, unique(clones_per_isotype_all[[i]]$ClonalRank)[k])
              }
            }
            clones_per_isotype_all[[i]] <- subset(clones_per_isotype_all[[i]], !ClonalRank %in% to_del)
            #redistribute clonal ranks

            rank_raw <- as.data.frame(clones_per_isotype_all[[i]] %>% dplyr::group_by(ClonalRank) %>% dplyr::summarise(sum_counts = sum(Counts)) %>% dplyr::arrange(dplyr::desc(sum_counts)) %>% dplyr::mutate(rank = 1:length(unique(ClonalRank))))
            clones_per_isotype_all[[i]]$ClonalRank_2 <- 0
            for(l in 1:nrow(rank_raw)){
              clones_per_isotype_all[[i]]$ClonalRank_2[which(clones_per_isotype_all[[i]]$ClonalRank == rank_raw$ClonalRank[l])] <- rank_raw$rank[l]
            }
            clones_per_isotype_all[[i]]$ClonalRank <- clones_per_isotype_all[[i]]$ClonalRank_2

            message("New ranking based only on selected isotypes: ")
            message(unique(clones_per_isotype_all[[i]]$clonotype_id))
          }

          if(color.by == "isotype"){
            output_plot[[i]] <- ggplot2::ggplot(clones_per_isotype_all[[i]], ggplot2::aes(fill = Isotype, y=Counts, x=ClonalRank)) + 
              ggplot2::geom_bar(stat="identity", width=0.6, color="black") + 
              ggplot2::theme_bw() + 
              ggplot2::scale_fill_manual(values = ggplot2::scale_fill_manual(values = c("IGHG" = "green4", "IGHM" = "black", "IGHA" = "red3", "IGHD"="blue", "IGHE"="purple", "Unknown"="gray"))) + 
              ggplot2::theme_classic() + 
              ggplot2::ggtitle(paste0(i)) + 
              ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5), text = ggplot2::element_text(size = text.size)) + 
              ggplot2::scale_y_continuous(expand = c(0,0)) + ggplot2::scale_x_continuous(expand = c(0,0.5)) + 
              ggplot2::labs(title = sample.names[[i]], x = "Clonal rank", y = "Number of cells")
          } else {
            output_plot[[i]] <- ggplot2::ggplot(clones_per_isotype_all[[i]], ggplot2::aes(fill = Color, y=Counts, x=ClonalRank)) + ggplot2::geom_bar(stat="identity", width=0.6, color="black") + ggplot2::theme_bw() + ggplot2::theme_classic() + ggplot2::ggtitle(paste0(i)) + ggplot2::theme(text = ggplot2::element_text(size = text.size), plot.title = ggplot2::element_text(hjust = 0.5)) + ggplot2::scale_y_continuous(expand = c(0,0)) + ggplot2::scale_x_continuous(expand = c(0,0.5)) + ggplot2::labs(title = sample.names[[i]], x = "Clonal rank", y = "Number of cells", fill = color.by)  + ggplot2::scale_fill_manual(values = unique_colors,labels=names(unique_colors))

          }

        }
        names(clones_per_isotype_all) <- sample.names
        return(list(output_plot,clones_per_isotype_all))
      }
      if(variant.plot ==TRUE){
        for (i in 1:length(sample.names)){

          message(paste0("Starting sample ", i, "/", length(sample.names)))

          #subset VDJ.matrix by group.by
          VDJ.matrix <- subset(VDJ.matrix.all, VDJ.matrix.all[,group.by] == sample.names[i])

          #Rank clonotypes

          s_id = VDJ.matrix$clonotype_id
          s_id_num = readr::parse_number(s_id)
          s_id_num = sprintf("%05d", s_id_num)
          s_id_num = paste0("clonotype", s_id_num)
          VDJ.matrix$clonotype_id <- s_id_num
          s_id = NULL
          s_id_num = NULL


          VDJ.matrix$clonotype_id = paste0("old", VDJ.matrix$clonotype_id)

          rank_raw <- as.data.frame(VDJ.matrix %>% dplyr::group_by(clonotype_id) %>% dplyr::summarise(n = n()))
          rank_raw <- rank_raw[order(rank_raw$n, rank_raw$clonotype_id, decreasing = TRUE),]
          #rank_raw <- rank_raw[order(rank_raw$n, decreasing = T),]

          rank_raw$new_clonotype <- 1:length(rank_raw$clonotype_id)
          rank_raw$new_clonotype = sprintf("%05d", rank_raw$new_clonotype)
          rank_raw$new_clonotype = paste0("new", rank_raw$new_clonotype)

          for(z in 1:nrow(rank_raw)){
            VDJ.matrix$clonotype_id <- gsub(rank_raw$clonotype_id[z], rank_raw$new_clonotype[z], VDJ.matrix$clonotype_id) #For VDJ
          }

          VDJ.matrix$clonotype_id <- paste0("clonotype", sprintf("%05d", readr::parse_number(VDJ.matrix$clonotype_id)))

          #get clonotype frequency table
          clono_freq <- as.data.frame(table(VDJ.matrix$clonotype_id))
          clono_freq <- clono_freq[order(clono_freq$Freq, decreasing = TRUE),]

          is.trimmed = VDJ.matrix$VJ_sequence_nt_trimmed

          #Define the variants
          if (is.null(is.trimmed) == FALSE){
            VDJ.matrix$pasted_variants = paste(VDJ.matrix$VDJ_sequence_nt_trimmed, VDJ.matrix$VJ_sequence_nt_trimmed, sep = ";")
            message(paste0("Trimmed sequences found for sample ", sample.names[i]))
            message("Variants are obtained as VDJ_sequence_nt_trimmed;VJ_sequence_nt_trimmed")
          }

          if (is.null(is.trimmed) == TRUE){
            VDJ.matrix$pasted_variants = paste(VDJ.matrix$VDJ_sequence_nt_raw, VDJ.matrix$VJ_sequence_nt_raw, sep = ";")
            message(paste0("Trimmed sequences not found for sample ", sample.names[i]))
            message("Variants are obtained as VDJ_sequence_nt_raw;VJ_sequence_nt_raw")
          }

          #get essential info from VDJ_GEX_matrix
          curr_rep_iso <- VDJ.matrix[,c("barcode","clonotype_id", "VDJ_cgene", "pasted_variants")]
          names(curr_rep_iso)[3] <- "isotype"

          #We only care about the main isotypes, not the subgroups
          curr_rep_iso$isotype <- substr(curr_rep_iso$isotype,1,4)

          if(color.by != "isotype"){ #add color info in case its needed
            curr_rep_iso$colors <- as.character(VDJ.matrix[,color.by]) #get as character
            curr_rep_iso$colors[which(is.na(curr_rep_iso$colors))] <- "None" #replace any NAs which would mess up string counting later

            if(inherits(VDJ.matrix[,color.by],"factor")){ #REORDER Depending on class in VDJ.matrix. This is an attempt at conserving existing factor levels such as in the seurat_clusters column
              curr_rep_iso$colors <- ordered(as.factor(curr_rep_iso$colors),levels = c(levels(VDJ.matrix[,color.by]), "None")) #reorder
            } else {
              curr_rep_iso$colors <- ordered(as.factor(curr_rep_iso$colors),levels = c(as.character(unique(VDJ.matrix[,color.by])), "None"))
            }
          }

          if(treat.incomplete.clones == "exclude"){

            clones_to_del <-c()

            for(k in 1:nrow(clono_freq)){
              if(all(curr_rep_iso[which(curr_rep_iso$clonotype_id == clono_freq[k,1]),"isotype"] == "")){ #detect if no VDJC is available in this clonotype
                clones_to_del <- append(clones_to_del, k)
              }
            }
            if(length(clones_to_del > 0)){

              #Formatting numbers inside the function
              s_id_num = clones_to_del
              s_id_num = sprintf("%05d", s_id_num)
              s_id_num = paste0("clonotype", s_id_num)
              clones_to_del <- s_id_num
              s_id = NULL
              s_id_num = NULL

              s_id = curr_rep_iso$clonotype_id
              s_id_num = readr::parse_number(s_id)
              s_id_num = sprintf("%05d", s_id_num)
              s_id_num = paste0("clonotype", s_id_num)
              curr_rep_iso$clonotype_id <- s_id_num
              s_id = NULL
              s_id_num = NULL

              #Delete the clones that are incomplete
              curr_rep_iso$match = curr_rep_iso$clonotype_id %in% clones_to_del
              curr_rep_iso <- subset(curr_rep_iso, subset = match == FALSE)
              curr_rep_iso$match = NULL

              curr_rep_iso= curr_rep_iso[order(curr_rep_iso$clonotype_id),]

              curr_rep_iso$clonotype_id <- paste0("old", curr_rep_iso$clonotype_id)


              #Re rank the clonotypes
              rank_raw <- as.data.frame(curr_rep_iso %>% dplyr::group_by(clonotype_id) %>% dplyr::summarise(n = n()))
              rank_raw <- rank_raw[order(rank_raw$n, rank_raw$clonotype_id, decreasing = TRUE),]
              rank_raw$new_clonotype <- 1:length(rank_raw$clonotype_id)
              rank_raw$new_clonotype = paste0("newclonotype", sprintf("%05d", rank_raw$new_clonotype))

              #Replace in the dataframe the old clonotypes by the new rank
              for(z in 1:nrow(rank_raw)){
                curr_rep_iso$clonotype_id <- gsub(rank_raw$clonotype_id[z], rank_raw$new_clonotype[z], curr_rep_iso$clonotype_id) #For VDJ
              }

              curr_rep_iso <- curr_rep_iso[order(curr_rep_iso$clonotype_id),]


            }
          }

          #Formatting the clonotypes for the function
          s_id = curr_rep_iso$clonotype_id
          s_id_num = readr::parse_number(s_id)
          s_id_num = sprintf("%05d", s_id_num)
          s_id_num = paste0("clonotype", s_id_num)
          curr_rep_iso$clonotype_id <- s_id_num
          s_id = NULL
          s_id_num = NULL

          options(dplyr.summarise.inform = FALSE)

          #Obtain the variant df depending on coloring
          if(color.by == "isotype"){
            curr_rep_iso %>% dplyr::group_by(pasted_variants, clonotype_id, isotype) %>% dplyr::summarise(n=n()) -> variant_df
          }

          if(color.by != "isotype"){
            curr_rep_iso %>% dplyr::group_by(pasted_variants, clonotype_id, colors) %>% dplyr::summarise(n=n()) -> variant_df
          }

          variant_df= variant_df[order(variant_df$clonotype_id,variant_df$n),]


          #Simplify the variant_df to keep only the number of clones desired
          max_row = max(which(variant_df$clonotype_id == paste0("clonotype", sprintf("%05d", clones))))
          variant_df = variant_df[1:max_row,]

          s_id = variant_df$clonotype_id
          s_id_num = readr::parse_number(s_id)
          variant_df$clonotype_id <- s_id_num
          s_id = NULL
          s_id_num = NULL


          if(treat.incomplete.cells == "proportional" & stringr::str_detect(paste0(variant_df$isotype,collapse = ";"), pattern = "IG")==TRUE){ #check that there is at least one IG entry

            for (clone_number in 1:clones){
              subset_clone_variant_with_isotype =  variant_df[which(variant_df$clonotype_id == clone_number & stringr::str_detect(variant_df$isotype, pattern = "IG")==TRUE),]
              subset_clone_variant = variant_df[which(variant_df$clonotype_id == clone_number),]

              variants <- c()

              if ((nrow(subset_clone_variant_with_isotype) < nrow(subset_clone_variant)) & (nrow(subset_clone_variant_with_isotype) >= 1)){

                for (z in 1:nrow(subset_clone_variant_with_isotype)){

                  variants <- c(variants, rep(subset_clone_variant_with_isotype$isotype[z], subset_clone_variant_with_isotype$n[z]))
                }

                proportional_variants = c()
                for (variantes in 1:length(which(variant_df$clonotype_id == clone_number & stringr::str_detect(variant_df$isotype, pattern = "IG")==FALSE))){
                  proportional_variants= c(proportional_variants, sample(variants, 1))
                }

                variant_df[which(variant_df$clonotype_id == clone_number & stringr::str_detect(variant_df$isotype, pattern = "IG")==FALSE),]$isotype = proportional_variants
              } else if ((nrow(subset_clone_variant_with_isotype) < nrow(subset_clone_variant)) & (nrow(subset_clone_variant_with_isotype) == 0)){

                proportional_variants = c()
                proportional_variants = c(proportional_variants, rep("Unknown", nrow(subset_clone_variant)))

                variant_df[which(variant_df$clonotype_id == clone_number & stringr::str_detect(variant_df$isotype, pattern = "IG")==FALSE),]$isotype = proportional_variants
              }

            }
          }

          if(treat.incomplete.cells == "exclude" & color.by == "isotype"){

            variant_df$match_ex_crit <- variant_df$isotype == ""
            variant_df <- subset(variant_df, subset = match_ex_crit == FALSE)
            variant_df$match_ex_crit = NULL

            variant_df$original_clonotypes = variant_df$clonotype_id

            variant_df$clonotype_id = sprintf("%05d", variant_df$clonotype_id)
            variant_df$clonotype_id = paste0("old", variant_df$clonotype_id)


            rank_raw <- as.data.frame(variant_df %>% dplyr::group_by(clonotype_id) %>% dplyr::summarise(n = sum(n)))
            rank_raw <- rank_raw[order(rank_raw$n, rank_raw$clonotype_id, decreasing = TRUE),]

            rank_raw$new_clonotype <- 1:length(rank_raw$clonotype_id)
            rank_raw$new_clonotype = sprintf("%05d", rank_raw$new_clonotype)
            rank_raw$new_clonotype = paste0("new", rank_raw$new_clonotype)

            for(z in 1:nrow(rank_raw)){
              variant_df$clonotype_id <- gsub(rank_raw$clonotype_id[z], rank_raw$new_clonotype[z], variant_df$clonotype_id) #For VDJ
            }

            variant_df$clonotype_id <- readr::parse_number(variant_df$clonotype_id)
            variant_df <- variant_df[order(variant_df$clonotype_id),]

          }

          #Delete clones not of interest
          if(color.by == "isotype" & isotypes.to.plot[1] != "all"){

            if(!any(isotypes.to.plot %in% unique(variant_df$isotype))){stop("isotype.to.plot input not found in dataframe. Please check if the isotype is spelled correctly")}

            variant_df$match_ex_crit <- variant_df$isotype %in% isotypes.to.plot
            variant_df <- subset(variant_df, subset = match_ex_crit == TRUE)
            variant_df$match_ex_crit = NULL

            variant_df$original_clonotypes = variant_df$clonotype_id

            variant_df$clonotype_id = sprintf("%05d", variant_df$clonotype_id)
            variant_df$clonotype_id = paste0("old", variant_df$clonotype_id)


            rank_raw <- as.data.frame(variant_df %>% dplyr::group_by(clonotype_id) %>% dplyr::summarise(n = sum(n)))
            rank_raw <- rank_raw[order(rank_raw$n, rank_raw$clonotype_id, decreasing = TRUE),]

            rank_raw$new_clonotype <- 1:length(rank_raw$clonotype_id)
            rank_raw$new_clonotype = sprintf("%05d", rank_raw$new_clonotype)
            rank_raw$new_clonotype = paste0("new", rank_raw$new_clonotype)

            for(z in 1:nrow(rank_raw)){
              variant_df$clonotype_id <- gsub(rank_raw$clonotype_id[z], rank_raw$new_clonotype[z], variant_df$clonotype_id) #For VDJ
            }

            variant_df$clonotype_id <- readr::parse_number(variant_df$clonotype_id)
            variant_df <- variant_df[order(variant_df$clonotype_id),]


            message("New ranking based only on selected isotypes: ")
            message(paste0(unique(variant_df$clonotype_id),", "))
          }

          variant_df$variant = 1:length(variant_df$clonotype_id)
          variant_df$variant = as.character(variant_df$variant)
          variant_df_list[[i]] = variant_df


          if(color.by == "isotype"){
            output_plot[[i]] <- ggplot2::ggplot(variant_df_list[[i]], ggplot2::aes(x = clonotype_id, y = n, fill = isotype, color = variant)) + ggplot2::geom_bar(stat="identity", width=0.6, color="white") + ggplot2::theme_bw() + ggplot2::scale_fill_manual("Isotype",values = c("IGHG" = "green4", "IGHM" = "black", "IGHA" = "red3", "IGHD"="blue", "IGHE"="purple", "Unknown"="gray")) + ggplot2::theme_classic() + ggplot2::ggtitle(paste0(i)) + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) + ggplot2::scale_y_continuous(expand = c(0,0)) + ggplot2::scale_x_continuous(expand = c(0,0.5)) + ggplot2::labs(title = sample.names[[i]], x = "Clonal rank", y = "Number of cells")
          } else {
            output_plot[[i]] <- ggplot2::ggplot(variant_df_list[[i]], ggplot2::aes(x = clonotype_id, y = n, fill = colors, color = variant)) + ggplot2::geom_bar(stat="identity", width=0.6, color="white") + ggplot2::scale_fill_manual(values = grDevices::rainbow(n = length(unique(clones_per_isotype_all[[i]]$colors)))) + ggplot2::theme_classic() + ggplot2::ggtitle(paste0(i)) + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) + ggplot2::scale_y_continuous(expand = c(0,0)) + ggplot2::scale_x_continuous(expand = c(0,0.5)) + ggplot2::labs(title = sample.names[[i]], x = "Clonal rank", y = "Number of cells")

          }
        }

        clones_per_isotype_all = variant_df_list

        names(clones_per_isotype_all) <- sample.names
        names(output_plot) <- sample.names

        return(list(output_plot,clones_per_isotype_all))
      }

    }

    if(subtypes ==TRUE & color.by == "isotype"){

      if(variant.plot == FALSE){
        for (i in 1:length(sample.names)){

          #subset VDJ.matrix by group.by
          VDJ.matrix <- subset(VDJ.matrix.all, VDJ.matrix.all[,group.by] == sample.names[i])


          #get clonotype frequency table
          clono_freq <- as.data.frame(table(VDJ.matrix$clonotype_id))
          clono_freq <- clono_freq[order(clono_freq$Freq, decreasing = TRUE),]

          #get essential info from VDJ_GEX_matrix
          curr_rep_iso <- VDJ.matrix[,c("barcode","clonotype_id", "VDJ_cgene", "VDJ_cdr3_aa", "VJ_cdr3_aa")]
          curr_rep_iso$VDJ_cgene[is.null(curr_rep_iso$VDJ_cgene)] <- "none"
          names(curr_rep_iso)[3] <- "isotype"
          #no substring here, just splitting on a ; to exclude multiple isotypes of one clone
          curr_rep_iso$isotype <- stringr::str_split(curr_rep_iso$isotype, ";", simplify = TRUE)[,1]

          if(treat.incomplete.clones == "exclude"){
            clones_to_del <-c()
            for(k in 1:nrow(clono_freq)){
              if(all(curr_rep_iso[which(curr_rep_iso$clonotype_id == clono_freq[k,1]),"isotype"] == "")){ #detect if no VDJC is available in this clonotype
                clones_to_del <- append(clones_to_del, k)
              }
            }
            if(length(clones_to_del > 0)){
              clono_freq <- clono_freq[-clones_to_del,]} #remove these clones entirely
            clono_freq <- clono_freq[order(clono_freq$Freq, decreasing = TRUE),] #reorder, to make sure
          }


          #iterate over clones and get isotype info
          clones_per_isotype <- list()
          j <- 1
          for (j in 1:clones){
            curr_clone <- curr_rep_iso[which(curr_rep_iso$clonotype_id == clono_freq[j,1]),]
            if(nrow(curr_clone) > 0){

              #str_split and discard potentially second chain to avoid overcounting in the case of a cell having 2 HC with 2 different isotypes
              curr_clone$isotype <- stringr::str_split(curr_clone$isotype, ";", simplify = TRUE)[,1]

              if(treat.incomplete.cells == "proportional" & stringr::str_detect(paste0(curr_clone$isotype,collapse = ";"), pattern = "IG")==TRUE){ #check that there is at least one IG entry
                props <- table(curr_clone$isotype[which(stringr::str_detect(curr_clone$isotype, pattern = "IG")==TRUE)]) #getting proportions
                n_total <- nrow(curr_clone) #getting total number of cells in clonotype
                props <- round(props / sum(props) * n_total,0) #calculating new number of each isotype to match proportions
                if(sum(props) > n_total){props[which.max(props)] <- props[which.max(props)] - 1
                } else if(sum(props) < n_total){props[which.min(props)] <- props[which.min(props)] + 1} #catching rounding derived errors.

                curr_clone$isotype <- rep.int(names(props), props) #new isotype column with the new number of isotypes. ! these are not in the original order !

              } else if (treat.incomplete.cells == "proportional" & stringr::str_detect(paste0(curr_clone$isotype,collapse = ";"), pattern = "IG")==FALSE){ #if no entry is present

                curr_clone$isotype <- "None"
              }

              clones_per_isotype[[j]] <- data.frame("Counts"=rep(0, 14), "Color"=rep("", 14), "Isotype"=rep("", 14), "ClonalRank"=rep("", 14), "clonotype_id" = rep(curr_clone$clonotype_id[1],14), "VDJ_cdr3_aa" = rep(curr_clone$VDJ_cdr3_aa[which(curr_clone$VDJ_cdr3_aa != "")][1],14), "VJ_cdr3_aa" = rep(curr_clone$VJ_cdr3_aa[which(curr_clone$VJ_cdr3_aa != "")][1],14), "barcode" = rep(paste0(curr_clone$barcode, collapse = ";"),14))  #to maintain clonotype information

              clones_per_isotype[[j]]$Counts[1] <- sum(stringr::str_count(curr_clone$isotype, "IGHG1"))
              clones_per_isotype[[j]]$Counts[2] <- sum(stringr::str_count(curr_clone$isotype, "IGHG2"))
              clones_per_isotype[[j]]$Counts[3] <- sum(stringr::str_count(curr_clone$isotype, "IGHG2A"))
              clones_per_isotype[[j]]$Counts[4] <- sum(stringr::str_count(curr_clone$isotype, "IGHG2B"))
              clones_per_isotype[[j]]$Counts[5] <- sum(stringr::str_count(curr_clone$isotype, "IGHG2C"))
              clones_per_isotype[[j]]$Counts[6] <- sum(stringr::str_count(curr_clone$isotype, "IGHG3"))
              clones_per_isotype[[j]]$Counts[7] <- sum(stringr::str_count(curr_clone$isotype, "IGHG4"))
              clones_per_isotype[[j]]$Counts[8] <- sum(stringr::str_count(curr_clone$isotype, "IGHM"))
              clones_per_isotype[[j]]$Counts[9] <- sum(stringr::str_count(curr_clone$isotype, "IGHA"))
              clones_per_isotype[[j]]$Counts[10] <- sum(stringr::str_count(curr_clone$isotype, "IGHA1"))
              clones_per_isotype[[j]]$Counts[11] <- sum(stringr::str_count(curr_clone$isotype, "IGHA2"))
              clones_per_isotype[[j]]$Counts[12] <- sum(stringr::str_count(curr_clone$isotype, "IGHD"))
              clones_per_isotype[[j]]$Counts[13] <- sum(stringr::str_count(curr_clone$isotype, "IGHE"))
              clones_per_isotype[[j]]$Counts[14] <- sum(stringr::str_count(curr_clone$isotype, "None"))



              clones_per_isotype[[j]]$ClonalRank <- j

              if(color.by == "isotype"){
                clones_per_isotype[[j]]$Isotype <- c("IGHG1", "IGHG2", "IGHG2a", "IGHG2b", "IGHG2c", "IGHG3", "IGHG4", "IGHM", "IGHA", "IGHA1", "IGHA2", "IGHD", "IGHE", "Unknown")

                if (species == "Human"){
                  clones_per_isotype[[j]] <- clones_per_isotype[[j]][-c(3, 4, 5, 9), ]
                }

                if (species == "Mouse"){
                  clones_per_isotype[[j]] <- clones_per_isotype[[j]][-c(2, 7, 10, 11), ]
                }

              } else {
                clones_per_isotype[[j]]$Isotype <- names(which.max(table(curr_clone$colors)))
              }
            } #end if(nrow(curr_clone) > 0)
          }
          clones_per_isotype_all[[i]] <- do.call("rbind",clones_per_isotype)

          if(treat.incomplete.cells == "exclude"){
            rank_raw <- as.data.frame(clones_per_isotype_all[[i]] %>% dplyr::group_by(ClonalRank) %>% dplyr::summarise(sum_counts = sum(Counts)) %>% dplyr::arrange(dplyr::desc(sum_counts)) %>% dplyr::mutate(rank = 1:length(unique(ClonalRank))))
            clones_per_isotype_all[[i]]$ClonalRank_2 <- 0
            for(l in 1:nrow(rank_raw)){
              clones_per_isotype_all[[i]]$ClonalRank_2[which(clones_per_isotype_all[[i]]$ClonalRank == rank_raw$ClonalRank[l])] <- rank_raw$rank[l]
            }

            clones_per_isotype_all[[i]]$ClonalRank <- clones_per_isotype_all[[i]]$ClonalRank_2
            #reorder the dataframe the sum_counts order by decreasing sum, the clonal rank is there to keep clones that have the same count sum together as a group of rows

            message("New ranking based only on present HC chains: ")
            message(unique(clones_per_isotype_all[[i]]$clonotype_id))
          }

          #Delete clones not of interest
          if(color.by == "isotype" & isotypes.to.plot[1] != "all"){
            if(!any(isotypes.to.plot %in% unique(clones_per_isotype_all[[i]]$Isotype))){stop("isotype.to.plot input not found in dataframe. Please check if the isotype is spelled correctly")}
            to_del <- c()
            for(k in 1:length(unique(clones_per_isotype_all[[i]]$ClonalRank))){
              if(sum(clones_per_isotype_all[[i]]$Counts[which(clones_per_isotype_all[[i]]$ClonalRank == unique(clones_per_isotype_all[[i]]$ClonalRank)[k] & clones_per_isotype_all[[i]]$Isotype %in% isotypes.to.plot)]) == 0){
                to_del <- append(to_del, unique(clones_per_isotype_all[[i]]$ClonalRank)[k])

              }
            }
            clones_per_isotype_all[[i]] <- subset(clones_per_isotype_all[[i]], !ClonalRank %in% to_del)
            #redistribute clonal ranks

            rank_raw <- as.data.frame(clones_per_isotype_all[[i]] %>% dplyr::group_by(ClonalRank) %>% dplyr::summarise(sum_counts = sum(Counts)) %>% plyr::arrange(dplyr::desc(sum_counts)) %>% dplyr::mutate(rank = 1:length(unique(ClonalRank))))
            clones_per_isotype_all[[i]]$ClonalRank_2 <- 0
            for(l in 1:nrow(rank_raw)){
              clones_per_isotype_all[[i]]$ClonalRank_2[which(clones_per_isotype_all[[i]]$ClonalRank == rank_raw$ClonalRank[l])] <- rank_raw$rank[l]
            }
            clones_per_isotype_all[[i]]$ClonalRank <- clones_per_isotype_all[[i]]$ClonalRank_2

            message("New ranking based only on selected isotypes: ")
            message(unique(clones_per_isotype_all[[i]]$clonotype_id))
          }

          if (species == "Human"){
            output_plot[[i]] <- ggplot2::ggplot(clones_per_isotype_all[[i]], ggplot2::aes(fill = Isotype, y=Counts, x=ClonalRank)) + ggplot2::geom_bar(stat="identity", width=0.6, color="black") + ggplot2::theme_bw() + ggplot2::scale_fill_manual("Isotype", values = c("IGHG1" = "green","IGHG2" = "green3", "IGHG3"="green4", "IGHG4"="darkgreen", "IGHM" = "black", "IGHA1" = "red", "IGHA2"= "red4", "IGHD"="blue", "IGHE"="purple", "Unknown"="gray")) + ggplot2::theme_classic() + ggplot2::ggtitle(sample.names[i]) + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) + ggplot2::scale_y_continuous(expand = c(0,0)) + ggplot2::ylab("Number of cells") + ggplot2::xlab("Clonal rank") + ggplot2::scale_x_continuous(expand = c(0,0.5))
          }

          if (species == "Mouse"){
            output_plot[[i]] <- ggplot2::ggplot(clones_per_isotype_all[[i]], ggplot2::aes(fill = Isotype, y=Counts, x=ClonalRank)) + ggplot2::geom_bar(stat="identity", width=0.6, color="black") + ggplot2::theme_bw() + ggplot2::scale_fill_manual("Isotype", values = c("IGHG1" = "lightgreen", "IGHG2a"="green", "IGHG2b" = "green3", "IGHG2c"="green4", "IGHG3"="darkgreen", "IGHM" = "black", "IGHA" = "red", "IGHD"="blue", "IGHE"="purple", "Unknown"="gray")) + ggplot2::theme_classic() + ggplot2::ggtitle(sample.names[i]) + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) + ggplot2::scale_y_continuous(expand = c(0,0)) + ggplot2::ylab("Number of cells") + ggplot2::xlab("Clonal rank") + ggplot2::scale_x_continuous(expand = c(0,0.5))

          }
        }

        names(clones_per_isotype_all) <- sample.names
        return(list(output_plot,clones_per_isotype_all))
      }
      if(variant.plot ==TRUE){
        for (i in 1:length(sample.names)){

          message(paste0("Starting sample ", i, "/", length(sample.names)))

          #subset VDJ.matrix by group.by
          VDJ.matrix <- subset(VDJ.matrix.all, VDJ.matrix.all[,group.by] == sample.names[i])

          #Rank clonotypes

          s_id = VDJ.matrix$clonotype_id
          s_id_num = readr::parse_number(s_id)
          s_id_num = sprintf("%05d", s_id_num)
          s_id_num = paste0("clonotype", s_id_num)
          VDJ.matrix$clonotype_id <- s_id_num
          s_id = NULL
          s_id_num = NULL


          VDJ.matrix$clonotype_id = paste0("old", VDJ.matrix$clonotype_id)

          rank_raw <- as.data.frame(VDJ.matrix %>% dplyr::group_by(clonotype_id) %>% dplyr::summarise(n = n()))
          rank_raw <- rank_raw[order(rank_raw$n, rank_raw$clonotype_id, decreasing = T),]

          rank_raw$new_clonotype <- 1:length(rank_raw$clonotype_id)
          rank_raw$new_clonotype = sprintf("%05d", rank_raw$new_clonotype)
          rank_raw$new_clonotype = paste0("new", rank_raw$new_clonotype)

          for(z in 1:nrow(rank_raw)){
            VDJ.matrix$clonotype_id <- gsub(rank_raw$clonotype_id[z], rank_raw$new_clonotype[z], VDJ.matrix$clonotype_id) #For VDJ
          }

          VDJ.matrix$clonotype_id <- paste0("clonotype", sprintf("%05d", readr::parse_number(VDJ.matrix$clonotype_id)))

          #get clonotype frequency table
          clono_freq <- as.data.frame(table(VDJ.matrix$clonotype_id))
          clono_freq <- clono_freq[order(clono_freq$Freq, decreasing = TRUE),]

          is.trimmed = VDJ.matrix$VJ_sequence_nt_trimmed

          #Define the variants
          if (is.null(is.trimmed) == FALSE){
            VDJ.matrix$pasted_variants = paste(VDJ.matrix$VDJ_sequence_nt_trimmed, VDJ.matrix$VJ_sequence_nt_trimmed, sep = ";")
            message(paste0("Trimmed sequences found for sample ", sample.names[i]))
            message("Variants are obtained as VDJ_sequence_nt_trimmed;VJ_sequence_nt_trimmed")
          }

          if (is.null(is.trimmed) == TRUE){
            VDJ.matrix$pasted_variants = paste(VDJ.matrix$VDJ_sequence_nt_raw, VDJ.matrix$VJ_sequence_nt_raw, sep = ";")
            message(paste0("Trimmed sequences not found for sample ", sample.names[i]))
            message("Variants are obtained as VDJ_sequence_nt_raw;VJ_sequence_nt_raw")
          }

          #get essential info from VDJ_GEX_matrix
          #get essential info from VDJ_GEX_matrix
          curr_rep_iso <- VDJ.matrix[,c("barcode","clonotype_id", "VDJ_cgene", "pasted_variants")]
          names(curr_rep_iso)[3] <- "isotype"

          #no substring here, just splitting on a ; to exclude multiple isotypes of one clone
          curr_rep_iso$isotype <- stringr::str_split(curr_rep_iso$isotype, ";", simplify = TRUE)[,1]



          if(treat.incomplete.clones == "exclude"){

            clones_to_del <-c()

            for(k in 1:nrow(clono_freq)){
              if(all(curr_rep_iso[which(curr_rep_iso$clonotype_id == clono_freq[k,1]),"isotype"] == "")){ #detect if no VDJC is available in this clonotype
                clones_to_del <- append(clones_to_del, k)
              }
            }
            if(length(clones_to_del > 0)){

              #Formatting numbers inside the function
              s_id_num = clones_to_del
              s_id_num = sprintf("%05d", s_id_num)
              s_id_num = paste0("clonotype", s_id_num)
              clones_to_del <- s_id_num
              s_id = NULL
              s_id_num = NULL

              s_id = curr_rep_iso$clonotype_id
              s_id_num = readr::parse_number(s_id)
              s_id_num = sprintf("%05d", s_id_num)
              s_id_num = paste0("clonotype", s_id_num)
              curr_rep_iso$clonotype_id <- s_id_num
              s_id = NULL
              s_id_num = NULL

              #Delete the clones that are incomplete
              curr_rep_iso$match = curr_rep_iso$clonotype_id %in% clones_to_del
              curr_rep_iso <- subset(curr_rep_iso, subset = match == FALSE)
              curr_rep_iso$match = NULL

              curr_rep_iso= curr_rep_iso[order(curr_rep_iso$clonotype_id),]

              curr_rep_iso$clonotype_id <- paste0("old", curr_rep_iso$clonotype_id)


              #Re rank the clonotypes
              rank_raw <- as.data.frame(curr_rep_iso %>% dplyr::group_by(clonotype_id) %>% dplyr::summarise(n = n()))
              rank_raw <- rank_raw[order(rank_raw$n, rank_raw$clonotype_id, decreasing = TRUE),]
              rank_raw$new_clonotype <- 1:length(rank_raw$clonotype_id)
              rank_raw$new_clonotype = paste0("newclonotype", sprintf("%05d", rank_raw$new_clonotype))

              #Replace in the dataframe the old clonotypes by the new rank
              for(z in 1:nrow(rank_raw)){
                curr_rep_iso$clonotype_id <- gsub(rank_raw$clonotype_id[z], rank_raw$new_clonotype[z], curr_rep_iso$clonotype_id) #For VDJ
              }

              curr_rep_iso <- curr_rep_iso[order(curr_rep_iso$clonotype_id),]


            }
          }

          #Formatting the clonotypes for the function
          s_id = curr_rep_iso$clonotype_id
          s_id_num = readr::parse_number(s_id)
          s_id_num = sprintf("%05d", s_id_num)
          s_id_num = paste0("clonotype", s_id_num)
          curr_rep_iso$clonotype_id <- s_id_num
          s_id = NULL
          s_id_num = NULL

          options(dplyr.summarise.inform = FALSE)

          curr_rep_iso %>% dplyr::group_by(pasted_variants, clonotype_id, isotype) %>% dplyr::summarise(n=n()) -> variant_df

          variant_df= variant_df[order(variant_df$clonotype_id,variant_df$n),]

          #Simplify the variant_df to keep only the number of clones desired
          max_row = max(which(variant_df$clonotype_id == paste0("clonotype", sprintf("%05d", clones))))
          variant_df = variant_df[1:max_row,]

          s_id = variant_df$clonotype_id
          s_id_num = readr::parse_number(s_id)
          variant_df$clonotype_id <- s_id_num
          s_id = NULL
          s_id_num = NULL

          if(treat.incomplete.cells == "proportional" & stringr::str_detect(paste0(variant_df$isotype,collapse = ";"), pattern = "IG")==TRUE){ #check that there is at least one IG entry

            for (clone_number in 1:clones){
              subset_clone_variant_with_isotype =  variant_df[which(variant_df$clonotype_id == clone_number & stringr::str_detect(variant_df$isotype, pattern = "IG")==TRUE),]
              subset_clone_variant = variant_df[which(variant_df$clonotype_id == clone_number),]

              variants <- c()

              if ((nrow(subset_clone_variant_with_isotype) < nrow(subset_clone_variant)) & (nrow(subset_clone_variant_with_isotype) >= 1)){

                for (z in 1:nrow(subset_clone_variant_with_isotype)){

                  variants <- c(variants, rep(subset_clone_variant_with_isotype$isotype[z], subset_clone_variant_with_isotype$n[z]))
                }

                proportional_variants = c()
                for (variantes in 1:length(which(variant_df$clonotype_id == clone_number & stringr::str_detect(variant_df$isotype, pattern = "IG")==FALSE))){
                  proportional_variants= c(proportional_variants, sample(variants, 1))
                }

                variant_df[which(variant_df$clonotype_id == clone_number & stringr::str_detect(variant_df$isotype, pattern = "IG")==FALSE),]$isotype = proportional_variants
              } else if ((nrow(subset_clone_variant_with_isotype) < nrow(subset_clone_variant)) & (nrow(subset_clone_variant_with_isotype) == 0)){

                proportional_variants = c()
                proportional_variants = c(proportional_variants, rep("Unknown", nrow(subset_clone_variant)))

                variant_df[which(variant_df$clonotype_id == clone_number & stringr::str_detect(variant_df$isotype, pattern = "IG")==FALSE),]$isotype = proportional_variants
              }

            }
          }

          if(treat.incomplete.cells == "exclude" & color.by == "isotype"){

            variant_df$match_ex_crit <- variant_df$isotype == ""
            variant_df <- subset(variant_df, subset = match_ex_crit == FALSE)
            variant_df$match_ex_crit = NULL

            variant_df$original_clonotypes = variant_df$clonotype_id

            variant_df$clonotype_id = sprintf("%05d", variant_df$clonotype_id)
            variant_df$clonotype_id = paste0("old", variant_df$clonotype_id)


            rank_raw <- as.data.frame(variant_df %>% dplyr::group_by(clonotype_id) %>% dplyr::summarise(n = sum(n)))
            rank_raw <- rank_raw[order(rank_raw$n, rank_raw$clonotype_id, decreasing = TRUE),]

            rank_raw$new_clonotype <- 1:length(rank_raw$clonotype_id)
            rank_raw$new_clonotype = sprintf("%05d", rank_raw$new_clonotype)
            rank_raw$new_clonotype = paste0("new", rank_raw$new_clonotype)

            for(z in 1:nrow(rank_raw)){
              variant_df$clonotype_id <- gsub(rank_raw$clonotype_id[z], rank_raw$new_clonotype[z], variant_df$clonotype_id) #For VDJ
            }

            variant_df$clonotype_id <- readr::parse_number(variant_df$clonotype_id)
            variant_df <- variant_df[order(variant_df$clonotype_id),]

          }

          if(color.by == "isotype" & isotypes.to.plot[1] != "all"){

            if(!any(isotypes.to.plot %in% unique(variant_df$isotype))){stop("isotype.to.plot input not found in dataframe. Please check if the isotype is spelled correctly")}

            variant_df$match_ex_crit <- variant_df$isotype %in% isotypes.to.plot
            variant_df <- subset(variant_df, subset = match_ex_crit == TRUE)
            variant_df$match_ex_crit = NULL

            variant_df$original_clonotypes = variant_df$clonotype_id

            variant_df$clonotype_id = sprintf("%05d", variant_df$clonotype_id)
            variant_df$clonotype_id = paste0("old", variant_df$clonotype_id)


            rank_raw <- as.data.frame(variant_df %>% dplyr::group_by(clonotype_id) %>% dplyr::summarise(n = sum(n)))
            rank_raw <- rank_raw[order(rank_raw$n, rank_raw$clonotype_id, decreasing = TRUE),]

            rank_raw$new_clonotype <- 1:length(rank_raw$clonotype_id)
            rank_raw$new_clonotype = sprintf("%05d", rank_raw$new_clonotype)
            rank_raw$new_clonotype = paste0("new", rank_raw$new_clonotype)

            for(z in 1:nrow(rank_raw)){
              variant_df$clonotype_id <- gsub(rank_raw$clonotype_id[z], rank_raw$new_clonotype[z], variant_df$clonotype_id) #For VDJ
            }

            variant_df$clonotype_id <- readr::parse_number(variant_df$clonotype_id)
            variant_df <- variant_df[order(variant_df$clonotype_id),]


            message("New ranking based only on selected isotypes: ")
            message(paste0(unique(variant_df$clonotype_id),", "))
          }

          variant_df$variant = 1:length(variant_df$clonotype_id)
          variant_df$variant = as.character(variant_df$variant)
          variant_df_list[[i]] = variant_df

          #Delete clones not of interest

          if (species == "Human"){
            output_plot[[i]] <- ggplot2::ggplot(variant_df_list[[i]], ggplot2::aes(x = clonotype_id, y = n, fill = isotype, color = variant)) + 
              ggplot2::geom_bar(stat="identity", width=0.6, color="white") + 
              ggplot2::theme_bw() + 
              ggplot2::scale_fill_manual("Isotype", values = c("IGHG1" = "green","IGHG2" = "green3", "IGHG3"="green4", "IGHG4"="darkgreen", "IGHM" = "black", "IGHA1" = "red", "IGHA2"= "red4", "IGHD"="blue", "IGHE"="purple", "Unknown"="gray")) + 
              ggplot2::theme_classic() + 
              ggplot2::ggtitle(paste0(i)) + 
              ggplot2::scale_y_continuous(expand = c(0,0)) + 
              ggplot2::scale_x_continuous(expand = c(0,0.5)) + 
              ggplot2::labs(title = sample.names[[i]], x = "Clonal rank", y = "Number of cells") +
              ggplot2::theme(text = ggplot2::element_text(size = 30),
                             plot.title = ggplot2::element_text(hjust = 0.5))

          }

          if (species == "Mouse"){
            output_plot[[i]] <- ggplot2::ggplot(variant_df_list[[i]], ggplot2::aes(x = clonotype_id, y = n, fill = isotype, color = variant)) + 
              ggplot2::geom_bar(stat="identity", width=0.6, color="white") + 
              ggplot2::theme_bw() + 
              ggplot2::scale_fill_manual("Isotype", values = c("IGHG1" = "lightgreen", "IGHG2A"="green", "IGHG2B" = "green3", "IGHG2C"="green4", "IGHG3"="darkgreen", "IGHM" = "black", "IGHA" = "#fb6a4a", "IGHD"="#1874CD", "IGHE"="#B452CD", "Unknown"="gray")) + 
              ggplot2::theme_classic() + 
              ggplot2::ggtitle(paste0(i)) + 
              ggplot2::scale_y_continuous(expand = c(0,0)) + 
              ggplot2::scale_x_continuous(expand = c(0,0.5)) + 
              ggplot2::labs(title = sample.names[[i]], x = "Clonal rank", y = "Number of cells") +
              ggplot2::theme(text = ggplot2::element_text(size = 30),
                             plot.title = ggplot2::element_text(hjust = 0.5))


          }
        }

        clones_per_isotype_all = variant_df_list

        names(clones_per_isotype_all) <- sample.names
        names(output_plot) <- sample.names
        return(list(output_plot,clones_per_isotype_all))
      }


    }

  }else if(celltype == "Tcells"){ ####START T cells

    #check that dataframe is the input
    if(!inherits(VDJ.matrix,"data.frame")){
      stop("Please provide a VDJ matrix dataframe (VDJ_GEX_matrix_output[[1]])")
    }

    #set up for subsetting by group.by item
    VDJ.matrix[,group.by] <- as.character(VDJ.matrix[,group.by])
    sample.names <- unique(VDJ.matrix[,group.by])
    sample.names[is.na(sample.names)] <- "NONE"
    VDJ.matrix[is.na(VDJ.matrix[,group.by]),group.by] <- "NONE"
    VDJ.matrix.all <- VDJ.matrix

    if(clones<1 | clones>50){stop("Number of clones must be an integer value between 1 and 50")}

    VDJ_per_clone_output_all <- list()
    clones_per_isotype <- list()
    clones_per_isotype_all <-list()
    output_plot <- list()

    for (i in 1:length(sample.names)){
      #subset VDJ.matrix by group.by
      VDJ.matrix <- subset(VDJ.matrix.all, VDJ.matrix.all[,group.by] == sample.names[i])
      #get clonotype frequency table
      clono_freq <- as.data.frame(table(VDJ.matrix$clonotype_id))
      clono_freq <- clono_freq[order(clono_freq$Freq, decreasing = TRUE),]

      #get essential info from VDJ_GEX_matrix
      curr_rep_iso <- VDJ.matrix[,c("barcode","clonotype_id", "VDJ_cdr3_aa", "VJ_cdr3_aa")]

      if(color.by[1] != "isotype"){ #add color info in case its needed
        curr_rep_iso$colors <- as.character(VDJ.matrix[,color.by]) #get as character
        curr_rep_iso$colors[which(is.na(curr_rep_iso$colors))] <- "None" #replace any NAs which would mess up string counting later

        if(inherits(VDJ.matrix[,color.by],"factor")){ #REORDER Depending on class in VDJ.matrix. This is an attempt at conserving existing factor levels such as in the seurat_clusters column
          curr_rep_iso$colors <- ordered(as.factor(curr_rep_iso$colors),levels = c(levels(VDJ.matrix[,color.by]), "None")) #reorder
        } else {
          curr_rep_iso$colors <- ordered(as.factor(curr_rep_iso$colors),levels = c(as.character(unique(VDJ.matrix[,color.by])), "None"))
        }

      } else{
        curr_rep_iso$colors <- "none"
      }

      curr_rep_iso$isotype <- "none"

      #iterate over clones and get isotype info
      clones_per_isotype <- list()
      for (j in 1:clones){
        curr_clone <- curr_rep_iso[which(curr_rep_iso$clonotype_id == clono_freq[j,1]),]
        if(nrow(curr_clone) > 0){

          color_cur_clone <- unique(curr_clone$colors)
          n_color_cur_clone <- length(unique(curr_clone$colors))

          clones_per_isotype[[j]] <- data.frame("Counts"=rep(0, n_color_cur_clone), "Color"= color_cur_clone, "ClonalRank"=rep("", n_color_cur_clone), "clonotype_id" = rep(curr_clone$clonotype_id[1],n_color_cur_clone), "VDJ_cdr3_aa" = rep(curr_clone$VDJ_cdr3_aa[which(curr_clone$VDJ_cdr3_aa != "")][1],n_color_cur_clone), "VJ_cdr3_aa" = rep(curr_clone$VJ_cdr3_aa[which(curr_clone$VJ_cdr3_aa != "")][1],n_color_cur_clone), "barcode" = rep(paste0(curr_clone$barcode, collapse = ";"),n_color_cur_clone))


          if(color.by[1] == "isotype"){
            clones_per_isotype[[j]]$Color <- c("None")
            clones_per_isotype[[j]]$Counts <- nrow(curr_clone)

          } else {

            for(k in 1:nrow(clones_per_isotype[[j]])){
              clones_per_isotype[[j]]$Counts[k] <- stringr::str_count(paste0("/",paste0(curr_clone$colors,collapse = "/ /"),"/"), pattern = paste0("/",as.character(clones_per_isotype[[j]]$Color[k]), "/"))
            }
          }

          clones_per_isotype[[j]]$ClonalRank <- j

        } #end if(nrow(curr_clone) > 0)
      }
      clones_per_isotype_all[[i]] <- do.call("rbind", clones_per_isotype)
      if(color.by[1] == "isotype"){
        output_plot[[i]] <- ggplot2::ggplot(clones_per_isotype_all[[i]], ggplot2::aes(fill = Color, y=Counts, x=ClonalRank)) + ggplot2::geom_bar(stat="identity", width=0.6, color="black") + ggplot2::theme_bw() + ggplot2::scale_fill_manual(values = c("gray80")) + ggplot2::theme_classic() + ggplot2::ggtitle(paste0(i)) + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) + ggplot2::scale_y_continuous(expand = c(0,0)) + ggplot2::scale_x_continuous(expand = c(0,0.5)) + ggplot2::labs(title = sample.names[[i]], x = "Clonal rank", y = "Number of cells")
      } else {
        output_plot[[i]] <- ggplot2::ggplot(clones_per_isotype_all[[i]], ggplot2::aes(fill = Color, y=Counts, x=ClonalRank)) + ggplot2::geom_bar(stat="identity", width=0.6, color="black") + ggplot2::theme_bw() + ggplot2::theme_classic() + ggplot2::ggtitle(paste0(i)) + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) + ggplot2::scale_y_continuous(expand = c(0,0)) + ggplot2::scale_x_continuous(expand = c(0,0.5)) + ggplot2::labs(title = sample.names[[i]], x = "Clonal rank", y = "Number of cells", fill = color.by)+ ggplot2::scale_fill_manual(values = grDevices::rainbow(n = length(unique(clones_per_isotype_all[[i]]$Color))))

      }
    }
    names(clones_per_isotype_all) <- sample.names
    return(list(output_plot,clones_per_isotype_all))
  }
}

Try the Platypus package in your browser

Any scripts or data that you put into this service are public.

Platypus documentation built on Oct. 18, 2024, 5:08 p.m.