Nothing
#'V(D)J gene usage stacked barplots
#'
#' @description Produces a stacked barplot with the fraction of the most frequently used IgH and IgK/L Vgenes. This function can be used in combination with the VDJ_Vgene_usage_barplot to vizualize V gene usage per sample and among samples.
#' @param VDJ Either (for platypus version "v2") output from VDJ_analyze function. This should be a list of clonotype dataframes, with each list element corresponding to a single VDJ repertoire, OR (for platypus version "v3") the the VDJ matrix output of the VDJ_GEX_matrix() function (normally VDJ.GEX.matrix.output[[1]])
#' @param group.by Character. Defaults to "sample_id". Column name of VDJ to group plot by.
#' @param HC.gene.number Numeric value indicating the top genes to be dispayed. If this number is higher than the total number of unique HC V genes in the VDJ repertoire, then this number is equal to the number of unique HC V genes.
#' @param Fraction.HC Numeric value indicating the minimum fraction of clones expressing a particular HC V gene. If the usage of a particular gene is below this value, then this gene is excluded. If the usage of a particular gene is above this value even in one sample, then this gene is included in the analysis. Default value is set to 0, thus all genes are selected.
#' @param LC.Vgene Logical indicating whether to make a barplot of the LC V gene distribution. Default is set to FALSE.
#' @param LC.gene.number Numeric value indicating the top genes to be dispayed. If this number is higher than the total number of unique LC V genes in the VDJ repertoire, then this number is equal to the number of unique LC V genes.
#' @param Fraction.LC Numeric value indicating the minimum fraction of clones expressing a particular LC V gene. If the usage of a particular gene is below this value, then this gene is excluded. If the usage of a particular gene is above this value even in one sample, then this gene is included in the analysis. Default value is set to 0, thus all genes are selected.
#' @param platypus.version Set according to input format to either "v2" or "v3". Defaults to "v3"
#' @param is.bulk logical value indicating whether the VDJ input was generated from bulk-sequencing data using the bulk_to_vgm function. If is.bulk = T, the VDJ_Vgene_usage_stacked_barplot function is compatible for use with bulk data. Defaults to False (F).
#' @return Returns a list of ggplot objects which show the stacked distribution of IgH and IgK/L V genes for the most used V genes. Returns an empty plot if the Fraction.HC or Fraction.LC that were selected were too high, resulting in the exclusion of all the genes.
#' @export
#' @examples
#' example.vdj.vgene_usage <- VDJ_Vgene_usage_stacked_barplot(
#' VDJ = Platypus::small_vdj, LC.Vgene = TRUE
#' ,HC.gene.number = 15, Fraction.HC = 1, platypus.version = "v3")
#'
VDJ_Vgene_usage_stacked_barplot <- function(VDJ,
group.by,
HC.gene.number,
Fraction.HC,
LC.Vgene,
LC.gene.number,
Fraction.LC,
platypus.version,
is.bulk){
Vgene <- NULL
Percentage <- NULL
VDJ_chain_count <- NULL
VJ_chain_count <- NULL
sample_id <- NULL
Sample <- NULL
Frequency <- NULL
if(missing(is.bulk)) is.bulk <- FALSE
if (missing(Fraction.HC)) Fraction.HC <- 0
if (missing(HC.gene.number)) HC.gene.number <- 10
if (missing(LC.Vgene)) LC.Vgene <- FALSE
if (missing(Fraction.LC)) Fraction.LC <- 0
if (missing(LC.gene.number)) LC.gene.number <- 10
#naming compatibility
VDJ.matrix <- VDJ
VDJ <- NULL
Vgene_usage_plot <- list()
HC_Vgene_usage <- list()
HC_Vgene_usage_top <- list()
HC_Vgene_usage_fraction <- list()
if(missing(platypus.version)) platypus.version <- "v3"
if(platypus.version == "v2"){
clonotype.list <- VDJ.matrix
for (i in 1:length(clonotype.list)){
#Calculate the V gene usage frequencies for each sample on the clonal lebel
HC_Vgene_usage[[i]] <- as.data.frame(table(clonotype.list[[i]]$HC_vgene))
colnames(HC_Vgene_usage[[i]]) <- c("Vgene", "Frequency")
for (j in 1:nrow(HC_Vgene_usage[[i]])){
HC_Vgene_usage[[i]]$Percentage[j] <- HC_Vgene_usage[[i]]$Frequency[j]/sum(HC_Vgene_usage[[i]]$Frequency)*100
}
#Rank based on most used V gene per sample
ranks <- order(-HC_Vgene_usage[[i]]$Frequency)
HC_Vgene_usage[[i]] <- HC_Vgene_usage[[i]][ranks,]
#Find the genes which are used more than Fraction.HC of the time
HC_Vgene_usage_fraction[[i]] <- HC_Vgene_usage[[i]][which(HC_Vgene_usage[[i]]$Percentage > Fraction.HC),]
}
#Find the union of unique gene names between all the samples
HC_Vgene_names <- do.call("rbind",HC_Vgene_usage_fraction)
HC_Vgene_names <- as.vector(unique(HC_Vgene_names$Vgene))
#Filter out all those genes that are used less than the Fraction.HC in all samples
for(i in 1:length(clonotype.list)){
HC_Vgene_usage[[i]] <- HC_Vgene_usage[[i]][HC_Vgene_usage[[i]]$Vgene %in% HC_Vgene_names,]
}
#Determine the mean gene usage between the samples
HC_Vgene_usage_all <- do.call("rbind", HC_Vgene_usage)
unique_Vgene <- unique(HC_Vgene_usage_all$Vgene)
HC_Vgene_usage_all_unique <- data.frame("Vgene"= rep("", length(unique_Vgene)), "Frequency"= rep(0, length(unique_Vgene)))
HC_Vgene_usage_all_unique$Vgene <- unique_Vgene
for (i in 1:length(unique_Vgene)){
HC_Vgene_usage_all_unique[i,"Percentage"] <- mean(HC_Vgene_usage_all[which(HC_Vgene_usage_all$Vgene == HC_Vgene_usage_all_unique$Vgene[i]),]$Percentage)
}
# Rank the V genes based on the most used ones
ranks <- order(-HC_Vgene_usage_all_unique$Frequency)
HC_Vgene_usage_all_unique <- HC_Vgene_usage_all_unique[ranks,]
top_Vgenes <- HC_Vgene_usage_all_unique$Vgene[1:HC.gene.number]
for(i in 1:length(clonotype.list)){
HC_Vgene_usage_top[[i]] <- data.frame("Vgene"=rep("", length(top_Vgenes)), "Frequency"=rep(0, length(top_Vgenes)))
HC_Vgene_usage_top[[i]]$Vgene <- top_Vgenes
for (j in 1:nrow(HC_Vgene_usage_top[[i]])){
if (HC_Vgene_usage_top[[i]]$Vgene[j] %in% HC_Vgene_usage[[i]]$Vgene ==TRUE){
HC_Vgene_usage_top[[i]]$Frequency[j] <- HC_Vgene_usage[[i]][which(HC_Vgene_usage[[i]]$Vgene == paste0(HC_Vgene_usage_top[[i]]$Vgene[j])),]$Frequency
}
}
for (j in 1:nrow(HC_Vgene_usage_top[[i]])){
HC_Vgene_usage_top[[i]]$Percentage[j] <- HC_Vgene_usage_top[[i]]$Frequency[j]/sum(HC_Vgene_usage_top[[i]]$Frequency)*100
}
}
#Assign the sample id
for (i in 1:length(HC_Vgene_usage_top)){
HC_Vgene_usage_top[[i]]$Sample <- as.character(i)
}
#And then bind together for plotting
plotting_df <- do.call("rbind",HC_Vgene_usage_top)
#Take out the NAs introduced because of the HC.gene.number and the Fraction.HC.
plotting_df <- plotting_df[!is.na(plotting_df$Vgene), ]
Vgene_usage_plot[[1]] <- ggplot2::ggplot(plotting_df, ggplot2::aes(fill = Vgene, y=Frequency, x=Sample)) +
ggplot2::geom_bar(position="fill", stat="identity", color="black", width = 0.7) +
ggplot2::theme_bw() + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.5, hjust=1)) +
ggplot2::ylab("% of unique clones") + ggplot2::scale_y_continuous(expand = c(0,0))
if(LC.Vgene ==TRUE){
LC_Vgene_usage <- list()
LC_Vgene_usage_top <- list()
LC_Vgene_usage_fraction <- list()
for (i in 1:length(clonotype.list)){
LC_Vgene_usage[[i]] <- as.data.frame(table(clonotype.list[[i]]$LC_vgene))
colnames(LC_Vgene_usage[[i]]) <- c("Vgene", "Frequency")
for (j in 1:nrow(LC_Vgene_usage[[i]])){
LC_Vgene_usage[[i]]$Percentage[j] <- LC_Vgene_usage[[i]]$Frequency[j]/sum(LC_Vgene_usage[[i]]$Frequency)*100
}
#Rank based on most used V gene
ranks <- order(-LC_Vgene_usage[[i]]$Frequency)
LC_Vgene_usage[[i]] <- LC_Vgene_usage[[i]][ranks,]
LC_Vgene_usage_fraction[[i]] <- LC_Vgene_usage[[i]][which(LC_Vgene_usage[[i]]$Percentage > Fraction.LC),]
}
#Find the union of unique gene names between all the samples
LC_Vgene_names <- do.call("rbind",LC_Vgene_usage_fraction)
LC_Vgene_names <- as.vector(unique(LC_Vgene_names$Vgene))
#Filter out all those genes that are used less than the Fraction.HC in all samples
for(i in 1:length(clonotype.list)){
LC_Vgene_usage[[i]] <- LC_Vgene_usage[[i]][LC_Vgene_usage[[i]]$Vgene %in% LC_Vgene_names,]
}
LC_Vgene_usage_all <- do.call("rbind", LC_Vgene_usage)
unique_Vgene <- unique(LC_Vgene_usage_all$Vgene)
LC_Vgene_usage_all_unique <- data.frame("Vgene"= rep("", length(unique_Vgene)), "Frequency"= rep(0, length(unique_Vgene)))
LC_Vgene_usage_all_unique$Vgene <- unique_Vgene
for (i in 1:length(unique_Vgene)){
LC_Vgene_usage_all_unique[i,"Percentage"] <- mean(LC_Vgene_usage_all[which(LC_Vgene_usage_all$Vgene == LC_Vgene_usage_all_unique$Vgene[i]),]$Percentage)
}
# Rank the V genes based on the most used ones
ranks <- order(-LC_Vgene_usage_all_unique$Frequency)
LC_Vgene_usage_all_unique <- LC_Vgene_usage_all_unique[ranks,]
top_Vgenes <- LC_Vgene_usage_all_unique$Vgene[1:LC.gene.number]
for(i in 1:length(clonotype.list)){
LC_Vgene_usage_top[[i]] <- data.frame("Vgene"=rep("", length(top_Vgenes)), "Frequency"=rep(0, length(top_Vgenes)))
LC_Vgene_usage_top[[i]]$Vgene <- top_Vgenes
for (j in 1:nrow(LC_Vgene_usage_top[[i]])){
if (LC_Vgene_usage_top[[i]]$Vgene[j] %in% LC_Vgene_usage[[i]]$Vgene ==TRUE){
LC_Vgene_usage_top[[i]]$Frequency[j] <- LC_Vgene_usage[[i]][which(LC_Vgene_usage[[i]]$Vgene == paste0(LC_Vgene_usage_top[[i]]$Vgene[j])),]$Frequency
}
}
for (j in 1:nrow(LC_Vgene_usage_top[[i]])){
LC_Vgene_usage_top[[i]]$Percentage[j] <- LC_Vgene_usage_top[[i]]$Frequency[j]/sum(LC_Vgene_usage_top[[i]]$Frequency)*100
}
}
#Assign the sample id
for (i in 1:length(LC_Vgene_usage_top)){
LC_Vgene_usage_top[[i]]$Sample <- as.character(i)
}
#And then bind together
plotting_df <- do.call("rbind",LC_Vgene_usage_top)
plotting_df <- plotting_df[!is.na(plotting_df$Vgene), ]
Vgene_usage_plot[[1]] <- ggplot2::ggplot(plotting_df, ggplot2::aes(fill = Vgene, y=Frequency, x=Sample)) +
ggplot2::geom_bar(position="fill", stat="identity", color="black", width = 0.7) +
ggplot2::theme_bw() + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.5, hjust=1)) +
ggplot2::ylab("% of unique clones") + ggplot2::scale_y_continuous(expand = c(0,0))
}
return(Vgene_usage_plot)
} else if(platypus.version == "v3"){
if(is.bulk == FALSE){
#filtering for max 1VDJ 1VJ chain
VDJ.matrix <- subset(VDJ.matrix, VDJ_chain_count == 1 & VJ_chain_count == 1)
}
if(missing(group.by)) group.by <- "sample_id"
if(group.by != "sample_id"){
if(group.by %in% names(VDJ.matrix)){
VDJ.matrix$sample_id <- as.character(VDJ.matrix[,group.by])
if(any(is.na(VDJ.matrix$sample_id)) == TRUE){
VDJ.matrix <- VDJ.matrix[!is.na(VDJ.matrix$sample_id),]
warning(paste0("Filtered out cells with 'NA' in grouping column"))
}
message(paste0("Grouping by: ", group.by))
} else {
warning(paste0("Group_id '",group.by, "' was not found in VDJ. Grouping by 'sample_id'"))}
}
clonotype.list <- list()
for(i in 1:length(unique(VDJ.matrix$sample_id))){
clonotype.list[[i]] <- subset(VDJ.matrix, sample_id == unique(VDJ.matrix$sample_id)[i])
#get unique clones
clonotype.list[[i]] <- clonotype.list[[i]][duplicated(clonotype.list[[i]]$clonotype_id) == FALSE,]
}
names(clonotype.list) <- unique(VDJ.matrix$sample_id)
message(paste0("Sample order: ", paste0(unique(VDJ.matrix$sample_id), collapse = " ; ")))
if(LC.Vgene == FALSE){
Vgene_usage_plot <- list()
HC_Vgene_usage <- list()
HC_Vgene_usage_top <- list()
HC_Vgene_usage_fraction <- list()
for (i in 1:length(clonotype.list)){
#Calculate the V gene usage frequencies for each sample on the clonal lebel
HC_Vgene_usage[[i]] <- as.data.frame(table(clonotype.list[[i]]$VDJ_vgene))
colnames(HC_Vgene_usage[[i]]) <- c("Vgene", "Frequency")
for (j in 1:nrow(HC_Vgene_usage[[i]])){
HC_Vgene_usage[[i]]$Percentage[j] <- HC_Vgene_usage[[i]]$Frequency[j]/sum(HC_Vgene_usage[[i]]$Frequency)*100
}
#Rank based on most used V gene per sample
ranks <- order(-HC_Vgene_usage[[i]]$Frequency)
HC_Vgene_usage[[i]] <- HC_Vgene_usage[[i]][ranks,]
#Find the genes which are used more than Fraction.HC of the time
HC_Vgene_usage_fraction[[i]] <- HC_Vgene_usage[[i]][which(HC_Vgene_usage[[i]]$Percentage > Fraction.HC),]
}
#Find the union of unique gene names between all the samples
HC_Vgene_names <- do.call("rbind",HC_Vgene_usage_fraction)
HC_Vgene_names <- as.vector(unique(HC_Vgene_names$Vgene))
#Filter out all those genes that are used less than the Fraction.HC in all samples
for(i in 1:length(clonotype.list)){
HC_Vgene_usage[[i]] <- HC_Vgene_usage[[i]][HC_Vgene_usage[[i]]$Vgene %in% HC_Vgene_names,]
}
#Determine the mean gene usage between the samples
HC_Vgene_usage_all <- do.call("rbind", HC_Vgene_usage)
unique_Vgene <- unique(HC_Vgene_usage_all$Vgene)
HC_Vgene_usage_all_unique <- data.frame("Vgene"= rep("", length(unique_Vgene)), "Frequency"= rep(0, length(unique_Vgene)))
HC_Vgene_usage_all_unique$Vgene <- unique_Vgene
for (i in 1:length(unique_Vgene)){
HC_Vgene_usage_all_unique[i,"Percentage"] <- mean(HC_Vgene_usage_all[which(HC_Vgene_usage_all$Vgene == HC_Vgene_usage_all_unique$Vgene[i]),]$Percentage)
}
# Rank the V genes based on the most used ones
ranks <- order(-HC_Vgene_usage_all_unique$Frequency)
HC_Vgene_usage_all_unique <- HC_Vgene_usage_all_unique[ranks,]
top_Vgenes <- HC_Vgene_usage_all_unique$Vgene[1:HC.gene.number]
for(i in 1:length(clonotype.list)){
HC_Vgene_usage_top[[i]] <- data.frame("Vgene"=rep("", length(top_Vgenes)), "Frequency"=rep(0, length(top_Vgenes)))
HC_Vgene_usage_top[[i]]$Vgene <- top_Vgenes
for (j in 1:nrow(HC_Vgene_usage_top[[i]])){
if (HC_Vgene_usage_top[[i]]$Vgene[j] %in% HC_Vgene_usage[[i]]$Vgene ==TRUE){
HC_Vgene_usage_top[[i]]$Frequency[j] <- HC_Vgene_usage[[i]][which(HC_Vgene_usage[[i]]$Vgene == paste0(HC_Vgene_usage_top[[i]]$Vgene[j])),]$Frequency
}
}
for (j in 1:nrow(HC_Vgene_usage_top[[i]])){
HC_Vgene_usage_top[[i]]$Percentage[j] <- HC_Vgene_usage_top[[i]]$Frequency[j]/sum(HC_Vgene_usage_top[[i]]$Frequency)*100
}
}
#Assign the sample id
for (i in 1:length(HC_Vgene_usage_top)){
HC_Vgene_usage_top[[i]]$Sample <- unique(VDJ.matrix$sample_id)[i]
}
#And then bind together for plotting
plotting_df <- do.call("rbind",HC_Vgene_usage_top)
#Take out the NAs introduced because of the HC.gene.number and the Fraction.HC.
plotting_df <- plotting_df[!is.na(plotting_df$Vgene), ]
Vgene_usage_plot[[1]] <- ggplot2::ggplot(plotting_df, ggplot2::aes(fill = Vgene, y=Frequency, x=Sample)) +
ggplot2::geom_bar(position="fill", stat="identity", color="black", width = 0.7) + cowplot::theme_cowplot() + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.5, hjust=1)) +
ggplot2::ylab("% of unique clones") + ggplot2::scale_y_continuous(expand = c(0,0)) #+ ggplot2::ggtitle(paste0("IgH V gene stacked"))
}else if(LC.Vgene ==TRUE){
LC_Vgene_usage <- list()
LC_Vgene_usage_top <- list()
LC_Vgene_usage_fraction <- list()
for (i in 1:length(clonotype.list)){
LC_Vgene_usage[[i]] <- as.data.frame(table(clonotype.list[[i]]$VJ_vgene))
colnames(LC_Vgene_usage[[i]]) <- c("Vgene", "Frequency")
for (j in 1:nrow(LC_Vgene_usage[[i]])){
LC_Vgene_usage[[i]]$Percentage[j] <- LC_Vgene_usage[[i]]$Frequency[j]/sum(LC_Vgene_usage[[i]]$Frequency)*100
}
#Rank based on most used V gene
ranks <- order(-LC_Vgene_usage[[i]]$Frequency)
LC_Vgene_usage[[i]] <- LC_Vgene_usage[[i]][ranks,]
LC_Vgene_usage_fraction[[i]] <- LC_Vgene_usage[[i]][which(LC_Vgene_usage[[i]]$Percentage > Fraction.LC),]
}
#Find the union of unique gene names between all the samples
LC_Vgene_names <- do.call("rbind",LC_Vgene_usage_fraction)
LC_Vgene_names <- as.vector(unique(LC_Vgene_names$Vgene))
#Filter out all those genes that are used less than the Fraction.HC in all samples
for(i in 1:length(clonotype.list)){
LC_Vgene_usage[[i]] <- LC_Vgene_usage[[i]][LC_Vgene_usage[[i]]$Vgene %in% LC_Vgene_names,]
}
LC_Vgene_usage_all <- do.call("rbind", LC_Vgene_usage)
unique_Vgene <- unique(LC_Vgene_usage_all$Vgene)
LC_Vgene_usage_all_unique <- data.frame("Vgene"= rep("", length(unique_Vgene)), "Frequency"= rep(0, length(unique_Vgene)))
LC_Vgene_usage_all_unique$Vgene <- unique_Vgene
for (i in 1:length(unique_Vgene)){
LC_Vgene_usage_all_unique[i,"Percentage"] <- mean(LC_Vgene_usage_all[which(LC_Vgene_usage_all$Vgene == LC_Vgene_usage_all_unique$Vgene[i]),]$Percentage)
}
# Rank the V genes based on the most used ones
ranks <- order(-LC_Vgene_usage_all_unique$Frequency)
LC_Vgene_usage_all_unique <- LC_Vgene_usage_all_unique[ranks,]
top_Vgenes <- LC_Vgene_usage_all_unique$Vgene[1:LC.gene.number]
for(i in 1:length(clonotype.list)){
LC_Vgene_usage_top[[i]] <- data.frame("Vgene"=rep("", length(top_Vgenes)), "Frequency"=rep(0, length(top_Vgenes)))
LC_Vgene_usage_top[[i]]$Vgene <- top_Vgenes
for (j in 1:nrow(LC_Vgene_usage_top[[i]])){
if (LC_Vgene_usage_top[[i]]$Vgene[j] %in% LC_Vgene_usage[[i]]$Vgene ==TRUE){
LC_Vgene_usage_top[[i]]$Frequency[j] <- LC_Vgene_usage[[i]][which(LC_Vgene_usage[[i]]$Vgene == paste0(LC_Vgene_usage_top[[i]]$Vgene[j])),]$Frequency
}
}
for (j in 1:nrow(LC_Vgene_usage_top[[i]])){
LC_Vgene_usage_top[[i]]$Percentage[j] <- LC_Vgene_usage_top[[i]]$Frequency[j]/sum(LC_Vgene_usage_top[[i]]$Frequency)*100
}
}
#Assign the sample id
for (i in 1:length(LC_Vgene_usage_top)){
LC_Vgene_usage_top[[i]]$Sample <- unique(VDJ.matrix$sample_id)[i]
}
#And then bind together
plotting_df <- do.call("rbind",LC_Vgene_usage_top)
plotting_df <- plotting_df[!is.na(plotting_df$Vgene), ]
Vgene_usage_plot[[1]] <- ggplot2::ggplot(plotting_df, ggplot2::aes(fill = Vgene, y=Frequency, x=Sample)) +
ggplot2::geom_bar(position="fill", stat="identity", color="black", width = 0.7) + cowplot::theme_cowplot() + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.5, hjust=1)) +
ggplot2::ylab("% of unique clones") + ggplot2::scale_y_continuous(expand = c(0,0)) #+ ggplot2::ggtitle(paste0("IgK/L V gene stacked"))
}
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.