In airr-community/rep-cred: Repertoire Credibility

knitr::opts_chunk$set(
  fig.width = 7,
  fig.asp = 0.7,
  echo = params$echo,
  cache = params$cache,
  warning = FALSE,
  message = FALSE,
  out_dir = params$outdir,
  eval.opts = c('eval', 'echo', 'fig.height', 'fig.width')
)

# Save params to an .RData file. Useful for debugging.
save(params, file = file.path(params$outdir, "params.RData"))

# Load libraries
library(repcred)
library(airr)
library(stringr)
library(ggplot2)
library(data.table)
library(seqinr)
library(kableExtra)
library(dplyr)
# Configuration and default values
gene_data = NA

#Below variables are set to give the orange , green and red colours. These colours were used as it makes the text easier to read and are more muted than the regular red , orange and green.
green = "LightGreen"
amber = "NavajoWhite"
red = "pink"

Input parameters

#SECTION 1
germline_reference <- NULL
if (!is.null(params$genome_file)) {
  germline_reference <-
    unlist(lapply(params$genome_file, tigger::readIgFasta))
} 

printParams(params)

repertoire <- airr::read_rearrangement(params$rep)
# Downsampling unless check box unticked
num_keep <- 5000
if (params$downsample & nrow(repertoire) > num_keep) {
  repertoire <- repertoire[sample(nrow(repertoire), size = num_keep),]
}


#Set up section colours
section_1 = green
section_2 = green
section_3 = green
section_4 = green
section_5 = green
section_6 = green
section_7 = green
section_8 = green
section_9 = green
section_10 = green
section_11 = green

Quality Control Stats

Repcred will run whatever analyses it can with a minimal number of input columns, but can provide more information if more columns are available. Specific issues are noted in the report. You may wish to check that all the columns you expect to be populated are present. Columns that are present but missing data in some rows are also flagged for review.

#SECTION 2
is_compliant <-
  suppressWarnings(airr::validate_rearrangement(repertoire)) # TODO: consider if to display the warning

if (!is_compliant) {
  formattedErrorMeassage("The repertoire is not AIRR compliant. Exiting analysis.")
  knitr::knit_exit()
}

missing_columns <- findMissingColumns(repertoire)
sequence_alignment_available <- TRUE
if (length(missing_columns) > 0) {
    writeLines("<h3> Columns with missing / No data </h3> ")
    if ("sequence_alignment" %in% missing_columns) {
        sequence_alignment_available <- any(!is.na(repertoire[["sequence_alignment"]]))
        if (!sequence_alignment_available) {
            formattedWarningMeassage("Required `sequence_alignment` is empty.")
        }
    }
    #kbl(data.table(column_name = missing_columns))
    writeLines(paste0(missing_columns, collapse = ","))
}

Non-nucleotides in sequence

This chapter reports non-nucleotide characters (anything other than A, C, G, or T) that are found in the annotated sequences. The presence of other characters such as N, X or gap characters may indicate quality issues, although leading Ns may indicate primer masking, which is not a problem.

With Illumina paired-end reads, non-nucleotides in the middle of a sequence may indicate that the reads don’t overlap, for example with long CDR3s. This may be inevitable in some experimental configurations but is something to be aware of during analysis. Leaving primer masking and paired read gaps aside, generally speaking one would expect the majority of reads not to include non-nucleotide symbols. The exact figure will depend upon experimental design, but look for variation between samples and possible correlation with overall read quality as reported by FASTQC, and consider whether such samples meet your experimental objectives.

What does the percentage mean in the analysis? Ideally we would like to see:percentage of reads containing non-nucleotides, and percentage of non-nucleotide symbols across symbols in all reads.

#SECTION 3
check_nucleotides(repertoire) # TODO: this works only on the sequence column. If missing, should it be changed to sequence_alignment?

Statistics

This section provides useful statistics about the repertoire:

repertoire$productive <- as.logical(repertoire$productive)
non_prod <- F
if (any(!is.na(repertoire$productive))) {
  prod_info <- data.table("Category"=factor(repertoire$productive, 
                                 levels = c("TRUE","FALSE")))
  non_prod <- any(!repertoire$productive)
  # non_prod <- T
}

vj_column <- "vj_in_frame" %in% colnames(repertoire) & 
    !any(is.na(repertoire$vj_in_frame))

cols_non_prod_breakdown <-  non_prod & 
    all(c("vj_in_frame","stop_codon") %in% colnames(repertoire)) &
    !any(is.na(repertoire$vj_in_frame)) & 
    !any(is.na(repertoire$stop_codon))

Productive vs. non-productive sequences

Non-productive sequences are those which are unlikely to translate to well-formed receptor protein, for example because of the presence of stop codons or absence of conserved residues at specific locations.

Non-productive sequences are found in biological samples, hence their presence in a repertoire is expected. Nevertheless they can also be listed in a repertoire as a result of technical problems in sequencing or incorrect annotation. Out-of-frame sequences are always non-productive.

If a high level of non-productive sequences is recorded, say >=10%, it is worth examining a sample of non-productive reads to establish the cause. A high number of out-of-frame sequences or sequences with stop codons could indicate a sample preparation or sequencing quality problem. A high number of non-productive in-frame sequences without stop codons could indicate issues with annotation. If you see the latter, analyse some sample sequences with an online service such as IMGT VQUEST or IgBLAST. If the sequences are annotated as productive online, suspect a problem in your annotation toolset.

cat("Skipping this section: non-productive sequences not present in the repertoire.")

#SECTION 4
ggplot2::ggplot(
  prod_info, 
  aes(x = !!as.name("Category"))) +
  ggplot2::geom_bar(stat="count") + 
  ggpubr::theme_pubclean() +
  ggplot2::labs(x = "", y = "Number of sequences") +
  ggplot2::scale_x_discrete(drop = FALSE, labels=c("Productive", "Non-productive", "NA/Not specified"))

knitr::kable(prod_info %>% 
      dplyr::count(!!as.name("Category"), name = "Number of sequences", .drop = F) %>%  
        dplyr::mutate("Category" = as.character(!!as.name("Category")),
                     "Category" =  dplyr::recode(!!as.name("Category"),"TRUE"="Productive",
                                              "FALSE"="Non-productive"),
                     "Category" = tidyr::replace_na(!!as.name("Category"),"NA/Not specified")))

r if(vj_column){"## Percentage of sequences where the V and J region are in-frame"}

writeLines(paste0("Retrived information from the vj_in_frame column, precentage of V-J seuqences in frame: ",formatC(mean(as.logical(repertoire$vj_in_frame))*100,digits = 5),"%"))

r if(cols_non_prod_breakdown){"## Non-productive sequences breakdown"}

### table the columns that indicate non-productive sequences.

## check the cols are booleans. If not change
cols <- c("vj_in_frame","stop_codon")
repertoire[[cols[1]]] <- if(is.logical(repertoire[[cols[1]]])) repertoire[[cols[1]]] else as.logical(repertoire[[cols[1]]])
repertoire[[cols[2]]] <- if(is.logical(repertoire[[cols[2]]])) repertoire[[cols[2]]] else as.logical(repertoire[[cols[2]]])

prod_info <- repertoire[!repertoire$productive, cols, drop=F] %>% 
  dplyr::mutate(vj_not_frame = !(!!as.name("vj_in_frame")),
                both = (!!as.name("vj_not_frame")) & (!!as.name("stop_codon")) ) %>%
  dplyr::select(!!as.name("vj_not_frame"),!!as.name("stop_codon"),!!as.name("both")) %>%
  dplyr::summarise("both_stop_codon" = sum(!!as.name("both")),
                   "only_stop_codon" = sum(!!as.name("stop_codon"))-(!!as.name("both_stop_codon")),
                   "both_vj_not_frame" = sum(!!as.name("both")),
                   "only_vj_not_frame" =  sum(!!as.name("vj_not_frame"))-(!!as.name("both_vj_not_frame")),
                   "Unkown" = sum(!(!!as.name("stop_codon"))& !(!!as.name("vj_not_frame")))) %>%
  tidyr::gather(key = "Category", value="Occurences") %>%
  dplyr::mutate("Both" = grepl("both",!!as.name("Category")),
                "Category" = gsub("(both|only)_","",!!as.name("Category")))
prod_info$Category <- factor(prod_info$Category, levels = c("vj_not_frame","stop_codon","Unkown"))
ggplot2::ggplot(
  prod_info, 
  ggplot2::aes(x = !!as.name("Category"), fill = !!as.name("Both"))) +
  ggplot2::geom_bar(stat="count") + 
  ggpubr::theme_pubclean() +
  ggplot2::labs(x = "", y = "Occurences", fill = "V-J out of frame and stop codon") +
  ggplot2::scale_fill_manual(values = c("gray50","firebrick")) + 
  ggplot2::scale_x_discrete(drop = FALSE, 
                            labels=c("V-J out of frame", 
                                     "Contains stop codons", 
                                     "Unkown"))

knitr::kable(data.frame("Category" = c("Contains stop codons",
                          "V-J out of frame",
                          "V-J out of frame and stop codon",
                          "Unkown"),
           "Occurences" = c(prod_info$Occurences[prod_info$Category=="stop_codon" & !prod_info$Both],
                            prod_info$Occurences[prod_info$Category=="vj_not_frame" & !prod_info$Both],
                            prod_info$Occurences[prod_info$Category=="stop_codon" & prod_info$Both],
                            prod_info$Occurences[prod_info$Category=="Unkown"])) %>% 
      dplyr::mutate("Category" = as.character(!!as.name("Category")),
                    "Category" =  dplyr::recode(!!as.name("Category"),"vj_not_frame"="V-J out of frame",
                                             "stop_codon"="Contains stop codons")))

Sequence length distribution

The input sequence length distribution will depend on the sequencing method and the annotation process. If the input sequence represents the assembled or consensus reads from the sequencer, check that they reflect expectation - e.g. Illumina reads would be expected to max out at the read length, but will have a distributions of lengths below the max because of paired-read overlap. The V(D)J sequence length should always be distributed around a length that is determined by the primer design (i.e. full-length sequences, or partial- length, depending on design).

cat("Skipping this section: `sequence_alignment` is empty.")

length_info <- data.frame(region = c(rep("Input sequence",nrow(repertoire)),
                                     rep("V(D)J sequence",nrow(repertoire))),
                          lengths = c(nchar(repertoire$sequence),
                                      nchar(gsub("[.]","",repertoire$sequence_alignment))))

ggplot2::ggplot(
  length_info, 
  ggplot2::aes(x = !!as.name("lengths"))) +
  ggplot2::geom_histogram(bins = 30) + 
  ggpubr::theme_pubclean() +
  ggplot2::labs(x = "", y = "Frequency") +
  ggplot2::facet_grid(cols = ggplot2::vars(!! ggplot2::sym("region")))


length_info <- length_info %>% dplyr::group_by(!!as.name("region")) %>%
  dplyr::summarise(
    Min = min(!!as.name("lengths"), na.rm = T),
    Max = max(!!as.name("lengths"), na.rm = T),
    Mean = mean(!!as.name("lengths"), na.rm = T),
    Median = median(!!as.name("lengths"), na.rm = T),
    SD = sd(!!as.name("lengths"), na.rm = T),
    "q5" = quantile(!!as.name("lengths"), probs = c(0.05), na.rm = T),
    "q95" = quantile(!!as.name("lengths"), probs = c(0.95), na.rm = T),
  )
knitr::kable(length_info)

Annotation Calls Statistics

Gene deletions are common, so do not expect to see every gene represented in every sample. However, it is worth checking that there are no systematic issues, for example checking that all V-gene families are represented as expected, bearing in mind the primer set that was used. Gaps could indicate that some primers are not amplifying. Unrepresented J-gene families can be indicative of annotation/germline set problems.

v_usage_info <- getGeneAlleleStat(repertoire, reference = germline_reference, call = "v_call")
d_usage_info <- data.frame()
if (any(!is.na(repertoire[["d_call"]]))) {
    d_usage_info <- getGeneAlleleStat(repertoire, reference = germline_reference, call = "d_call")
}
j_usage_info <- getGeneAlleleStat(repertoire, reference = germline_reference, call = "j_call")

appearance_info <- bind_rows(v_usage_info$gene_data,
                             d_usage_info$gene_data,
                             j_usage_info$gene_data)

appearance_info$segment <- substr(appearance_info$gene,1,4)

size <- max(table(appearance_info$segment))

if(!is.null(germline_reference)) {
  genes_not_in_rep <- appearance_info$gene[!appearance_info$in_ref]
  genes_not_in_rep <- genes_not_in_rep[!is.na(genes_not_in_rep)]

  if(length(genes_not_in_rep)!=0){
  print(paste0("The following genes were not found in the reference set supplied: ", paste0(genes_not_in_rep, collapse = ",")))
}

}

Number of unique allele calls per gene

ggplot2::ggplot(appearance_info, 
  ggplot2::aes(x = !!as.name("gene"), y = !!as.name("count_unique_alleles"))) +
  ggplot2::geom_col() + 
  ggpubr::theme_pubclean() +
  #ggplot2::theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
  ggplot2::labs(x = "", y = "# of unique alleles") +
  ggplot2::facet_wrap(vars(!! sym("segment")), nrow = 1, 
                      ncol = length(unique(appearance_info[["segment"]])),
                      scales = "free") + ggplot2::coord_flip()

Relative usage

usage_info <- appearance_info %>%
  dplyr::group_by(!!as.name("segment")) %>%
  dplyr::mutate(n = sum(!!as.name("frequency"))) %>%
  dplyr::group_by(!!as.name("segment"),!!as.name("gene")) %>%
  dplyr::summarise("fraction" = !!as.name("frequency")/unique(!!as.name("n")))

ggplot2::ggplot(usage_info, 
  ggplot2::aes(x = !!as.name("gene"), y = !!as.name("fraction"))) +
  ggplot2::geom_col() + 
  ggpubr::theme_pubclean() +
  #ggplot2::theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
  ggplot2::labs(x = "", y = "Relative usage") +
  ggplot2::facet_wrap(vars(!! sym("segment")), nrow = 1, 
                      ncol = length(unique(usage_info[["segment"]])),
                      scales = "free") + ggplot2::coord_flip()

# TODO: missing primers, gene frequency. 

## get gene usage. Get the genes with alakazam getGene and calculate the relative usage. If germline is supplied. Display only genes within the germline. Else show all.
# 
# usage_info <- repertoire %>% rowwise() %>%
#   mutate(v_gene = getGene(!!as.name("v_call"), ))
# 
# 
# usage_info <- getGeneAlleleStat(repertoire, reference = germline_reference, call = "v_call")
# 
# ## unique genes and alleles.
# 
# ggplot2::ggplot(
#   data.frame("Category" = c("Unique Genes","Unique Alleles"), 
#                   "Counts" = c(nrow(usage_info$gene_data[usage_info$gene_data$frequency>0,]),
#                                nrow(usage_info$allele_data[usage_info$allele_data$frequency>0,]))), 
#   ggplot2::aes_string(x = "Category", y = "Counts")) +
#   ggplot2::geom_col() + 
#   ggpubr::theme_pubclean() +
#   ggplot2::labs(x = "", y = "Counts")
# 
# ## Allele counts. If reference is supplied showing the zeros?
# 
# ggplot2::ggplot(usage_info$allele_data[usage_info$allele_data$frequency>0,],
#                 ggplot2::aes_string(x="allele",y="frequency" ,fill="frequency")) + 
#   ggplot2::geom_col() + 
#   ggplot2::guides(fill = "none") + 
#   ggpubr::theme_pubclean() +
#   ggplot2::labs(x="Alleles",y="Number of occurences") + coord_flip()
# 
# fasta_genes <- readInGeneNamesIMGTFasta(gene_data)
# gene_freq_table = geneCount(genes_present,fasta_genes)
# freq_data_table=gene_freq_table
# gene_freq_table=gene_freq_table[gene_freq_table$gene_count>0,]
# gene_tables=getUniqueGenes(gene_freq_table)
# genes_only_freq_table = gene_tables[[1]]
# 
# cat("\n")
# print(kable(gene_freq_table))
# cat("\n")
# 
# number_unqiue_alleles = length(gene_freq_table$gene_name)
# number_unqiue_genes = length(genes_only_freq_table$gene_name)
# labels=c("Number of unique alleles" , "Number of unique genes")
# values=c(number_unqiue_alleles,number_unqiue_genes)
# 
# barplot(values,names.arg = labels,col='light blue',ylab="Counts" ,main="Comparison of unique genes to unique alleles" )
# print(number_unqiue_genes)
# print(ggplot(gene_freq_table,aes(x=gene_name,y=gene_count ,fill=gene_count))+geom_bar(stat="identity")+guides(fill = "none")+labs(x="Gene",y="Number of occurences")+coord_flip())
# 
# 
# cat("\n")
# print(kable(gene_tables[[1]]))
# cat("\n")
# allele_link=gene_tables[[2]]
# gene_allele_freq_table=data.frame(table(allele_link$gene,allele_link$allele))
# gene_allele_freq_table = gene_allele_freq_table[gene_allele_freq_table$Freq>0,]
# gene_allele_freq_table$Var1 <- as.factor(gene_allele_freq_table$Var1)
# print(ggplot(gene_allele_freq_table,aes(x=Var1,y=Freq,fill=Var2))+geom_bar(stat="identity")+guides(fill = "none", x=guide_axis(angle=90))+labs(x="Gene",y="Number of alleles present"))
# 
# cat("\n")
# print(kable(allele_link))
# cat("\n")
# 
# writeLines("<h3>Genes not present in the repertoire:</h3>")
# print(kable(getAbsentGeneList(freq_data_table)))

General Sumrep Statistics

These statistics are provided to give a general indication of repertoire characteristics. The analysis is taken from Sumrep.

cat("Skipping this section: `sequence_alignment` is empty.")

### summrep statistics graphs = hot/cold spots, GC content, Mutation and germline

### hot cold, and GC based on the sequence_alignment column

# hot_data <- suppressWarnings(repcred:::getHotspotCountDistribution(repertoire))
# cold_data <- suppressWarnings(repcred:::getColdspotCountDistribution(repertoire))
# gc_data <- suppressWarnings(repcred:::getGCContentDistribution(repertoire))
# 
# sumrep_info <- data.frame(
#   "Category" = c(rep("HotSpot",length(hot_data)),
#                  rep("ColdSpot",length(cold_data)),
#                  rep("GC content",length(gc_data))),
#   "values" = c(hot_data,cold_data,gc_data)
# )
# 
# sumrep_info$Category <- factor(sumrep_info$Category, levels = c("HotSpot","ColdSpot","GC content"))
# 
# ggplot2::ggplot(sumrep_info,
#                 ggplot2::aes(x=1,y=values, color=Category)) + 
#   ggplot2::geom_violin() + 
#   ggplot2::geom_boxplot(width = 0.2, outlier.shape = NA) + 
#   ggplot2::guides(color = "none") + 
#   ggpubr::theme_pubclean() +
#   ggplot2::labs(x="",y="") + 
#   facet_wrap(c("Category"), scales = "free", drop = T)



#SECTION 6
plot(0:1,xaxt='n',yaxt='n',ann=FALSE)
legend("center",c("Mean +- standard dev","5% and 95% quantile points","Min and Max Value points"),fill=c("Dark blue","light blue","red"))

if("sequence_alignment" %in% colnames(repertoire)&!any(is.na(repertoire$sequence_alignment))){
suppressWarnings(repcred:::hotspotCountDist(repertoire))
suppressWarnings(repcred:::coldspotCountDist(repertoire))
suppressWarnings(repcred:::gcContentDistribution(repertoire))
}else{
   section_6 = amber
    writeLines("Missing column : sequence_alignment \n Unable to run statistics : getHotspotCountDistribution , getColspotCountDistribution, getGCContentDistribution")
}


# if("sequence_alignment" %in% colnames(repertoire) & "germline_alignment" %in% colnames(repertoire) &  !any(is.na(repertoire$sequence_alignment))& !any(is.na(repertoire$germline_alignment))){
#  positionDistancesBetweenMutationDistribution(repertoire)
#  distanceFromGermlineToSequenceDistribution(repertoire)  
# }else{
#    if(section_6 == amber){
#       section_6 = red
#    }else{
#       section_6 = amber
#    }
#    writeLines("Missing columns : sequence_alignment , germline_alignment  \n Unable to run statistics : positionDistancesBetweenMutationDistribution , distanceFromGermlineToSequenceDistribution")
# }

#  Pairwise Statistics

# #SECTION 7
# if(isTRUE(full_check)){
#  plot(0:1,xaxt='n',yaxt='n',ann=FALSE)
# legend("center",c("Mean +- standard dev","5% and 95% quantile points","Min and Max Value points"),fill=c("Dark blue","light blue","red"))
# 
# if("sequence_alignment" %in% colnames(repertoire)&!any(is.na(repertoire$sequence_alignment))){
# CDR3pairwiseDistanceInfo(repertoire)
# pairwiseDistDistribution(repertoire)
# nearestNeighbourDistInfo(repertoire)
# }else{
#    section_7 = red
#    writeLines("Missing column : Sequence_alignment \n Unable to run statistics")
# }
# }else{
#    writeLines("Only the basic statistics were run.Please see other sections for the other statistics.")
# }

#  Physiochemical Statistics
# if (all(is.na(repertoire$junction_aa))) {
#     print("Skipping this section: `juncation_aa` is empty.")
# }

#SECTION 8
# plot(0:1,xaxt='n',yaxt='n',ann=FALSE)
# legend("center",c("Mean +- standard dev","5% and 95% quantile points","Min and Max Value points"),fill=c("Dark blue","light blue","red"))
# 
# 
# #polarityDistribution(repertoire)
# #aromaticityDistribution(repertoire)
# #acidityDistribution(repertoire)
# #basicityDistribution(repertoire)
# # bulkinessDistribution(repertoire)
# # chargeDistribution(repertoire)
# aliphaticDistribution(repertoire)
# print(mean(getAliphaticIndexDistribution(repertoire)),nm.rm=TRUE)
# GRAVYDistribution(repertoire)

#  Insertion Length Distributions
#SECTION 9
# plot(0:1,xaxt='n',yaxt='n',ann=FALSE)
# legend("center",c("Mean +- standard dev","5% and 95% quantile points","Min and Max Value points"),fill=c("Dark blue","light blue","red"))
# if("np1_length" %in% colnames(repertoire)&!any(is.na(repertoire$np1_length))){
#  VJinsertionLengthDistribution(repertoire)
#  VDinsertionLengthDistribution(repertoire)
# }else{
#  writeLines("Missing Column : np1_length")  
# }
# if("np2_length" %in% colnames(repertoire)&!any(is.na(repertoire$np2_length))){
# DJinsertionLengthDistribution(repertoire)
# }else{
#    writeLines("Missing Column : np2_length")  
# }

#SECTION 10
#  Prime deletion Distributions Statistics

# plot(0:1,xaxt='n',yaxt='n',ann=FALSE)
# legend("center",c("Mean +- standard dev","5% and 95% quantile points","Min and Max Value points"),fill=c("Dark blue","light blue","red"))
# 
# missing_col_count = 0
# ##
# if("v_3p_del" %in% colnames(repertoire)&!any(is.na(repertoire$v_3p_del))){
#  VGene3PrimeDeletionLengthDistribution(repertoire)  
# missing_col_count = missing_col_count +1
# }else{
#    writeLines("\n Missing column : v_3p_del , Cannot run statistic \n")
# }
# if("v_5p_del" %in% colnames(repertoire)&!any(is.na(repertoire$v_5p_del))){
#  VGene5PrimeDeletionLengthDistribution(repertoire) 
#    missing_col_count = missing_col_count +1
# }else{
#    writeLines("Missing column : v_5p_del , Cannot run statistic \n")
# }
# ##
# if("d_3p_del" %in% colnames(repertoire)&!any(is.na(repertoire$d_3p_del))){
# DGene3PrimeDeletionLengthDistribution(repertoire)
#    missing_col_count = missing_col_count +1
# }else{
#    writeLines("Missing column : d_3p_del , Cannot run statistic \n")
# }
# if("d_5p_del" %in% colnames(repertoire)&!any(is.na(repertoire$d_5p_del))){
# DGene5PrimeDeletionLengthDistribution(repertoire)
#    missing_col_count = missing_col_count +1
# }else{
#    writeLines("Missing column : d_5p_del , Cannot run statistic \n")
# }
# ##
# if("j_3p_del" %in% colnames(repertoire)&!any(is.na(repertoire$j_3p_del))){
# JGene3PrimeDeletionLengthDistribution(repertoire)
#    missing_col_count = missing_col_count +1
# }else{
#    writeLines("Missing column : j_3p_del , Cannot run statistic \n")
# }
# if("j_5p_del" %in% colnames(repertoire)&!any(is.na(repertoire$j_5p_del))){
# JGene5PrimeDeletionLengthDistribution(repertoire)
#    missing_col_count = missing_col_count +1
# }else{
#    writeLines("Missing column : j_5p_del , Cannot run statistic \n")
# }
# 
# if(missing_col_count > 3){
#    section_10=red
# }
# if(missing_col_count > 0){
#    section_10=amber
# }

repertoire_chimera = repertoire[!is.na(repertoire$sequence),]
chimera = nrow(repertoire_chimera)!=0
multiple_vgene = FALSE

r if(chimera){"# Possible Chimerisms\nBelow table shows the comparison between the total number of sequences compares to the total number of unique CDR3 sequences and then compared to the number of CDR3 sequences that have multiple different v-call genes associated with them."}

#SECTION 11

cdr3_seq_info = checkCDR3(repertoire_chimera)

#table(cdr3_seq_info$cdr3_seq)
cdr3_vcalls = getVCalls(cdr3_seq_info,repertoire_chimera,FALSE)
total_num_unique_seq = length(unique(repertoire_chimera$sequence))
##
num_uniq_cdr3_seq = length(unique(cdr3_seq_info$cdr3_seqs))

num_occur_multiple_gene_call <- length(which(data.frame(table(cdr3_seq_info$cdr3_seqs))$Freq > 1))
#kbl(cdr3_vcalls)

####
# 
if (nrow(cdr3_vcalls)>0) {

    freq_table= data.frame(table(as.character(cdr3_vcalls$seq),as.character(cdr3_vcalls$v_call_genes)))
    names(freq_table) <- c("sequence","call","Freq")
    freq_table= freq_table[freq_table$Freq >1,]
    multiple_vgene_freq = data.frame(table(as.character(freq_table$sequence)))
    multiple_vgene_freq = multiple_vgene_freq[multiple_vgene_freq$Freq>1,]
    multiple_vgene <- nrow(multiple_vgene_freq)!=0
    multiple_gene_calls = length(table(freq_table$sequence))
} else {
    multiple_gene_calls <- 0
}

#print(freq_table)
seqs_stats_vals=c(total_num_unique_seq,num_uniq_cdr3_seq,multiple_gene_calls)
labels = c("Unique complete seqs" , "Unique CDR3 seqs" , "Unique CDR3 seqs with multiple single v-calls" )

##
#par(mar=c(8, 13 ,5 ,3))
#barplot(seqs_stats_vals,names.arg = labels,col='dark blue',xlab="Occurences" ,main="CDR3 Sequence comparisons", las=2, horiz=TRUE ,cex.names=0.8)

ggplot(data.frame(label = labels, val = seqs_stats_vals), aes(x = label, y = val)) +
  geom_col() + labs(y = "Occurences", x = "" , title="CDR3 Sequence comparisons") +
  ggplot2::coord_flip() + theme_classic()

if (multiple_vgene) {
    freq_table$call <- as.factor(freq_table$call)
    ggplot(freq_table,
            aes(x=call,y=Freq,fill=sequence))+
            geom_bar(stat="identity")+
            guides(fill = "none", x=guide_axis(angle=90))+
            labs(x="Gene",y="Number of sequences present") + 
            theme_classic() 
}

r if(multiple_vgene){"Below contain CDR3 sequences with more than one v_call:"}

options(width = 2000)
num_to_display=0
if(length(repertoire$sequence)>100000){
   num_to_display = 3
}else{
   num_to_display = 6
}

# if(seq_type == "")
# display_type
repcred:::plotVgeneDist(cdr3_data_table = cdr3_vcalls,
                        num_of_results_to_show = num_to_display, 
                        aa_or_nn = "aa", 
                        freq_table = multiple_vgene_freq)
#kbl(most_chimeric)

#kbl(as.data.table(findAmplificationAreas(repertoire)))
repcred:::addTrafficLighting(c(section_1,section_2,section_3,section_4,section_5,section_6,section_7,section_8,section_9,section_10,section_11))

airr-community/rep-cred documentation built on July 13, 2024, 1 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

Tweet to @rdrrHQ

GitHub issue tracker

ian@mutexlabs.com