R/genotype_statistics.R

Defines functions write_genotype_file add_hap_stats calc_genotype calc_haplo_details gap_input_sequences make_genotype_db read_input_sequences read_inferred_sequences read_reference_genes read_input_files generate_ogrdb_report report_note report_warn report

Documented in generate_ogrdb_report read_input_files write_genotype_file

# This software is licensed under the CC BY-SA 4.0 licence: https://creativecommons.org/licenses/by-sa/4.0/
#
# Some functions are adapted from TIgGER (https://tigger.readthedocs.io) with thanks to the authors.



#' @importFrom dplyr count filter group_by n rename select summarise summarize tally
#' @importFrom magrittr %>%
#' @importFrom ggplot2 aes element_blank element_text geom_bar geom_line ggplot ggplotGrob labs margin scale_fill_brewer scale_fill_manual scale_x_discrete scale_y_continuous sec_axis theme theme_classic unit

# Functions used in markdown - declared here so that we don't get a NOTE in devtools::check() about unused dependencies
#' @importFrom gridExtra grid.arrange




globalVariables(c('warnings'))


# Timestamped progress report

report = function(x) {
  message(Sys.time(), ':', x, '\n')
}

# Warning messages (echoed to pdf)

pkg.globals = new.env()
pkg.globals$report_notes = c()

report_warn = function(x) {
  warning(x)
  report_note(x)
}

# Note in pdf

report_note = function(xx) {
  for (x in xx) {
    if (!(x %in% pkg.globals$report_notes)) {
      pkg.globals$report_notes = c(pkg.globals$report_notes, x)
    }
  }
}

#' Generate OGRDB reports from specified files.
#'
#' This creates the genotype report
#' (suffixed _ogrdb_report.csv) and the plot file (suffixed _ogrdb_plos.pdf).
#' Both are created in the directory holding the annotated read file, and the
#' file names are prefixed by the name of the annotated read file.
#' @param ref_filename Name of file containing IMGT-aligned reference genes in FASTA format
#' @param inferred_filename Name of file containing sequences of inferred novel alleles, or '-' if none
#' @param species Species name used in field 3 of the IMGT germline header with spaces omitted, if the reference file is from IMGT. Otherwise ''
#' @param filename Name of file containing annotated reads in AIRR, CHANGEO or IgDiscover format. The format is detected automatically
#' @param chain one of IGHV, IGKV, IGLV, IGHD, IGHJ, IGKJ, IGLJ, TRAV, TRAj, TRBV, TRBD, TRBJ, TRGV, TRGj, TRDV, TRDD, TRDJ
#' @param hap_gene The haplotyping columns will be completed based on the usage of the two most frequent alleles of this gene. If NA, the column will be blank
#' @param segment one of V, D, J
#' @param chain_type one of H, L
#' @param plot_unmutated Plot base composition using only unmutated sequences (V-chains only)
#' @param all_inferred Treat all alleles as novel
#' @param format The format for the plot file ('pdf', 'html' or 'none')
#' @param custom_file_prefix custom prefix to use for output files. If not specified, the prefix is taken from the input file name
#' @return None
#' @export
#' @examples
#' # prepare files for example
#' reference_set = system.file("extdata/ref_gapped.fasta", package = "ogrdbstats")
#' inferred_set = system.file("extdata/novel_gapped.fasta", package = "ogrdbstats")
#' repertoire = system.file("extdata/ogrdbstats_example_repertoire.tsv", package = "ogrdbstats")
#' file.copy(repertoire, tempdir())
#' repfile = file.path(tempdir(), 'ogrdbstats_example_repertoire.tsv')
#'
#' generate_ogrdb_report(reference_set, inferred_set, 'Homosapiens',
#'           repfile, 'IGHV', NA, 'V', 'H', FALSE, format='none')
#'
#' #clean up
#' outfile = file.path(tempdir(), 'ogrdbstats_example_repertoire_ogrdb_report.csv')
#' file.remove(repfile)
#' file.remove(outfile)
generate_ogrdb_report = function(ref_filename, inferred_filename, species, filename, chain, hap_gene, segment, chain_type, plot_unmutated, all_inferred=FALSE, format='pdf', custom_file_prefix='') {
  if (format == 'none') {
    return(invisible())
  }

  grDevices::pdf(NULL) # this seems to stop an empty Rplots.pdf from being created.

  report('Processing started')

  report_note(paste0('Repertoire file: ', filename))
  report_note(paste0('Germline reference file: ', ref_filename))
  report_note(paste0('Novel allele file: ', inferred_filename))

  if (!is.na(species)) {
    report_note(paste0('Species: ', species))
  }

  report_note(paste0('Chain: ', chain))
  report_note(paste0('Segment: ', segment))

  if (all_inferred) {
    report_note('All alleles are treated as novel')
  }

  if (plot_unmutated) {
    report_note("Base plots are calculated from unmutated sequences only.")
  }

  rd = read_input_files(ref_filename, inferred_filename, species, filename, chain, hap_gene, segment, chain_type, all_inferred)
  file_base = basename(filename)
  file_splits = strsplit(file_base, '.', fixed=TRUE)[[1]]

  if (custom_file_prefix != '') {
    file_prefix = custom_file_prefix
  } else {
    file_prefix = paste(file_splits[1:length(file_splits)-1], sep='.', collapse='.')
  }

  file_loc = dirname(filename)

  write_genotype_file(file.path(file_loc, paste0(file_prefix, '_ogrdb_report.csv')), segment, chain_type, rd$genotype)
  report(paste0(file_prefix, '_ogrdb_report.csv'))

  all_inferred = FALSE

  report('plotting bar charts')
  barplot_grobs = make_barplot_grobs(rd$input_sequences, rd$genotype_db, rd$inferred_seqs, rd$genotype, segment, rd$calculated_NC)

  report('plotting novel base composition')
  if(segment == 'V' && plot_unmutated) {
    nbgrobs = make_novel_base_grobs(rd$inferred_seqs, rd$input_sequences[rd$input_sequences$SEG_MUT_NC==0,], segment, all_inferred)
  } else {
    nbgrobs = make_novel_base_grobs(rd$inferred_seqs, rd$input_sequences, segment, all_inferred)
  }

  report('plotting haplotyping charts')
  haplo_grobs = make_haplo_grobs(segment, rd$haplo_details)

  report('writing plot file')

  if (format=='html') {
    outfile = file.path(file_loc, paste0(file_prefix, '_ogrdb_plots'))
  } else if (format=='pdf') {
    outfile = file.path(file_loc, paste0(file_prefix, '_ogrdb_plots.pdf'))
  } else {
    outfile = 'none'
  }

  write_plot_file(outfile, rd$input_sequences, nbgrobs$cdr3_dist, nbgrobs$end, nbgrobs$conc, nbgrobs$whole,
                  nbgrobs$triplet, barplot_grobs, haplo_grobs$aplot, haplo_grobs$haplo, paste0(pkg.globals$report_notes, sep='\n', collapse='\n'), format=format)

  return(invisible(NULL))
}

#' Read input files into memory
#' @param ref_filename Name of file containing IMGT-aligned reference genes in FASTA format
#' @param inferred_filename Name of file containing sequences of inferred novel alleles, or '-' if none
#' @param species Species name used in field 3 of the IMGT germline header with spaces omitted, if the reference file is from IMGT. Otherwise ''
#' @param filename Name of file containing annotated reads in AIRR, CHANGEO or IgDiscover format. The format is detected automatically
#' @param chain one of IGHV, IGKV, IGLV, IGHD, IGHJ, IGKJ, IGLJ, TRAV, TRAj, TRBV, TRBD, TRBJ, TRGV, TRGj, TRDV, TRDD, TRDJ
#' @param hap_gene The haplotyping columns will be completed based on the usage of the two most frequent alleles of this gene. If NA, the column will be blank
#' @param segment one of V, D, J
#' @param chain_type one of H, L
#' @param all_inferred Treat all alleles as novel
#' @return A named list containing the following elements:
#' \tabular{ll}{
#' ref_genes  \tab named list of IMGT-gapped reference genes \cr
#' inferred_seqs \tab named list of IMGT-gapped inferred (novel) sequences. \cr
#' input_sequences \tab data frame with one row per annotated read, with CHANGEO-style column names
#'                           One key point: the column SEG_CALL is the gene call for the segment under analysis. Hence if segment is 'V',
#'                           'V_CALL' will be renamed 'SEG_CALL' whereas is segment is 'J', 'J_CALL' is renamed 'SEG_CALL'. This simplifies
#'                           downstream processing.
#'                           Rows in the input file with ambiguous SEG_CALLs, or no call, are removed. \cr
#' genotype_db \tab named list of gene sequences referenced in the annotated reads (both reference and novel sequences) \cr
#' haplo_details \tab data used for haplotype analysis, showing allelic ratios calculated with various potential haplotyping genes \cr
#' genotype \tab data frame containing information provided in the OGRDB genotype csv file \cr
#' calculated_NC \tab a boolean that is TRUE if mutation counts were calculated by this library, FALSE if they were read from the annotated read file \cr
#' }
#' @export
#' @examples
#' # Create the analysis data set from example files provided with the package
#' #(this dataset is also provided in the package as example_rep)
#' reference_set = system.file("extdata/ref_gapped.fasta", package = "ogrdbstats")
#' inferred_set = system.file("extdata/novel_gapped.fasta", package = "ogrdbstats")
#' repertoire = system.file("extdata/ogrdbstats_example_repertoire.tsv", package = "ogrdbstats")
#'
#' example_data = read_input_files(reference_set, inferred_set, 'Homosapiens',
#'        repertoire, 'IGHV', NA, 'V', 'H', FALSE)
read_input_files = function(ref_filename, inferred_filename, species, filename, chain, hap_gene, segment, chain_type, all_inferred) {
  find_template = function(call) {
    tem = ref_genes[call]
    if(is.na(tem))
      tem = inferred_seqs[call]

    return(tem)
  }

  if (basename(filename) == "ogrdbstats_example_repertoire.tsv") {
    report("Using cached example data")
    return(ogrdbstats::example_rep)
  }

  report('reading reference genes')
  ref_genes = read_reference_genes(ref_filename, species, chain, segment)

  report('reading inferred sequences')
  inferred_seqs = read_inferred_sequences(inferred_filename, segment, ref_genes)

  if (all_inferred) {
    report('Treating all sequences as inferred for the purpose of reporting')
    inferred_seqs = c(inferred_seqs, ref_genes[!(names(ref_genes) %in% names(inferred_seqs))])
  }

  report('reading input sequences')
  input_sequences = read_input_sequences(filename, segment, chain_type)
  input_sequences$SEG_REF_IMGT = sapply(input_sequences$SEG_CALL, find_template)

  if (segment == 'V') {
    report('checking and fixing read alignment')
    input_sequences = gap_input_sequences(input_sequences, inferred_seqs, ref_genes)
  }

  report('determining genotype')
  genotype_db = make_genotype_db(input_sequences, inferred_seqs, ref_genes)

  report('calculating haplotype details')
  haplo_details = calc_haplo_details(segment, input_sequences)

  report('calculating genotype')
  c = calc_genotype(segment, chain_type, input_sequences, ref_genes, inferred_seqs, genotype_db, hap_gene, haplo_details)

  return(list('ref_genes'=ref_genes, 'inferred_seqs'=inferred_seqs, 'input_sequences'=c$input_sequences, 'genotype_db'=genotype_db,
           'haplo_details'=haplo_details, 'genotype'=c$genotype, 'calculated_NC'=c$calculated_NC))
}


#' Read the reference set from the specified file
#' @param ref_filename Name of file containing IMGT-aligned reference genes in FASTA format
#' @param species Species name used in field 3 of the IMGT germline header with spaces omitted, if the reference file is from IMGT. Otherwise ''
#' @param chain one of IGHV, IGKV, IGLV, IGHD, IGHJ, IGKJ, IGLJ, TRAV, TRAj, TRBV, TRBD, TRBJ, TRGV, TRGj, TRDV, TRDD, TRDJ
#' @param segment one of V, D, J
#' @return Named list of gene sequences
#' @noRd
read_reference_genes = function(ref_filename, species, chain, segment) {
  # get the reference set

  ref_genes = tigger::readIgFasta(ref_filename, strip_down_name =FALSE)
  set = chain
  region = paste0(segment, '-REGION')

  # process IMGT library, if header is in corresponding format
  if(grepl('|', names(ref_genes)[1], fixed=TRUE)) {
    ref_genes = ref_genes[grepl(species, names(ref_genes),fixed=TRUE)]
    ref_genes = ref_genes[grepl(region, names(ref_genes),fixed=TRUE)]  # restrict to V-D-J regions
    ref_genes = ref_genes[grepl(set, names(ref_genes),fixed=TRUE)]

    gene_name = function(full_name) {
      return(strsplit(full_name, '|', fixed=TRUE)[[1]][2])
    }

    names(ref_genes) = sapply(names(ref_genes), gene_name)
  } else {
    ref_genes = ref_genes[grepl(set, names(ref_genes),fixed=TRUE)]
  }

  # Crude fix for two misaligned human IGHJ reference genes

  if(set == 'IGHJ') {
    for(g in c('IGHJ6*02', 'IGHJ6*03')) {
      if(g %in% names(ref_genes) && stringr::str_sub(ref_genes[g], start= -1) == 'A') {
        ref_genes[g] = paste0(ref_genes[g], 'G')
        report_note(paste0('Modified truncated reference gene ', g, ' to ', ref_genes[g]))
      }
    }
  }

  # Check that the reference set is IMGT-aligned by looking for dots in the V-gene sequences

  misaligned_v = function(ref, name) {
    if(grepl('IG.V', name, fixed=FALSE)) {
      return(!grepl('.', ref, fixed=TRUE))
    } else {
      return(NA)
    }
  }

  misaligned = mapply(misaligned_v, ref_genes, names(ref_genes))

  if(any(misaligned, na.rm=TRUE)) {
    warning('The following reference V-genes are not IMGT aligned:')
    warning(names(ref_genes)[misaligned])
    stop('The reference genes must be IMGT gap-aligned. Please consult the Prerequisites section in the README file for details.\n')
  }

  if(length(ref_genes) < 1) {
    report_warn('Warning: no reference genes were found.')
  }

  return(ref_genes)
}

#' Read inferred sequences from a file. If ungapped, they will be gapped using the closest available reference gene
#' @param inferred_filename Name of file containing sequences of inferred novel alleles, or '-' if none
#' @param segment one of V, D, J
#' @param ref_genes Named list of reference sequences
#' @return Named list of gapped inferred sequences
#' @noRd
read_inferred_sequences = function(inferred_filename, segment, ref_genes) {
  # get the genotype and novel alleles in this set

  if(inferred_filename != '-') {
    inferred_seqs = tigger::readIgFasta(inferred_filename, strip_down_name=FALSE)
  } else {
    inferred_seqs = c()
  }

  # ignore inferred genes that are listed in the reference. gap any that aren't already gapped.

  inferred_seqs = inferred_seqs[!(names(inferred_seqs) %in% names(ref_genes))]

  if(segment == 'V') {
    inferred_seqs = sapply(names(inferred_seqs), imgt_gap_inferred, seqs=inferred_seqs, ref_genes=ref_genes)
  }

  return(inferred_seqs)
}



#' Read annotated input sequences and convert format to CHANGEO-style, filling in as many gaps in what we get from the inference tool as we can
#' @param filename Name of file containing annotated reads in AIRR, CHANGEO or IgDiscover format. The format is detected automatically
#' @param segment one of V, D, J
#' @param chain_type one of H, L
#' @return Data Frame containing sequence annotations, with CHANGEO format headers.
#' @noRd
read_input_sequences = function(filename, segment, chain_type) {
  D_CALL = D_MUT_NC = D_SEQ = D_errors = D_gene = D_region = J_CALL = J_MUT_NC = J_SEQ = J_errors = NULL
  J_nt = SEG_CALL = SEG_MUT_NC = VDJ_nt = V_CALL = V_CALL_GENOTYPED = V_CDR3_start = V_MUT_NC = V_SEQ = NULL
  V_errors = V_gene = V_nt = a_allele = a_gene = aa_diff = aa_substitutions = allelic_percentage = NULL
  assigned_unmutated_frequency = closest_host = closest_reference = v_call = J_gene = CDR3_nt = name = NULL


  # Read the sequences. Changeo format is assumed unless airr or IgDiscover format is identified
  # TODO - check and give nice error message if any columns are missing

  s = utils::read.delim(filename, stringsAsFactors=FALSE)

  # Fallback in airr,changeo if no v_call_genotyped fied is present

  if(!('V_CALL_GENOTYPED' %in% names(s)) && !('v_call_genotyped' %in% names(s)) && ( ('V_CALL' %in% names(s)) || ('v_call' %in% names(s)))) {
    report_warn('No v_call_genotyped field found. Falling back to v_call (please see notes on genotyping).')
    if('V_CALL' %in% names(s))
    {
      s = rename(s, V_CALL_GENOTYPED=V_CALL)
    } else {
      s = rename(s, v_call_genotyped=v_call)
    }
  }

  # Derive CDR3 from junction if it's not present

  junction2cdr3 = function(jn) {
    if(is.na(jn) || nchar(jn) < 7) {
      return(jn)
    }
    return(substr(jn, 3, nchar(jn)-3))
  }

  if((!('CDR3_IMGT' %in% names(s)) && !('CDR3' %in% names(s)) && !('cdr3' %in% names(s))) &&  (('JUNCTION' %in% names(s)) || ('junction' %in% names(s)))) {
    report('Deriving CDR3 from JUNCTION')
    if('V_CALL' %in% names(s))
    {
      s$CDR3 = sapply(s$JUNCTION, junction2cdr3)
    } else {
      s$cdr3 = sapply(s$junction, junction2cdr3)
    }
  }


  if('sequence_id' %in% names(s))
  {
    # airr format - convert wanted fields to changeo
    # TODO - would be better to get the field ranges from the CIGAR string, given that it's a required field
    # https://www.bioconductor.org/packages/devel/bioc/manuals/GenomicAlignments/man/GenomicAlignments.pdf

    # add a dummy D_CALL to light chain repertoires, for ease of processing

    if(chain_type == 'L' && !('d_call' %in% names(s))) {
      s$d_call = ''
    }

    req_names = c('sequence', 'sequence_id', 'v_call_genotyped', 'd_call', 'j_call', 'sequence_alignment', 'cdr3')
    col_names = c('SEQUENCE_INPUT', 'SEQUENCE_ID', 'V_CALL_GENOTYPED', 'D_CALL', 'J_CALL', 'SEQUENCE_IMGT', 'CDR3_IMGT')

    if(segment != 'V') {
      a_seg = tolower(segment)
      req_names = c(req_names, paste0(a_seg, '_sequence_start'), paste0(a_seg, '_sequence_end'), paste0(a_seg, '_germline_start'), paste0(a_seg, '_germline_end'))
      col_names = c(col_names, 'SEG_SEQ_START', 'SEG_SEQ_END', 'SEG_GERM_START', 'SEG_GERM_END')
    }

    if(segment == 'V') {
      # check that SEQUENCE_IMGT has gapped sequences, otherwise find the CDR3 start

      t = grepl(".", s$sequence_alignment, fixed=TRUE)

      if (length(t[t]) < nrow(s) / 2) {
        req_names = c(req_names, 'cdr3_start', 'fwr1_start')
        col_names = c(col_names, 'CDR3_START', 'FWR1_START')
      }
    }

    s = select(s, req_names)
    names(s) = col_names

    if(segment != 'V') {
      s$SEG_SEQ_LENGTH = s$SEG_SEQ_END - s$SEG_SEQ_START + 1
      s$SEG_GERM_LENGTH = s$SEG_GERM_END - s$SEG_GERM_START + 1
    }

    if ('CDR3_START' %in% col_names) {
      s$CDR3_START = s$CDR3_START - s$FWR1_START    # make them indeces into the V sequence
      if (dimnames(sort(table(substring(s$SEQUENCE_IMGT, s$CDR3_START, s$CDR3_START+2)),decreasing=TRUE))[[1]][1] == 'TGT') {
        # we've actually found the junction not the CDR3!
        s$CDR3_START = s$CDR3_START + 3
      }
      s$JUNCTION_START = s$CDR3_START - 3
      names(s)[names(s) == 'SEQUENCE_IMGT'] = 'SEQUENCE'
    }

  } else if('V_gene' %in% names(s)) {
    # IgDiscover format
    #  s = uncount(s, count)  for consistency with IgDiscover results, count each unique record in the file only once, regardless of 'count'

    # add a dummy D_CALL to light chain repertoires, for ease of processing

    if(chain_type == 'L' && !('D_gene' %in% names(s))) {
      s$D_gene = ''
      s$D_errors = ''
      s$D_region = ''
    }

    if(chain_type == 'L' && !('VDJ_nt' %in% names(s))) {
      s$VDJ_nt = s$VJ_nt
    }

    col_names = c('SEQUENCE_ID', 'V_CALL_GENOTYPED', 'D_CALL', 'J_CALL', 'CDR3_IMGT', 'V_MUT_NC', 'D_MUT_NC', 'J_MUT_NC', 'SEQUENCE', 'JUNCTION_START', 'V_SEQ', 'D_SEQ', 'J_SEQ')
    s = select(s, name, V_gene, D_gene, J_gene, CDR3_nt, V_errors, D_errors, J_errors, VDJ_nt, V_CDR3_start, V_nt, D_region, J_nt)
    names(s) = col_names
    # adjust IgDiscover's V_CDR3_START to the 1- based location of the conserved cysteine
    s$JUNCTION_START = s$JUNCTION_START - 2
    s$SEQUENCE = toupper(s$SEQUENCE)

    if(segment == 'V') {
      s = rename(s, SEG_MUT_NC=V_MUT_NC, SEG_SEQ=V_SEQ)
    } else if(segment == 'D') {
      s = rename(s, SEG_MUT_NC=D_MUT_NC, SEG_SEQ=D_SEQ)
    } else {
      s = rename(s, SEG_MUT_NC=J_MUT_NC, SEG_SEQ=J_SEQ)
    }
  }

  if(segment == 'V') {
    s = rename(s, SEG_CALL=V_CALL_GENOTYPED)
  } else if(segment == 'D') {
    s = rename(s, SEG_CALL=D_CALL)
  } else {
    s = rename(s, SEG_CALL=J_CALL)
  }

  if('V_CALL_GENOTYPED' %in% names(s)) {
    if('V_CALL' %in% names(s)) {
      s = subset(s, select = -c('V_CALL'))
    }
    s = rename(s, V_CALL=V_CALL_GENOTYPED)
  }

  s$CDR3_IMGT = toupper(s$CDR3_IMGT)

  #remove sequences with ambiguous calls, or no call, in the target segment

  s = s[!grepl(',', s$SEG_CALL),]
  s = s[!(s$SEG_CALL == ''),]

  # At this point, s contains Changeo-named columns with all data required for calculations

  return(s)
}

#' Build a list of gene sequences that matches the personalised genotype
#' @param input_sequences the input_sequences data frame
#' @param inferred_seqs named list of novel gene sequences
#' @param ref_genes named list of reference genes
#' @return named list of gene sequences
#' @noRd
make_genotype_db = function(input_sequences, inferred_seqs, ref_genes) {
  # Warn if we don't have genotype statistics for any of the inferred alleles
  # this can happen, for example, with Tigger, if novel alleles are detected but do not pass subsequent criteria for being included in the genotype.

  genotype_alleles = unique(input_sequences$SEG_CALL)
  missing = inferred_seqs[!(names(inferred_seqs) %in% genotype_alleles)]

  if(length(missing) >= 1) {
    report_warn(paste('Novel sequence(s)', paste0(names(missing), collapse=' '), 'are not listed in the genotype and will be ignored.', sep=' ', '\n'))
    inferred_seqs = inferred_seqs[(names(inferred_seqs) %in% genotype_alleles)]
  }

  genotype_alleles = genotype_alleles[!(genotype_alleles %in% names(inferred_seqs))]
  genotype_seqs = lapply(genotype_alleles, function(x) {ref_genes[x]})
  genotype_db = stats::setNames(c(genotype_seqs, inferred_seqs), c(genotype_alleles, names(inferred_seqs)))


  # Check we have sequences for all alleles named in the reads - either from the reference set or from the inferred sequences
  # otherwise - one of these two is incomplete!

  if(any(is.na(genotype_db))) {
    missing_alleles = paste0(names(genotype_db[is.na(genotype_db)]), collapse=", ")
    report_warn(paste0("Warning: sequence(s) for allele(s) ", missing_alleles, " can't be found in the reference set or the novel alleles file.\n"))
  }

  # Check that a reasonable number of genes in the reference set are not called in the sequence set

  unused_ref = length(setdiff(names(ref_genes), names(genotype_db)))

  if(unused_ref < (length(names(ref_genes))/10)) {
    report_warn("Warning: Over 90% of reference genes are called in the repertoire. This suggests that either the reference set is incomplete, or the genotype has not been personalised.")
  }

  return(genotype_db)
}

#' Check whether input sequences are gapped, and gap them if necessary
#' @param input_sequences the input_sequences data frame
#' @param inferred_seqs named list of novel gene sequences
#' @param ref_genes named list of reference genes
#' @return a revised input_sequences structure, guaranteed to contain gapped sequences in SEQUENCE_IMGT
#' @noRd
gap_input_sequences = function(input_sequences, inferred_seqs, ref_genes) {
  if(!('SEQUENCE_IMGT' %in% names(input_sequences))) {
    input_sequences$SEQUENCE_IMGT = mapply(imgt_gap, input_sequences$SEQUENCE,input_sequences$CDR3_IMGT, input_sequences$JUNCTION_START, input_sequences$SEG_REF_IMGT)
  } else {
    # remove any sequences that do not have an aligned sequence

    input_sequences$SEQ_LEN=stringr::str_length(input_sequences$SEQUENCE_IMGT)
    count_zero = length(input_sequences$SEQ_LEN[input_sequences$SEQ_LEN==0])

    if(count_zero > 0) {
      report_note(paste0('Warning: removing ', count_zero, ' sequences with no SEQUENCE_IMGT'))
      input_sequences = input_sequences[input_sequences$SEQ_LEN>0,]
    }
  }

  input_sequences$SEQUENCE_IMGT = toupper(input_sequences$SEQUENCE_IMGT)

  return(input_sequences)
}

#' Calculate the data used for haplotype analysis, showing allelic ratios calculated with various potential haplotyping genes
#' @param segment one of V, D, J
#' @param input_sequences the input_sequences data frame
#' @noRd
#' @return A named list containing the following elements:
#' \tabular{ll}{
#'     sa \tab           a variant of the input_sequences data frame, with fields for haplotyping analysis \cr
#'     a_genes \tab       list of potential haplotyping genes (if the segment under analysis is J, these are V genes, otherwise J genes) \cr
#'     a_props \tab       proportion of each allele of each potential haplotyping gene in the input sequences\cr
#'}
calc_haplo_details = function(segment, input_sequences) {
  A_CALL = J_CALL = V_CALL = a_gene = a_allele = NULL

  if(segment == 'V' || segment == 'D') {
    sa = input_sequences[!grepl(',', input_sequences$J_CALL),]            # unique J-calls only
    sa = rename(sa, A_CALL=J_CALL)
  } else {
    sa = input_sequences[!grepl(',', input_sequences$V_CALL),]
    sa = rename(sa, A_CALL=V_CALL)
  }

  sa$a_gene = sapply(sa$A_CALL, function(x) {
      if (grepl('*', x, fixed=TRUE)) {
        strsplit(x, '*', fixed=TRUE)[[1]][[1]]
      } else {
        'X'
      }
    })

  if (nrow(sa) == 0) {
    return(list())
  }

  sa$a_allele = sapply(sa$A_CALL, function(x) {
      if (grepl('*', x, fixed=TRUE)) {
        strsplit(x, '*', fixed=TRUE)[[1]][[2]]
      } else {
        'X'
      }
    })

  if (nrow(sa) == 0) {
    return(list())
  }

  sa$a_gene = factor(sa$a_gene, sort_alleles(unique(sa$a_gene)))

  su = select(sa, A_CALL, a_gene, a_allele)
  a_genes = sort(unlist(unique(su$a_gene)))

  sa = sa[!grepl(',', sa$SEG_CALL),]        # remove ambiguous V-calls
  sa$SEG_CALL = factor(sa$SEG_CALL, sort_alleles(unique(sa$SEG_CALL)))

  # calc percentage of each allele in a gene
  allele_props = function(gene, su) {
    a_gene = a_allele = NULL
    alleles = su %>% filter(a_gene==gene) %>% group_by(a_allele) %>% summarise(count=n())
    alleles$a_gene = gene
    total = sum(alleles$count)
    alleles$percent = 100*alleles$count/total
    return(alleles)
  }

  a_props = do.call('rbind', lapply(a_genes, allele_props, su=su))

  return(list('sa'=sa, 'a_props'=a_props, 'a_genes'=a_genes))
}


#' Build the genotype data required in the OGRDB genotype file
#' @param segment one of V, D, J
#' @param chain_type one of H, L
#' @param s the input_sequences data frame
#' @param ref_genes named list of reference genes
#' @param inferred_seqs named list of novel gene sequences
#' @param genotype_db named list of gene sequences in the personalised genotype
#' @param hap_gene The haplotyping columns will be completed based on the usage of the two most frequent alleles of this gene. If NA, the column will be blank
#' @param haplo_details Data structure created by create_haplo_details
#' @noRd
#' @return A named list containing the following elements:
#' \tabular{ll}{
#'     input_sequences \tab     updated data frame, guaranteed to include mutation counts, which have been calculated if necessary \cr
#'     calculated_NC  \tab      a boolean, TRUE if mutation counts had to be calculated, FALSE otherwise \cr
#'     genotype \tab            data frame containing the information required for the OGRDB genotype file \cr
#'}

calc_genotype = function(segment, chain_type, s, ref_genes, inferred_seqs, genotype_db, hap_gene, haplo_details) {
  GENE = unique_calls = unique_cdrs = SEG_CALL = closest_reference = aa_diff = aa_substitutions = closest_host = aggregate = NULL
  sequences = reference_closest = host_closest = reference_difference = host_difference = reference_nt_diffs = reference_aa_difference = NULL
  reference_aa_subs = nt_diff = nt_substitutions = nt_diff_host = host_nt_diffs = host_aa_difference = host_aa_subs = NULL

  # unmutated count for each allele

  calculated_NC = FALSE

  if(!('SEG_MUT_NC' %in% names(s))) {
    if(segment == 'V') {
      # We take the count up to the 2nd CYS at 310
      # This matches IgDiscover practice and facilitates Tigger's reassignAlles approach which does not re-analyse the junction with the novel V-allele
      s$SEQUENCE_IMGT_TRUNC = sapply(s$SEQUENCE_IMGT, substring, first=1, last=309)
      s$SEG_MUT_NC = unlist(tigger::getMutCount(s$SEQUENCE_IMGT_TRUNC, s$SEG_CALL, genotype_db))
    } else {
      s$SEG_SEQ = mapply(substr, s$SEQUENCE_INPUT, s$SEG_SEQ_START, s$SEG_SEQ_START+s$SEG_SEQ_LENGTH-1)
      s$SEG_REF_SEQ = mapply(substr, s$SEG_REF_IMGT, s$SEG_GERM_START, s$SEG_GERM_START+s$SEG_GERM_LENGTH-1)
      s$SEG_MUT_NC = stringdist::stringdist(s$SEG_SEQ, s$SEG_REF_SEQ, method="hamming")
    }

    calculated_NC = TRUE
    s[,is.na(s$SEG_MUT_NC)]$SEG_MUT_NC = 0
  }

  report('calculated mutation count')

  # make a space-alignment of D sequences for the usage histogram (V and J use the IMGT alignment)

  if(segment == 'D') {
    s$SEG_SEQ_ALIGNED = mapply(paste0, sapply(s$SEG_GERM_START, function(x) {paste(rep(' ', x), collapse='')}), s$SEG_SEQ)
    width = max(stringr::str_length(s$SEG_SEQ_ALIGNED))
    s$SEG_SEQ_ALIGNED = sapply(s$SEG_SEQ_ALIGNED, function(x) {stringr::str_pad(x, width, side='right')})
  }

  genotype = s %>% group_by(SEG_CALL) %>% summarize(sequences = n())
  s0 = s[s$SEG_MUT_NC == 0,] %>% group_by(SEG_CALL) %>% summarize(unmutated_sequences = n())
  genotype = merge(genotype, s0, all=TRUE)

  if(any(is.na(genotype$unmutated_sequences))) {
    genotype[is.na(genotype$unmutated_sequences),]$unmutated_sequences = 0
  }

  total_unmutated = sum(genotype$unmutated_sequences)
  genotype$unmutated_frequency = round(100*genotype$unmutated_sequences/total_unmutated, digits=2)

  s_totals = s %>% group_by(SEG_CALL) %>% summarize(sequences = n())
  genotype = merge(genotype, s_totals, all=TRUE)

  genotype$GENE = sapply(genotype$SEG_CALL, function(x) {if(grepl('*', x, fixed=TRUE)) {strsplit(x, '*', fixed=TRUE)[[1]][1]} else {x}})
  allelic_totals = genotype %>% group_by(GENE) %>% summarise(allelic_total=sum(sequences))
  genotype = merge(genotype, allelic_totals, all=TRUE)
  genotype$allelic_percentage = round(100*genotype$sequences/genotype$allelic_total)

  su=s[s$SEG_MUT_NC==0,]

  if(segment != 'V') {
    genotype$unique_vs = sapply(genotype$SEG_CALL, unique_calls, segment='V_CALL', seqs=s)
    genotype$unique_vs_unmutated = sapply(genotype$SEG_CALL, unique_calls, segment='V_CALL', seqs=su)
  }

  if(segment != 'D' && chain_type == 'H') {
    genotype$unique_ds = sapply(genotype$SEG_CALL, unique_calls, segment='D_CALL', seqs=s)
    genotype$unique_ds_unmutated = sapply(genotype$SEG_CALL, unique_calls, segment='D_CALL', seqs=su)
  }

  if(segment != 'J') {
    genotype$unique_js = sapply(genotype$SEG_CALL, unique_calls, segment='J_CALL', seqs=s)
    genotype$unique_js_unmutated = sapply(genotype$SEG_CALL, unique_calls, segment='J_CALL', seqs=su)
  }

  genotype$unique_cdr3s = sapply(genotype$SEG_CALL, unique_cdrs, segment='CDR3_IMGT', seqs=s)
  genotype$unique_cdr3s_unmutated = sapply(genotype$SEG_CALL, unique_cdrs, segment='CDR3_IMGT', seqs=su)

  genotype$assigned_unmutated_frequency = round(100*genotype$unmutated_sequences/genotype$sequences, digits=2)

  # closest in genotype and in reference (inferred alleles only)
  # Inferred D alleles should be aligned for best match (if this is an allele of an existing D-gene, align against a knon allele of that gene)

  if (length(inferred_seqs) == 0) {
    report_warn('Warning - no inferred sequences found.\n')

    genotype$reference_closest = NA
    genotype$host_closest = NA
    genotype$reference_difference = NA
    genotype$reference_nt_diffs = NA
    genotype$reference_aa_difference = NA
    genotype$reference_aa_subs = NA
    genotype$host_difference = NA
    genotype$host_nt_diffs = NA
    genotype$host_aa_difference = NA
    genotype$host_aa_subs = NA
  } else {
    nearest_ref = data.frame(t(sapply(seq_along(inferred_seqs), find_nearest, ref_genes=ref_genes, prefix='reference', inferred_seqs=inferred_seqs, segment=segment)))
    nearest_ref$SEG_CALL = names(inferred_seqs)
    genotype = merge(genotype, nearest_ref, by='SEG_CALL', all.x=TRUE)

    nearest_ref = data.frame(t(sapply(seq_along(inferred_seqs), find_nearest, ref_genes=ref_genes[names(ref_genes) %in% genotype$SEG_CALL], prefix='host', inferred_seqs=inferred_seqs, segment=segment)))
    nearest_ref$SEG_CALL = names(inferred_seqs)
    genotype = merge(genotype, nearest_ref, by='SEG_CALL', all.x=TRUE)
  }


  genotype$nt_sequence = sapply(genotype$SEG_CALL, function(x) genotype_db[x][[1]])

  genotype = rename(genotype, sequence_id=SEG_CALL, closest_reference=reference_closest, closest_host=host_closest,
                           nt_diff=reference_difference, nt_diff_host=host_difference, nt_substitutions=reference_nt_diffs, aa_diff=reference_aa_difference,
                           aa_substitutions=reference_aa_subs)

  genotype$unmutated_umis = ''
  genotype$nt_sequence_gapped = genotype$nt_sequence
  genotype$nt_sequence = gsub('-', '', genotype$nt_sequence, fixed=TRUE)
  genotype$nt_sequence = gsub('.', '', genotype$nt_sequence, fixed=TRUE)
  genotype = tidyr::unnest(genotype, cols = c(closest_reference, nt_diff, nt_substitutions, aa_diff, aa_substitutions,
                                       closest_host, nt_diff_host, host_nt_diffs, host_aa_difference,
                                       host_aa_subs))

  # If we had to calculate MUT_NC, set the mutated counts to NA for any allele for which we don't have a sequence

  if(calculated_NC && any(is.na(genotype$nt_sequence))) {
    genotype[is.na(genotype$nt_sequence),]$unmutated_frequency = NA
    genotype[is.na(genotype$nt_sequence),]$assigned_unmutated_frequency = NA
    genotype[is.na(genotype$nt_sequence),]$unmutated_sequences = NA
  }

  # Check for duplicate germline sequences

  concat_names = function(x) {
    paste(x, collapse=', ')
  }

  warn_dupes = function(x) {
    report_warn(paste0('Warning: ', x, ' have identical germline sequences.\n'))
  }

  dupes = aggregate(genotype["sequence_id"], by=genotype["nt_sequence"], FUN=concat_names)
  dupes = dupes$sequence_id[grepl(',', dupes$sequence_id, fixed=TRUE)]

  if(length(dupes) > 0) {
    x = lapply(dupes, warn_dupes)
  }

  # Add haplo details
  genotype = add_hap_stats(genotype, hap_gene, haplo_details)

  return(list("input_sequences"=s, "genotype"=genotype, "calculated_NC"=calculated_NC))
}


#' If the haplotyping gene has been specified, add haplotyping ratios to the genotype data frame, provided the ratios for that gene are suitable
#' @param genotype genotype data frame
#' @param hap_gene haplotyping gene for which ratios should be calculated
#' @param haplo_details Data structure created by create_haplo_details
#' @return modified genotype with haplotyping ratios added
#' @noRd
add_hap_stats = function(genotype, hap_gene, haplo_details) {
  a_gene = a_allele = SEG_CALL = haplotyping_ratio = sequence_id = NULL
  sa = haplo_details$sa
  a_props = haplo_details$a_props

  genotype$haplotyping_gene = ''
  genotype$haplotyping_ratio = ''

  if(!is.na(hap_gene)) {
    ap = a_props[a_props$a_gene==hap_gene,]
    ap = ap[order(ap$percent, decreasing=TRUE),]

    if(nrow(ap) < 2 || ap[1,]$percent > 75 || ap[2,]$percent < 20 || (ap[1,]$percent+ap[2,]$percent < 90)) {
      report_note(paste0('Alelleic ratio is unsuitable for haplotyping analysis based on ', hap_gene, '\n'))
    } else
    {
      genotype$haplotyping_gene = hap_gene
      genotype = select(genotype, -c(haplotyping_ratio))

      a1 = ap[1,]$a_allele
      a2 = ap[2,]$a_allele
      report_note(paste0('Haplotyping analysis is based on gene ', hap_gene, ' alleles ', a1, ':', a2, '\n'))
      recs = sa %>% filter(a_gene==hap_gene) %>% filter(a_allele==a1 | a_allele==a2)
      recs = recs %>% select(sequence_id=SEG_CALL, a_allele) %>% group_by(sequence_id, a_allele) %>% summarise(count=n()) %>% tidyr::spread(a_allele, count)
      recs[is.na(recs)] = 0
      names(recs) = c('sequence_id', 'a1', 'a2')
      recs$totals = recs$a1 + recs$a2
      recs$a1 = round(100 * recs$a1/ recs$totals, 0)
      recs$a2 = round(100 * recs$a2/ recs$totals, 0)
      recs$haplotyping_ratio = paste0(' ', recs$a1, ':', recs$a2, ' ')
      recs = recs %>% select(sequence_id, haplotyping_ratio)
      genotype = merge(genotype, recs, all=TRUE)
    }
  }

  return(genotype)
}

#' Write the genotype file required by OGRDB
#' @export
#' @param filename name of file to create (csv)
#' @param segment one of V, D, J
#' @param chain_type one of H, L
#' @param genotype genotype data frame
#' @return None
#' @examples
#' genotype_file = tempfile("ogrdb_genotype")
#' write_genotype_file(genotype_file, 'V', 'H', example_rep$genotype)
#' file.remove(genotype_file)
write_genotype_file = function(filename, segment, chain_type, genotype) {
  closest_reference = closest_host = nt_diff = nt_diff_host = nt_substitutions = aa_diff = NULL
  aa_substitutions = assigned_unmutated_frequency = unmutated_frequency = unmutated_sequences = unmutated_umis = allelic_percentage = unique_ds = sequence_id = NULL
  unique_js = unique_cdr3s = unique_ds_unmutated = unique_js_unmutated = unique_cdr3s_unmutated = haplotyping_gene = haplotyping_ratio = nt_sequence = nt_sequence_gapped = NULL
  closest_host = closest_reference = aa_diff = aa_substitutions = assigned_unmutated_frequency = allelic_percentage = sequences = unique_vs = unique_vs_unmutated = NULL

  genotype = genotype[order_alleles(data.frame(genes=as.character(genotype$sequence_id), stringsAsFactors = FALSE)),]

  if(chain_type == 'H') {
    if(segment == 'V') {
      g = select(genotype, sequence_id, sequences, closest_reference, closest_host, nt_diff, nt_diff_host, nt_substitutions, aa_diff,
               aa_substitutions, assigned_unmutated_frequency, unmutated_frequency, unmutated_sequences, unmutated_umis, allelic_percentage, unique_ds,
               unique_js,unique_cdr3s, unique_ds_unmutated, unique_js_unmutated, unique_cdr3s_unmutated, haplotyping_gene, haplotyping_ratio, nt_sequence, nt_sequence_gapped)
    } else if(segment == 'D') {
      g = select(genotype, sequence_id, sequences, closest_reference, closest_host, nt_diff, nt_diff_host, nt_substitutions, aa_diff,
                 aa_substitutions, assigned_unmutated_frequency, unmutated_frequency, unmutated_sequences, unmutated_umis, allelic_percentage, unique_vs,
                 unique_js,unique_cdr3s, unique_vs_unmutated, unique_js_unmutated, unique_cdr3s_unmutated, haplotyping_gene, haplotyping_ratio, nt_sequence)
    } else if(segment == 'J') {
      g = select(genotype, sequence_id, sequences, closest_reference, closest_host, nt_diff, nt_diff_host, nt_substitutions, aa_diff,
                 aa_substitutions, assigned_unmutated_frequency, unmutated_frequency, unmutated_sequences, unmutated_umis, allelic_percentage, unique_vs,
                 unique_ds,unique_cdr3s, unique_vs_unmutated, unique_ds_unmutated, unique_cdr3s_unmutated, haplotyping_gene, haplotyping_ratio, nt_sequence)
    }
  } else {
    if(segment == 'V') {
      g = select(genotype, sequence_id, sequences, closest_reference, closest_host, nt_diff, nt_diff_host, nt_substitutions, aa_diff,
                 aa_substitutions, assigned_unmutated_frequency, unmutated_frequency, unmutated_sequences, unmutated_umis, allelic_percentage,
                 unique_js,unique_cdr3s, unique_js_unmutated, unique_cdr3s_unmutated, haplotyping_gene, haplotyping_ratio, nt_sequence, nt_sequence_gapped)
    } else if(segment == 'J') {
      g = select(genotype, sequence_id, sequences, closest_reference, closest_host, nt_diff, nt_diff_host, nt_substitutions, aa_diff,
                 aa_substitutions, assigned_unmutated_frequency, unmutated_frequency, unmutated_sequences, unmutated_umis, allelic_percentage, unique_vs,
                 unique_cdr3s, unique_vs_unmutated, unique_cdr3s_unmutated, haplotyping_gene, haplotyping_ratio, nt_sequence)
    }
  }

  g[] = lapply(g, as.character)
  g[is.na(g)] = ''

  utils::write.csv(g, filename, row.names=FALSE)
  return(invisible(NULL))
}
airr-community/ogrdbstats documentation built on Feb. 17, 2025, 5:05 p.m.