R/NestTablesXL_func_20221012.R
In MHCtools: Analysis of MHC Data in Non-Model Species

Documented in NestTablesXL

#' NestTablesXL() function
#'
#' \code{\link{NestTablesXL}} reads the R lists output by the HpltFind() function
#' and translates them to an Excel workbook for more convenient evaluation of the
#' inferred haplotypes and curation of unresolved and incongruent sequences. The
#' workbook contains separate tabs for each nest in the data set and provides an
#' overview of the genotypes of the samples in each nest and the inferred
#' haplotypes.
#'
#' If you publish data or results produced with MHCtools, please cite both of
#' the following references:
#' Roved, J. (2022). MHCtools: Analysis of MHC data in non-model species. Cran.
#' Roved, J. (2024). MHCtools 1.5: Analysis of MHC sequencing data in R. In S.
#' Boegel (Ed.), HLA Typing: Methods and Protocols (2nd ed., pp. 275–295).
#' Humana Press. https://doi.org/10.1007/978-1-0716-3874-3_18
#'
#' @param nest_table is a table containing the sample names of parents and
#'   offspring in each nest. This table should be organized so that the
#'   individual names are in the first column (Sample_ID), and the nest number
#'   is in the second column (Nest). For each nest, the first two rows should be
#'   the parents, followed immediately by the offspring in the subsequent rows,
#'   and then followed by the next nest, and so on. It is assumed that nests are
#'   numbered consecutively beginning at 1. Please use the same table as was used
#'   to generate the haplotypes using HpltFind().
#' @param seq_table seq_table is a sequence table as output by the 'dada2'
#'   pipeline, which has samples in rows and nucleotide sequence variants in
#'   columns. Please use the same table as was used to generate the haplotypes
#'   using HpltFind().
#' @param filepath is a user defined path to the folder where the output files
#'   from the HpltFind() function have been saved.
#' @param path_out is a user defined path to the folder where the output file
#'   will be saved.
#' @return An Excel workbook with individual tabs for each nest in nest_table.
#'   Each tab contains a binary (logical) occurrence matrix with the samples from
#'   each nest in columns and sequences (inherited from seq_table) in rows. The
#'   order of the samples is derived from nest_table, with parents in the two
#'   leftmost columns. Each tab also lists the putative haplotypes inferred by the
#'   HpltFind() function and provides lists of unresolved sequences in haplotypes,
#'   sequences with unidentified decent (i.e., present in parents but not in
#'   offspring), sequences not assigned to haplotypes, and sequences with
#'   unidentified origin (i.e., present in offspring but not in parents).
#'   Note: NestTablesXL() will overwrite any existing file with the output file
#'   name in path_out.
#' @seealso \code{\link{HpltFind}}; for more information about 'dada2' visit
#'   <https://benjjneb.github.io/dada2/>
#' @examples
#' nest_table <- nest_table
#' seq_table <- sequence_table_hplt
#' filepath <- system.file("extdata/HpltFindOut/", package="MHCtools")
#' path_out <- tempdir()
#' NestTablesXL(nest_table, seq_table, filepath, path_out)
#' @importFrom "openxlsx" "addWorksheet" "createWorkbook" "saveWorkbook"
#'   "setColWidths" "writeData" "writeDataTable"
#' @export

NestTablesXL <- function(nest_table, seq_table, filepath, path_out) {

  # Create an Excel workbook
  wb <- createWorkbook()

  # Get the file names of the .Rds output generated by the HpltFind function
  file_names <- dir(filepath)

  # Sort the file names by  nest number
  file_names <- file_names[order(as.numeric(gsub("[^0-9]", "", file_names)))]

  # Define the sequence names
  seq_names <- colnames(seq_table)

  # Define the number of nests in the data set
  No_nests <- max(nest_table$Nest)

  for (i in 1:No_nests) {

    # Create a new sheet in the workbook
    addWorksheet(wb, sheetName = paste(i))

    # Create a vector for the names of the samples in nest i
    Nest_samples <- factor()

    # Get the names of the samples in nest i
    Nest_samples <- nest_table[nest_table[,"Nest"]==i,"Sample_ID"]

    # Create a vector for the names of all the sequences found in any sample from nest i
    Seqs_nest <- factor()

    # Create a list that will contain vectors with the names of the sequences that are found in each sample
    Seqs <- vector("list", length(Nest_samples))

    # Get the names of the sequences that are found in each sample in nest i
    for (j in 1:length(Nest_samples)) {

      # Fetch column numbers for the sequences in sample j in the sequence table
      z <- which(seq_table[paste(Nest_samples[j]),] > 0)

      # Enter the names of the sequences in sample j into the list Seqs
      Seqs[[j]] <- seq_names[z]

      # Enter the names of the sequences in sample j into Seqs_nest
      for (Seq in Seqs[[j]]) {

        # If Seq is not present in Seqs_nest, append Seq to Seqs_nest
        if (Seq %in% Seqs_nest == FALSE) {

          Seqs_nest <- append(Seqs_nest, Seq)

        }

      }

    }

    # Order the sequence names in Seqs_nest
    Seqs_nest <- sort(Seqs_nest)

    # Create a list that will contain vectors with the presence/absense of sequences for each sample
    Seqs_occ <- vector("list", length(Nest_samples))

    # Assign the presence/absence of sequences for each sample to Seqs_occ
    for (j in 1:length(Nest_samples)) {

      Seqs_occ[[j]] <- numeric(length(Seqs_nest))

      for (k in 1:length(Seqs_nest)) {

        # If Seqs_nest[k] is present in Seqs[[j]], assign 1 to Seqs_occ[[j]][k]
        # else assign 0
        if (Seqs_nest[k] %in% Seqs[[j]] == TRUE) {

          Seqs_occ[[j]][k] <- 1

        } else {

          Seqs_occ[[j]][k] <- 0

        }

      }

    }

    # Create an occurrence matrix showing presence/absense of sequences in samples in nest i
    Occ_matrix <- data.frame(sapply(Seqs_occ,c))
    rownames(Occ_matrix) <- Seqs_nest
    colnames(Occ_matrix) <- Nest_samples

    # Add this as a table in the worksheet
    writeDataTable(wb, i, Occ_matrix, startCol = 1, startRow = 1, rowNames = TRUE, colNames = TRUE)

    # Change column width
    setColWidths(wb, i, cols = 1:15, widths = 12)

    # Load the nest lists from the output of HpltFind
    Nest_i <- readRDS(file.path(filepath, file_names[i]))

    # Add "Haplotypes" as title in the worksheet
    writeData(wb, i, x = as.character("Haplotypes"), xy = c(1, (3+length(Seqs_nest))))

    # Add the haplotypes to the worksheet below the occurrence matrix
    Hplt_1 <-  data.frame(Nest_i$Putative_haplotypes[[1]], stringsAsFactors=FALSE)
    colnames(Hplt_1) <- paste0(Nest_samples[1], "A")
    writeDataTable(wb, i, Hplt_1, startRow = (4+length(Seqs_nest)), startCol = 1, rowNames = FALSE, colNames = TRUE)

    Hplt_2 <-  data.frame(Nest_i$Putative_haplotypes[[2]], stringsAsFactors=FALSE)
    colnames(Hplt_2) <- paste0(Nest_samples[1], "B")
    writeDataTable(wb, i, Hplt_2, startRow = (4+length(Seqs_nest)), startCol = 2, rowNames = FALSE, colNames = TRUE)

    Hplt_3 <-  data.frame(Nest_i$Putative_haplotypes[[3]], stringsAsFactors=FALSE)
    colnames(Hplt_3) <- paste0(Nest_samples[2], "A")
    writeDataTable(wb, i, Hplt_3, startRow = (4+length(Seqs_nest)), startCol = 3, rowNames = FALSE, colNames = TRUE)

    Hplt_4 <-  data.frame(Nest_i$Putative_haplotypes[[4]], stringsAsFactors=FALSE)
    colnames(Hplt_4) <- paste0(Nest_samples[2], "B")
    writeDataTable(wb, i, Hplt_4, startRow = (4+length(Seqs_nest)), startCol = 4, rowNames = FALSE, colNames = TRUE)

    # Add "Unresolved sequences" as title in the worksheet
    writeData(wb, i, x = as.character("Unresolved sequences"), xy = c(6, (3+length(Seqs_nest))))

    # Add the unresolved sequences to the worksheet
    Unr_seqs_1 <-  data.frame(Nest_i$Unresolved_seqs[[1]], stringsAsFactors=FALSE)
    colnames(Unr_seqs_1) <- paste(Nest_samples[1])
    writeDataTable(wb, i, Unr_seqs_1, startRow = (4+length(Seqs_nest)), startCol = 6, rowNames = FALSE, colNames = TRUE)

    Unr_seqs_2 <-  data.frame(Nest_i$Unresolved_seqs[[2]], stringsAsFactors=FALSE)
    colnames(Unr_seqs_2) <- paste(Nest_samples[2])
    writeDataTable(wb, i, Unr_seqs_2, startRow = (4+length(Seqs_nest)), startCol = 7, rowNames = FALSE, colNames = TRUE)

    # Add "Unidentified descent" as title in the worksheet
    writeData(wb, i, x = as.character("Unidentified descent"), xy = c(9, (3+length(Seqs_nest))))

    # Add the haplotype sequences not identified in any offspring to the worksheet
    Hplt_1_inc <-  data.frame(Nest_i$Incongruent_seqs[[1]], stringsAsFactors=FALSE)
    colnames(Hplt_1_inc) <- paste0(Nest_samples[1], "A")
    writeDataTable(wb, i, Hplt_1_inc, startRow = (4+length(Seqs_nest)), startCol = 9, rowNames = FALSE, colNames = TRUE)

    Hplt_2_inc <-  data.frame(Nest_i$Incongruent_seqs[[2]], stringsAsFactors=FALSE)
    colnames(Hplt_2_inc) <- paste0(Nest_samples[1], "B")
    writeDataTable(wb, i, Hplt_2_inc, startRow = (4+length(Seqs_nest)), startCol = 10, rowNames = FALSE, colNames = TRUE)

    Hplt_3_inc <-  data.frame(Nest_i$Incongruent_seqs[[3]], stringsAsFactors=FALSE)
    colnames(Hplt_3_inc) <- paste0(Nest_samples[2], "A")
    writeDataTable(wb, i, Hplt_3_inc, startRow = (4+length(Seqs_nest)), startCol = 11, rowNames = FALSE, colNames = TRUE)

    Hplt_4_inc <-  data.frame(Nest_i$Incongruent_seqs[[4]], stringsAsFactors=FALSE)
    colnames(Hplt_4_inc) <- paste0(Nest_samples[2], "B")
    writeDataTable(wb, i, Hplt_4_inc, startRow = (4+length(Seqs_nest)), startCol = 12, rowNames = FALSE, colNames = TRUE)

    # Add "Not assigned to haplotypes" as title in the worksheet
    writeData(wb, i, x = as.character("Not assigned to haplotypes"), xy = c(14, (3+length(Seqs_nest))))

    # Add the sequences found in parents that were not assigned to a haplotype to the worksheet
    Pinc_1 <-  data.frame(Nest_i$Incongruent_seqs$Parents[[1]], stringsAsFactors=FALSE)
    colnames(Pinc_1) <- paste(Nest_samples[1])
    writeDataTable(wb, i, Pinc_1, startRow = (4+length(Seqs_nest)), startCol = 14, rowNames = FALSE, colNames = TRUE)

    Pinc_2 <-  data.frame(Nest_i$Incongruent_seqs$Parents[[2]], stringsAsFactors=FALSE)
    colnames(Pinc_2) <- paste(Nest_samples[2])
    writeDataTable(wb, i, Pinc_2, startRow = (4+length(Seqs_nest)), startCol = 15, rowNames = FALSE, colNames = TRUE)

    # Add "Unidentified origin" as title in the worksheet
    writeData(wb, i, x = as.character("Unidentified origin"), xy = c(17, (3+length(Seqs_nest))))

    # Add the sequences found in offspring, but not identified in any of the parents to the worksheet
    for (j in 1:(length(Nest_samples)-2)) {

      Cinc <-  data.frame(Nest_i$Incongruent_seqs$Chicks[[j]], stringsAsFactors=FALSE)
      colnames(Cinc) <- paste(Nest_samples[(2+j)])
      writeDataTable(wb, i, Cinc, startRow = (4+length(Seqs_nest)), startCol = (16+j), rowNames = FALSE, colNames = TRUE)

    }

  }

  # Save the workbook to a file
  saveWorkbook(wb, file=paste0(path_out, "/Nest_tables_",c(format(Sys.Date(),"%Y%m%d")),".xlsx"), overwrite=TRUE)

}