Nothing
#' Build binary presence/absence matrix of sporulation genes
#'
#' Transforms the output of `sporulation_gene_name()` into a wide-format matrix
#' indicating the presence (1) or absence (0) of each sporulation-associated gene per genome.
#'
#' @param df A data.frame from `sporulation_gene_name()` with columns `genome_ID` and `spo_gene_name`.
#'
#' @return A wide-format binary matrix with genomes in rows and genes in columns.
#' @import dplyr
#' @importFrom tidyr pivot_wider
#'
#' @examples
#' # Load package
#' library(SpoMAG)
#'
#' # Load example annotation tables
#' file_spor <- system.file("extdata", "one_sporulating.csv.gz", package = "SpoMAG")
#' file_aspo <- system.file("extdata", "one_asporogenic.csv.gz", package = "SpoMAG")
#'
#' # Read files
#' df_spor <- readr::read_csv(file_spor, show_col_types = FALSE)
#' df_aspo <- readr::read_csv(file_aspo, show_col_types = FALSE)
#'
#' # Step 1: Extract sporulation-related genes
#' genes_spor <- sporulation_gene_name(df_spor)
#' genes_aspo <- sporulation_gene_name(df_aspo)
#'
#' # Step 2: Convert to binary matrix
#' bin_spor <- build_binary_matrix(genes_spor)
#' bin_aspo <- build_binary_matrix(genes_aspo)
#'
#' @export
build_binary_matrix <- function(df) {
required_genes <- c(
"spo0A", "sigH", "spoIIE", "spoIIIE", "spoIIIJ", "pth", "spoVG", "spoVS", "divIC",
"divIB", "divIVA", "ftsA", "ftsE", "ftsH", "ftsX", "ftsY", "ftsZ", "jag", "minC",
"minD", "spo0B", "spo0F", "ald", "obg", "ftsL", "ymcA", "ylbF", "yaaT", "sda",
"sigE", "sigF", "sigG", "spoIIAA", "spoIIAB", "spoIIGA", "parA", "soj", "parB", "spoIID",
"spoIIM", "spoIIP", "spoIIQ", "spoIIIAA", "spoIIIAB", "spoIIIAC", "spoiiiAD", "spoIIIAE", "spoIIIAF", "spoIIIAG",
"spoIIIAH", "spoIIB", "yunB", "sigK", "spoIIID", "spoIVFB", "spoVB", "spoVK", "ftsW", "bofA",
"alr", "dacB", "spmA", "spmB", "yisY", "ylmC", "ytaF", "ytvI", "ctpB", "spoIVFA",
"ykvI", "yqfU", "ydca", "ydcc", "yhbh", "spoIIR", "spoIVB", "spoVT", "dacF", "ytfJ",
"yloC", "rsfA", "fin_yabk", "ymfJ", "yqhG", "ywzB", "spoVAC", "spoVAD", "nfo", "mglA_sspA",
"sspB", "spoVAA", "spoVAF", "sspH", "tlp", "spoVAEB", "spoVAB", "sspC", "sspI", "sspF",
"spoVFA", "spoVFB", "spoIVA", "cotJC", "cotSA", "gerM", "safA", "ydhD", "yhaX", "cotJB",
"yhjR", "spoVID", "cotE", "hsps", "spoVD", "ylbJ", "cwlC_cwlD", "lytH", "yabP", "yabQ",
"yqfC", "yqfD", "gerA", "gpr", "CSD", "gdh", "ypeB", "gerC", "lgt", "gerD",
"gerE"
)
df_bin <- df %>%
mutate(present = 1) %>%
select(genome_ID, spo_gene_name, present) %>%
distinct() %>%
tidyr::pivot_wider(
names_from = spo_gene_name,
values_from = present,
values_fill = list(present = 0)
)
colnames(df_bin) <- gsub("[^a-zA-Z0-9_]", "_", colnames(df_bin))
missing_genes <- setdiff(required_genes, colnames(df_bin))
for (gene in missing_genes) {
df_bin[[gene]] <- 0
}
df_bin <- df_bin[, c("genome_ID", sort(setdiff(colnames(df_bin), "genome_ID")))]
return(df_bin)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.