inst/RefDB/UHGG_2.0.1/genome_AA.R

# Read and sum amino acid composition of all proteins for each genome 20221022
# Script modified for UHGG from GTDB/genome_AA.R in chem16S 20231230
genome_AA <- function() {

  # Read taxonomy file
  # ./taxonomy.csv.xz has 2350 genomes with Contamination < 2 & Completeness > 95
  # fullset/taxonomy.csv.xz has all 4744 genomes in UHGG v2.0.1
  taxonomy <- read.csv("taxonomy.csv.xz")
  # Get genome names (accessions)
  genome <- taxonomy$genome
  # List xz-ipped files previously downloaded using commands in getMGnify.R
  files <- file.path("genomes", paste0(genome, ".faa.xz"))

  # Loop over FASTA files (one for each genome)
  ifile <- seq_along(files)
  aa <- lapply(ifile, function(i) {
    # Print progress message
    if(i %% 100 == 0) print(i)
    # Read amino acid composition 
    aa <- suppressMessages(canprot::read_fasta(files[i]))
    # Sum amino acid composition
    canprot::sum_aa(aa)
  })
  aa <- do.call(rbind, aa)

  # Put in full genome names (with version suffix .1, .2, etc.)
  aa$organism <- genome
  # Put in species names 20240104
  aa$ref <- taxonomy$species
  # Use generic protein name
  aa$protein <- "MGnify"

  # Save result
  write.csv(aa, "genome_AA.csv", row.names = FALSE, quote = FALSE)

}
jedick/JMDplots documentation built on April 12, 2025, 1:35 p.m.