cogeqc: Systematic quality checks on comparative genomics analyses

Data in inst/extdata/

The gene family file was downloaded from PLAZA Dicots 5.0 and converted to an OrthoFinder-like format with the following code:

library(dplyr)
fam <- readr::read_tsv("~/Downloads/genefamily_data.HOMFAM.csv.gz", skip = 2)
fam_brassicaceae <- fam %>%
    filter(species %in% c("ath", "bol")) %>%
    rename(Orthogroup = `#gf_id`) 

fam_ath <- fam_brassicaceae %>%
    filter(species == "ath") %>%
    group_by(Orthogroup) %>%
    mutate(Ath = paste(gene_id, collapse = ", ")) %>%
    ungroup() %>%
    distinct(Orthogroup, .keep_all = TRUE) %>%
    select(Orthogroup, Ath)

fam_bol <- fam_brassicaceae %>%
    filter(species == "bol") %>%
    group_by(Orthogroup) %>%
    mutate(Bol = paste(gene_id, collapse = ", ")) %>%
    ungroup() %>%
    distinct(Orthogroup, .keep_all = TRUE) %>%
    select(Orthogroup, Bol)

fam_final <- inner_join(fam_ath, fam_bol)

readr::write_tsv(fam_final[1:10, ],
                 file = here::here("inst", "extdata", "Orthogroups.tsv")
)

cd inst/extdata
gzip Orthogroups.tsv

Here, we will run BUSCO in a Conda environment created with a temporary installation of miniconda. This temporary miniconda installation is only possible thanks to the Herper package.

# Download Ostreococcus tauri's genome
ota_genome <- file.path(tempdir(), "ota_genome.fasta.gz")
download.file("ftp://ftp.psb.ugent.be/pub/plaza/plaza_pico_03/Genomes/ota.fasta.gz", destfile = ota_genome)

system2("gunzip", args = ota_genome)
ota_genome <- gsub("\\.gz", "", ota_genome)

# Choose BUSCO dataset
dataset <- "chlorophyta_odb10"

# Install miniconda in a temporary directory
library(Herper)
miniconda_path <- file.path(tempdir(), "temp_miniconda")
env <- "busco_env"
install_CondaTools(tools = "busco", 
                   env = env, 
                   pathToMiniConda = miniconda_path)

# Test if it is working
with_CondaEnv(env,
              system2(command = "busco", args = "--list-datasets", stdout = TRUE),
              pathToMiniConda = miniconda_path)


# Run BUSCO on Ostreococcus tauri's genome
run_busco(sequence = ota_genome, outlabel = "ota", mode = "genome",
          lineage = dataset, threads = 2, outpath = "~/Documents", 
          download_path = "~/Documents/busco_datasets",
          envname = env, miniconda_path = miniconda_path, force = TRUE)

fs::file_copy("~/Documents/ota/run_chlorophyta_odb10/short_summary.txt", 
              here::here("inst", "extdata", "short_summary.txt"))

This file contains the first 1,000 lines from the Herbaspirilllum seropedicae SmR1 (GCA_000143225) genome, and it was downloaded from Ensembl Bacteria.

# Bash
head -n 1001 Hse.fa > Hse_subset.fa

Example of files containing Orthofinder’s per-species statistics. These files were downloaded from https://bioinformatics.plants.ox.ac.uk/davidemms/public_data/Results_model_species.tar.gz and copied to extdata/ with:

fs::file_copy(
    "~/Downloads/Results_orthofinder_example/Comparative_Genomics_Statistics/Statistics_PerSpecies.tsv",
    here::here("inst", "extdata")
)

fs::file_copy(
    "~/Downloads/Results_orthofinder_example/Comparative_Genomics_Statistics/Duplications_per_Species_Tree_Node.tsv",
    here::here("inst", "extdata")
)

fs::file_copy(
    "~/Downloads/Results_orthofinder_example/Comparative_Genomics_Statistics/Orthogroups_SpeciesOverlaps.tsv",
    here::here("inst", "extdata")
)

Data in data/

og <- fam %>%
    dplyr::filter(species %in% c("ath", "bol")) %>%
    dplyr::rename(Orthogroup = `#gf_id`, Species = species, Gene = gene_id) %>%
    as.data.frame()
og$Species <- gsub("ath", "Ath", og$Species)
og$Species <- gsub("bol", "Bol", og$Species)

usethis::use_data(og, compress = "xz")

# Download and tidy the data set
download.file("https://ftp.psb.ugent.be/pub/plaza/plaza_public_dicots_05/InterPro/interpro.ath.csv.gz", destfile = "~/Downloads/interpro.ath.csv.gz")

interpro_ath <- read.csv("~/Downloads/interpro.ath.csv.gz", 
                         sep = "\t", skip = 8)[, c(1, 3, 4)]
names(interpro_ath) <- c("Gene", "Annotation", "Description")

# Keep only genes included in orthogroups
data(og)
interpro_ath <- interpro_ath[interpro_ath$Gene %in% og$Gene, ]

# Save data
interpro_ath <- interpro_ath[, c(1,2)]
usethis::use_data(interpro_ath, compress = "xz", overwrite = TRUE)

# Download and tidy the data set
download.file("https://ftp.psb.ugent.be/pub/plaza/plaza_public_dicots_05/InterPro/interpro.bol.csv.gz", destfile = "~/Downloads/interpro.bol.csv.gz")

interpro_bol <- read.csv("~/Downloads/interpro.bol.csv.gz", 
                         sep = "\t", skip = 8)[, c(1, 3, 4)]
names(interpro_bol) <- c("Gene", "Annotation", "Description")

# Keep only genes included in orthogroups
data(og)
interpro_bol <- interpro_bol[interpro_bol$Gene %in% og$Gene, ]

# Save data
interpro_bol <- interpro_bol[, c(1,2)]
usethis::use_data(interpro_bol, compress = "xz", overwrite = TRUE)

This object contains BUSCO’s summary output for batch mode with the following genomes:

Herbaspirillum seropedicae SmR1 (GCA_000143225)
Herbaspirillum rubrisubalbicans M1 (GCA_001483945))

Both genomes were downloaded from Ensembl Bacteria. After downloading, FASTA files were gunzipped and stored in the directory ~/Documents/Herbaspirillum_genomes.

sequence <- "~/Documents/Herbaspirillum_genomes"
download_path <- paste0(tempdir(), "/datasets")
run_busco(sequence, outlabel = "Herbaspirillum", mode = "genome",
          lineage = "burkholderiales_odb10",
          outpath = tempdir(), download_path = download_path)

batch_summary <- read_busco(tempdir())
usethis::use_data(batch_summary, compress = "xz", overwrite = TRUE)

This is a species tree for model species retrieved from Orthofinder’s example in https://bioinformatics.plants.ox.ac.uk/davidemms/public_data/Results_model_species.tar.gz.

tree_file <- "~/Downloads/Results_orthofinder_example/Species_Tree/SpeciesTree_rooted_node_labels.txt"
tree <- treeio::read.tree(tree_file)

usethis::use_data(tree, compress = "xz", overwrite = TRUE)

The synteny network of 107 angiosperms was downloaded from this link, which is associated with the publication Network-based Microsynteny Analysis Identifies Major Differences and Genomic Outliers in Mammalian and Angiosperm Genomes. For package size issues, the network was filtered to keep only Brassica sp. species (B. rapa, B. napus, and B. oleraceae).

synnet <- readr::read_tsv("~/Downloads/107Plant-SynNet-b5s5m25.gz",
                          col_names = FALSE)[, c(3,4)]
synnet <- as.data.frame(synnet)
names(synnet) <- c("anchor1", "anchor2")

# See all species abbreviations
species <- c(
    substr(synnet$anchor1, start = 1, stop = 3),
    substr(synnet$anchor2, start = 1, stop = 3)
)
species <- unique(species)
sort(species)

# Pick only 'bol', 'bnp', and 'bra'
synnet <- synnet[grepl("^(bol|bra|bnp).*", synnet$anchor1), ]
synnet <- synnet[grepl("^(bol|bra|bnp).*", synnet$anchor2), ]
rownames(synnet) <- NULL

usethis::use_data(synnet, compress = "xz", overwrite = TRUE)

almeidasilvaf/cogeqc documentation built on Jan. 29, 2024, 7:20 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

almeidasilvaf/cogeqc
Systematic quality checks on comparative genomics analyses

inst/script/data_creation.md
In almeidasilvaf/cogeqc: Systematic quality checks on comparative genomics analyses

Data in inst/extdata/

Orthogroups.tsv.gz

short_summary.txt (BUSCO output)

Hse_subset.fa

Statistics_PerSpecies.tsv, Duplications_per_Species_Tree_Node.tsv and Orthogroups_SpeciesOverlaps.tsv

Data in data/

og.rda

interpro_ath.rda

interpro_bol.rda

batch_summary.rda (BUSCO batch mode)

tree.rda

synnet.rda

R Package Documentation

Browse R Packages

We want your feedback!

almeidasilvaf/cogeqc Systematic quality checks on comparative genomics analyses

inst/script/data_creation.md In almeidasilvaf/cogeqc: Systematic quality checks on comparative genomics analyses

Data in inst/extdata/

Orthogroups.tsv.gz

short_summary.txt (BUSCO output)

Hse_subset.fa

Statistics_PerSpecies.tsv, Duplications_per_Species_Tree_Node.tsv and Orthogroups_SpeciesOverlaps.tsv

Data in data/

og.rda

interpro_ath.rda

interpro_bol.rda

batch_summary.rda (BUSCO batch mode)

tree.rda

synnet.rda

R Package Documentation

Browse R Packages

We want your feedback!

almeidasilvaf/cogeqc
Systematic quality checks on comparative genomics analyses

inst/script/data_creation.md
In almeidasilvaf/cogeqc: Systematic quality checks on comparative genomics analyses