#' @import dplyr
#' @import dbplyr
#' @import stringr
#' @import readxl
#' @import readr
#' @import tibble
# Notes:
# biocLite("Biostrings")
# library(Biostrings) # To manipulate DNA sequences
# -- The problem of Biostrings is that it writes the file as Linux (LF only),
# -- so one solution is to use sequinr (writes LF/CR)
# -- the other solution is to replace line feed by linefeed + CR [this the chosen solution]
# library(seqinr) # Used to write fasta files - NOT USED
# To do:
# * Use the dbplyr library to build queries...
# source("C:/Users/vaulot/Google Drive/Scripts/R library/dv_function_db.R")
# source("C:/Users/vaulot/Google Drive/Scripts/R library/dv_function_genbank.R")
# source("C:/Users/vaulot/Google Drive/Scripts/R library/dv_function_misc.R")
# pr2_read -------------------------------------------------------
#' @title Reads the whole PR2 database into a data frame
#' @description
#' The database access codes is obtained with the function db_info("pr2_google").
#'
#' **Important**: This gets the whole database, including sequences that have been removed. These sequences must be filtered out before export using pr2 <- pr2 %>% filter (is.na(removed_version)). However the taxo entries that have been removed are already removed.
#' @param taxo_levels_number integer - number of taxonomic levels, 8 or 9
#' @return
#' A data frame with all the columns from pr2.
#' @examples
#' pr2 <- pr2_read(taxo_levels_number = 8)
#' @export
#' @md
#'
pr2_read <- function(taxo_levels_number = 8) {
# This the level set not used (i.e. 8 if use 9 and conversely)
taxo_levels_number_not_used = ifelse(taxo_levels_number == 8, 9, 8)
# Ending for the taxonomic levels ("_8" or "_9")
ending = str_c("_",taxo_levels_number) # Ending of taxonomic fields
ending_removed = str_c("_",taxo_levels_number_not_used) # Ending of taxonomic fields
taxo_levels<- list()
taxo_levels[[8]] = c("kingdom", "supergroup", "division", "class", "order", "family", "genus", "species")
taxo_levels[[9]] = c("domain", "supergroup", "division", "subdivision", "class", "order", "family", "genus", "species")
print("Using pr2_google")
# Info for the Google dabase
pr2_db <- db_info("pr2_google")
pr2_db_con <- db_connect(pr2_db)
pr2_main <- tbl(pr2_db_con, "pr2_main") %>%
collect() %>%
rename_with(~ str_replace(.,ending, ""), contains(ending)) %>%
select(-contains(ending_removed))%>%
# select(-contains("edited")) %>%
select(-contains("_id"))
pr2_seq <- tbl(pr2_db_con, "pr2_sequences") %>%
collect()
pr2_taxo <- tbl(pr2_db_con, "pr2_taxonomy") %>%
filter (is.na(taxo_removed_version)) %>%
collect()
pr2_taxo <- pr2_taxo %>%
rename_with(~ str_replace(.,ending, ""), contains(ending)) %>%
select(-contains(ending_removed)) %>%
select(any_of(taxo_levels[[taxo_levels_number]])) %>%
distinct() # Very important because some levels are replicated because 2 species_8 may correspond to the same species_9
pr2_metadata <- tbl(pr2_db_con, "pr2_metadata") %>%
collect() %>%
select(-contains("_id"))
pr2_countries <- tbl(pr2_db_con, "pr2_countries") %>%
collect()
pr2_assign_bayes <- tbl(pr2_db_con, "pr2_assign_bayes") %>%
collect()
pr2_assign_silva <- tbl(pr2_db_con, "pr2_assign_silva") %>%
collect() %>%
select(pr2_accession, silva_taxonomy)
# Join the tables and keep only sequences that are not removed
pr2 <- pr2_main %>%
left_join(pr2_taxo, by = c("species"="species")) %>%
left_join(pr2_seq) %>%
left_join(pr2_metadata) %>%
left_join(pr2_countries) %>%
left_join(pr2_assign_bayes) %>%
left_join(pr2_assign_silva) %>%
relocate(!!taxo_levels[[taxo_levels_number]], .after = pr2_accession) %>%
mutate(organelle_code = case_when((organelle == "nucleus" | is.na(organelle)) ~ "",
TRUE ~ str_sub(organelle, 1,4) )) %>%
mutate_at(taxo_levels[[taxo_levels_number]], ~str_c(.x, organelle_code, sep = ":")) %>% # Add the organelle at end of taxo
mutate_at(taxo_levels[[taxo_levels_number]], ~str_replace(.x, ":$", "")) %>% # Remove trailing :
select_if(~!all(is.na(.))) %>% # remove empty columns
select(-gb_sequence) # remove gb_sequence
db_disconnect(pr2_db_con)
return(pr2)
}
# pr2_taxo_read -------------------------------------------------------
#' @title Reads the PR2 taxonomic database into a data frame
#' @description
#' The database access codes is obtained with the function db_info("pr2_google").
#'
#' **Important**: This gets the current taxonomic database and does not include taxa that have been removed.
#' @return
#' A data frame
#' @examples
#' pr2_taxo <- pr2_taxo_read()
#' @export
#' @md
#'
pr2_taxo_read <- function() {
print("Using pr2_google")
# Read the PR2 full tables including removed records
pr2_db_con <- db_connect(db_info("pr2_google"))
pr2_taxo <- tbl(pr2_db_con, "pr2_taxonomy") %>%
filter (is.na(taxo_removed_version))%>%
collect() %>%
select(-contains("removed"))
db_disconnect(pr2_db_con)
return(pr2_taxo)
}
# pr2_traits_read -------------------------------------------------------
#' @title Reads the PR2 traits database into a data frame
#' @description
#' The database access codes is obtained with the function db_info("pr2_google").
#'
#' **Important**: This gets the current taxonomic database and does not include taxa that have been removed.
#' @return
#' A data frame
#' @examples
#' pr2_traits <- pr2_traits_read()
#' @export
#' @md
#'
pr2_traits_read <- function() {
print("Using pr2_google")
# Read the PR2 full table including removed records
pr2_db_con <- db_connect(db_info("pr2_google"))
pr2_traits <- tbl(pr2_db_con, "pr2_traits") %>%
collect()
db_disconnect(pr2_db_con)
return(pr2_traits)
}
# pr2_colors_read -------------------------------------------------------
#' @title Reads the PR2 color into a data frame
#' @description
#' The database access codes is obtained with the function db_info("pr2_google").
#'
#' **Important**: This gets the current taxonomic database and does not include taxa that have been removed.
#' @return
#' A data frame with all the columns from pr2_colors.
#' @examples
#' pr2_colors <- pr2_colors_read()
#' @export
#' @md
#'
pr2_colors_read <- function() {
print("Using pr2_google")
# Read the PR2 full tables including removed records
pr2_db_con <- db_connect(db_info("pr2_google"))
pr2_colors <- tbl(pr2_db_con, "pr2_colors") %>%
collect()
db_disconnect(pr2_db_con)
return(pr2_colors)
}
# pr2_sequence_label ------------------------------
#' @title Create a simple label for a sequence
#' @description
#' Create a label to be used as header a fasta file based on the metadata from the table *pr2_metadata*. Only apply to a single character not to a vector.
#' @param sample_type character - one of "culture", "environmental","isolate", "species"
#' @param strain character - strain name
#' @param clone character - clone name
#' @param specimen_voucher character - voucher
#' @return character
#' @examples
#' fasta_label <- pr2_sequence_label(sample_type="culture",strain="RCC115")
#' @export
#' @md
pr2_sequence_label <- function(sample_type, strain, clone, specimen_voucher) {
# Note : must use str_replace_na() to make sure that no value is NA, because if a string is NA then str_c will return NA
sequence_label <- case_when(is.na(sample_type) ~ "",
sample_type == "culture" ~ str_c("strain_",str_replace_na(strain, replacement="")),
sample_type == "environmental" ~ str_c("clone_",str_replace_na(clone, replacement="")),
sample_type == "isolate" ~ "",
sample_type == "specimen" ~ str_c("specimen_",str_replace_na(specimen_voucher, replacement="")),
TRUE ~ "")
return(sequence_label)
}
# pr2_export -----------------------------------------
#' @title Export the PR2 database (one file)
#' @description
#' This will save the pr2 database in variety of format. The files are compressed as .gz
#' * "fasta" - the description line is taylored for different applications (usearch, mothur, dada2, blast). for the application dada2 2 kinds of files can be produced one for otu assignement and one for species
#' * "taxo" - the file is saved with the mothur format
#' * "metadata" - only the metadata
#' * "merged" - the whole database in a single file
#'
#' Notes
#' * The data are NOT filtered in any way, this should be done before saving
#' * The following fields are needed to create the sequence labels :
#' * pr2_sample_type, gb_strain, gb_clone, gb_specimen_voucher
#' * gene, organelle
#' @param pr2_select data frame - the pr2 database or an extract of the pr2 database
#' @param file_name character - full path of file where to save
#' @param file_type character - one of "fasta", "taxo", "metadata", "merged", "merged_excel"
#' @param file_format character - one of "UTAX", "fasta_taxo_short","fasta_taxo_long","dada2", "dada2_species","mothur"
#' @return
#' Write the files in compressed format (.gz)
#' @examples
#' pr2_export(pr2, "C:/Daniel/myfile.fas", "fasta", "mothur", 9)
#' @export
#' @md
pr2_export <- function(pr2_select, file_name, file_type="fasta", file_format="fasta_taxo_long", taxo_levels_number = 9 ) {
# These lines are for testing and should be commented out
# pr2 <- pr2_env_not_updated
# file_name <- "pr2_env_not_updated.fasta"
# level_select="kingdom"
# taxo_select="Eukaryota"
# file_type="fasta"
# file_format="fasta_taxo_long"
pr2_seq_out <- Biostrings::DNAStringSet(pr2_select$sequence) # Store the sequence in a DNAString - not used anymore
# Use sequinr (sequences are stored as list)
# pr2_seq_out <- as.list(pr2_select$sequence)
# Remove any space from Strain, Clone name, Specimen_voucher
pr2_select <- pr2_select %>%
mutate (gb_strain = str_replace_all(gb_strain, " ", "_"),
gb_clone = str_replace_all(gb_clone, " ", "_"),
gb_specimen_voucher = str_replace_all(gb_specimen_voucher, " ", "_"),
# Create a nice label based on the type of sequence
# if the organelle is empty replace by empty string to avoid NA...
sequence_label = pr2_sequence_label (pr2_sample_type,
gb_strain,
gb_clone,
gb_specimen_voucher),
organelle = str_replace_na(organelle, replacement = ""),
name_taxo_short = str_c(pr2_accession,
gene,
organelle,
sequence_label,
species,
sep="|"),
name_dada2_species = str_c(genbank_accession,
str_replace(species,"_"," "),
sep=" ")
)
if (taxo_levels_number == 9) {
pr2_select <- pr2_select %>%
mutate ( name_UTAX = str_c(pr2_accession,";tax=k:",
# kingdom,",d:",
domain,",d:",
supergroup,",p:",
division,",c:",
class,",o:",
order,",f:",
family,",g:",
genus,",s:",
species,
sep=""),
name_taxo_long = str_c(pr2_accession,
gene,
organelle,
sequence_label,
domain,
supergroup,
division,
subdivision,
class,
order,
family,
genus,
species,
sep="|"),
name_dada2 = str_c(domain,
supergroup,
division,
subdivision,
class,
order,
family,
genus,
species,
"",
sep=";")
)
} else {
pr2_select <- pr2_select %>%
mutate ( name_UTAX = str_c(pr2_accession,";tax=k:",
kingdom,",d:",
supergroup,",p:",
division,",c:",
class,",o:",
order,",f:",
family,",g:",
genus,",s:",
species,
sep=""),
name_taxo_long = str_c(pr2_accession,
gene,
organelle,
sequence_label,
kingdom,
supergroup,
division,
class,
order,
family,
genus,
species,
sep="|"),
name_dada2 = str_c(kingdom,
supergroup,
division,
class,
order,
family,
genus,
species,
"",
sep=";")
)
}
switch(file_type,
fasta = {
switch(file_format,
UTAX = {names(pr2_seq_out) <- pr2_select$name_UTAX},
fasta_taxo_short= {names(pr2_seq_out) <- pr2_select$name_taxo_short},
fasta_taxo_long= {names(pr2_seq_out) <- pr2_select$name_taxo_long},
dada2= {names(pr2_seq_out) <- pr2_select$name_dada2},
dada2_species= {names(pr2_seq_out) <- pr2_select$name_dada2_species},
mothur= {names(pr2_seq_out) <- pr2_select$pr2_accession
} )
# When using DNA strings only LF added (Unix convention)
# Set width to max possible so that no carriage return (causes problems with dada2)
Biostrings::writeXStringSet(pr2_seq_out, file_name, compress=TRUE, width = 20000)
# Convert to dos
# unix2dos(file_name)
# next line is when using seqinr
# write.fasta(sequences=pr2_seq_out, names=names(pr2_seq_out), file.out=file_name, as.string = TRUE)
},
taxo = {
# mothur taxonomy file terminates with semi column at end of line
if (taxo_levels_number == 9) {
pr2_select <- pr2_select %>%
mutate(taxo_string = str_c(domain, supergroup, division, subdivision,
class, order, family, genus, species,"",
sep=";") )
}
else {
pr2_select <- pr2_select %>%
mutate(taxo_string = str_c(kingdom, supergroup, division,
class, order, family, genus, species,"",
sep=";") )
}
pr2_select_taxo <- pr2_select %>%
select(pr2_accession, taxo_string)
write.table(pr2_select_taxo, gzfile(file_name),
col.names = FALSE, row.names=FALSE,
sep="\t", quote= FALSE)
},
metadata = {
pr2_select_metadata <- pr2_select %>% select(pr2_accession,
genbank_accession,
starts_with("gb_"),
starts_with("pr2_"),
starts_with("eukref_"),
starts_with("silva_"),
pubmed_id,
metadata_remark)
# write.table(pr2_select_metadata, gzfile(file_name),
# col.names = TRUE, row.names=FALSE,
# sep="\t", quote= FALSE, na="")
write_tsv(pr2_select_metadata, gzfile(file_name))
},
merged = {
write.table(pr2_select, gzfile(file_name),
col.names = TRUE, row.names=FALSE,
sep="\t", quote= FALSE, na="")
},
merged_excel = {
if (taxo_levels_number == 9) {
pr2_select <- pr2_select %>%
arrange(domain, supergroup, division, subdivision, class, order, family, genus, species)
} else {
pr2_select <- pr2_select %>%
arrange(kingdom, supergroup, division, class, order, family, genus, species)
}
pr2_select %>% select(-starts_with("name")) %>%
openxlsx::write.xlsx(file_name, zoom = 90, firstRow = TRUE, firstCol = TRUE)
}
)
}
# pr2_export SQLite -----------------------------------------
#' @title Export the PR2 database to a SQLite file
#' @description
#' This will save the pr2 database in SQLite databas as 4 tables (pr2_main, pr2_sequences, pr2_metadata, pr2_taxo)
#' @param file_name character - full path of file where to save
#' @return
#' TRUE is succesful
#' @examples
#' pr2_export_sqlite("pr2_google.sqlite")
#' @export
#' @md
pr2_export_sqlite <- function(file_name) {
# Read the PR2 full tables including removed records
pr2_db_con <- db_connect(db_info("pr2_google"))
pr2_main <- tbl(pr2_db_con, "pr2_main") %>%
filter (is.na(removed_version))%>%
collect() %>%
rename(species = species_8) %>%
select(-contains("_9"))
pr2_seq <- tbl(pr2_db_con, "pr2_sequences") %>%
collect()
pr2_taxo <- tbl(pr2_db_con, "pr2_taxonomy") %>%
filter (is.na(taxo_removed_version))%>%
collect() %>%
rename_with(~ str_replace(.,"_8", ""), contains("_8")) %>%
select(-contains("_9"))
pr2_metadata <- tbl(pr2_db_con, "pr2_metadata")%>%
collect()
# pr2_countries <- tbl(pr2_db_con, "pr2_countries")%>%
# collect()
#
# pr2_assign_silva <- tbl(pr2_db_con, "pr2_assign_silva")%>%
# collect()
# pr2_assign_bayes <- tbl(pr2_db_con, "pr2_assign_bayes")%>%
# collect()
db_disconnect(pr2_db_con)
# Write to local database
pr2_db_con <- db_connect_sqlite(file_name)
copy_to(pr2_db_con, pr2_main, name = "pr2_main", indexes= list("pr2_accession", "genbank_accession", "species"),
temporary=FALSE, overwrite = TRUE)
copy_to(pr2_db_con, pr2_seq, name = "pr2_sequences", indexes= list("pr2_accession"),
temporary=FALSE, overwrite = TRUE)
copy_to(pr2_db_con, pr2_metadata, name = "pr2_metadata", indexes= list( "genbank_accession"),
temporary=FALSE, overwrite = TRUE)
copy_to(pr2_db_con, pr2_taxo, name = "pr2_taxonomy", indexes= list( "species"),
temporary=FALSE, overwrite = TRUE)
# copy_to(pr2_db_con, pr2_countries, name = "pr2_countries", indexes= list( "pr2_country"),
# temporary=FALSE, overwrite = TRUE)
# copy_to(pr2_db_con, pr2_assign_silva, name = "pr2_assign_silva", indexes= list( "pr2_accession"),
# temporary=FALSE, overwrite = TRUE)
# copy_to(pr2_db_con, pr2_assign_bayes, name = "pr2_assign_bayes", indexes= list( "pr2_accession"),
# temporary=FALSE, overwrite = TRUE)
db_disconnect(pr2_db_con)
R.utils::gzip(file_name, overwrite = TRUE, remove=TRUE)
return(TRUE)
}
# pr2_export_all -----------------------------------------
#' @title Export the PR2 database (all files)
#' @description
#' This will save the pr2 database in all the format necessary for the repository on GitHub. It calls \code{link{pr2_export}}
#' @param pr2.env list containing the name of files
#' @return
#' Write the all the files in compressed format (.gz)
#' @examples
#' pr2_export_all(pr2.env)
#' @export
pr2_export_all <- function(pr2.env) {
taxo_levels_number = pr2.env$taxo_levels_number
pr2 <- pr2_read(taxo_levels_number)
pr2_taxonomy <- pr2_taxo_read()
pr2_chimera <- pr2 %>%
filter(!is.na(chimera)) %>%
select_if(~!all(is.na(.))) # remove empty columns
pr2_unassigned <- pr2 %>%
filter(is.na(removed_version)) %>%
filter(is.na(species)) %>%
select_if(~!all(is.na(.))) # remove empty columns
# Filter out sequences that have been removed, quarantined, and sequences without species names
pr2 <- pr2 %>%
filter(is.na(removed_version)) %>%
filter(is.na(quarantined_version)) %>%
filter(!is.na(species)) %>%
select(-contains("chimera")) %>%
select(-contains("bayes")) %>%
select(-contains("boot")) %>%
select_if(~!all(is.na(.))) # remove empty columns
# All sequences (merged file only)
print("Exporting merged file")
pr2_export(pr2, file_name=pr2.env$file_merged_excel, file_type="merged_excel", taxo_levels_number = taxo_levels_number)
# pr2_export(pr2, file_name=pr2.env$file_metadata, file_type="metadata")
# All SSU sequences
print("Exporting all SSU")
#
pr2_export(pr2, file_name=pr2.env$file_SSU_fasta_UTAX, file_type="fasta", file_format="UTAX", taxo_levels_number = taxo_levels_number )
pr2_export(pr2, file_name=pr2.env$file_SSU_fasta_taxo_long, file_type="fasta", file_format="fasta_taxo_long", taxo_levels_number = taxo_levels_number)
# pr2_export(pr2, file_name=pr2.env$file_fasta_taxo_short, file_type="fasta", file_format="fasta_taxo_short", taxo_levels_number = taxo_levels_number)
pr2_export(pr2, file_name=pr2.env$file_SSU_fasta_mothur, file_type="fasta", file_format="mothur", taxo_levels_number = taxo_levels_number)
pr2_export(pr2, file_name=pr2.env$file_SSU_taxo_mothur, file_type="taxo", taxo_levels_number = taxo_levels_number)
pr2_export(pr2, file_name=pr2.env$file_SSU_fasta_dada2, file_type="fasta", file_format="dada2", taxo_levels_number = taxo_levels_number)
# 18S nuclear only - NOT DONE ANYMORE
# print("Exporting 18S")
# #
# pr2_18S <- filter(pr2, gene == "18S_rRNA")
# pr2_export(pr2_18S, file_name=pr2.env$file_18S_fasta_UTAX, file_type="fasta", file_format="UTAX", taxo_levels_number = taxo_levels_number )
#
# pr2_export(pr2_18S, file_name=pr2.env$file_18S_fasta_taxo_long, file_type="fasta", file_format="fasta_taxo_long", taxo_levels_number = taxo_levels_number)
# # pr2_export(pr2, file_name=pr2.env$file_fasta_taxo_short, file_type="fasta", file_format="fasta_taxo_short", taxo_levels_number = taxo_levels_number)
#
# pr2_export(pr2_18S, file_name=pr2.env$file_18S_fasta_mothur, file_type="fasta", file_format="mothur", taxo_levels_number = taxo_levels_number)
# pr2_export(pr2_18S, file_name=pr2.env$file_18S_taxo_mothur, file_type="taxo", taxo_levels_number = taxo_levels_number)
#
# pr2_export(pr2_18S, file_name=pr2.env$file_18S_fasta_dada2, file_type="fasta", file_format="dada2", taxo_levels_number = taxo_levels_number)
# 18S V4 - dada2 only
# print("Exporting 18S V4")
#
# # 1. pcr in silico
# # 2. remove amplicon that are NA
# # 3. replace sequence by amplicon
# pr2_18S_V4 <- data.frame(amplicon = pcr_sequences(pr2.env$primer_V4_fwd,
# pr2.env$primer_V4_rev,
# pr2_18S$sequence,
# pr2.env$primer_V4_mismatches)) %>%
# bind_cols(pr2_18S) %>%
# filter(!is.na(amplicon)) %>%
# mutate(sequence=amplicon)
#
# pr2_export(pr2_18S_V4, file_name=pr2.env$file_18S_V4_fasta_dada2, file_type="fasta", file_format="dada2", taxo_levels_number = taxo_levels_number)
# 16S plastid only - NOT DONE anymore
# print("Exporting 16S plastid")
#
# pr2_16S <- filter(pr2, gene == "16S_rRNA")
# pr2_export(pr2_16S, file_name=pr2.env$file_16S_fasta_UTAX, file_type="fasta", file_format="UTAX", taxo_levels_number = taxo_levels_number)
#
# pr2_export(pr2_16S, file_name=pr2.env$file_16S_fasta_taxo_long, file_type="fasta", file_format="fasta_taxo_long", taxo_levels_number = taxo_levels_number)
#
# pr2_export(pr2_16S, file_name=pr2.env$file_16S_fasta_mothur, file_type="fasta", file_format="mothur", taxo_levels_number = taxo_levels_number)
# pr2_export(pr2_16S, file_name=pr2.env$file_16S_taxo_mothur, file_type="taxo", taxo_levels_number = taxo_levels_number)
#
# pr2_export(pr2_16S, file_name=pr2.env$file_16S_fasta_dada2, file_type="fasta", file_format="dada2", taxo_levels_number = taxo_levels_number)
# R package
print("Exporting R package")
pr2.env$pr2database_directory
save(pr2, file=str_c(pr2.env$pr2database_directory, "data/pr2.rda"))
# Next lines uncomment for version 5.0
# save(pr2_taxonomy, file=str_c(pr2.env$pr2database_directory, "data/pr2_taxonomy.rda"))
# save(pr2_chimera, file=str_c(pr2.env$pr2database_directory, "data/pr2_chimera.rda"))
# save(pr2_unassigned, file=str_c(pr2.env$pr2database_directory, "data/pr2_unassigned.rda"))
# SQLite - Uncomment for version 5.0.0
print("Exporting SQLite")
pr2_export_sqlite(pr2.env$file_sqlite)
}
# pr2_taxo_check ---------------
#' @title Check taxonomy
#' @description
#' This check the taxonomy for the following conditions:
#' * No taxon has 2 different parents
#' * Not taxon appears at different ranks
#'
#' @note
#' The input data frame does not need to be the whole pr2 taxo, can be taxo extracted from an Excel file.
#' Several files are produced:
#'
#' * *taxo_list_(rank_number)_(rank_name).txt* (7 files)
#' * name of each taxon
#' * taxa_number - number of species in this taxon
#' * level of taxon - from 1 to 8 (1 for kingdom or domain)
#' * id of the parent taxon (at rank i-1)
#' * id of the taxon (from 1 to n for each rank)
#' * *taxo_list_2_parents_(rank_number)_(rank_name).txt* (7 files). This is an important file because it list all the taxa that have 2 parents. When all the files are completely empty, then the taxonomic list is OK.
#' * name of the taxon
#' * number of parents
#' * *taxo_level_duplicates.txt*. List the taxa that are found at different ranks. Fro example Cryptophyta can be in the Class column for one species and in the Division levels for all other species.
#' * name
#' * number of times found
#' * taxa number
#' * level at which found
#' * id
#' * id of the parent
#' * *taxo_species_duplicates*. List of species that appear at least twice in the species list.
#'
#' @param pr2_taxo dataframe - should contain eight or nine columns kingdom -> species or domain -> species
#' @param taxo_levels - vector with taxo levels
#' @param dir_taxo character - full path where file are saved (without final //)
#' @return
#' Write several files (see description)
#' @examples
#' pr2_taxo_check (taxo_list, "C:/Daniel/database")
#' @export
#' @md
pr2_taxo_check <- function(pr2_taxo, taxo_levels = pr2.env$taxo_levels, dir_taxo="") {
# Initialize variables
taxo_levels_number <- length(taxo_levels)
taxo_list <- list()
taxo_list_2_parents <- list()
# Extract all taxo levels
# Level 1 - Kingdom or Domain
i <- 1
# print(taxo_levels[[1]])
taxo <- pr2_taxo %>%
group_by (!!as.name(taxo_levels[[1]])) %>%
summarise(taxa_number=n()) %>%
rename(name=!!as.name(taxo_levels[[1]]))
taxo$level<-i
taxo$id<-1:nrow(taxo)
taxo$parent_id<-0
taxo_list[[1]] <- taxo
# Level 2 to 8 or 9
for (i in c(2:taxo_levels_number)) {
taxo_list[[i]] <-pr2_extract_one_taxo_level(pr2_taxo,
taxo_levels[[i-1]],
taxo_levels[[i]],
i,
taxo_list[[i-1]])
write_tsv(taxo_list[[i]], path=str_c(dir_taxo,"/","taxo_list_", i, "_", taxo_levels[[i]], ".txt"))
}
# Find taxo levels that have more than one parent
# Note : it is necessary to examine each file one by one going in descending order and rerun the script each time
for (i in c(2:taxo_levels_number)) {
taxo_list_2_parents[[i]] <- taxo_list[[i]] %>%
group_by (name) %>%
summarize (n_parents=n()) %>%
filter (n_parents>1)
write_tsv(taxo_list_2_parents[[i]],
path=str_c(dir_taxo,"/","taxo_list_2_parents_", i, "_", taxo_levels[[i]], ".txt"))
}
# Find taxo names that are found at different levels
taxo_list_all <- taxo_list[[1]]
for (i in c(2:taxo_levels_number)) {
taxo_list_all <- rbind (taxo_list_all, taxo_list[[i]])
}
taxo_list_duplicate<- taxo_list_all %>%
group_by (name) %>%
summarize (n_found=n()) %>%
filter ((n_found>1))
taxo_list_duplicate<-left_join(taxo_list_duplicate, taxo_list_all)
write_tsv(taxo_list_duplicate, path=str_c(dir_taxo,"/","taxo_level_duplicates", ".txt"))
# Find species that are duplicated in the species list - ERROR HERE TO CORRECT (refer to species hard coded)
cat("Lower taxo level: ", taxo_levels[[taxo_levels_number]])
taxo_species_duplicate<- pr2_taxo %>%
group_by (!!as.name(taxo_levels[[taxo_levels_number]])) %>%
summarize (n_found=n()) %>%
filter (n_found>1)
write_tsv(taxo_species_duplicate, path=str_c(dir_taxo,"/","taxo_species_duplicates", ".txt"))
}
# pr2_extract_one_taxo_level -------------------------------------------------------------------------
#' Extract one level of taxonomy
#'
#' This function returns a data frame that contains:
#'
#' * name = name of the taxon
#' * taxa_number = number of taxa downstream of this taxon
#' * level = number of the level (e.g. genus = level number 7)
#' * parend_id = id of the first parent taxon (sometimes one taxon has more than one taxon)
#' * id = id of the taxon (just the row name since they are ordered alphabetically )
#' @note
#' Here is an example at the genus level
#' name taxa_number level parent_id id
#' Ascampbelliella 1 7 51 1
#' Choreotrichida_XX 1 7 1 2
#' Cyttarocylididae_X 1 7 52 3
#' Cyttarocylis 4 7 52 4
#' Petalotricha 1 7 52 5
#' Codonaria 2 7 53 6
#' Codonella 2 7 53 7
#' @param pr2_taxo dataframe - taxonomy with 8 coumns kigdom -> species (but can be more)
#' @param level_above_name character - name of the taxo level above (e.g. "order")
#' @param level_name character - name the level considered (e.g. "family")
#' @param taxo_above dataframe - the same structure than the output of the function but for the level above (e.g. the order if we are the family level)
#' @examples
#' genus_df <- pr2_extract_one_taxo_level(pr2_taxo, "family", "genus", 7, family_df)
#' @export
#' @md
pr2_extract_one_taxo_level <- function(pr2_taxo, level_above_name, level_name, level_number, taxo_above) {
taxo <- pr2_taxo %>%
group_by(!!as.symbol(level_above_name), !!as.symbol(level_name)) %>%
summarise(n=n()) %>%
transmute(name= !!as.symbol(level_name), number_of_taxa=n)
taxo$level<-level_number
taxo <-merge(taxo, taxo_above, all.x, by.x=level_above_name, by.y="name")
taxo <- taxo %>%
transmute(name, taxa_number=number_of_taxa, level=level.x, parent_id = id) %>%
arrange(name) %>%
rowid_to_column( var = "id")
return(taxo)
}
# pr2_taxo_list -----------------------------------------
#' @title Build a list of taxa
#' @description
#' Construct a list of taxa from the pr2 database
#' @param pr2 - data frame - whole or filtered pr2 data base
#' @param level_select - taxonomic rank
#' @return
#' A character vector
#' @examples
#' pr2_taxo_list(pr2, "class")
#' @export
#' @md
pr2_taxo_list <- function(pr2, level_select) {
taxo_list <- pr2 %>%
distinct_(level_select) %>%
arrange_(level_select)
return(taxo_list)
}
# pr2_taxo_X -----------------------------------------
#' @title Add _X for taxon that are repeated on same line
#' @description
#' If more than 2 ranks on a given line have the same name or is empty, rename with Mytaxon_X, Mytaxon_XX etc.. up to the species name which becomes Mytaxon_XX_sp.
#' @param pr2_taxo Data frame containing columns from kingdom to species (or else see the next parameter). It is not necessary that the columns be ordered.
#' @param taxo_levels A vector of ordered taxonomic ranks such as c("kingdom", "supergroup", "division", "class", "order", "family", "genus", "species")
#' @return
#' A corrected data frame
#' @examples
#' pr2_taxo_clean <- pr2_taxo_X(pr2_taxo, pr2.env$taxo_levels)
#' @export
#' @md
pr2_taxo_X <- function(pr2_taxo, taxo_levels = pr2.env$taxo_levels[[8]]) {
taxo_levels_number <- length(taxo_levels)
for (i in 1:nrow(pr2_taxo)) {
stem = ""
for (j in 1:(taxo_levels_number-1) ){
# Check first if the next level is equal to previous level which is stored in stem
if ((pr2_taxo[i,taxo_levels[j]]==stem) | (pr2_taxo[i,taxo_levels[j]]=="")| is.na(pr2_taxo[i,taxo_levels[j]])) {
# If true then add _XX where the number of X is proportional to the difference in number of columns since the start
pr2_taxo[i,taxo_levels[j]] <- str_c(stem, "_", str_dup("X", j-stem_level))
# If we are at the end (species level) then use _XX_sp. there
if (j == taxo_levels_number-1) {
pr2_taxo[i,taxo_levels[j+1]] <- str_c(stem,"_", str_dup("X", j-stem_level), "_sp.")
}
}
else {
# If the level is not repeated then use it to check the next column
stem <- pr2_taxo[i,taxo_levels[j]]
stem_level <- j
}
}
}
return(pr2_taxo)
}
# pr2_build_taxons ---------------
#' @title Build taxon table (long format)
#' @description
#' This constructs a taxon table (long format) from a taxonomy table (wide format)
#' If add_old_fields = TRUE then the following columns are appended
#' old_species_id, taxon_edited_version, taxon_edited_by, taxon_removed_version, taxon_remark, taxon_reference
#'
#' @note
#' The input data frame does not need to be the whole pr2 taxo, can be taxo extracted from an Excel file.
#'
#' @param pr2_taxonomy dataframe - should contain eight columns kingdom -> species (this can be modified by using the pr2.env list)
#' @param pr2.env list containing the taxon levels
#' @param add_old_fields logical - If add_old_fields = TRUE then the following columns are appended: old_species_id, taxon_edited_version, taxon_edited_by, taxon_removed_version, taxon_remark, taxon_reference
#' @return
#' Data frame pr2_taxons with columns "taxon_id" , "taxon_name", "taxon_level", "parent_id", "old_species_id",
#' "taxon_edited_version", "taxon_edited_by", "taxon_removed_version" ,"taxon_remark", "taxon_reference"
#' @examples
#' pr2_taxons <- pr2_buid_taxons(pr2_taxonomy)
#' @export
#' @md
pr2_buid_taxons <- function(pr2_taxonomy, pr2.env, add_old_fields=FALSE) {
# Initialize variables
taxo_levels_number <- length(pr2.env$taxo_levels)
taxo_list <- list()
taxo_list_2_parents <- list()
# Extract all taxo levels
# Level 1 - Kingdom
i <- 1
taxo <- pr2_taxonomy %>%
group_by (any_of(pr2.env$taxo_levels[1])) %>%
summarise(number_of_taxa=n())
taxo<-taxo %>%
transmute(name=kingdom, taxa_number=number_of_taxa)
taxo$level<-i
taxo$id<-1:nrow(taxo)
taxo$parent_id<-0
taxo_list[[1]] <- taxo
# Level 2 to 8
for (i in c(2:taxo_levels_number)) {
taxo_list[[i]] <-pr2_extract_one_taxo_level(pr2_taxonomy,
pr2.env$taxo_levels[[i-1]],
pr2.env$taxo_levels[[i]],
i,
taxo_list[[i-1]])
}
# Reduce the 8 lists to a single dataframe
pr2_taxons <- taxo_list %>% reduce (bind_rows)
# Get a serial number for the taxons
pr2_taxons <- pr2_taxons %>% rowid_to_column(var="taxon_id")
# Now replace the parent_id by the new id
pr2_taxons_1 <- pr2_taxons %>% mutate(parent_level=level-1)
pr2_taxons_2 <- pr2_taxons %>% select(parent_id_new=taxon_id, level, id)
pr2_taxons_3 <- left_join(pr2_taxons_1, pr2_taxons_2, by = c("parent_level" = "level", "parent_id"="id"))
# Change the names
pr2_taxons_3 <- pr2_taxons_3 %>% select(taxon_id, taxon_name = name, taxon_level = level, parent_id = parent_id_new)
# Left join to recover the old information from the pr2_taxonomy table
if(add_old_fields==TRUE) {
pr2_taxo_1 <- pr2_taxo %>% select(old_species_id = taxo_id,
taxon_name=species,
taxon_edited_version = taxo_edited_version,
taxon_edited_by = taxo_edited_by,
taxon_removed_version = taxo_removed_version,
taxon_remark = taxo_remark,
taxon_reference = reference)
pr2_taxons_3 <- pr2_taxons_3 %>% left_join(pr2_taxo_1)
}
return(pr2_taxons_3)
}
# pr2_build_taxonomy ---------------
#' @title Buikd taxonomy table (wide format) from taxon table (long format)
#' @description
#' This constructs a taxonomy table from a taxon table
#'
#' @note
#' The input data frame does not need to be the whole pr2 taxons, can be taxo extracted from an Excel file.
#'
#' Programming note:
#' This function uses the following trick for mutating the colums with the taxo_level string
#' rank_name = **sym**(pr2.env$taxo_levels[i])
#' mutate (**!!** rank_name **:=** taxon_name)
#' @param pr2_taxons dataframe - should contain at least three columns taxon_id, taxon_name, taxon_level, parent_id
#' @param pr2.env list containing taxo_levels
#' @return
#' Data frame pr2_taxonomy with 9 columns
#' @examples
#' pr2_taxo <- pr2_buil_taxonomy(pr2_taxons, pr2.env)
#' @export
#' @md
pr2_build_taxonomy <- function(pr2_taxons, pr2.env) {
taxo_levels_number <- length(pr2.env$taxo_levels)
taxo_list <- list()
for (i in c(taxo_levels_number:1)){
# i=7
# Needs to transform the taxo_level string into an expression
rank_name = sym(pr2.env$taxo_levels[i])
# Select one level of taxonomy and keep its ID and its Parent ID
taxo_list[[i]] <- pr2_taxons %>%
filter(taxon_level == i) %>%
dplyr::select(taxon_id, taxon_name, parent_id) %>%
mutate (!! rank_name := taxon_name) %>%
select(-taxon_name)
# For the last level (species) just take all the species. For the other levels do a join with next level (e.g. if genus, join to species)
if (i == taxo_levels_number) {
pr2_taxonomy <- taxo_list[[i]]
} else {
pr2_taxonomy <- pr2_taxonomy %>%
left_join(taxo_list[[i]], by = c("parent_id"="taxon_id") ) %>%
select(-parent_id) %>%
rename(parent_id = parent_id.y)
}
}
pr2_taxonomy <- pr2_taxonomy %>%
select_( .dots = c("taxon_id", pr2.env$taxo_levels))
return(pr2_taxonomy)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.