dvutils: A set of utilities

Documented in metapr2_export_asv

#' @import dplyr
#' @import tidyr
#' @import stringr
#' @import tibble
#' @importFrom rio export

# metapr2_export_asv -------------------------------------------------------
#' @title Exports the metapr2 database
#' @description
#' Exports a range of file and format.
#'    * fasta file with the full taxonomy or just the genus level
#'    * excel file with the full table of the asv and metadata
#'    * phyloseq file (better when only selecting a single data set)
#'    * list of samples with metadata
#'
#' Returns a list with 4 elements
#'    * df = dataframe with all the asv and the stations and read numbers
#'    * ps = phyloseq object  - if phyloseq = TRUE
#'    * fasta = df with 2 columns seq_name and sequence
#'    * samples= sample_list (list of sammples with metadata)
#'
#' NOTE: if all export_long_xls, export_wide_xls, export_phyloseq are false, abundances are not exported
#' @param taxo_level The taxonomic level for selection (do not quote), e.g. class or genus
#' @param taxo_name  The name of the taxonomic level selected, e.g. "Chlorophyta", can be a vector c("Chlorophyta", "Haptophyta")
#' @param boot_level The taxonomic level for bootstrap filtering (do not quote), e.g. class_boot or genus_boot
#' @param boot_min  Minimum bootstrap value at the class level, 0 if you want to get all asvs
#' @param assigned_with Program used for assignement - "dada2" or "decipher"
#' @param reference_database Reference database used - "pr2_4.14.0" or "pr2_4.12.0"
#' @param directory  Directory where the files are saved (finish with /)
#' @param dataset_id_selected  Integer vector, e.g. 23 or c(21, 23) or 21:23
#' @param filter_samples Character expression for filter, e.g. "DNA_RNA == 'DNA'" (use single quotes inside double quotes)
#' @param filter_metadata  Character expression for filter, e.g. "substrate == 'sediment trap material'" (use single quotes inside double quotes)
#' @param export_long_xls  If TRUE, an xlsx file is produced containing the final long data frame
#' @param export_wide_xls  If TRUE, an xlsx file is produced containing the final wide data frame
#' @param export_sample_xls  If TRUE, an xlsx file is produced containing the sample list
#' @param export_phyloseq  If TRUE, a phyloseq file is produced and a phyloseq object producted
#' @param export_fasta  If TRUE, a fasta is produced
#' @param taxonomy_full If TRUE, the fasta file contains the full taxonomy (8 levels), if false only contains the species
#' @param use_hash If TRUE, the asvs with identical has will be merged and called by their hash value (sequence_hash)
#' @param sum_reads_min This is the minimum number of reads (summed over the datasets selected) for an asv to be included
#' @return
#' A list with 4 elements
#' @examples
#' # Export all the asv in a single fasta
#'   metapr2_export_asv()
#'
#' # Export as specific data set as a phyloseq file
#'   metapr2_export_asv(dataset_id_selected = 23, export_phyloseq = TRUE)
#'
#' # Export a specific genus as a fasta file and an excel file
#'   asv_set <- metapr2_export_asv(taxo_level = genus, taxo_name=c("Pseudohaptolina","Haptolina"),
#'                                 export_fasta=TRUE, taxonomy_full= FALSE,
#'                                 boot_level = genus_boot, boot_min = 100,
#'                                 export_long_xls = TRUE)
#' @export
#' @md
#'
metapr2_export_asv <- function(taxo_level = kingdom, taxo_name="Eukaryota",
                               boot_level = class_boot, boot_min = 0,
                               assigned_with = "dada2",
                               reference_database = "pr2_4.14.0",
                               directory = "C:/daniel.vaulot@gmail.com/Databases/_metaPR2/export/",
                               dataset_id_selected = c(1:500),
                               filter_samples = NULL,
                               filter_metadata = NULL,
                               export_long_xls=FALSE,
                               export_wide_xls=FALSE,
                               export_sample_xls=FALSE,
                               export_phyloseq = FALSE,
                               export_fasta=FALSE,
                               taxonomy_full = TRUE,
                               use_hash = FALSE,
                               sum_reads_min = 0) {

  taxo_levels <- c("kingdom", "supergroup", "division", "class", "order", "family", "genus", "species")
  taxo_levels_boot <- str_c(taxo_levels, "_boot")

  # Define a variable to hold the data set id as character
  dataset_id_char <- case_when ((500 %in% dataset_id_selected) ~ "all",
                                length(dataset_id_selected) > 5 ~ "several",
                                TRUE ~ str_c(dataset_id_selected, collapse = "_"))


  # Read the database and already filter for selected records -------------------
  metapr2_db <- db_info("metapr2_google")
  metapr2_db_con <- db_connect(metapr2_db)

  # Get all the asv (filtration is now done latter)
  asv_set <- tbl(metapr2_db_con, "metapr2_asv") %>%
    filter(dataset_id %in% dataset_id_selected) %>%
    filter(is.na(chimera)) %>%
    collect()

  # Use the new assignements
  if(!(assigned_with %in% c("dada2", "decipher"))) {assigned_with = "dada2"}
  if(!(reference_database %in% c("pr2_4.12.0", "pr2_4.14.0"))) {reference_database = "pr2_4.14.0"}

  if(reference_database == "pr2_4.14.0") {
    if (assigned_with == "dada2") {
      asv_set_updated <- tbl(metapr2_db_con, "metapr2_asv_dada2_pr2_4.14.0") %>%
        collect()
    } else if (assigned_with == "decipher") {
      asv_set_updated <- tbl(metapr2_db_con, "metapr2_asv_decipher_pr2_4.14.0") %>%
        collect()
    }

    if(reference_database == "pr2_4.12.0") {
        asv_set_updated <- tbl(metapr2_db_con, "metapr2_asv_dada2_pr2_4.12.0")
      }

    asv_set <- asv_set %>%
      select(-any_of(c(taxo_levels, taxo_levels_boot))) %>%
      left_join(asv_set_updated, by = c("sequence_hash" = "sequence_hash"))
  }

  # Create a label for the taxons selected to be used for the files -------------------
  if(length(taxo_name) > 1){
    taxo_name_label <- str_c(str_sub(taxo_name, 1, 5), collapse="_")
  } else {
    taxo_name_label <- taxo_name
  }

  # Filter the asv based on the datasets selected and the minimum bootstrap -------------------
  # asv_set <- asv_set %>%
  #    filter(dataset_id %in% dataset_id_selected)

  # Need to use !! before the local variable
  # Error: Cannot embed a data frame in a SQL query.
  #  If you are seeing this error in code that used to work, the most likely cause is a change dbplyr 1.4.0. Previously `df$x` or
  # `df[[y]]` implied that `df` was a local variable, but now you must make that explict with `!!` or `local()`, e.g., `!!df$x` or
  # `local(df[["y"]))

  metapr2_asv_abundance <- tbl(metapr2_db_con, "metapr2_asv_abundance") %>%
    filter(asv_code %in% !!asv_set$asv_code) %>%
    collect()

  metapr2_samples <- tbl(metapr2_db_con, "metapr2_samples") %>%
    filter(!is.na(file_code)) %>%   # Remove all samples that have not been processed
    collect()

  # Filter samples
  if (!is.null(filter_samples)) {
    metapr2_samples <- metapr2_samples %>%
      filter(eval(rlang::parse_expr(filter_samples)))
  }

  metapr2_metadata <- tbl(metapr2_db_con, "metapr2_metadata") %>%
    collect()

  # Filter metadata
  if (!is.null(filter_metadata)) {
    metapr2_metadata <- metapr2_metadata %>%
      filter(eval(rlang::parse_expr(filter_metadata)))
  }

  metapr2_datasets <- tbl(metapr2_db_con, "metapr2_datasets") %>%
    collect()

  db_disconnect(metapr2_db_con)

  # Merging together asvs with same hash value -------------------

  if (use_hash){
    asv_fasta <- asv_set %>%
      group_by(sequence_hash) %>%
      dplyr::slice(1) %>%
      mutate(asv_code = sequence_hash) %>%
      select(-dataset_id) %>%
      ungroup()
  } else {
    asv_fasta <- asv_set
  }


  sample_list <- metapr2_samples %>%
    # Use a inner_join here so that if no metadata then sample is not found in sample list
    inner_join(metapr2_metadata) %>%
    filter(dataset_id %in% dataset_id_selected) %>%
    left_join(metapr2_datasets, by = c("dataset_id" = "dataset_id")) %>%
    select(-contains(c("dada2", "primer", "web")), -dataset_path) %>%
    # This removes all the column that are empty
    select_if(~!all(is.na(.)))

  asv_set <-  asv_set %>%
    left_join(metapr2_asv_abundance) %>%
    # Use a inner_join here so that if no sample then sample is not found in asv_set
    inner_join(metapr2_samples) %>%
    # Use a inner_join here so that if no metadata then sample is not found in asv_set
    inner_join(metapr2_metadata) %>%
    left_join(metapr2_datasets, by = c("dataset_id" = "dataset_id")) %>%
    select(-contains(c("dada2", "primer", "paper", "web")), -dataset_path, -asv_id) %>%
    # Next line removes all the column that are empty
    select_if(~!all(is.na(.)))

  # Merging together asvs with same hash value -------------------

  if (use_hash) {
    asv_set <- asv_set %>%
      mutate(asv_code = sequence_hash)
  }

  # Compute abundance of ASVs --------------------------------------

  asv_set_number <- asv_set %>%
    count(asv_code, wt=n_reads, sort = TRUE, name = "sum_reads_asv") %>%
    filter(sum_reads_asv >= sum_reads_min)

  # Remove the low abundance ASVs ---------------------------------

  asv_set <- asv_set %>%
    filter(asv_code %in% asv_set_number$asv_code)

  asv_fasta <- asv_fasta %>%
    filter(asv_code %in% asv_set_number$asv_code) %>%
    left_join(asv_set_number) %>%
    arrange(desc(sum_reads_asv))

  # Compute the abundance of reads per sample for all and for photosynthetic groups ---

  sample_all_reads <- asv_set %>%
    count(file_code, wt=n_reads, name="reads_corrected_total")

  sample_photo_reads <- asv_set %>%
    filter((division %in% c(
      "Chlorophyta", "Cryptophyta", "Rhodophyta",
      "Haptophyta", "Ochrophyta"
    )) |
      (class == "Filosa-Chlorarachnea")) %>%
    count(file_code, wt=n_reads, name="reads_corrected_photo")

  # Add corrected counts to asv_set and sample_list -------------------

  asv_set <- asv_set %>%
    left_join(sample_all_reads) %>%
    left_join(sample_photo_reads)

  sample_list <- sample_list %>%
    left_join(sample_all_reads) %>%
    left_join(sample_photo_reads) %>%
    relocate(contains("_corrected_"), .after = reads_total) %>%
    mutate(reads_corrected_photo = replace_na(reads_corrected_photo, 0)) %>%
    filter(!is.na(reads_corrected_total))


  # Filter the ASVs for taxonomic group and minimum bootstrap -------------------

  asv_set <- asv_set %>%
    filter({{taxo_level}} %in% taxo_name) %>%
    filter({{boot_level}} >= boot_min)

  asv_fasta <- asv_fasta %>%
    filter({{taxo_level}} %in% taxo_name) %>%
    filter({{boot_level}} >= boot_min)

  # File names -------------------
  generate_file_name <- function (type, ext = "xlsx") str_c(directory, type, dataset_id_char ,
                                                   "_", taxo_name_label ,"_",Sys.Date(),".", ext)


  file_asv_fasta <- generate_file_name("metapr2_asv_set_",  "fasta")
  file_asv_xlsx <-  generate_file_name("metapr2_asv_set_")
  file_wide <- generate_file_name("metapr2_wide_asv_set_")
  file_long <- generate_file_name("metapr2_long_asv_set_")
  file_samples <- generate_file_name("metapr2_samples_asv_set_")
  file_phyloseq <- generate_file_name("metapr2_phyloseq_asv_set_", "rds")


  # Export fasta file ------------

  if (taxonomy_full) {
    asv_fasta <- asv_fasta %>%
      dplyr::mutate(seq_name = asv_code)
  } else {
    asv_fasta <- asv_fasta %>%
      mutate(seq_name = str_c(asv_code, species, sep="|") )
  }

  # Export asv file
  if (export_fasta){
    fasta_write(asv_fasta,file_asv_fasta, compress = FALSE, taxo_include = TRUE)
    rio::export(asv_fasta, file_asv_xlsx, overwrite = TRUE)
  }


  # Export wide Excel file

  if (export_wide_xls){
    if (use_hash) {
      # If use_hash, much remove the bootstrap values that may be different for the different asvs with same hash tag
      asv_set_wide <- asv_set %>%
        select(asv_code, kingdom:species, sequence, file_code, n_reads)
    } else {
      asv_set_wide <- asv_set %>%
        select(asv_code, kingdom:species_boot, sequence, sequence_hash, file_code, n_reads)
    }

    asv_set_wide <- asv_set_wide %>%
      pivot_wider(names_from=file_code,
                  values_from = n_reads,
                  values_fill=list(n_reads=0),
                  values_fn = mean)

    rio::export(asv_set_wide, file_wide, overwrite = TRUE)
  }

  # Export long excel file

  if (export_long_xls)   rio::export(asv_set, file_long, overwrite = TRUE)

  if (export_sample_xls) rio::export(sample_list, file_samples, overwrite = TRUE)

  # Export Phyloseq file

  if (export_phyloseq) {
    ## Create the samples, otu and taxonomy tables

    # 1. samples table : row names are labeled by file_code
    samples_df <- asv_set %>%
      select(dataset_id, file_code:last_col(), -n_reads) %>%
      distinct(file_code, .keep_all = TRUE) %>%
      column_to_rownames(var = "file_code")


    # 2. otu table :
    otu <- asv_set %>%
      select(asv_code, file_code, n_reads) %>%
      pivot_wider(names_from=file_code,
                  values_from = n_reads,
                  values_fill=list(n_reads=0),
                  values_fn = mean) %>%
      column_to_rownames(var = "asv_code")

    # 3. Taxonomy table

    tax <-  asv_set %>%
      select(asv_code, kingdom:species) %>%
      distinct(asv_code, .keep_all = TRUE) %>%
      column_to_rownames(var = "asv_code")

    ## Create and save to phyloseq object

    # Transform into matrixes
    otu_mat <- as.matrix(otu)
    tax_mat <- as.matrix(tax)

    # Transform to phyloseq object and save to Rdata file
    OTU = phyloseq::otu_table(otu_mat, taxa_are_rows = TRUE)
    TAX = phyloseq::tax_table(tax_mat)
    samples = phyloseq::sample_data(samples_df)

    phyloseq_asv <- phyloseq::phyloseq(OTU, TAX, samples)

    saveRDS(phyloseq_asv, file = file_phyloseq )

  }

  # Return from function

  if (export_phyloseq) {
    return(list(df=asv_set, ps=phyloseq_asv, fasta=asv_fasta, samples=sample_list))
  } else{
    return(list(df=asv_set, fasta=asv_fasta, samples=sample_list))
  }
}


# metapr2_export_datasets -------------------------------------------------------
#' @title Exports the metapr2_datasets table
#' @description
#' Exports a data frame with all the available data sets
#' @return
#' A data frame
#' @examples
#' # Export all datasets
#'   datasets <- metapr2_export_datasets()
#' @export
#' @md
#'
metapr2_export_datasets <- function() {

# Read the database

  metapr2_db <- db_info("metapr2_google")
  metapr2_db_con <- db_connect(metapr2_db)
  metapr2_datasets <- tbl(metapr2_db_con, "metapr2_datasets") %>% collect()
  db_disconnect(metapr2_db_con)

  return(metapr2_datasets)
  }

# metapr2_export_metadata -------------------------------------------------------
#' @title Exports the metapr2_metadata table
#' @description
#' Exports a data frame with all the available metadata
#' @return
#' A data frame
#' @examples
#' # Export all datasets
#'   metadata <- metapr2_export_metadata()
#' @export
#' @md
#'
metapr2_export_metadata <- function() {

# Read the database

  metapr2_db <- db_info("metapr2_google")
  metapr2_db_con <- db_connect(metapr2_db)
  metapr2_metadata <- tbl(metapr2_db_con, "metapr2_metadata") %>% collect()
  db_disconnect(metapr2_db_con)

  return(metapr2_metadata)
  }


# metapr2_export_reads_total -------------------------------------------------------
#' @title Exports the total number of reads in each dataset
#' @description
#' Exports  the total number of reads for each file_code
#'
#' Data are save in an excel file
#' @param directory  Directory where the files are saved (finish with /)
#' @param dataset_id_selected  Integer vector, e.g. 23 or c(21, 23) or 21:23
#' @return
#' A data frame with 2 columns:
#' * file_code
#' * reads_total
#' @examples
#' # Export data for all datasets
#'   df <- metapr2_export_reads_total()
#' @export
#' @md
#'
metapr2_export_reads_total <- function(directory = "C:/daniel.vaulot@gmail.com/Databases/_metaPR2/export/",
                               dataset_id_selected = c(1:500)) {

# Define a variable to hold the data set id as character
  dataset_id_char <- case_when ((500 %in% dataset_id_selected) ~ "all",
                                length(dataset_id_selected) > 5 ~ "several",
                                TRUE ~ str_c(dataset_id_selected, collapse = "_"))


# Read the database and already filter for selected records
  metapr2_db <- db_info("metapr2_google")
  metapr2_db_con <- db_connect(metapr2_db)

# Get the asv filtered by datasets
  asv_set <- tbl(metapr2_db_con, "metapr2_asv")%>%
     filter(dataset_id %in% dataset_id_selected) %>%
     collect()

# Need to use !! before the local variable
  # Error: Cannot embed a data frame in a SQL query.
  #  If you are seeing this error in code that used to work, the most likely cause is a change dbplyr 1.4.0. Previously `df$x` or
  # `df[[y]]` implied that `df` was a local variable, but now you must make that explict with `!!` or `local()`, e.g., `!!df$x` or
  # `local(df[["y"]))

  metapr2_asv_abundance <- tbl(metapr2_db_con, "metapr2_asv_abundance") %>%
    filter(asv_code %in% !!asv_set$asv_code) %>%
    collect()

  metapr2_samples <- tbl(metapr2_db_con, "metapr2_samples") %>% collect()

  db_disconnect(metapr2_db_con)


  asv_set <-  asv_set %>%
   inner_join(metapr2_asv_abundance) %>%
   inner_join(metapr2_samples)

  asv_set_total <- asv_set %>%
    filter(!is.na(file_code)) %>%  # Remove empty file codes
    group_by(file_code) %>%
    summarise(reads_total = sum(n_reads))


    export(asv_set_total, str_c(directory, "metapr2_reads_total_", dataset_id_char,"_", Sys.Date() , ".xlsx"))

    return(asv_set_total)

  }

# metapr2_export_qs -------------------------------------------------------
#' @title Exports the metapr2 database in qs file for shiny app
#' @description
#' Exports a range of file and format.
#'    * fasta file with the full taxonomy or just the genus level
#'    * excel file with the full table of the asv and metadata
#'    * phyloseq file (better when only selecting a single data set)
#'    * list of samples with metadata
#'
#' Returns a list with 4 elements
#'    * df = dataframe with all the asv and the stations and read numbers
#'    * ps = phyloseq object  - if phyloseq = TRUE
#'    * fasta = df with 2 columns seq_name and sequence
#'    * samples= sample_list (list of sammples with metadata)
#'
#' NOTE: if all export_long_xls, export_wide_xls, export_phyloseq are false, abundances are not exported
#' @param set_type   "public" (41 sets), "basic" (5 sets only), "all"
#' @param directory  Directory where the files are saved (finish with /)
#'
#' @return
#' TRUE if successful
#' @examples
#' # Export public data set
#'
#'   metapr2_export_qs("public")
#'
#' @export
#' @md
#'
metapr2_export_qs <- function(set_type = "public",
                              directory = "data/") {

# Constants  ----------------------------------


  # List to store all global variables
  global <- list()

  global$taxo_levels <- c("kingdom", "supergroup", "division", "class", "order", "family", "genus", "species", "asv_code")


  # All samples are normalized to 100 with 3 decimals, so that it corresponds to a percent
  global$n_reads_tot_normalized = 100

  # Column to removes as well as contains("_boot")

  cols_to_remove = c("asv_id" , "chimera", "sequence_hash", "asv_remark",
                     "sample_id", "file_name", "metadata_id",
                     "NCBI", "NCBI_run", "sample_name",
                     "sample_concentration","sample_sorting", "sample_sorting_cells",
                     "metadata_code", "replicate",
                     "fraction_name_original", "fraction_min", "fraction_max",
                     "sample_remark", "metadata_code_original",
                     "station_id_num", "year", "time", "O2", "fluorescence",
                     "season", "day_length", "depth_range",
                     "substrate_description", "substrate_description_detailed",
                     "experiment_time" ,
                     "pH" , "ice_type" , "ice_thickness" ,
                     "Secchi_depth",
                     "Chla_0.2_3 um" , "PAR_pct" ,
                     "bact_ml" , "peuk_ml" , "neuk_ml", "crypto_ml",
                     "NO2", "metadata_remark",
                     "dataset_path",
                     "lat_min","lat_max","long_min","long_max",
                     "primers_removed ", "dada2_truncLen","dada2_minLen",
                     "dada2_maxLen","dada2_bigdata","dada2_truncQ",
                     "dada2_maxEE","dada2_max_number_asvs")



  # Data sets selected ----------------------------------


  # For class - 5 sets

  if (set_type == "basic"){
    datasets_selected <- metapr2_export_datasets() %>%
      filter(dataset_id %in% c(1, 34, 35, 205, 206))
    sub_dir = "sets_basic"
  }

  # 41 sets
  if (set_type == "public"){
    datasets_selected <- metapr2_export_datasets() %>%
      filter(!is.na(metapr2_version))
    sub_dir = "sets_public"
  }

  # 58 sets
  if (set_type == "all"){
    datasets_selected <- metapr2_export_datasets() %>%
      filter(!is.na(processing_date),
             gene == "18S rRNA")
    sub_dir = "sets_all"
  }

  cat("Datasets: ", nrow(datasets_selected))

  #
  # # All datasets
  # datasets_selected <- metapr2_export_datasets()

  # Read metaPR2 datasets ----------------------------------


  asv_set <- metapr2_export_asv(
    taxo_level = kingdom,
    taxo_name = "Eukaryota",
    dataset_id_selected = datasets_selected$dataset_id,
    filter_samples = "fraction_name != 'femto' &
                   !is.na(file_code)  &
                   file_code !='' &
                   !is.na(DNA_RNA)",
    filter_metadata =  "is.na(experiment_name)",
    export_fasta = FALSE,
    export_wide_xls = FALSE,
    export_sample_xls = FALSE,
    export_phyloseq = FALSE,
    directory = directory,
    taxonomy_full = TRUE,
    boot_min = 90,
    boot_level = supergroup_boot,
    use_hash = TRUE,
    sum_reads_min = 100
  )

  # Summarize information for each data set ----------------------------------

  dataset_samples <- asv_set$df %>%
    select(dataset_id, file_code) %>%
    distinct() %>%
    group_by(dataset_id) %>%
    count(name = "sample_number") %>%
    ungroup()

  dataset_asv <- asv_set$df %>%
    select(dataset_id, asv_code) %>%
    distinct() %>%
    group_by(dataset_id) %>%
    count(name = "asv_number") %>%
    ungroup()

  dataset_reads <- asv_set$df %>%
    select(dataset_id, file_code, n_reads) %>%
    group_by(dataset_id, file_code) %>%
    summarize(n_reads = sum(n_reads, na.rm=TRUE)) %>%
    ungroup() %>%
    group_by(dataset_id) %>%
    summarize(n_reads_mean = round(mean(n_reads,  na.rm=TRUE),0)) %>%
    ungroup()


  # Normalize total mumber of sample reads to 100 ------------------------------
  # Only use the first 10 characters for asv_code (non ambiguous)

  asv_set$df <- asv_set$df %>%
    mutate(asv_code = str_sub(asv_code, 1,10) ) %>%
    select(file_code, asv_code, n_reads) %>%
    group_by(file_code) %>%
    mutate(n_reads_pct = round(n_reads/sum(n_reads)*global$n_reads_tot_normalized, 3)) %>%
    ungroup()

  # Do not transform the phyloseq data for Alpha and Beta diversity analyses
  # asv_set$ps  = phyloseq::transform_sample_counts(asv_set$ps,
  #                                                 function(x) round( x / sum(x)*global$n_reads_tot_normalized,3 ))

  asv_set$samples <- asv_set$samples %>%
    select(-any_of(cols_to_remove),
           -(processing_pipeline_original:contact_email)
    ) %>%
    mutate(label = str_c(dataset_code,
                         str_replace_na(station_id, ""),
                         str_replace_na(depth_level, ""),
                         str_replace_na(substrate, ""),
                         sep = "-"))

  asv_set$datasets <- datasets_selected %>%
    select(-any_of(cols_to_remove)) %>%
    left_join(dataset_samples) %>%
    left_join(dataset_asv) %>%
    left_join(dataset_reads)

  cat("Datasets: ", nrow(asv_set$datasets))

  asv_set$fasta <- asv_set$fasta %>%
    select(- seq_name, -any_of(cols_to_remove), -contains("_boot")) %>%
    mutate(asv_code = str_sub(asv_code, 1,10) )




  # The next line is only necessary for exporting as independant phyloseq file
  # asv_set_phyloseq <- asv_set$ps

  # Compute derived data -----------------------------------------------------------

  # Metadata groups

  global$gene_regions <- sort(unique(asv_set$samples$gene_region))
  global$DNA_RNAs <-  sort(unique(asv_set$samples$DNA_RNA))
  global$projects <-  sort(unique(asv_set$samples$project))

  # Reordering the variables ------------------------------------------------


  depth_level_ordered <- c("under ice", "surface", "euphotic",
                           "bathypelagic", "mesopelagic", "land",
                           "composite" ,"bottom" )
  fraction_name_ordered <- c("pico", "pico-nano", "nano",
                             "nano-micro", "micro",
                             "meso" ,"total" )
  substrate_ordered <- c("water", "net", "ice", "sediment trap material", "sediment trap blank",
                         "epibiota", "sediment", "soil" )

  ecosystems_ordered <- c( "oceanic", "coastal","estuarine","freshwater lakes","freshwater rivers","terrestrial")

  asv_set$samples <- asv_set$samples %>%
    mutate(substrate = stringr::str_replace(substrate, "first year ice", "ice"),
           depth_level = forcats::fct_relevel(depth_level, levels = depth_level_ordered),
           fraction_name = forcats::fct_relevel(fraction_name, levels = fraction_name_ordered),
           substrate = forcats::fct_relevel(substrate, levels = substrate_ordered),
           ecosystem = forcats::fct_relevel(ecosystem, levels = ecosystems_ordered)
    )


  # Reorder for the check boxes ---------------------------------------------


  update_order <- function(variable) {
    values <- dplyr::arrange(asv_set$samples, .data[[variable]])
    values <- dplyr::pull(values, .data[[variable]]) %>%
    unique() %>%
    as.character()
  }

  global$depth_levels <- update_order("depth_level")
  global$fraction_names <- update_order("fraction_name")
  global$substrates <- update_order("substrate")
  global$ecosystems <- update_order("ecosystem")


  # Taxonomy structure --------------------------------------------------------

  global$pr2_taxo <- asv_set$fasta %>%
    select(any_of(global$taxo_levels)) %>%
    distinct() %>%
    arrange(across(any_of(global$taxo_levels)))

  # Colors ------------------------------------------------------------------

  pr2_colors <- dvutils::pr2_colors_read()

  supergroup_colors <- pr2_colors %>%
    filter(palette_name=="daniel",
           taxo_level == "supergroup")

  global$supergroup_colors <- structure(supergroup_colors$color_hex,
                                        .Names=supergroup_colors$taxo_name)


  # Authentification (move to data_initialize) ------------------------


  # Save data using qs --------------------------------------------------------

  qs::qsave(asv_set, str_c(directory, "asv_set.qs"))
  qs::qsave(global, str_c(directory,"global.qs"))

  # Object sizes ------------------------------------------------------------

  obj_size <- function(x) {
    cat("Object:",deparse(substitute(x)), "- size: ", round(pryr::object_size(x)/10**6, 2), " Mb \n")
  }

  obj_size(asv_set)
  obj_size(asv_set$df)
  obj_size(asv_set$samples)
  obj_size(asv_set$fasta)

  obj_size(global)
}