R/1_construct_database.R

Defines functions get_predefined_att get_boolean_att get_fraction_att get_int_att get_doubles_att get_alternative_att get_required_att import_data

# -----------------------
# Import data from folder
# -----------------------
import_data <- function(folder) {
  files <- list.files(folder)
  data <- lapply(paste0(folder,"/",files), read.csv, strip.white = T)
  names(data) <- files
  return(data)
}

# -----------------------
# Get attributes
# -----------------------

# Get required attributes
get_required_att <- function(my_attributes) {
  req_att <- my_attributes %>%
    dplyr::filter(Required_or_Optional == "Required")
  return(req_att)
}

# Get alternative attributes
get_alternative_att <- function(my_attributes) {
  alt_att <- my_attributes %>%
    dplyr::filter(Required_or_Optional == "Required_alternative") %>%
    dplyr::group_by(Group) %>%
    dplyr::group_split()
  return(alt_att)
}

# Get attributes with datatype double
get_doubles_att <- function(my_attributes) {
  doubles_attributes <- my_attributes %>%
    dplyr::filter(Datatype == "Double")
  return(doubles_attributes)
}

# Get attributes with datatype integer
get_int_att <- function(my_attributes){
  int_att <- my_attributes %>%
    dplyr::filter(Datatype == "Integer")
  return(int_att)
}

get_fraction_att <- function(my_attributes){
  frac_att <- my_attributes %>%
    dplyr::filter(Unit == "Fraction")
  return(frac_att)
}

get_boolean_att <- function(my_attributes){
  bool_att <- my_attributes %>%
    dplyr::filter(Datatype == "Boolean")
  return(bool_att)
}

get_predefined_att <- function(my_attributes){
  remove_str <- "Predefined: "
  split_at <- ", "
  units <- my_attributes$Unit %>%
    .[grep(remove_str, .)] %>%
    gsub(remove_str, "", .) %>%
    strsplit(., split_at)
  names(units) <- my_attributes$Attribute[grep(remove_str, my_attributes$Unit)]
  return(units)
}

# --------------------------
# Check presence attributes
# --------------------------
are_required_att_present <- function(file, file_name, required_attributes){
  att_is_missing <- !required_attributes$Attribute %in% colnames(file)
  if(TRUE %in% (att_is_missing)){
    stop(paste0("The required attribute(s) ", paste(required_attributes$Attribute[att_is_missing], collapse = ", "),
                " is/are missing in the file ", file_name))
  }
}

are_alternative_required_att_present <- function(file, file_name, alternative_attributes){
  # Is at least one of the alternatives for required fields present?
  if(length(alternative_attributes) > 0){
    present <- FALSE
    for(j in 1:length(alternative_attributes)){
      att_is_missing <- !alternative_attributes[[j]]$Attribute %in% colnames(file)
      if(all(!att_is_missing)){
        present <- TRUE
      }
    }
    if(!present) {
      stop(paste0("In ", file_name," please provide ",paste(alternative_attributes[[1]]$Attribute, collapse = ", ")," or an alternative set of required variables."))
    }
  }
}

are_att_names_valid <- function(file, file_name, my_attributes){
  # Are all provided attributes names (incl. optional) valid?
  att_is_invalid <- !colnames(file) %in% my_attributes$Attribute
  if(TRUE %in% att_is_invalid){
    stop(paste0("The attribute name(s) ",
                paste(colnames(file)[att_is_invalid], collapse = ", ") ,
                " in file ",file_name, " is/are not valid."))
  }
}

# -----------------------
# Check values validity
# -----------------------

are_IDs_unique <- function(file, file_name, ID_column = NULL){
  if(!(ID_column %in% colnames(file))){
    stop("Unknown ID_column given.")
  }
  ID_is_duplicated <- duplicated(file[,ID_column])
  if(TRUE %in% ID_is_duplicated){
    stop(paste0("The ID(s) ", paste(file[ID_is_duplicated, ID_column], collapse = ", "),
                 " in file ",file_name," is/are duplicated. IDs must be unique."))
  }
}

are_required_att_complete <- function(file, file_name, required_attributes){
  # No NA values are allowed in the required columns.
  columns <- colnames(file) %in% required_attributes$Attribute
  subset <- file[,columns, drop = F]
  if(TRUE %in% is.na(subset)){
    rowi <- unique(which(is.na(subset), arr.ind = T)[,"row"]+1)
    stop(paste0("NA values are not allowed for the required attributes ",
                paste(required_attributes$Attribute, collapse = ", "),
                ". Please check the file ", file_name,
                " row(s): ",paste(sort(unique(rowi)), collapse = ", ")))
  }
}

are_alternative_required_att_complete <- function(file, file_name, alternative_attributes){
  if(length(alternative_attributes) > 0){
    # No NA values are allowed in at least one of the required alternatives
    NA_index <- list()
    for(j in 1:length(alternative_attributes)){
      columns <- colnames(file) %in% alternative_attributes[[j]]$Attribute
      NA_index[[j]] <- unique(which(is.na(file[,columns, drop = F]), arr.ind = TRUE)[,"row"])
    }
    if(TRUE %in% duplicated(unlist(NA_index))){
      stop(paste0("Every entry must have at least one complete set of alternative required fields. ",
                  "Duplicate NAs are found in row(s) ",
                  paste(sort(unlist(NA_index)[duplicated(unlist(NA_index))]+1), collapse = ", "),
                  " in file: ", file_name))
    }
  }
}

are_groups_complete <- function(file, file_name, my_attributes){
  my_groups <- my_attributes %>%
    dplyr::filter(!is.na(Group)) %>%
    dplyr::group_by(Group) %>%
    dplyr::group_split()

  for(my_group in my_groups){
    is_group_included <- colnames(file) %in% my_group$Attribute
    if(TRUE %in% is_group_included &&
       length(which(is_group_included)) != length(my_group$Attribute)){
      stop(paste0("In file ",file_name," the attributes ",
                  paste(my_group$Attribute, collapse = ", "),
                  " are needed but only ",
                  paste(colnames(file)[is_group_included], collapse = ", ")," are given."))
    }
  }
}

are_dates_converted <- function(file, file_name){
  # All dates should be convertible from a string dd/mm/yyyy to R data format.
  is_unconverted_date <- is.na(file$Date)
  if(TRUE %in% is_unconverted_date){
    stop(paste0("The date in row(s) ",paste(sort(which(is_unconverted_date)+1), collapse = ", "),
                " in file ", file_name,
                " cannot be converted. Ensure all dates have the right format: dd/mm/yyyy."))
  }
}

is_time_format_correct <- function(file, file_name){
  is_wrong_time_start <- gsub("[0-2][0-9]:[0-5][0-9]:[0-5][0-9]","", file$Time_start) != ""
  is_wrong_time_stop <- gsub("[0-2][0-9]:[0-5][0-9]:[0-5][0-9]","", file$Time_stop) != ""
  if(TRUE %in% is_wrong_time_stop){
    is_wrong_time_stop <- !all(is.na(file$Time_stop))
  }
  if(TRUE %in% is_wrong_time_start | TRUE %in% is_wrong_time_stop){
    stop(paste0("The time in column 'Time_start' and/or 'Time_stop' and row(s) ",
                paste(sort(which(c(is_wrong_time_start, is_wrong_time_stop))+1), collapse = ", "),
                " in file ", file_name, " should be in the format 'HH:MM:SS'."))
  }
}

is_Station_objective_correct <- function(file, file_name){
  incomplete <- (file$Station_objective == "Incomplete")
  no_excl <- is.null(file$Excluded)
  no_focus <- is.null(file$Focus)
  if(TRUE %in% incomplete){
    if(no_excl & no_focus){
      stop(paste0("In file ",file_name,", some entries in column 'Station_objective' are 'Incomplete', but the necessary columns 'Excluded' or 'Focus' do not exist."))
    }else if(no_excl){
      incomplete_and_NA <- which(incomplete & is.na(file$Focus))
      if(length(incomplete_and_NA) > 0){
        stop(paste0("In file ",file_name,", the entries in row(s) ",
                    paste(sort(incomplete_and_NA+1), collapse = ", "),
                    " are defined 'Incomplete' but no taxa are provided in the column 'Focus' or 'Excluded'."))
      }
    }else if(no_focus){
      incomplete_and_NA <- which(incomplete & is.na(file$Excluded))
      if(length(incomplete_and_NA) > 0){
        stop(paste0("In file ",file_name,", the entries in row(s) ",
                    paste(sort(incomplete_and_NA+1), collapse = ", "),
                    " are defined 'Incomplete' but no taxa are provided in the column 'Focus' or 'Excluded'."))
      }
    }else{
      incomplete_and_NA <- which(incomplete & is.na(file$Excluded) & is.na(file$Focus))
      if(length(incomplete_and_NA) > 0){
        stop(paste0("In file ",file_name,", the entries in row(s) ",
                    paste(sort(incomplete_and_NA+1), collapse = ", "),
                    " are defined 'Incomplete' but no taxa are provided in the column 'Focus' or 'Excluded'."))
      }
    }
  }
}

are_values_correct_type <- function(file, file_name, type_attributes, func){
  columns <- which(colnames(file) %in% type_attributes$Attribute)
  if(length(columns) > 0){
    for(j in 1:length(columns)){
      if(!func(file[,columns[j]])){
        stop(paste0("The column ", colnames(file)[columns[j]], " in file ",file_name,
                    " must contain only values of datatype ",type_attributes$Datatype[1]))
      }
    }
  }
}

are_booleans_correct <- function(file, file_name, boolean_attributes){
  if(length(boolean_attributes$Attribute) > 0){
    columns <- which(colnames(file) %in% boolean_attributes$Attribute)
    subset <- file[,columns, drop = F]
    if(dim(subset)[2] != 0){
      is_not_boolean <- which(!is.na(subset) & subset != 0 & subset != 1, arr.ind = T)[,"row"]+1
      if(length(is_not_boolean) > 0){
        stop(paste0("In file ",file_name,
                    " the boolean attributes ", paste(colnames(file)[columns], collapse = ", "),
                    " may only be 0, 1, or NA. Please check row(s) ",
                    paste(sort(is_not_boolean), collapse = ", ")))
      }
    }
  }
}

are_fractions_correct <- function(file, file_name, fraction_attributes){
  if(length(fraction_attributes$Attribute) > 0){
    columns <- which(colnames(file) %in% fraction_attributes$Attribute)
    subset <- file[,columns, drop = F]
    is_not_fraction <- which(subset < 0 | subset > 1, arr.ind = T)[,"row"]+1
    if(length(is_not_fraction) > 0){
      stop(paste0("In file ",file_name,
                  " the attributes ", paste(colnames(file)[columns], collapse = ", "),
                  " should lie between 0 and 1. Please check row(s) ",
                  paste(sort(is_not_fraction), collapse = ", ")))
    }
  }
}

are_predefined_atts_correct <- function(file, file_name, predefined_attributes){
  for(i in 1:length(predefined_attributes)){
    att <- names(predefined_attributes)[i]
    if(!att %in% colnames(file)){
      next
    }else{
      subset <- file[,att]
      is_not_predefined <- which(!subset %in% predefined_attributes[[i]] & !is.na(subset))+1
      if(length(is_not_predefined) > 0){
        stop(paste0(
          "In file ",file_name," in column ",att,
          " values exist that are not the predefined values ",
          paste(predefined_attributes[[i]], collapse = ", "),
          " or NA in row(s): ",paste(sort(is_not_predefined), collapse = ", ")
        ))
      }
    }
  }
}

are_species_metadata_consistent <- function(file, file_name){
  n_unique_Expect <- file %>%
    dplyr::group_by(Species_reported, StationID) %>%
    dplyr::filter(Count > -1) %>%
    dplyr::select(Species_reported, StationID) %>%
    dplyr::distinct()

  n_unique_Obs <- file %>%
    dplyr::group_by(Species_reported, StationID) %>%
    dplyr::filter(Count > -1) %>%
    dplyr::select(Species_reported, StationID, Fraction, is_Fraction_assumed) %>%
    dplyr::distinct() %>%
    dplyr::mutate(mistake_count = dplyr::n())

  if(dim(n_unique_Expect)[1] != dim(n_unique_Obs)[1]){
    print(dplyr::filter(n_unique_Obs, mistake_count > 1))
    stop(paste0("In file ", file_name," the reported Fraction and is_Fraction_assumed must be equal for all species in a sample."))
  }
}

is_biomass_complete <- function(file, file_name){
  # WW
  if("WW_g" %in% colnames(file)){
    should_be_complete <- which(!is.na(file$WW_g))
    if(length(should_be_complete > 0)){
      isNA <- is.na(file[should_be_complete,
                         c("Weight_type", "Threshold_scale",
                           "is_Shell_removed", "is_Partial_WW")])
      if(TRUE %in% isNA){
        stop("Wet weight is reported, but the columns Weight_type, Threshold_scale, isWithShell and is_Partial_WW are not filled for all weights.")
      }
    }
    # Entry weight type
    should_be_empty <- which(is.na(file$WW_g) & file$Weight_type == "Entry")
    if(length(should_be_empty > 0)){
      stop(paste0("If WW_g is 'NA' then Weight_type cannot be 'Entry'. Check row(s): ",
                  paste0(should_be_empty+1, collapse = ",")))
    }
    # Sample weight type
    if("Sample" %in% file$Weight_type){
      check <- file %>%
        dplyr::group_by(StationID, Species_reported) %>%
        dplyr::filter(Weight_type == "Sample") %>%
        dplyr::summarise(
          nr_rows = dplyr::n(),
          nr_NAs = length(which(is.na(WW_g)))
        ) %>%
        dplyr::mutate(incorrect_weight_type = ifelse(nr_NAs < nr_rows, FALSE, TRUE)) %>%
        dplyr::filter(incorrect_weight_type == TRUE)
      if(dim(check)[1] > 0){
        print(check)
        stop("For the station-species combination as printed the WW_g of all 'Sample' entries is NA. Either include a value for WW_g or set Weight_type to 'NA'.")
      }
    }
  }


  # AFDW
  if("AFDW_g" %in% colnames(file)){
    should_be_complete <- which(!is.na(file$AFDW_g))
    if(length(should_be_complete > 0)){
      isNA <- is.na(file[should_be_complete,
                         c("Weight_type_AFDW", "Threshold_scale_AFDW",
                           "is_Partial_AFDW")])
      if(TRUE %in% isNA){
        stop("AFDW is reported, but the columns Weight_type_AFDW, Threshold_scale_AFDW, and is_Partial_AFDW are not filled for all weights.")
      }
    }

    # Entry weight type
    should_be_empty <- which(is.na(file$AFDW_g) & file$Weight_type_AFDW == "Entry")
    if(length(should_be_empty > 0)){
      stop(paste0("If AFDW_g is 'NA' then Weight_type_AFDW cannot be 'Entry'. Check row(s): ",
                  paste0(should_be_empty+1, collapse = ",")))
    }
    # Sample weight type
    if("Sample" %in% file$Weight_type_AFDW){
      check <- file %>%
        dplyr::group_by(StationID, Species_reported) %>%
        dplyr::filter(Weight_type_AFDW == "Sample") %>%
        dplyr::summarise(
          nr_rows = dplyr::n(),
          nr_NAs = length(which(is.na(AFDW_g)))
        ) %>%
        dplyr::mutate(incorrect_weight_type = ifelse(nr_NAs < nr_rows, FALSE, TRUE)) %>%
        dplyr::filter(incorrect_weight_type == TRUE)
      if(dim(check)[1] > 0){
        print(check)
        stop("For the station-species combination as printed the AFDW_g of all 'Sample' entries is NA. Either include a value for AFDW_g or set Weight_type_AFDW to 'NA'.")
      }
    }
  }
}

do_measurements_have_units <- function(file, file_name){
  measurements <- which(!is.na(file$Size_value))
  no_units <- which(is.na(file$Size_unit[measurements]))+1
  no_dimensions <- which(is.na(file$Size_dimension[measurements]))+1
  if(length(no_units) > 0){
    stop(paste0("In file ",file_name,
                " size units are missing in row(s) ",
                paste(sort(no_units), collapse = ", ")))
  }
  if(length(no_dimensions) > 0){
    stop(paste0("In file ",file_name,
                " size dimensions are missing in row(s) ",
                paste(sort(no_dimensions), collapse = ", ")))
  }

  no_measurements <- which(is.na(file$Size_value))
  but_units <- which(!is.na(file$Size_unit[no_measurements]))+1
  but_dimensions <- which(!is.na(file$Size_dimension[no_measurements]))+1
  if(length(but_units) > 0){
    stop(paste0("In file ",file_name,
                " size units are reported without a size value in row(s) ",
                paste(sort(but_units), collapse = ", ")))
  }
  if(length(but_dimensions) > 0){
    stop(paste0("In file ",file_name,
                " size dimensions are reported without a size value in row(s) ",
                paste(sort(but_dimensions), collapse = ", ")))
  }
}

are_measurements_positive <- function(file, file_name){
  # Stations
  tl <- file$Track_length_m_preset
  bd <- file$Blade_depth_cm
  bw <- file$Blade_width_cm
  ts <- file$Tow_speed_knots
  wd <- file$Water_depth_m_cruise
  oc <- file$Odometer_count
  br <- file$Bearing
  if(!is.null(tl)){
    tl <- tl  < 0
    if(TRUE %in% tl){
      stop(paste0("In file ",file_name," column Track_length_cruise_m the values in row(s) ",
                  paste(sort(which(tl)+1), collapse = ", "), " are negative but should be positive."))
    }
  }
  if(!is.null(bd)){
    bd <- bd < 0
    if(TRUE %in% bd){
      stop(paste0("In file ",file_name," column Blade_depth_cm the values in row(s) ",
                  paste(sort(which(bd)+1), collapse = ", "), " are negative but should be positive."))
    }
  }
  if(!is.null(bw)){
    bw <- bw < 0
    if(TRUE %in% bw){
      stop(paste0("In file ",file_name," column Blade_width_cm the values in row(s) ",
                  paste(sort(which(bw)+1), collapse = ", "), " are negative but should be positive."))
    }
  }
  if(!is.null(ts)){
    ts <- ts < 0
    if(TRUE %in% ts){
      stop(paste0("In file ",file_name," column Tow_speed_knots the values in row(s) ",
                  paste(sort(which(ts)+1), collapse = ", "), " are negative but should be positive."))
    }
  }
  if(!is.null(wd)){
    wd <- wd < 0
    if(TRUE %in% wd){
      stop(paste0("In file ",file_name," column Water_depth_m_cruise the values in row(s) ",
                  paste(sort(which(wd)+1), collapse = ", "), " are negative but should be positive."))
    }
  }
  if(!is.null(oc)){
    oc <- oc < 0
    if(TRUE %in% oc){
      stop(paste0("In file ",file_name," column Odometer_count the values in row(s) ",
                  paste(sort(which(oc)+1), collapse = ", "), " are negative but should be positive."))
    }
  }
  if(!is.null(br)){
    br <- abs(br) > 180
    if(TRUE %in% br){
      stop(paste0("In file ",file_name," column Bearing the absolute values in row(s) ",
                  paste(sort(which(br)+1), collapse = ", "),
                  " are larger than 180."))
    }
  }

  # Species
  ct <- file$Count
  sv <- file$Size_value
  ww <- file$WW_g
  aw <- file$AFDW_g
  tw <- file$Threshold_scale
  ta <- file$Threshold_scale_AFDW

  if(!is.null(ct)){
    ct_l <- ct < -1
    if(TRUE %in% ct_l){
      stop(paste0("In file ",file_name," column Count the values in row(s) ",
                  paste(sort(which(ct_l)+1), collapse = ", "), " are negative but should be positive."))
    }
    ct_l <- ct == 0
    if(TRUE %in% ct_l){
      stop(paste0("In file ",file_name," column Count the values in row(s) ",
                  paste(sort(which(ct_l)+1), collapse = ", "), " are zero. Please check for typos or set to 'NA' instead."))
    }
  }
  if(!is.null(sv)){
    sv <- sv < 0
    if(TRUE %in% sv){
      stop(paste0("In file ",file_name," column Size_value the values in row(s) ",
                  paste(sort(which(sv)+1), collapse = ", "), " are negative but should be positive."))
    }
  }
  if(!is.null(ww)){
    ww <- ww < 0
    if(TRUE %in% ww){
      stop(paste0("In file ",file_name," column WW_g the values in row(s) ",
                  paste(sort(which(ww)+1), collapse = ", "), " are negative but should be positive."))
    }
  }
  if(!is.null(aw)){
    aw <- aw < 0
    if(TRUE %in% aw){
      stop(paste0("In file ",file_name," column AFDW_g the values in row(s) ",
                  paste(sort(which(aw)+1), collapse = ", "), " are negative but should be positive."))
    }
  }
  if(!is.null(tw)){
    tw <- tw <= 0
    if(TRUE %in% tw){
      stop(paste0("In file ",file_name," column Threshold_scale the values in row(s) ",
                  paste(sort(which(tw)+1), collapse = ", "), " are negative but should be positive."))
    }
  }
  if(!is.null(ta)){
    ta <- ta <= 0
    if(TRUE %in% ta){
      stop(paste0("In file ",file_name," column Threshold_scale_AFDW the values in row(s) ",
                  paste(sort(which(ta)+1), collapse = ", "), " are negative but should be positive."))
    }
  }
}

#############
# pos2coord #
#############
# source: https://www.r-bloggers.com/array-position-to-matrix-coordinates-conversion/
# Function to go from position in matrix to coordinates of matrix
# position is index from 1 to length(matrix)
# coord is [row, col]
# dim.mat is c(nrow, ncol)
pos2coord <- function(pos=NULL, coord=NULL, dim.mat=NULL) {
  if(is.null(pos) & is.null(coord) | is.null(dim.mat)){
    stop("must supply either 'pos' or 'coord', and 'dim.mat'")
  }
  if(is.null(pos) & !is.null(coord) & !is.null(dim.mat)){
    pos <- ((coord[,2]-1)*dim.mat[1])+coord[,1]
    return(pos)
  }
  if(!is.null(pos) & is.null(coord) & !is.null(dim.mat)){
    coord <- matrix(NA, nrow=length(pos), ncol=2)
    colnames(coord) <- c("row", "col")
    coord[,1] <- ((pos-1) %% dim.mat[1]) +1
    coord[,2] <- ((pos-1) %/% dim.mat[1]) +1
    return(coord)
  }
}

add_missing_columns <- function(df, attributes){
  missing_columns <- attributes[which(!attributes %in% colnames(df))]
  if(length(missing_columns) > 0){
    for(i in 1:length(missing_columns)){
      df$newcol <- NA
      colnames(df)[which(colnames(df) == "newcol")] <- missing_columns[i]
    }
  }
  return(df)
}

#' Construct initial database
#'
#' This function loads CSV files, checks the data format, and constructs an initial table
#' with only raw data.
#' @param in_folder This folder must contain two subfolders called 'Species' and 'Stations' that contain
#' properly formatted CSV files. These CSV files can be requested from the NIOZ Data Archiving System.
#' Default is "inputfiles".
#' @param out_folder The function will create a new directoy to store the newly created data.
#' Default is "data".
#' @details Make sure this function is executed in the same working directory that contains the in_folder directory.
#' This function checks the data format of the CSV files contained by the 'Species' and 'Stations' folder
#' contained by the 'in-folder'. If the functions finds any irregularities, an error will be thrown.
#' @return This function does not return any objects. It will throw errors if anything is wrong with
#' your initial data. If there are no errors, it will store Rdata in the newly created folder 'out_folder'.
#' @importFrom magrittr "%>%"
#' @export
construct_database <- function(in_folder = "inputfiles", out_folder = "data", as_CSV = TRUE){
  # List of tables to import and check
  tables <- list(
    Stations <- list(folder = "Stations", att = "attributes_stations.csv"),
    Species <- list(folder = "Species", att = "attributes_species.csv")
  )

  # Create folder to store output
  dir.create(out_folder)

  for(table in tables){
    print(paste0("Checking the files in folder ", in_folder, "/", table$folder, ". Importing data."))

    # Read in attribute requirements
    my_attributes <- read.csv(system.file("extdata", table$att, package = "TripleD")) %>%
      dplyr::filter(Attribute != "IGNORE")

    # Read in data
    data <- import_data(paste0(in_folder,"/",table$folder))

    # Subset data requirements
    required_attributes <- get_required_att(my_attributes)
    alternative_attributes <- get_alternative_att(my_attributes)
    doubles_attributes <- get_doubles_att(my_attributes)
    integer_attributes <- get_int_att(my_attributes)
    boolean_attributes <- get_boolean_att(my_attributes)
    fraction_attributes <- get_fraction_att(my_attributes)
    predefined_attributes <- get_predefined_att(my_attributes)

    # Check format requirements for each file
    for(i in 1:length(data)){
      file <- data[[i]]
      file_name <- paste(table$folder, names(data)[i], sep = "/")
      print(paste0("Checking file ", file_name))

      # Check if all cells are filled.
      # Empty cells are read in as "" or NaN and are not allowed.
      NaNs <- is.nan(unlist(file))
      emptystr <- (file == "")
      if(TRUE %in% c(NaNs, emptystr)){
        rowi <- unique(c(
          pos2coord(pos = which(NaNs), dim.mat = dim(file))[,"row"]+1,
          which(file == "", arr.ind = T)[,"row"]+1
        ))
        stop("No empty cells are allowed in the CSV. Check rows ",
             paste(sort(rowi), collapse = ", "))
      }

      # Attribute presence
      are_required_att_present(file, file_name, required_attributes)
      are_alternative_required_att_present(file, file_name, alternative_attributes)
      are_att_names_valid(file, file_name, my_attributes)

      # Correct values and units
      are_required_att_complete(file, file_name, required_attributes)
      are_alternative_required_att_complete(file, file_name, alternative_attributes)
      are_groups_complete(file, file_name, my_attributes)
      are_values_correct_type(file, file_name, doubles_attributes, func = is.double)
      are_values_correct_type(file, file_name, integer_attributes, func = is.integer)
      are_booleans_correct(file, file_name, boolean_attributes)
      are_fractions_correct(file, file_name, fraction_attributes)
      are_predefined_atts_correct(file, file_name, predefined_attributes)
      if(table$folder == "Stations"){
        are_IDs_unique(file, file_name, ID_column = "StationID")
        data[[i]]$Date <- as.Date(data[[i]]$Date, format = "%d/%m/%Y")
        are_dates_converted(file = data[[i]], file_name)
        is_time_format_correct(file, file_name)
        is_Station_objective_correct(file, file_name)
      }
      if(table$folder == "Species"){
        are_IDs_unique(file, file_name, ID_column = "EntryID")
        are_species_metadata_consistent(file, file_name)
        is_biomass_complete(file, file_name)
        do_measurements_have_units(file, file_name)
      }
      are_measurements_positive(file, file_name)
    }
    if(table$folder == "Stations"){
      stations <- dplyr::bind_rows(data, .id = "File")
      # If certain columns do not exist, create them with values NA, to prevent errors down the line.
      stations <- add_missing_columns(stations, as.character(my_attributes$Attribute))
    }
    if(table$folder == "Species"){
      species <- dplyr::bind_rows(data, .id = "File")
      # If certain columns do not exist, create them with values NA, to prevent errors down the line.
      species <- add_missing_columns(species, as.character(my_attributes$Attribute))
    }
  }

  # Booleans 1, 0 to true booleans.
  stations <- stations %>%
    dplyr::mutate(is_Quantitative = ifelse(is_Quantitative == 0, FALSE, TRUE))
  species <- species %>% dplyr::mutate(
    is_Fraction_assumed = ifelse(is_Fraction_assumed == 0, FALSE, TRUE),
    is_ID_confident = ifelse(is_ID_confident == 0, FALSE, TRUE),
    is_Shell_removed = ifelse(is_Shell_removed == 1, TRUE,FALSE),
    is_Partial_WW = ifelse(is_Partial_WW == 1, TRUE, FALSE),
    is_Partial_AFDW = ifelse(is_Partial_AFDW == 1, TRUE, FALSE),
    is_Preserved = ifelse(is_Preserved == 1, TRUE, FALSE)
  )
  species$is_Shell_removed[which(is.na(species$is_Shell_removed))] <- FALSE

  # Test of all stationIDs used in the species files are also mentioned in the station files.
  missing_stationIDs <- as.character(
    unique(species$StationID[which(!species$StationID %in% stations$StationID)]))
  if(length(missing_stationIDs) > 0){
    stop(paste0("The StationID(s) ", paste(missing_stationIDs, collapse = ", "), " are reported in the species file,
                   but are missing in the stations file, i.e. metadata is missing for these biological data points."))
  }
  # Test if stationIDs are unique over different files.
  ID_is_duplicated <- which(duplicated(stations$StationID))
  if(length(ID_is_duplicated) > 0){
    stop(paste0("The StationID(s)" ,paste(stations$StationID[ID_is_duplicated], collapse = ", "),
                " occur multiple times in differen files, but they must be unique. Please check."))
  }

  # Save
  save(stations, file = paste0(out_folder,"/","stations_initial.rda"))
  save(species, file = paste0(out_folder,"/","species_initial.rda"))
  if(as_CSV){
    write.csv(stations, file = paste0(out_folder, "/stations_initial.csv"))
    write.csv(species, file = paste0(out_folder, "/species_initial.csv"))
  }
  message("All good! (Warnings can probably be ignored. Review if you want to be sure.")
}
dswdejonge/TripleD documentation built on June 18, 2020, 12:24 p.m.