R/GeneralErrorsDetection.R

Defines functions GeneralErrorsDetection

Documented in GeneralErrorsDetection

#' General Errors Detection
#'
#' @param Data Dataset (data.frame or data.table)
#'
#' @details Detect errors
#'   - Remove **duplicated rows**
#'   - Check **missing value** in
#'      X-YTreeUTM/PlotArea/Plot/Subplot/Year/TreeFieldNum/
#'      IdTree/IdStem/Diameter/POM/HOM/Family/Genus/Species/VernName
#'   - Check **missing value** (NA/0) in the measurement variables: "Diameter",
#'      "HOM", "TreeHeight", "StemHeight"
#'   - Check of the **unique association of the IdTree with plot, subplot**
#'      **and TreeFieldNum** (at the site scale)
#'   - Check **duplicated IdTree/IdStem** in a census (at the site scale)
#'   - Check for trees **outside the subplot** (not implemented yet)
#'   - Check **invariant coordinates per IdTree/IdStem**
#'   - Check **fix Plot and Subplot number** (not implemented yet)
#'
#'
#' @return The input dataset (data.table) with a new *Comment* column with error
#'   type informations.
#'
#' @importFrom stats na.omit
#'
#' @export
#'
#' @examples
#' library(data.table)
#' data("TestData")
#'
#' Rslt <- GeneralErrorsDetection(TestData)
#'
GeneralErrorsDetection <- function(
  Data
){

  #### Arguments check ####

  # Data
  if (!inherits(Data, c("data.table", "data.frame")))
    stop("Data must be a data.frame or data.table")

  # IdStem or IdTree? ---------------------------------------------------------------------------------------
  # If no IdStem take IdTree
  if((!"IdStem" %in% names(Data) | all(is.na(Data$IdStem))) &
     ("IdTree" %in% names(Data) & any(!is.na(Data$IdTree))) ){
    ID <- "IdTree"
    Data[, IdTree := as.character(IdTree)]

  }else{ ID <- "IdStem"
  Data[, IdStem := as.character(IdStem)]
  }

  if(!any(c("IdStem", "IdTree") %in% names(Data)) | (all(is.na(Data$IdStem)) &  all(is.na(Data$IdTree))) )
    stop("The 'IdStem' or 'IdTree' column is missing in your dataset")
  # ---------------------------------------------------------------------------------------------------------

  Data[, Subplot := as.character(Subplot)]

  #### Function ####

  # In data.table
  setDT(Data)

  # Check duplicate rows ------------------------------------------------------------------------------------
  # if there are duplicate rows, delete them

  if(anyDuplicated(Data) != 0)
    Data <- unique(Data)


  # Missing values ----------------------------------------------------------------------------------------------------
  # If the column exists, but have NA values

  # Check bota : Family/Genus/Species/ScientificName/VernName
  # Check size : Diameter, POM(?)
  Vars <- c("Plot", "Subplot", "Year", "TreeFieldNum", "IdTree", "IdStem",
            "Diameter", "POM", "HOM", "TreeHeight", "StemHeight",
            "XTreeUTM", "YTreeUTM", "Family", "Genus", "Species", "VernName")

  for (v in 1:length(Vars)) {

    if(Vars[v] %in% names(Data)){ # If the column exists
      if(!all(is.na(Data[,get(Vars[v])]))){ # if the column is not completely empty

        Data <- GenerateComment(Data,
                                condition = is.na(Data[,get(Vars[v])]),
                                comment = paste0("Missing value in ", Vars[v]))

        # warning(paste0("Missing value in ", Vars[v]))

      } # not empty column
    } # column exists
  } # Vars loop

  # Data[grepl("Missing value", Comment)] # to check


  # Measurement variables = 0 -----------------------------------------------------------------------------------------

  Vars <- c("Diameter", "HOM", "TreeHeight", "StemHeight")

  for (v in 1:length(Vars)) {
    if(Vars[v] %in% names(Data)){ # If the column exists

      Data <- GenerateComment(Data,
                              condition = Data[,get(Vars[v])] == 0,
                              comment = paste0(Vars[v]," cannot be 0"))

      # warning(paste0(Vars[v]," cannot be 0"))
    }
  }


  # Data[grepl("cannot be 0", Comment)] # to check



  # Check duplicated TreeFieldNum in plot-subplot association ---------------------------------------------------------

  # Create "PlotSubNum" = "Site/Year/Plot/Subplot/TreeFieldNum"
  # Data[, PlotSubNum := paste(Site, Year, Plot, Subplot, TreeFieldNum, sep = "/")]
  #
  # # y = 2017
  # # p=1
  # # c= 3
  # duplicated_num <- num <- vector("character")
  #
  # # if any duplicats in this col
  # if(anyDuplicated(Data$PlotSubNum) != 0) {
  #   # For each site
  #   for (s in unique(na.omit(Data$Site))) {
  #     # For each census
  #     for (y in unique(na.omit(Data$Year))) {
  #       # For each plot
  #       for (p in unique(na.omit(Data$Plot))) {
  #         # For each Subplot in this plot
  #         for (c in unique(na.omit(Data[Data$Plot==p, Subplot]))) {
  #
  #           num <- Data[Data$Site == s & Data$Year == y
  #                       & Data$Plot == p & Data$Subplot == c]$TreeFieldNum # all the TreeFieldNum for each Plot-Subplot combination
  #
  #           # if there are several TreeFieldNum per Plot-Subplot combination
  #           if(anyDuplicated(num) != 0){
  #             duplicated_num <- unique(num[duplicated(num)])
  #
  #             Data <- GenerateComment(Data,
  #                                     condition =
  #                                       Data[,Site] == s & Data[,Year] == y
  #                                     & Data[,Plot] == p & Data[,Subplot] == c
  #                                     & Data[,TreeFieldNum] %in% duplicated_num,
  #                                     comment = "Duplicate TreeFieldNum in the same Plot and Subplot")
  #
  #             num <- vector("character")
  #
  #             warning("Duplicate TreeFieldNum(s) (",duplicated_num,") in the same Plot (",p,") and Subplot (",c,"), in ",y,"")
  #
  #           } else {num <- vector("character")}
  #         } # end subplot loop
  #       } # end plot loop
  #     } # end year loop
  #   } # end site loop
  # }
  #
  # Data[, PlotSubNum := NULL]
  # Data[TreeFieldNum == duplicated_num,.(Year = sort(Year), Plot, Subplot, TreeFieldNum, Comment)] # to check (1 duplicate)


  # Check of the unique association of the IdTree/IdStem with Plot-Subplot-TreeFieldNum, at the site scale -------------------

  duplicated_ID <- CorresIDs <- vector("character")

  # For each site
  for (s in unique(na.omit(Data$Site))) {

    correspondances <- na.omit(unique(
      Data[Data$Site == s, .(IdTree, Plot, Subplot, TreeFieldNum)]
    ))

    CorresIDs <- correspondances[, IdTree] # .(IdTree) all the Idtree's having a unique P-SubP-TreeFieldNum combination

    if(!identical(CorresIDs, unique(CorresIDs))){ # check if it's the same length, same ids -> 1 asso/ID

      duplicated_ID <- unique(CorresIDs[duplicated(CorresIDs)]) # identify the Idtree(s) having several P-SubP-TreeFieldNum combinations

      Data <- GenerateComment(Data,
                              condition =
                                Data[,Site] == s
                              & Data[,IdTree] %in% duplicated_ID,
                              comment = "Non-unique association of the IdTree with Plot, Subplot and TreeFieldNum")

      DuplicatedID <- unique(Data[IdTree %in% duplicated_ID,
                                  .(IdTree, Plot, Subplot, TreeFieldNum)])
      DuplicatedID <- DuplicatedID[order(IdTree)]

      b <- capture.output(DuplicatedID)
      c <- paste(b, "\n", sep = "")

      warning("Non-unique association of IdTree(s) with Plot, Subplot and TreeFieldNum:\n", c, "\n")

    }
  } # end site loop

  # unique(Data[IdTree %in% duplicated_ID,
  #             .(IdTree = sort(IdTree), Plot, Subplot, TreeFieldNum, Comment)]) # to check


  # Check duplicated IdTree/IdStem in a census ------------------------------------------------------------------------
  DuplicatedID <- Data[duplicated(Data[, list(get(ID), Year)]), list(get(ID), Year)]

  if(nrow(DuplicatedID) > 0){

    DuplicatedID[, IDYear := paste(V1, Year, sep = "/")] # code to detect

    Data[, IDYear := paste(get(ID), Year, sep = "/")] # code to detect

    Data <- GenerateComment(Data,
                            condition = Data$IDYear %in% DuplicatedID[, IDYear],
                            comment = paste0("Duplicated '", ID, "' in the census"))

    a <- Data[IDYear %in% DuplicatedID[, IDYear], .(Year, Plot, Subplot, TreeFieldNum, get(ID))]
    setnames(a, "V5", ID)
    a <- a[order(get(ID), Year)]
    b <- capture.output(a)
    c <- paste(b, "\n", sep = "")

    warning("Duplicated '", ID, "' in the census:\n", c, "\n")
    warning("If these duplicates are normal in your protocol (several measurements per year), you can leave your dataset like that,
the corrections taking into account only 1 measurement per year will consider the data to correct according to the 'KeepMeas' argument.
If these duplicates are abnormal according to your protocol, we advise you to treat them before applying the corrections.")

    Data[, IDYear := NULL]
  }

  # Data[grepl("Duplicated", Comment)] # to check


  # Check for trees outside the subplot (A FAIRE) ---------------------------------------------------------------------
  # Comparer PlotArea avec l'aire du MCP (Minimum Convex Polygon) des arbres a l'interieur de la parcelle.
  # Si aire du MCP > x% plotArea -> error


  # Check invariant coordinates per IdTree/IdStem ---------------------------------------------------------------------

  duplicated_ID <- CorresIDs <- vector("character")

  # For each site
  for (s in unique(na.omit(Data$Site))) {

    CoordIDCombination <- na.omit(unique(
      Data[Data$Site == s, c(ID, "XTreeUTM", "YTreeUTM"), with = FALSE]
    ))

    CorresIDs <- CoordIDCombination[, get(ID)] # .(IdTree) all the Idtree's having a unique X-YTreeUTM) combination

    if(!identical(CorresIDs, unique(CorresIDs))){ # check if it's the same length, same ids -> 1 asso/ID

      duplicated_ID <- unique(CorresIDs[duplicated(CorresIDs)]) # identify the Idtree(s) having several P-SubP-TreeFieldNum combinations

      Data <- GenerateComment(Data,
                              condition =
                                Data[,Site] == s
                              & Data[,get(ID)] %in% duplicated_ID,
                              comment = paste0("Different coordinates (XTreeUTM, YTreeUTM) for a same '", ID,"'"))

      warning(paste0("Different coordinates (XTreeUTM, YTreeUTM) for a same '", ID,"' (",duplicated_ID,")"))

    }
  } # end site loop

  # unique(Data[IdTree %in% duplicated_ID,
  #             .(IdTree = sort(IdTree), XTreeUTM, YTreeUTM, Comment)]) # to check


  # Check fix Plot and Subplot number (A FAIRE, Eliot a) --------------------------------------------------------------
  # alerte quand le nombre de sous-parcelles/parcelles varie selon les années


  return(Data)
}
VincyaneBadouard/TreeData documentation built on Jan. 4, 2024, 2:56 a.m.