R/EPDr-standardize_methods.R

# blois_quality -----------------------------------------------

#' Blois quality index for palynological samples
#' 
#' This function apply the quality index described in Blois et al. (2013). 
#' From the Ecography 2013 paper, Appendix 3: "\emph{For each site at a particular 1 kyr
#' time period, site data-quality was calculated as the mean normalized distance
#' of the nearest pollen sample and the nearest chronological control.  We
#' calculated the distance in years of the nearest pollen sample and the nearest
#' chronological control to each 1 kyr time period.  We eliminated sites where
#' the nearest pollen sample was over 2000 years away or the nearest
#' chronological control was over 5000 years away.  For the remaining sites in
#' each 1 kyr period, we created a summary measure of site data-quality by 
#' rescaling the two distances in years to a 0 - 1 scale and calculating the
#' mean.  For example, if the nearest sample to the 1 kyr BP time period at a
#' given site was at 1.050 kyr BP and the nearest chronological control was at
#' 1.100 kyr BP, the raw distances would be 50 years and 100 years, respectively.
#' These equate to scaled values of 0.975 (i.e., 1 - 50/2000) and 0.98 (i.e., 1 -
#' 100/5000) for sample and chronological quality, respectively, with a mean
#' data-quality for this site at the 1 kyr BP time period of 0.9775.}"
#' To replicate the calculation the function allows to specify different maximum
#' distances as parameters of the function.
#' 
#' When \code{x} is an \code{\link[EPDr]{epd.entity-class}} object, it is
#' first transformed into a \code{\link[EPDr]{epd.entity.df-class}} 
#' object by \code{\link[EPDr]{entity_to_matrices}} function.
#' 
#' @param x epd.entity \code{\link[EPDr]{epd.entity.df-class}} or a
#' \code{\link[EPDr]{epd.entity-class}} object.
#' @param chronology numeric Chronology number to look for samples and control points ages.
#' This value become the default chronology in the new object. If not 
#' specified the function check the default chronology in  
#' \code{x}. It can be any of the chronologies in the EPD for that 
#' particular entity, or the one from Giesecke et al. (2013). 
#' @param max_sample_dist numeric Maximum numeric distance in years to be considered
#' to the palynological samples for interpolated or ranged data.
#' @param max_control_dist numeric Maximum numeric distance in years to be considered
#' to the control points (e.g., C14, top, bottom, etc.).
#' @param overwrite logical TRUE or FALSE indicating whether to overwrite
#' blois index in @@agesdf@@dataquality if it already has a 'blois' column.
#'
#' @return \code{\link[EPDr]{epd.entity.df-class}} object with no empty
#' \code{@@agesdf@@dataquality} slot. The default chronology in \code{x@@defaultchron}
#' is changed to the one specified in \code{chronology}. 
#' 
#' @references Blois, J.L., Williams, J.W., Fitzpatrick, M.C., Ferrier, S.,
#' Veloz, S.D., He, F., Liu, Z., Manion, G., and Otto-Bliesner, B. (2013).
#' Modeling the climatic drivers of spatial patterns in vegetation composition
#' since the Last Glacial Maximum. Ecography, 36, 460-473.
#' @references Giesecke, T., Davis, B., Brewer, S., Finsinger, W., 
#' Wolters, S., Blaaw, M., de Beaulieu, J.L., Binney, H., Fyfe, R.M.,
#' Gaillard, M.J., Gil-Romera, G., van der Knaap, W.O. Kunes, P.,
#' Kuhl, N., van Leeuwen, J.F.N, Leydet, M., Lotter, A.F., Ortu, E.,
#' Semmler, M., and Bradshaw, R.H.W (2013). Towards mapping the late
#' Quaternary vegetation change of Europe. Vegetation History and
#' Archaeobotany, 23, 75-86.
#'
#' @examples
#' \dontrun{
#' epd.connection <- connect_to_epd(host="localhost", database="epd",
#'                                  user="epdr", password="epdrpw")
#' epd.1 <- get_entity(1, epd.connection)
#' epd.1.qi <- blois_quality(epd.1)
#' epd.1.qi@agesdf@dataquality
#' 
#' epd.1.ran <- intervals_counts(epd.1, tmin = seq(0, 21000, by = 1000),
#'                               tmax = seq(999, 21999, by = 1000))
#' epd.1.ran.qi <- blois_quality(epd.1.ran)
#' epd.1.ran.qi@agesdf@dataquality
#' 
#' t <- c(seq(0, 21000, by = 500))
#' epd.1.int <- interpolate_counts(epd.1, t)
#' epd.1.int.qi <- blois_quality(epd.1.int)
#' epd.1.int.qi@agesdf@dataquality
#' }
#' @rdname blois_quality
#' @exportMethod blois_quality
setGeneric("blois_quality", function(x,
                                     chronology = NULL,
                                     max_sample_dist = 2000,
                                     max_control_dist = 5000,
                                     overwrite = FALSE){
  standardGeneric("blois_quality")
})

#' @rdname blois_quality
setMethod("blois_quality", signature(x = "epd.entity.df"),
          function(x,
                   chronology,
                   max_sample_dist,
                   max_control_dist,
                   overwrite){
            if ("blois" %in% colnames(x@agesdf@dataquality) & overwrite == FALSE){
              stop(paste0("`x` already has blois quality index calculated ",
                          "and 'overwrite = FALSE'. If you need to ",
                          "calculate blois index again, make sure ",
                          "to remove the 'blois' column in the ",
                          "@agesdf@dataquality slot or turn ",
                          "'overwrite = TRUE."))
            }
            if (is.null(chronology)){
              if (!check_defaultchron(x)){
                stop(paste0("The 'x' object has no sample ages ",
                            "and the quality index cannot be calculated."))
              }
              chron_ <- x@defaultchron
            }
            if (chron_ == 9999){
              chronology <- "giesecke"
            }else{
              chronology <- chron_
            }
            index <- which(x@samples@pagedpt$chron_ == chron_)
            sample_ages <- x@samples@pagedpt[index, "agebp"]
            data_ages <- subset(x@agesdf@depthages, select = chronology)
            index <- which(x@chron@agebasis$chron_ == chron_)
            agebasis <- x@chron@agebasis[index, ]
            .mindiff <- function(z, w, max_diff){
              if (length(w) == 0){
                sorted.w <- NA
              }else{
                sorted.w <- sort(w)
              }
              if (length(sorted.w) == 1){
                mindist <- abs(z - sorted.w)
              }else{
                myfun <- stats::stepfun(sorted.w,
                                        0:length(sorted.w))
                indices <- pmin(pmax(1, myfun(z)),
                                length(sorted.w) - 1)
                mindist <- pmin(abs(z - sorted.w[indices]),
                                abs(z - sorted.w[indices + 1]))
              }
              if (!is.null(max_diff)){
                mindist <- pmin(mindist, max_diff)
              }
              return(mindist)
            }
            control_dist <- apply(data_ages,
                                  MARGIN = 2,
                                  FUN = .mindiff,
                                  agebasis[, "age"],
                                  max_control_dist)
            sample_dist <- apply(data_ages,
                                 MARGIN = 2,
                                 FUN = .mindiff,
                                 sample_ages,
                                 max_sample_dist)
            control_dist <- 1 - (control_dist / max_control_dist)
            sample_dist <- 1 - (sample_dist / max_sample_dist)
            data.quality <- (control_dist + sample_dist) / 2
            data.quality <- as.data.frame(data.quality)
            colnames(data.quality) <- "blois"
            if (nrow(x@agesdf@dataquality) == 0){
              x@agesdf@dataquality <- data.quality
            }else{
              if ("blois" %in% colnames(x@agesdf@dataquality) & overwrite == TRUE){
                x@agesdf@dataquality$blois <- data.quality$blois
              }else{
                x@agesdf@dataquality <- cbind(x@agesdf@dataquality,
                                              data.quality)
              }
            }
            return(x)
          })

#' @rdname blois_quality
setMethod("blois_quality", signature(x = "epd.entity"),
          function(x,
                   chronology,
                   max_sample_dist,
                   max_control_dist,
                   overwrite){
            x <- entity_to_matrices(x)
            x <- blois_quality(x,
                               chronology,
                               max_sample_dist,
                               max_control_dist,
                               overwrite)
            return(x)
          })


# check_restriction --------------------------------------------------------

#' Check restrictions on EPDr objects
#' 
#' This function check EPDr objects (\code{\link[EPDr]{epd.entity-class}},
#' and \code{\link[EPDr]{epd.entity.df-class}}), 
#' and return a logical value indicating whether the data are restricted (TRUE) or
#' unrestricted (FALSE).
#'
#' @param x epd.entity \code{\link[EPDr]{epd.entity-class}} or
#' \code{\link[EPDr]{epd.entity.df-class}} object.
#'
#' @return logical value indicating whether the data are restricted (TRUE) or
#' unrestricted (FALSE).
#' @examples
#' \dontrun{
#' epd.connection <- connect_to_epd(host="localhost", database="epd",
#'                                  user="epdr", password="epdrpw")
#' epd.1 <- get_entity(1, epd.connection)
#' check_restriction(epd.1)
#' epd.1046 <- get_entity(1046, epd.connection)
#' check_restriction(epd.1046)
#' }
#' @rdname check_restriction
#' @exportMethod check_restriction
setGeneric("check_restriction", function(x){
  standardGeneric("check_restriction")
})

#' @rdname check_restriction
setMethod("check_restriction", signature(x = "epd.entity"), function(x){
  if (x@entity@pentity$usestatus == "R"){
    return(TRUE)
  }else{
    return(FALSE)
  }
})


# check_defaultchron --------------------------------------------------

#' Check default chronology on EPDr objects
#' 
#' The function check EPDr objects with slot @@defaultchron 
#' (\code{\link[EPDr]{epd.entity-class}}, and
#' \code{\link[EPDr]{epd.entity.df-class}}) to see if there are a default
#' chronology specified.
#'
#' @param x epd.entity \code{\link[EPDr]{epd.entity-class}} or
#' \code{\link[EPDr]{epd.entity.df-class}} objects.
#'
#' @return Logical value indicating whether the object has a default 
#' chronology (TRUE) or not (FALSE).
#' @examples
#' \dontrun{
#' epd.connection <- connect_to_epd(host="localhost", database="epd",
#'                                  user="epdr", password="epdrpw")
#' epd.1 <- get_entity(1, epd.connection)
#' check_defaultchron(epd.1)
#' }
#' @rdname check_defaultchron
#' @exportMethod check_defaultchron
setGeneric("check_defaultchron", function(x){
  standardGeneric("check_defaultchron")
})

#' @rdname check_defaultchron
setMethod("check_defaultchron", signature(x = "epd.entity"), function(x){
  if (length(x@defaultchron) == 0){
    return(FALSE)
  }else{
    if (x@defaultchron == 0){
      return(FALSE)
    }else{
      return(TRUE)
    }
  }
})


# check_valid_defaultchron --------------------------------------------------

#' Check that default chronology on EPDr objects is valid
#' 
#' The function check EPDr objects with slot @@defaultchron 
#' (\code{\link[EPDr]{epd.entity-class}}, and
#' \code{\link[EPDr]{epd.entity.df-class}}) to see if that chronology
#' has ages for the counts samples.
#'
#' @param x epd.entity \code{\link[EPDr]{epd.entity-class}} or
#' \code{\link[EPDr]{epd.entity.df-class}} objects.
#'
#' @return Logical value indicating whether the default chronology has ages 
#' (TRUE) or not (FALSE).
#' @examples
#' \dontrun{
#' epd.connection <- connect_to_epd(host="localhost", database="epd",
#'                                  user="epdr", password="epdrpw")
#' epd.1 <- get_entity(1, epd.connection)
#' check_valid_defaultchron(epd.1)
#' }
#' @rdname check_valid_defaultchron
#' @exportMethod check_valid_defaultchron
setGeneric("check_valid_defaultchron", function(x){
  standardGeneric("check_valid_defaultchron")
})

#' @rdname check_valid_defaultchron
setMethod("check_valid_defaultchron", signature(x = "epd.entity"), function(x){
  chron <- x@defaultchron
  chron_pagedpt <- x@samples@pagedpt$chron
  return(chron %in% chron_pagedpt)
})


# check_counts --------------------------------------------------

#' Check counts data on EPDr objects
#' 
#' The function check \code{\link[EPDr]{epd.entity.df-class}} objects (with
#' slot @@commdf@@counts) to see if there are counts data on it.
#'
#' @param x epd.entity.df \code{\link[EPDr]{epd.entity.df-class}} object.
#'
#' @return Logical value indicating whether the object has a default 
#' chronology (TRUE) or not (FALSE).
#' @examples
#' \dontrun{
#' epd.connection <- connect_to_epd(host="localhost", database="epd",
#'                                  user="epdr", password="epdrpw")
#' epd.1 <- get_entity(1, epd.connection)
#' check_counts(epd.1)
#' epd.763 <- get_entity(763, epd.connection)
#' check_counts(epd.763)
#' }
#' @rdname check_counts
#' @exportMethod check_counts
setGeneric("check_counts", function(x){
  standardGeneric("check_counts")
})

#' @rdname check_counts
setMethod("check_counts", signature(x = "epd.entity.df"), function(x){
  if (nrow(x@commdf@counts) == 0){
    return(FALSE)
  }else{
    if (ncol(x@commdf@counts) == 0){
      return(FALSE)
    }else{
      return(TRUE)
    }
  }
})


# counts_to_percentages -------------------------------------------------------

#' Calculate counts percentages
#'
#' This function transforms the counts matrix (samples x taxa) in
#' the @@commdf@@counts slot of a \code{\link[EPDr]{epd.entity.df-class}}
#' object to percentages relative to the total amount of particles 
#' in each sample (row).
#' 
#' Although the function work with \code{\link[EPDr]{epd.entity.df-class}} and
#' \code{\link[EPDr]{epd.entity-class}} objects, the later are previously 
#' transformed using \code{\link[EPDr]{entity_to_matrices}} without any
#' posterior data manipulation and transformation. A lot of entities have
#' information from different taxonomical groups (terrestrial plants pollen 
#' and algae or fungi spores). Hence, percentages calculated in this base
#' may be meaningless and its use is unrecommended.
#'
#' @param x epd.entity \code{\link[EPDr]{epd.entity.df-class}} or 
#' \code{\link[EPDr]{epd.entity-class}} objects.
#' @param offset numeric Numeric value to avoid dividing by zero 
#' if the total of particles is zero for a particular sample. This may 
#' happen when \code{x} has been filtered by taxa groups. For instance, 
#' if only trees and shrubs ("TRSH") are kept in the \code{x@@commdf@counts} 
#' slot some sites in the last glacial maximum could have no pollen at all, 
#' because all vegetation was herbaceos or small shrubs. The effect of the
#' offset is negligible because it only affect pollen counts of zero. The
#' result is to indicate that when counts are present, the results is 
#' numerical zero and not NA. This is relevant to discriminate sites and
#' times combinations where pollen counts are not available (NA) and not
#' numerically zero.
#' 
#' @return The function returns a \code{\link[EPDr]{epd.entity.df-class}} object, 
#' which is a modified version of \code{x} (or \code{entity_to_matrices(x)}
#' if x is a \code{\link[EPDr]{epd.entity-class}} object), in which
#' values in @@counts slot represent percentages instead of raw counts.
#' 
#' @examples
#' \dontrun{
#' epd.connection <- connect_to_epd(host="localhost", database="epd",
#'                                  user="epdr", password="epdrpw")
#' epd.1 <- get_entity(1, epd.connection)
#' epd.1 <- entity_to_matrices(epd.1)
#' epd.1 <- filter_taxagroups(epd.1, c("DWAR", "HERB", "LIAN", "TRSH",
#'                            "UPHE", "INUN")) ## All pollen taxa groups
#' epd.1.percent <- counts_to_percentages(epd.1)
#' head(epd.1.percent@commdf@counts)
#' head(epd.1@commdf@counts)
#' }
#' @rdname counts_to_percentages
#' @exportMethod counts_to_percentages
setGeneric("counts_to_percentages", function(x, offset = 0.0001){
  standardGeneric("counts_to_percentages")
})


#' @rdname counts_to_percentages
setMethod("counts_to_percentages", signature(x = "epd.entity.df"),
          function(x, offset){
            counts <- x@commdf@counts
            countstype <- factor("Percentages",
                                 levels = c("Counts", "Percentages"))
            totals <- rowSums(counts, na.rm = TRUE)
            totals[which(totals == 0)] <- offset
            counts <- (counts / totals) * 100
            x@commdf@counts <- counts
            x@countstype <- countstype
            return(x)
          })

#' @rdname counts_to_percentages
setMethod("counts_to_percentages",
          signature(x = "epd.entity"),
          function(x, offset){
  x <- entity_to_matrices(x)
  x <- counts_to_percentages(x, offset)
  return(x)
})


# entity_to_matrices --------------------------------------------------------

#' Calculate data matrices from \code{\link[EPDr]{epd.entity-class}} objects 
#' 
#' This function takes an \code{\link[EPDr]{epd.entity-class}} object and transforms
#' data from tables in the object to new tables in the form of matrices. 
#' Information in the new tables, are easier to understand and manipulate than
#' those in the native format of the EPD. Transformed data include samples,
#' ages of that samples, and counts for that samples (both biological and 
#' no-biological particles).
#' 
#' The function return a \code{\link[EPDr]{epd.entity.df-class}} object, which 
#' is an expanded version of \code{\link[EPDr]{epd.entity-class}} objects with
#' eight new slots. This new object are intended to be modified for 
#' standardizations but keeping an unmodified version of the data and
#' structure of the \code{\link[EPDr]{epd.entity-class}} object.
#' Four slots contains the data in the new format, while four others store
#' information to track changes made from standardization and modification
#' functions (e.g., \code{\link[EPDr]{counts_to_percentages}}. 
#' 
#' @param x epd.entity Object of class \code{\link[EPDr]{epd.entity-class}} to be 
#' modified.
#' 
#' @return epd.entity.df object. Expanded version of \code{x} with certain data
#' reformated into data matrices instead of the native EPD format.
#' 
#' @examples
#' \dontrun{
#' epd.connection <- connect_to_epd(host="localhost", database="epd",
#'                                  user="epdr", password="epdrpw")
#' epd.1 <- get_entity(1, epd.connection)
#' epd.1 <- entity_to_matrices(epd.1)
#' }
#' @rdname entity_to_matrices
#' @exportMethod entity_to_matrices
setGeneric("entity_to_matrices", function(x){
  standardGeneric("entity_to_matrices")
})

#' @rdname entity_to_matrices
setMethod("entity_to_matrices", signature(x = "epd.entity"), function(x){
  samples <- x@samples
  ages <- subset(samples@pagedpt, select = c("sample_", "chron_", "agebp"))
  ages <- unique(ages)
  if (nrow(ages) == 0){
    warning("This core does not have age data.", call. = FALSE)
    ages.cast <- data.frame()
    sample_ <- samples@psamples$sample_
    samplelabel <- as.character(sample_)
    depthcm <- samples@psamples$depthcm[match(sample_,
                                              samples@psamples$sample_)]
    dataquality <- data.frame()
  }else{
    ages.cast <- reshape2::dcast(ages, sample_ ~ chron_,
                                 value.var = "agebp", margin = FALSE)
    sample_ <- samples@psamples$sample_
    samplelabel <- as.character(sample_)
    ages.cast <- ages.cast[match(sample_, ages.cast$sample_), ]
    chronnames <- colnames(ages.cast)[-1]
    chronnames[which(chronnames == 9999)] <- "giesecke"
    ages.cast <- subset(ages.cast, select = -1)
    colnames(ages.cast) <- chronnames
    depthcm <- samples@psamples$depthcm[match(sample_,
                                              samples@psamples$sample_)]
    dataquality <- data.frame()
  }
  samplesdf <- new("samplesdf", sample_ = sample_, samplelabel = samplelabel)
  agesdf <- new("agesdf", depthcm = depthcm, depthages = ages.cast,
                dataquality = dataquality)
  countstype <- factor("Counts", levels = c("Counts", "Percentages"))
  countsprocessing <- factor("Raw",
                             levels = c("Raw", "Interpolated", "Ranged means"))
  taxatype <- factor("Default", levels = c("Default", "Accepted", "Higher"))
  taxaprocessing <- factor("Original", levels = c("Original", "Expanded"))
  if (nrow(samples@pcounts) == 0){
    warning("This core does not have count data.", call. = FALSE)
    counts.cast <- data.frame()
    taxanames <- character(0)
    taxa_ <- numeric(0)
    taxagroupid <- character(0)
    taxaaccepted <- numeric(0)
    taxamhvar <- numeric(0)
    commdf <- new("commdf")
    nopodf <- new("nopodf")
  }else{
    taxa_ <- samples@pvars$var_
    taxanames <- samples@pvars$varname[match(taxa_, samples@pvars$var_)]
    taxagroupid <- samples@pgroup$groupid[match(taxa_, samples@pgroup$var_)]
    taxaaccepted <- samples@pvars$accvar_[match(taxa_, samples@pvars$var_)]
    taxamhvar <- samples@pvars$mhvar_[match(taxa_, samples@pvars$var_)]
    counts.cast <- reshape2::dcast(samples@pcounts,
                                   sample_ ~ var_,
                                   value.var = "count")
    counts.cast[is.na(counts.cast)] <- 0
    counts.cast <- counts.cast[match(sample_, counts.cast$sample_), ]
    counts.cast <- subset(counts.cast, select = -1)
    counts.cast <- counts.cast[match(taxa_, colnames(counts.cast))]
    colnames(counts.cast) <- taxanames
    if (any(taxagroupid == "NOPO")){
      npi <- which(taxagroupid == "NOPO")
      nnpi <- which(taxagroupid != "NOPO")
      commdf <- new("commdf",
                    taxanames = taxanames[nnpi],
                    taxa_ = taxa_[nnpi],
                    taxaaccepted = taxaaccepted[nnpi],
                    taxamhvar = taxamhvar[nnpi],
                    taxagroupid = taxagroupid[nnpi],
                    counts = subset(counts.cast, select = nnpi))
      nopodf <- new("nopodf",
                    varnames = taxanames[npi],
                    var_ = taxa_[npi],
                    varaccepted = taxaaccepted[npi],
                    varmhvar = taxamhvar[npi],
                    vargroupid = taxagroupid[npi],
                    counts = subset(counts.cast, select = npi))
    }else{
      commdf <- new("commdf", taxanames = taxanames,
                    taxa_ = taxa_, taxaaccepted = taxaaccepted,
                    taxamhvar = taxamhvar, taxagroupid = taxagroupid,
                    counts = counts.cast)
      nopodf <- new("nopodf")
    }
  }
  x <- new("epd.entity.df",
           e_ = x@e_,
           postbombzone = x@postbombzone,
           numberofchron = x@numberofchron,
           isingiesecke = x@isingiesecke,
           defaultchron = x@defaultchron,
           taxatype = taxatype,
           taxaprocessing = taxaprocessing,
           countstype = countstype,
           countsprocessing = countsprocessing,
           entity = x@entity,
           site = x@site,
           geochron = x@geochron,
           chron = x@chron,
           samples = x@samples,
           samplesdf = samplesdf,
           agesdf = agesdf,
           commdf = commdf,
           nopodf = nopodf)
  return(x)
})


# filter_taxa -----------------------------------------------

#' Expand EPDr objects with new taxa
#' 
#' This function modifies \code{@@commdf@@counts} slot of
#' (\code{\link[EPDr]{epd.entity.df-class}} objects to filter taxa columns
#' to match \code{taxa}. The function adds empty columns (\code{NA}) if a new taxa is
#' defined in \code{taxa} or removes columns for the taxa not included in \code{taxa}.
#' The function may look useless for a single entity but it is useful when standardizing
#' data across multiple entities.
#'
#' @param x epd.entity \code{\link[EPDr]{epd.entity.df-class}} or
#' \code{\link[EPDr]{epd.entity-class}} object to be modified.
#' @param taxa character Character vector indicating the new 
#' taxa in the \code{@@epd.entity.df} slot. 
#' @param epd.taxonomy data.frame Data frame with the taxonomy from the 
#' EPD as from the \code{\link[EPDr]{get_taxonomy_epd}} function.
#' @param na_value numeric Number indicating the value to be used for
#' taxa not previously present in the entity.
#' 
#' @return \code{\link[EPDr]{epd.entity.df-class}} object with the modified
#' \code{@@commdf@@counts} slot.
#' 
#' @examples
#' \dontrun{
#' epd.connection <- connect_to_epd(host="localhost", database="epd",
#'                                  user="epdr", password="epdrpw")
#' epd.1 <- get_entity(1, epd.connection)
#' epd.1@commdf@taxanames
#' colnames(epd.1@commdf@counts)
#' epd.1.ft <- filter_taxa(epd.1, c(epd.1@commdf@taxa_names, "prueba"),
#'                         get_taxonomy_epd(epd.connection))
#' colnames(epd.1.ft@commdf@counts)
#' epd.1.ft@commdf@taxanames
#' head(epd.1.ft@commdf@counts)
#' }
#' @rdname filter_taxa
#' @exportMethod filter_taxa
setGeneric("filter_taxa", function(x, taxa, epd.taxonomy, na_value = 0){
  standardGeneric("filter_taxa")
})

#' @rdname filter_taxa
setMethod("filter_taxa", signature(x = "epd.entity.df",
                                   taxa = "character",
                                   epd.taxonomy = "data.frame"),
          function(x, taxa, epd.taxonomy, na_value){
            site_taxa <- x@commdf@taxanames
            diff_names <- setdiff(taxa, site_taxa)
            if (nrow(x@commdf@counts) == 0){
              new_counts <- data.frame(matrix(ncol = length(taxa),
                                              nrow = 0))
              colnames(new_counts) <- taxa
            }else{
              new_counts <- x@commdf@counts
              new_counts[, diff_names] <- na_value
            }
            new_counts <- subset(new_counts, select = taxa)
            x@taxatype <- factor("Expanded",
                                    levels = c("Samples",
                                               "Accepted",
                                               "Higher"))
            x@commdf@taxanames <- colnames(new_counts)
            index <- match(colnames(new_counts), epd.taxonomy$varname)
            x@commdf@taxa_ <- epd.taxonomy$var_[index]
            x@commdf@taxaaccepted <- epd.taxonomy$accvar_[index]
            x@commdf@taxamhvar <- epd.taxonomy$mhvar_[index]
            x@commdf@taxagroupid <- epd.taxonomy$groupid[index]
            x@commdf@counts <- new_counts
            return(x)
          })


#' @rdname filter_taxa
setMethod("filter_taxa", signature(x = "epd.entity", taxa = "character",
                                   epd.taxonomy = "data.frame"),
          function(x, taxa, epd.taxonomy, na_value){
            x <- entity_to_matrices(x)
            x <- filter_taxa(x, taxa, epd.taxonomy)
            return(x)
          })


# filter_taxagroups --------------------------------------------------------

#' Filter Taxa Groups
#' 
#' This function removes taxa from the slot @@commdf@@counts in the 
#' \code{\link[EPDr]{epd.entity.df-class}} object based on specific groups of taxa.
#' For example, the user can select to work only with pollen from trees and 
#' shrubs (TRSH) or with algae (ALGA). If \code{x} is an 
#' \code{\link[EPDr]{epd.entity-class}} object, this is first converted into a 
#' \code{\link[EPDr]{epd.entity.df-class}} with 
#' \code{\link[EPDr]{entity_to_matrices}}.
#'
#' @param x epd.entity Object of class \code{\link[EPDr]{epd.entity.df-class}}
#' or \code{\link[EPDr]{epd.entity-class}} object.
#' @param taxagroups character Character vector indicating taxa groups 
#' to be selected.
#' 
#' @return The function returns a \code{\link[EPDr]{epd.entity.df-class}} 
#' with information on \code{@@commdf@@counts} slot 
#' only for taxa belonging to the specified taxa groups.
#' 
#' @examples
#' \dontrun{
#' epd.connection <- connect_to_epd(host="localhost", database="epd",
#'                                  user="epdr", password="epdrpw")
#' epd.1 <- get_entity(1, epd.connection)
#' epd.1.trsh <- filter_taxagroups(epd.1, "TRSH")
#' str(epd.1@commdf@counts)
#' str(epd.1.trsh@commdf@counts)
#' }
#' @rdname filter_taxagroups
#' @exportMethod filter_taxagroups
setGeneric("filter_taxagroups", function(x, taxagroups){
  standardGeneric("filter_taxagroups")
})

#' @rdname filter_taxagroups
setMethod("filter_taxagroups", signature(x = "epd.entity.df",
                                         taxagroups = "character"),
          function(x, taxagroups){
            commdf <- x@commdf
            index <- which(commdf@taxagroupid %in% taxagroups)
            index <- unique(index)
            commdf@taxanames <- commdf@taxanames[index]
            commdf@taxa_ <- commdf@taxa_[index]
            commdf@taxaaccepted <- commdf@taxaaccepted[index]
            commdf@taxamhvar <- commdf@taxamhvar[index]
            commdf@taxagroupid <- commdf@taxagroupid[index]
            commdf@counts <- subset(commdf@counts, select = index)
            x@commdf <- commdf
            return(x)
          })

#' @rdname filter_taxagroups
setMethod("filter_taxagroups", signature(x = "epd.entity",
                                         taxagroups = "character"),
          function(x, taxagroups){
            x <- entity_to_matrices(x)
            x <- filter_taxagroups(x, taxagroups)
            return(x)
          })


# giesecke_default_chron -----------------------------------------------

#' Make Giesecke the default chronology
#' 
#' This function makes chronologies from Giesecke the default to be used in
#' \code{\link[EPDr]{epd.entity-class}} and \code{\link[EPDr]{epd.entity.df-class}}
#' objects if they are available for that particular entity.
#'
#' Details The function first check whether that entity has chronological
#' data in Giesecke et al. (2013), and then, if available, change the
#' default chronology number from the EPD to the one provided by Giesecke
#' (i.e., x@@defaultchron == 9999)
#'
#' @param x epd.entity \code{\link[EPDr]{epd.entity-class}} or 
#' \code{\link[EPDr]{epd.entity.df-class}} objects.
#'
#' @return The function returns a modified version of the 
#' \code{epd.entity} in which the slot @@defaultchron is changed to 
#' \code{9999} if they recalculated ages for this entity. A numeric code 
#' is used (\code{9999}) because of the object definition is numeric. This 
#' values is used in other functions to grab the right ages from Giesecke.
#' 
#' @references Giesecke, T., Davis, B., Brewer, S., Finsinger, W., 
#' Wolters, S., Blaaw, M., de Beaulieu, J.L., Binney, H., Fyfe, R.M.,
#' Gaillard, M.J., Gil-Romera, G., van der Knaap, W.O. Kunes, P.,
#' Kuhl, N., van Leeuwen, J.F.N, Leydet, M., Lotter, A.F., Ortu, E.,
#' Semmler, M., and Bradshaw, R.H.W (2013). Towards mapping the late
#' Quaternary vegetation change of Europe. Vegetation History and
#' Archaeobotany, 23, 75-86.
#' 
#' @examples
#' \dontrun{
#' epd.connection <- connect_to_epd(host="localhost", database="epd",
#'                                  user="epdr", password="epdrpw")
#' epd.1 <- get_entity(1, epd.connection)
#' epd.1@defaultchron
#' epd.1 <- giesecke_default_chron(epd.1)
#' epd.1@defaultchron
#' }
#' @rdname giesecke_default_chron
#' @exportMethod giesecke_default_chron
setGeneric("giesecke_default_chron", function(x){
  standardGeneric("giesecke_default_chron")
})

#' @rdname giesecke_default_chron
setMethod("giesecke_default_chron",
          signature(x = "epd.entity"), function(x){
            if (x@isingiesecke == TRUE){
              x@defaultchron <- 9999
            }
            return(x)
          })


# interpolate_counts -------------------------------------------------------

#' Interpolate counts to specific time periods
#'
#' This function uses data (sample ages and sample counts) from an
#' \code{\link[EPDr]{epd.entity.df-class}} object to estimate by linear 
#' interpolation, loess or smooth splines the counts at specific time 
#' periods defined by the user. This can be used to estimate counts for 
#' the same time periods for multiple entities in the database, 
#' standardizing them for integrative analysis.
#' 
#' @param x epd.entity.df An \code{\link[EPDr]{epd.entity.df-class}} or
#' \code{\link[EPDr]{epd.entity-class}} object as returned by
#' \code{\link[EPDr]{get_entity}} or
#' \code{\link[EPDr]{entity_to_matrices}} functions.
#' @param time numeric Vector with time periods, in the same system 
#' (i.e., cal BP) than "ages" in epd.entity.df, in which counts have 
#' to be estimated.
#' @param chronology numeric Number specifying the chronology from 
#' which ages should be used to calculate the interpolations. If none is
#' provided the function uses the default chronology from the object (see
#' \code{\link[EPDr]{giesecke_default_chron}} or
#' \code{\link[EPDr]{check_defaultchron}}).
#' @param method character Interpolation method, should be an unambiguous 
#' abbreviation of either linear, loess, or sspline. See Details section.
#' @param rep_negt logical logical to indicate whether or not to replace 
#' negative values with zero in the interpolated data.
#' @param span numeric Span for loess, default = 0.25.
#' @param df numeric Degress of freedome for smoothing spline, default is 
#' the lower of 20 or 0.7 * number of samples.
#' @param ...	additional arguments to loess and smooth.spline.
#'
#' @details Data for time periods in \code{time} but not recorded in the 
#' entity are fill with \code{NA}. This is convenient if analysis are 
#' carried out with multiple entities.
#' 
#' Interpolation can be done using linear interpolation between 
#' data points in the original series (default) using 
#' \code{\link[stats]{approxfun}}, using a fitted 
#' \code{\link[stats]{loess}} locally weighted regression, or by 
#' \code{\link[stats]{smooth.spline}}. The latter two methods 
#' will also smooth the data and additional arguments may be passed to 
#' these functions to control the amount of smoothing, or to force replacing
#' negative values with zeros.
#'
#' @return The function returns an \code{\link[EPDr]{epd.entity.df-class}} object,
#' similar to \code{x} in which ages and counts has been modified to the
#' time periods specified in time and the counts estimated for these periods.
#' 
#' @examples
#' \dontrun{
#' epd.connection <- connect_to_epd(host="localhost", database="epd",
#'                                  user="epdr", password="epdrpw")
#' t <- c(seq(0, 21000, by = 500))
#' epd.1 <- get_entity(1, epd.connection)
#' epd.1.int <- interpolate_counts(epd.1, t)
#' 
#' epd.3 <- get_entity(3, epd.connection)
#' epd.3.int <- interpolate_counts(epd.3, t, method="linear")
#' epd.3.int <- interpolate_counts(epd.3, t, method="loess")
#' epd.3.int <- interpolate_counts(epd.3, t, method="sspline")
#' }
#' @rdname interpolate_counts
#' @exportMethod interpolate_counts
setGeneric("interpolate_counts", function(x,
  time,
  chronology = NULL,
  method = c("linear",
             "loess",
             "sspline"),
  rep_negt = TRUE,
  span = 0.25,
  df = min(20,
           nrow(x@commdf@counts) *
             0.7),
  ...){
  standardGeneric("interpolate_counts")
})

#' @rdname interpolate_counts
setMethod("interpolate_counts", signature(x = "epd.entity.df",
                                          time = "numeric"),
          function(x, time, chronology, method, rep_negt, span,
                   df, ...){
            if (is.null(chronology)){
              chronology <- x@defaultchron
            }
            if (chronology == 9999){
              chronology <- "giesecke"
            }
            if (nrow(x@agesdf@depthages) == 0){
              stop(paste0("The entity has not ages, and interpolation cannot ",
                          "be performed."))
            }
            if (nrow(x@commdf@counts) == 0){
              stop(paste0("The entity has not counts, and interpolation ",
                          "cannot be performed."))
            }
            sample_ages <- subset(x@agesdf@depthages,
                                  select = as.character(chronology))
            sample_depthcm <- x@agesdf@depthcm
            sample_id <- x@samplesdf@sample_
            sample_counts <- x@commdf@counts
            # remove data from depths whithout ages associated
            index1 <- which(!is.na(sample_ages))
            index2 <- seq_along(sample_id)
            index <- intersect(index2, index1)
            sample_ages <- sample_ages[index, ]
            sample_depthcm <- sample_depthcm[index]
            sample_id <- sample_id[index]
            sample_counts <- sample_counts[index, ]
            # index taxa with less than 2 meassures
            cc_index <- apply(!is.na(sample_counts), MARGIN = 2, FUN = sum)
            cc_index <- cc_index >= 2
            ## set the time bounds
            min_sample_age <- min(sample_ages, na.rm = TRUE)
            max_sample_age <- max(sample_ages, na.rm = TRUE)
            min_time <- time[which(time >= min_sample_age)][1]
            max_time <- time[which(time <= max_sample_age)][length(
              which(time <= max_sample_age))]
            interp_ages <- time[which(time >= min_time & time <= max_time)]
            interp_counts <- as.data.frame(matrix(nrow = length(interp_ages),
                                                  ncol = ncol(sample_counts)))
            interp_depthcm <- numeric(0)
            if (length(interp_ages) != 0){
              method <- match.arg(method)
              if (is.null(method))
                stop("Interpolation method not recognised")
              if (method == "linear") {
                lin.f <- function(y, x, xout) {
                  stats::approx(x, y, xout)$y
                }
                interp_counts[, cc_index] <- apply(sample_counts[, cc_index],
                                                   MARGIN = 2,
                                                   lin.f,
                                                   x = sample_ages,
                                                   xout = interp_ages,
                                                   ...)
                interp_depthcm <- lin.f(sample_depthcm, sample_ages,
                                        xout = interp_ages)
              }
              else if (method == "loess") {
                lo.f <- function(y, x, xout, span, ...) {
                  fit <- stats::loess(y ~ x, span = span, ...)
                  stats::predict(fit, newdata = data.frame(x = xout))
                }
                interp_counts[, cc_index] <- apply(sample_counts[, cc_index],
                                                   MARGIN = 2,
                                                   lo.f,
                                                   x = sample_ages,
                                                   xout = interp_ages,
                                                   span = span,
                                                   ...)
                interp_depthcm <- lo.f(sample_depthcm,
                                       sample_ages,
                                       xout = interp_ages,
                                       span = span,
                                       ...)
              }
              else if (method == "sspline") {
                ss.f <- function(y, x, xout, df, ...) {
                  fit <- stats::smooth.spline(y ~ x, df = df, ...)
                  stats::predict(fit, x = data.frame(x = xout))$y[, 1]
                }
                interp_counts[, cc_index] <- apply(sample_counts[, cc_index],
                                                   MARGIN = 2,
                                                   ss.f,
                                                   x = sample_ages,
                                                   xout = interp_ages)
                interp_depthcm <- ss.f(sample_depthcm,
                                       sample_ages,
                                       xout = interp_ages,
                                       df = df, ...)
              }
              if (rep_negt) {
                interp_counts[interp_counts < 0] <- 0
              }
            }
            colnames(interp_counts) <- colnames(sample_counts)
            output_counts <- as.data.frame(matrix(nrow = length(time),
                                                  ncol = ncol(sample_counts)))
            colnames(output_counts) <- colnames(sample_counts)
            output_depthcm <- rep(NA, length(time))
            output_ages <- time
            index <- which(output_ages %in% interp_ages)
            output_counts[index, ] <- interp_counts
            output_depthcm[index] <- interp_depthcm
            if (chronology == "giesecke"){
              x@defaultchron <- 9999
            }else{
              x@defaultchron <- chronology
            }
            x@countsprocessing <- factor("Interpolated",
                                         levels = c("Samples",
                                                    "Interpolated",
                                                    "Ranged means"))
            x@samplesdf@sample_ <- 10000 + seq_along(output_ages)
            x@samplesdf@samplelabel <- as.character(output_ages)
            x@agesdf@depthages <- data.frame(output_ages)
            colnames(x@agesdf@depthages) <- chronology
            x@agesdf@depthcm <- output_depthcm
            x@commdf@counts <- output_counts
            return(x)
          })

#' @rdname interpolate_counts
setMethod("interpolate_counts", signature(x = "epd.entity", time = "numeric"),
          function(x, time, chronology, method, rep_negt, span,
                   df, ...){
            x <- entity_to_matrices(x)
            x <- interpolate_counts(x, time, chronology, method, rep_negt,
                                    span, df, ...)
            return(x)
          })


# intervals_counts ----------------------------------------------------------

#' Mean counts for specific time intervals
#'
#' This function uses data (sample ages and sample counts) from an
#' \code{\link[EPDr]{epd.entity.df-class}} object to calculate mean counts
#' for samples within specific time intervals defined by the user. This can be
#' used to estimate mean counts for the same time intervals for multiple 
#' entities or cores in the database, standardizing them for 
#' integrative analysis.
#' 
#' Time intervals without sample (data) in the entity are fill with 
#' \code{NA}. This is convenient if analysis are carried out with 
#' multiple entities.
#'
#' @param x epd.entity.df An \code{\link[EPDr]{epd.entity.df-class}} object as 
#' returned by the \code{\link[EPDr]{entity_to_matrices}} function.
#' @param tmin numeric Numeric vector indicating the lower limits 
#' (in years cal. BP) for the time intervals.
#' @param tmax numeric Numeric vector indicating the upper limits 
#' (in years cal. BP) for the time intervals
#' @param newlabels character Character vector with labels for each time 
#' intervals, if none are provided the functions generate them with the 
#' following format \code{tmin}-\code{tmax}.
#' @param chronology numeric Number specifying the chronology from which 
#' ages should be used to calculate the interpolations. If none is provided 
#' the function uses the default chronology from the object (see
#' \code{\link[EPDr]{giesecke_default_chron}}).
#'
#' @return The function returns a \code{\link[EPDr]{epd.entity.df-class}} object, 
#' similar to \code{epd.entity.df} in which ages and counts has been modified
#' to the time intervarls specified and the counts estimated for these periods.
#' 
#' @examples
#' \dontrun{
#' epd.connection <- connect_to_epd(host="localhost", database="epd",
#'                                  user="epdr", password="epdrpw")
#' epd.1 <- get_entity(1, epd.connection)
#' epd.1.int <- intervals_counts(epd.1, tmin = seq(0, 21000, by = 1000),
#'                               tmax = seq(999, 21999, by = 1000))
#' epd.3 <- get_entity(3, epd.connection)
#' epd.3.int <- intervals_counts(epd.3, tmin = seq(0, 21000, by = 1000),
#'                               tmax = seq(999, 21999, by = 1000))
#' epd.3.int <- intervals_counts(epd.3, tmin = seq(0, 21000, by = 1000),
#'                               tmax = seq(999, 21999, by = 1000), 2)
#' }
#' @rdname intervals_counts
#' @exportMethod intervals_counts
setGeneric("intervals_counts", function(x,
                                        tmin,
                                        tmax,
                                        chronology = NULL,
                                        newlabels = NULL){
  standardGeneric("intervals_counts")
})

#' @rdname intervals_counts
setMethod("intervals_counts", signature(x = "epd.entity.df",
                                        tmin = "numeric",
                                        tmax = "numeric"),
          function(x, tmin, tmax, chronology, newlabels){
            if (length(tmin) != length(tmax)){
              stop(paste0("length(tmin) != length(tmax). Please, specify ",
                          "two vectors of the same length"))
            }
            if (is.null(newlabels)){
              newlabels <- paste(tmin, "-", tmax, sep = "")
            }
            if (is.null(chronology)){
              chronology <- x@defaultchron
            }
            if (chronology == 9999){
              chronology <- "giesecke"
            }
            sample_ages <- subset(x@agesdf@depthages,
                                  select = as.character(chronology))
            sample_depthcm <- x@agesdf@depthcm
            sample_id <- x@samplesdf@sample_
            sample_counts <- x@commdf@counts
            index1 <- which(!is.na(sample_ages))
            index2 <- seq_along(sample_id)
            index <- intersect(index2, index1)
            sample_ages <- sample_ages[index, ]
            sample_depthcm <- sample_depthcm[index]
            sample_id <- sample_id[index]
            sample_counts <- sample_counts[index, ]
            .is_between <- function(x, a, b) {
              x >= a & x <= b
            }
            index <- mapply(function(a, b, x){
              .is_between(x, a, b)
            }
            , tmin, tmax, MoreArgs = list(sample_ages))
            if (is.vector(index)){
              intervalid <- which(index)
              index <- 1
            }else{
              intervalid <- unlist(apply(index, MARGIN = 1, FUN = which))
              index <- unlist(apply(index, MARGIN = 2, FUN = which))
            }
            output_depthcm <- rep(NA, length(newlabels))
            output_ages <- rep(NA, length(newlabels))
            output_counts <- as.data.frame(matrix(NA, nrow = length(newlabels),
                                                  ncol = ncol(sample_counts)))
            colnames(output_counts) <- colnames(sample_counts)
            if (length(index) == 0 | !all(index %in% x@samplesdf@sample_)){
            }else{
              range.counts <- sample_counts[index, ]
              range.depthcm <- sample_depthcm[index]
              range.ages <- sample_ages[index]
              range_means <- apply(range.counts, MARGIN = 2,
                                   FUN = function(x, y, z){
                                     stats::aggregate(x, by = list(y = y),
                                                      FUN = z, na.rm = TRUE)
                                   }
                                   , intervalid, mean)
              range_means <- lapply(range_means, function(x){
                x$x
              })
              range_means <- do.call(cbind, range_means)
              range_depth_means <- stats::aggregate(range.depthcm,
                                                    by = list(y = intervalid),
                                                    FUN = mean, na.rm = TRUE)
              range_depth_means <- range_depth_means$x
              range_ages_means <- stats::aggregate(range.ages,
                                                   by = list(y = intervalid),
                                                   FUN = mean,
                                                   na.rm = TRUE)
              range_ages_means <- range_ages_means$x
              output_counts[unique(intervalid), ] <- range_means
              output_depthcm[unique(intervalid)] <- range_depth_means
              output_ages[unique(intervalid)] <- range_ages_means
            }
            x@countsprocessing <- factor("Ranged means",
                                         levels = c("Samples",
                                                    "Interpolated",
                                                    "Ranged means"))
            x@samplesdf@sample_ <- 20000 + seq_along(newlabels)
            x@samplesdf@samplelabel <- newlabels
            if (chronology == "giesecke"){
              x@defaultchron <- 9999
            }else{
              x@defaultchron <- chronology
            }
            x@agesdf@depthages <- data.frame(output_ages)
            colnames(x@agesdf@depthages) <- chronology
            x@agesdf@depthcm <- as.numeric(output_depthcm)
            x@commdf@counts <- output_counts
            return(x)
          })

#' @rdname intervals_counts
setMethod("intervals_counts", signature(x = "epd.entity",
                                        tmin = "numeric", tmax = "numeric"),
          function(x, tmin, tmax, chronology, newlabels){
            x <- entity_to_matrices(x)
            x <- intervals_counts(x, tmin, tmax, chronology, newlabels)
            return(x)
          })

# table_by_taxa_age -----------------------------------------------

#' Tabulate counts by taxa and age
#' 
#' This function tabulates data from the \code{@@commdf@@counts} slot 
#' in EPDr objects (\code{\link[EPDr]{epd.entity.df-class}} or calculated 
#' them from a \code{\link[EPDr]{epd.entity-class}}) to summarize information 
#' for particular taxa at particular age or time intervals (samples). 
#' This function is useful to reshape the data to be plotted and mapped 
#' by \code{link[ggplot2]{ggplot}}. It was written to be used by 
#' \code{\link[EPDr]{map_taxa_age}} function.
#' 
#' @param x epd.entity.df \code{\link[EPDr]{epd.entity.df-class}} object where 
#' to extract the data from, or a \code{\link[EPDr]{epd.entity-class}} that is 
#' automatically transformed into a \code{\link[EPDr]{epd.entity.df-class}} and 
#' then tabulated.
#' @param taxa character Character vector indicating the taxa to be included 
#' in the table. Several taxa can be specified but the function returns data 
#' for only one taxa by summing all counts, and the taxa name specified 
#' in the output is the first one in \code{taxa}. This is useful when
#' you want to combine data from different taxa in the same genus,
#' for instance.
#' @param sample_label character Character vector indicating the ages or time 
#' intervals to be included in the table.
#'
#' @return Data frame with six columns: \code{e_}, \code{londd}, \code{latdd},
#' \code{count}, \code{sample_label}, and \code{taxa_label}.
#' \itemize{
#' \item{"e_"}{Entity identification number.}
#' \item{"londd"}{Longitude of the site in decimal degrees.}
#' \item{"latdd"}{Latitude of the site in decimal degrees.}
#' \item{"count"}{The count of that taxon in that particular sample
#' (age or time interval).}
#' \item{"sample_label"}{The sample (age or time interval) at which the
#' particulates were counted.}
#' \item{"taxa_label"}{The taxa that has been counted.}
#' }
#' @examples
#' \dontrun{
#' epd.connection <- connect_to_epd(host = "localhost", database = "epd",
#'                                user = "epdr", password = "epdrpw")
#' epd.1 <- get_entity(1, epd.connection)
#' epd.1 <- filter_taxagroups(epd.1, c("HERB", "TRSH", "DWAR",
#'                                       "LIAN", "HEMI", "UPHE"))
#' epd.1 <- giesecke_default_chron(epd.1)
#' 
#' epd.1 <- interpolate_counts(epd.1, seq(0, 22000, by = 1000))
#' epd.taxonomy <- getTaxonomyEPD(epd.connection)
#' epd.1 <- taxa_to_acceptedtaxa(epd.1, epd.taxonomy)
#' 
#' table_by_taxa_age(epd.1, "Cedrus", 
#'                   as.character(seq(0, 21000, by=1000)))
#' }
#' @rdname table_by_taxa_age
#' @exportMethod table_by_taxa_age
setGeneric("table_by_taxa_age", function(x, taxa, sample_label){
  standardGeneric("table_by_taxa_age")
})

#' @rdname table_by_taxa_age
setMethod("table_by_taxa_age", signature(x = "epd.entity.df",
                                         taxa = "character",
                                         sample_label = "character"),
          function(x, taxa, sample_label){
            if (!all(taxa %in% x@commdf@taxanames)){
              stop("taxa has to be a valid taxon in x@commdf@counts
                   and x@commdf@taxanames")
            }
            if (!all(sample_label %in% x@samplesdf@samplelabel)){
              stop("sample_label has to be valid sample labels in
                   x@commdf@counts and x@samplesdf@samplelabel")
            }
            e_ <- extract_e(x)
            londd <- x@site@siteloc$londd
            latdd <- x@site@siteloc$latdd
            sample_index <- which(x@samplesdf@samplelabel %in% sample_label)
            taxa_index <- which(x@commdf@taxanames %in% taxa)
            count <- x@commdf@counts[sample_index, taxa_index]
            if (class(count) == "data.frame"){
              count <- apply(count, MARGIN = 1, FUN = sum)
            }
            if (length(count) == 0){
              count <- NA
            }
            output <- data.frame(e_, londd, latdd, count,
                                 sample_label, taxa[[1]])
            colnames(output) <- c("e_", "londd", "latdd", "count",
                                  "sample_label", "taxa")
            return(output)
            })

#' @rdname table_by_taxa_age
setMethod("table_by_taxa_age", signature(x = "epd.entity",
                                         taxa = "character",
                                         sample_label = "character"),
          function(x, taxa, sample_label){
            x <- entity_to_matrices(x)
            table_by_taxa_age(x, taxa, sample_label)
            return(x)
          })


# taxa_to_acceptedtaxa -----------------------------------------------

#' Change taxa to accepted taxa names
#' 
#' This function modifies the taxa names in the \code{@@commdf@@counts} slot 
#' of \code{\link[EPDr]{epd.entity.df-class}} objects. More specifically, this 
#' function compares the taxa name with the taxonomy of the EPD
#' to use the accepted names. If these changes result in duplicated columns 
#' of the same taxa their values are unified by summing them.
#'
#' @param x epd.entity.df Object of class \code{\link[EPDr]{epd.entity.df-class}} 
#' with a \code{@@commdf@@counts} slot.
#' @param epd.taxonomy data.frame Data frame with the taxonomy 
#' from the EPD as from the \code{\link[EPDr]{get_taxonomy_epd}} function.
#'
#' @return \code{\link[EPDr]{epd.entity.df-class}} object with new taxa names.
#' 
#' @examples
#' \dontrun{
#' epd.connection <- connect_to_epd(host="localhost", database="epd",
#'                                  user="epdr", password="epdrpw")
#' epd.1 <- get_entity(1, epd.connection)
#' epd.1 <- entity_to_matrices(epd.1)
#' epd.1@commdf@taxanames
#' colnames(epd.1@commdf@counts)
#' epd.1.acc <- taxa_to_acceptedtaxa(epd.1, get_taxonomy_epd(epd.connection))
#' epd.1.acc@commdf@taxanames
#' colnames(epd.1.acc@commdf@counts)
#' }
#' @rdname taxa_to_acceptedtaxa
#' @exportMethod taxa_to_acceptedtaxa
setGeneric("taxa_to_acceptedtaxa", function(x, epd.taxonomy){
  standardGeneric("taxa_to_acceptedtaxa")
})

#' @rdname taxa_to_acceptedtaxa
setMethod("taxa_to_acceptedtaxa", signature(x = "epd.entity.df",
                                            epd.taxonomy = "data.frame"),
          function(x, epd.taxonomy){
            taxa_ <- x@commdf@taxa_
            taxa_acc <- x@commdf@taxaaccepted
            if (length(taxa_) == 0){
              warning(paste0("Entity has no count data for any taxon. ",
                             "Returning the original object in the format ",
                             "of an 'epd.entity.df' object."))
              return(x)
            }
            ii <- match(taxa_acc,
                        epd.taxonomy$var_)
            index <- unique(ii)
            x@taxatype <- factor("Accepted",
                                 levels = c("Samples", "Accepted",
                                            "Higher"))
            x@commdf@taxanames <- epd.taxonomy$varname[index]
            x@commdf@taxa_ <- epd.taxonomy$var_[index]
            x@commdf@taxaaccepted <- epd.taxonomy$accvar_[index]
            x@commdf@taxamhvar <- epd.taxonomy$mhvar_[index]
            x@commdf@taxagroupid <- epd.taxonomy$groupid[index]
            counts <- x@commdf@counts
            if (nrow(counts) == 0){
              counts <- counts[, seq_along(index)]
            }else{
              colnames(counts) <- taxa_acc
              t_counts <- as.data.frame(t(counts))
              t_counts <- stats::aggregate(t_counts, by = list(ii), FUN = sum)
              t_counts <- subset(t_counts, select = -1)
              counts <- as.data.frame(t(t_counts))
            }
            colnames(counts) <- epd.taxonomy$varname[index]
            x@commdf@counts <- counts
            return(x)
          })

#' @rdname taxa_to_acceptedtaxa
setMethod("taxa_to_acceptedtaxa", signature(x = "epd.entity",
                                            epd.taxonomy = "data.frame"),
          function(x, epd.taxonomy){
            x <- entity_to_matrices(x)
            x <- taxa_to_acceptedtaxa(x, epd.taxonomy)
            return(x)
          })


# taxa_to_highertaxa -----------------------------------------------

#' Change taxa to higher taxa level
#' 
#' This function modifies the taxa names in the \code{@commdf@@counts} slot
#' of \code{\link[EPDr]{epd.entity.df-class}} objects. More specifically, this 
#' function compares the taxa name with the taxonomy of the EPD to use 
#' the higher taxa names. If these changes result in duplicated columns of 
#' the same taxa their values are unified by summing them.
#'
#' @param x epd.entity \code{\link[EPDr]{epd.entity.df-class}} object.
#' @param epd.taxonomy data.frame Data frame with the taxonomy from 
#' the EPD as from the \code{\link[EPDr]{get_taxonomy_epd}} function.
#' @param rm_null_mhvar logical Logical value indicating whether to remove
#' or not those taxa that has no value for higher level taxa.
#'
#' @return Object of class \code{\link[EPDr]{epd.entity.df-class}} with new 
#' taxa names.
#' 
#' @examples
#' \dontrun{
#' epd.connection <- connect_to_epd(host="localhost", database="epd",
#'                                  user="epdr", password="epdrpw")
#' epd.1 <- get_entity(1, epd.connection)
#' epd.1 <- entity_to_matrices(epd.1)
#' epd.1@commdf@taxanames
#' colnames(epd.1@commdf@counts)
#' epd.1.hn <- taxa_to_highertaxa(epd.1, get_taxonomy_epd(epd.connection))
#' epd.1.hn@commdf@taxanames
#' colnames(epd.1.hn@commdf@counts)
#' }
#' @rdname taxa_to_highertaxa
#' @exportMethod taxa_to_highertaxa
setGeneric("taxa_to_highertaxa",
           function(x, epd.taxonomy, rm_null_mhvar = FALSE){
  standardGeneric("taxa_to_highertaxa")
})

#' @rdname taxa_to_highertaxa
setMethod("taxa_to_highertaxa", signature(x = "epd.entity.df",
                                          epd.taxonomy = "data.frame"),
          function(x, epd.taxonomy, rm_null_mhvar){
            if (!is.logical(rm_null_mhvar)){
              stop(paste0("'rm_null_mhvar' of the wrong format. It has to be ",
                          "TRUE or FALSE. Check ?taxa_to_highertaxa"))
            }
            taxa_ <- x@commdf@taxa_
            taxa_mhvar <- x@commdf@taxamhvar
            if (!rm_null_mhvar){
              na_index <- which(is.na(taxa_mhvar))
              taxa_mhvar[na_index] <- taxa_[na_index]
            }
            if (length(taxa_) == 0){
              warning(paste0("Entity has no count data for any taxon. ",
                             "Returning the original object in the format ",
                             "of an 'epd.entity.df' object."))
              return(x)
            }
            ii <- match(taxa_mhvar, epd.taxonomy$var_)
            index <- unique(stats::na.omit(ii))
            x@taxatype <- factor("Higher", levels = c("Samples",
                                                         "Accepted",
                                                         "Higher"))
            x@commdf@taxa_ <- epd.taxonomy$var_[index]
            x@commdf@taxanames <- epd.taxonomy$varname[index]
            x@commdf@taxaaccepted <- epd.taxonomy$accvar_[index]
            x@commdf@taxamhvar <- epd.taxonomy$mhvar_[index]
            x@commdf@taxagroupid <- epd.taxonomy$groupid[index]
            counts <- x@commdf@counts
            if (nrow(counts) == 0){
              counts <- counts[, seq_along(index)]
            }else{
              colnames(counts) <- taxa_mhvar
              t_counts <- as.data.frame(t(counts))
              t_counts <- stats::aggregate(t_counts, by = list(ii), FUN = sum)
              t_counts <- subset(t_counts, select = -1)
              counts <- as.data.frame(t(t_counts))
            }
            colnames(counts) <- epd.taxonomy$varname[index]
            x@commdf@counts <- counts
            return(x)
          })

#' @rdname taxa_to_highertaxa
setMethod("taxa_to_highertaxa", signature(x = "epd.entity",
                                          epd.taxonomy = "data.frame"),
          function(x, epd.taxonomy, rm_null_mhvar){
            x <- entity_to_matrices(x)
            x <- taxa_to_highertaxa(x, epd.taxonomy, rm_null_mhvar)
            return(x)
          })


#' # FUNCTIONNAME -----------------------------------------------
#' 
#' #' Title TBW
#' #' 
#' #' Description TBW
#' #'
#' #' Details TBW
#' #'
#' #' @param parameter TBW
#' #'
#' #' @return TBW
#' #' @examples
#' #' \dontrun{
#' #' TBW
#' #' }
#' #' @rdname TBW
#' #' @exportMethod TBW
#' setGeneric("functionname", function(param){
#'   standardGeneric("functionname")
#' })
#' 
#' #' @rdname functionname
#' setMethod("functionname", signature(param="paramclass"), function(param){
#'   
#' })
dinilu/EPDr documentation built on Aug. 22, 2019, 1:03 p.m.