R/waterdepths_calculations.R

#' make the water data long
#'
#' @param df the water data to make long
#'
#' @return long water data
#' @importFrom magrittr "%>%"
#' @export
longwater <- function(df, which_columns = "measure") {
  inputnames <- c("site_brom.id", "site", "trt.name", "date", "depth.centre.measure.first", 
    "depth.leafa.measure.first", "depth.leafb.measure.first", "depth.centre.water.first", 
    "depth.leafa.water.first", "depth.leafb.water.first")
  
  inputcheck <- assertthat::has_name(df, inputnames)
  
  if (!all(inputcheck)) stop("the leaf.waterdepths names are wrong!")
  if (length(inputcheck) != ncol(df)) stop("leaf.waterdepths missing columns!")
  
  
  measures  <- df %>%
    tidyr::gather("data_name", "depth", starts_with("depth"), convert = TRUE) %>%
    tidyr::separate(data_name, into = c("depth_word", "leaf",
                                        "watered_first","first")) %>%
    dplyr::select(-depth_word, -first) %>%
    ## filter out measurements from argentina
    ## ONLY WHERE water first is no 
    ## in other words, where they measured depth first.
    ## all depth should be measured AFTER watering.
    dplyr::filter(watered_first == which_columns)
  
  correct_names <- c("site_brom.id", "site", "trt.name",
                     "date", "leaf", "watered_first", 
                     "depth")
  
  ## check the names
  namecheck <- assertthat::has_name(measures, correct_names)
  
  if (!all(namecheck)) stop("the column names are wrong!")
  if (length(namecheck) != ncol(measures)) stop("output missing columns!")
  
  return(measures)
}



#' Group or summarize water data (long form)
#'
#' @param data a data.frame created by \code{longwater()}
#' @param aggregate_leaves should leaves be aggregated? defaults to \code{FALSE}
#'
#' @return if \code{aggregate = FALSE}, a data.frame of the same size as the original, but with groups defined for further processing. If \code{aggregate_leaves = TRUE}, output has one observation per bromeliad (rather than one observation per leaf within bromeliad)
#' @export
#' @importFrom magrittr "%>%"
group_or_summarize <- function(data, aggregate_leaves = FALSE){
  impt_names <- c("site", "watered_first", "trt.name",
                  "leaf", "site_brom.id")
  test_for_names <- impt_names %in% names(data)

  missing_names <- impt_names[!test_for_names]

  if (length(missing_names) > 0) {
    stop(
      sprintf(
        "missing names %s",
        paste0(missing_names, collapse = ", ")
      )
    )
  }

  if (aggregate_leaves) {
    data %>%
      dplyr::select(-leaf) %>%
      dplyr::group_by(site, watered_first, trt.name, site_brom.id, date) %>%
      dplyr::summarise(ndepth = sum(!is.na(depth)),
                depth = mean(depth, na.rm = TRUE))
  } else {
    data %>%
      dplyr::group_by(site, watered_first, trt.name, site_brom.id, leaf)
  }
}

#' Make the support file
#'
#' @param allsites combined site.info tab
#' @param phys combined bromeliad.physical tab
#' @examples 
#' sites <- combine_tab(c("Argentina", "French_Guiana"),"site.info")
#' phys <- combine_tab(c("Argentina", "French_Guiana"),"bromeliad.physical")
#' sup_file <- make_support_file(sites,phys)
#' sup_file
#' @return A dataframe containing support file: site.name, trt.name, temporal.block, start_block and finish_block. Start_block is equivalent to start.water.addition from the site.info tab, and finish_block is equivalent to last.day.sample
#' @export
make_support_file <- function(allsites, phys){
  ## get the data

  ## to get the start and the end date of the experiment for
  ## each site:
  start_finish <- allsites %>%
    dplyr::select(site.name,
                  start_block = start.water.addition,
                  finish_block = last.day.sample)

  ## Get the exact date that each bromeliad started. This is
  ## based on the temporal block.
  which_block <- phys %>%
    dplyr::select(site, trt.name, temporal.block)

  ## Join these two data frames, so that we can add the
  ## RELATIVE  start time of each block to the ABSOLUTE
  ## start time of each experiment
  support <- dplyr::left_join(which_block, start_finish, by = c("site" = "site.name"))

  ## recode blocks as numbers
  block_days_start <- c("a" = 0, "b" = 1, "c" = 2)
  block_days_finish <- c("a" = 2, "b" = 1, "c" = 0)

  ## add the constants to the start and finish date.
  ### the start date for each block is always positive (after the beginning)
  ### the end date is always earlier
  final_support <- support %>%
    dplyr::mutate(start_block = start_block + lubridate::days(block_days_start[temporal.block]),
           finish_block = finish_block - lubridate::days(block_days_finish[temporal.block]))

  test_final <- final_support %>% 
    dplyr::mutate(exp_duration = as.numeric(finish_block - start_block)) %>% 
    dplyr::select(site, exp_duration) %>% 
    dplyr::filter(exp_duration != 65 | is.na(exp_duration)) %>% 
    dplyr::distinct()
  
  ## perhaps print out the test_final or perhaps some other measurement.
  
  return(final_support)
  
}


#' Filter water data (long format)
#'
#' This function filters long-format water data, by applying \code{filter_centre_leaf}
#'  and \code{filter_naonly_groups}
#'
#' @param Data long-format water data
#' @param rm_centre should the centre leaf be used? defaults to TRUE
#'
#' @return ungrouped data.frame
#' @export
filter_long_water <- function(Data, rm_centre = TRUE){
  ## filter the output of longwater,
  ### remove central leaf (if required)
  ### get rid of any group without any numbers ie all NA
  Data %>%
    ## filter out centre
    filter_centre_leaf(centre_filter = rm_centre) %>% ## add argument here
    dplyr::group_by(site, watered_first) %>%
    filter_naonly_groups %>%
    dplyr::ungroup(.)
}



#' fill in the missing timeline
#'
#' This function fills in the complete sequence of dates,
#' from the beginning to the end of a site.
#'
#' @param filtered_water_data a pre-filtered water dataset. acts as a "template" for the grouping variables to include
#' @param sitedata contents of the site.info tab
#' @param physdata contents of the physical.info tab
#'
#' @return a long dataset that contains a consecutive series of dates
#' @export
make_full_timeline <- function(filtered_water_data, sitedata, physdata){
  supp <- make_support_file(allsites = sitedata,
                            phys = physdata)

  ## here, we need to make this as similar as possible to the output of longwater
  ## we remove depth and date, the only columns which make the rows unique
  ## (that is, the only columns which need filling in)
  simple_long <- filtered_water_data %>%
    dplyr::select(-depth, -date) %>%
    dplyr::distinct(.) %>%
    dplyr::left_join(supp)

  ## now create a long dataset
  ## by making a date column
  ## that spans from start_block and goes to finish_block
  simple_long %>%
    #filter(!is.na(finish_block)) %>%
    dplyr::group_by(site_brom.id, site,
                    trt.name, leaf, watered_first,
                    temporal.block) %>%
    dplyr::do(dplyr::data_frame(date = seq(from = .$start_block,
                             to = .$finish_block,
                             by = "days")))
}

#' Calculate the hydrological variables
#'
#' @param waterdata the leaf.waterdepths tab
#' @param sitedata the site.info tab
#' @param physicaldata the bromeliad.physical
#' @param rm_centre remove centre? defaults to TRUE
#' @param aggregate_leaves aggregate leaves? defaults to FALSE
#' @examples 
#' leafwater <- combine_tab(c("Argentina", "French_Guiana"),"leaf.waterdepths")
#' sites <- combine_tab(c("Argentina", "French_Guiana"),"site.info")
#' phys <- combine_tab(c("Argentina", "French_Guiana"),"bromeliad.physical")
#' hydro <- hydro_variables(waterdata = leafwater,
#'                           sitedata = sites,
#'                       physicaldata = phys)
#' hydro
#' @return The hydrological variables
#' @export
#' @importFrom magrittr "%>%"
hydro_variables <- function(waterdata, sitedata, physicaldata,
                            rm_centre = TRUE, aggregate_leaves = FALSE,
                            .columns = "measure", dohydro = TRUE){
  ## should point out to ppl that you contrl the output by
  ## manipulating waterdata
  if (any(is.na(waterdata$site))) {
    message("removing NA values in the 'site' column. Perhaps somebody has added blank rows to a dataset! please check")
    waterdata <- data.frame(waterdata[!is.na(waterdata$site),])
  }
  
  ## find macae
  assertthat::assert_that(assertthat::has_name(waterdata, "site"))
  find_macae <- waterdata[["site"]] == "macae"
  meas_macae <- ifelse(find_macae, "water", "measure")
  ## split the list
  ## set arguments for that list
  water_split <- split(waterdata, meas_macae)
  
  ## Map
  longwater_type <- function(water_split, x){
    force(x)
    longwater(water_split, which_columns = x)
  }
  answer <- Map(longwater_type,
                water_split,
                unique(meas_macae))
  
  long_water <- dplyr::bind_rows(answer)
  
  filtered_long_water <- long_water %>%
    filter_long_water(rm_centre = rm_centre)

  long_dates <- make_full_timeline(filtered_water_data = filtered_long_water,
                                   sitedata,
                                   physicaldata)

  ## if aggregating leaves, remove from long_dates
  ## otherwise they will be found in output
  if (aggregate_leaves) {
    long_dates <- long_dates %>%
      dplyr::ungroup(.) %>%
      dplyr::select(-leaf)
  }

  ## combining the original data with data
  ## that has been "filled in" with fun
  filled_in <- filtered_long_water %>%
    ## filter out NA groups
    group_or_summarize(aggregate_leaves = aggregate_leaves) %>%
    dplyr::left_join(long_dates, .)

  ## not everybody measured the centre! if we are keeping (not removing) the centre we need to filter out the NA groups again.
  if (!rm_centre) {
    filled_in <- filter_naonly_groups(filled_in)
  }

  ## if leaves have been aggregated, they are not around to
  ## be used for grouping or arranging
  if (aggregate_leaves) {
    sorted_water <- filled_in %>%
      dplyr::arrange(site, date, trt.name) %>%
      dplyr::group_by(site, trt.name)
  } else {
    sorted_water <- filled_in %>%
      dplyr::arrange(site, date, trt.name, leaf) %>%
      dplyr::group_by(site, site_brom.id, trt.name, leaf)
  }

  if (isTRUE(dohydro)){
    hydrovars <- dplyr::do(sorted_water, water_summary_calc(.$depth))
    
    message("if last_dry and last_wet is NA, we use 65 days (end of experiment)\nif n_driedout and n_overflow are NA, they are 0")
    ret <- hydrovars %>% 
      tidyr::replace_na(list(last_dry = 65, last_wet = 65, n_driedout = 0, n_overflow = 0))
    
    
    return(ret)   
  } else {
    ret <- sorted_water
  }
  
  return(ret)
 
}


overflow <- function(dep){
  ## find the maximum
  md <- max(dep, na.rm = TRUE)
  ## a leaf is full if it is 10 ml below maximum
  full <- md - 10
  ## relative to the number of observations:
  n_obs <- sum(!is.na(dep)) 
  
  if (md <= 10) {
    ov <- 0
  } else {
    ov <- sum(dep > full, na.rm = TRUE)
  }
  
  ov/n_obs
}

driedout <- function(dep){ 
  ## find the maximum 
  md <- max(dep, na.rm = TRUE)
  ## define dryness
  dry <- 5
  ## relative to the number of observations:
  n_obs <- sum(!is.na(dep)) 
  
  ## if max is less than 10 everything is dry
  if (md <= 10) {
    dr <- n_obs
  } else {
    dr <- sum(dep <= dry, na.rm = TRUE)
  }
  
  dr/n_obs
}


#' Calculate water depth measurements
#'
#' @param depth depth measurements
#' @details text describing parameter inputs in more detail.
#' \itemize{
#'  \item{"max.depth"}{ the maximum depth}
#'  \item{"min.depth"}{ the minimum depth}
#'  \item{"mean.depth"}{ mean depth}
#'  \item{"var.depth"}{ variance in depth}
#'  \item{"sd.depth"}{ standard deviation in depth}
#'  \item{"cv.depth"}{ coefficient of variation in depth}
#'  \item{"net_fluct"}{ net fluctuation in depth}
#'  \item{"total_fluct"}{ total fluctuation in depth}
#'  \item{"amplitude"}{ max.depth - mean.depth}
#'  \item{"wetness"}{mean.depth / max.depth}
#' }
#'
#' @return a 1 x n row \code{tbl_df}
#' @export
water_summary_calc <- function(depth){
  ## modify functions to remove the na values
  noNA <- function(f, x) f(x, na.rm = TRUE)
  ## check that it looks like mm
  ## must be sorted by date
  ## merge with support file -- must be 63 long
  var_measures <- dplyr::data_frame(
    len.depth = length(depth),
    n.depth = sum(!is.na(depth)),
    mean.depth = noNA(mean, depth),
    max.depth = noNA(max, depth),
    min.depth = noNA(min, depth),
    amplitude = max.depth - min.depth,
    net_fluc = sum(depth[-1] - depth[-length(depth)], na.rm = TRUE),
    tot_fluc = sum(abs(depth[-1] - depth[-length(depth)]), na.rm = TRUE),
    sd.depth = noNA(sd, depth),
    cv.depth = 100*(sd.depth/mean.depth),
    wetness = mean.depth / max.depth,
    prop.overflow.days = overflow(depth),
    prop.driedout.days = driedout(depth)
  )
  #message(sprintf("trying %s", unique(.site_brom.id)))
  extreme <- get_last_extremity(depth)
  
  #message("Everything is OK / Tudo otimo / Esta bien")
  cbind(var_measures, extreme)
}


# extreme events --------------------------------------

check_increasing <- function(vec){
  all_but_last <- vec[-length(vec)]
  all_but_frst <- vec[-1]
  
  compares <- Map(`<`, all_but_last, all_but_frst)
  
  res <- all(unlist(compares))
  
  if (any(vec < 0) | any(is.na(vec))) {
    res <- NA
  }
  return(res)
}

extreme_vector <- function(vec, bounds){
  if (bounds[1] != 0) stop("first boundary should be 0")
  
  if (max(vec, na.rm = TRUE) != bounds[4]) stop("last boundary should be maximum")
  
  ans <- cut(vec, breaks = bounds,
             labels = c("driedout",
                        "normal",
                        "overflow"),
             include.lowest = TRUE)

  as.character(ans)
  
}


find_bounds_wet_overflow <- function(depth){
  ## dep should be 65
  maxdep <- max(depth, na.rm = TRUE)
  full <- maxdep - 10
  empty <- 5

  
  boundaries <- c(0, empty, full, maxdep)
  
  return(boundaries)
}

## takes a vector and returns a data_frame
extremity <- function(dep){

  boundaries <- find_bounds_wet_overflow(dep)
  
  bigger <- check_increasing(boundaries)
  
  if (!isTRUE(bigger)) {
    warning(sprintf("boundaries are %s. These are not increasing!
                    Probably this leaf was too dry.
                    Answer is drought, forever",
                    paste(boundaries,
                          collapse = ", ")))
    ## ok we define such leaves as being in drought forevere
    ## err at least as long as a measurement was taken
    ev <- ifelse(!is.na(dep),"driedout",NA)
    pr <- rev(seq_along(dep))
              
    ## could add a "overflow" at 66 here
    final <- dplyr::data_frame(event = ev,
                               prior = pr)
  } else if (all(is.na(dep))) {
    warning("depth measurements completely absent. Did you choose the correct measurement time?")
    final <- dplyr::data_frame(event = NA,
                               prior = NA)
  } else {
    
    event_vec <- extreme_vector(dep, boundaries)
    
    ## just the words pls
    df_extreme <- dplyr::data_frame(event = event_vec,
                                    prior = rev(seq_along(event)))
    
    ## now filter for extreme events
    final <- df_extreme %>% 
      dplyr::filter(event %in% c("driedout", "overflow"))
  }
  Final <- final %>% 
    dplyr::filter(!is.na(event))
  return(Final)
}

last_extremity <- function(df){
  ## check event has only those levels
  ## check prior is integer
  
  ## find the most recent extreme event
  res <- most_recent(df)

  
  res2 <- df %>% 
    dplyr::group_by(event) %>% 
    dplyr::tally(.) %>% 
    dplyr::mutate(s = 1, 
                  event = paste0("n_", event)) %>% 
    tidyr::spread(event, n, fill = 0) %>% 
    dplyr::select(-s)
  
  
  if (nrow(res) == 1 & nrow(res2) == 1) {
    cbind(res, res2)
  } else if (nrow(res) == 0 | nrow(res) == 0 ) {
    dplyr::data_frame(event = NA,
                      prior = NA,
                      n_driedout = NA,
                      n_overflow = NA
                      )
  } else {
    stop("wtf")
  }
  
}

get_last_extremity <- . %>% 
  extremity() %>% 
  dplyr::do(last_extremity(.))

most_recent <- function(extreme_df){
  last_dry <- suppressWarnings(min(extreme_df$prior[extreme_df$event == "driedout"]))
  last_wet <- suppressWarnings(min(extreme_df$prior[extreme_df$event == "overflow"]))
  
  last_dry <- ifelse(is.finite(last_dry), last_dry, NA)
  last_wet <- ifelse(is.finite(last_wet), last_wet, NA)
  
  long_dry <- longest_stretch(extreme_df, "driedout")
  long_wet <- longest_stretch(extreme_df, "overflow")
  
  long_dry <- ifelse(is.finite(long_dry), long_dry, NA)
  long_wet <- ifelse(is.finite(long_wet), long_wet, NA)
  
  dplyr::data_frame(last_dry = last_dry,
                    last_wet = last_wet,
                    long_dry = long_dry,
                    long_wet = long_wet)
}

longest_stretch <- function(extreme_df, filt){
  lagged <- extreme_df %>% 
    dplyr::filter(event == filt) %>% 
    dplyr::mutate(l = lag(prior) - prior)

  #lag of 1 means consecutive
  grps <- rle(lagged$l)
  consec <- grps$value == 1
  max(grps$lengths[consec], na.rm = TRUE) + 1
}

summarize_hydro <- function(hydro_data){
  hydro_data %>% 
    dplyr::select(-leaf) %>% 
    dplyr::group_by(site, site_brom.id, trt.name) %>% 
    dplyr::summarise_each(dplyr::funs(median))
}
SrivastavaLab/bwgtools documentation built on May 9, 2019, 1:54 p.m.