R/combining.R

### these are all the functions which combine data.
### Contains the Master Function combine_tab and its helpers,
### including functions which reshape data or create new columns


#' Obtain data for all sites
#'
#' This function reads data from the same tab for all the sites (via \code{get_all_sites()}) , then combines thme with \code{dplyr::rbind_all()}
#'
#' @param sheetname The name of the sheet you'd like to have (\code{"leaf.waterdepths"}, \code{"bromeliad.physical"}, \code{"bromeliad.final.inverts"}, \code{"site.info"}, \code{"site.weather"}, \code{"bromeliad.initial.inverts"}, \code{"bromeliad.terrestrial"}, \code{"terrestrial.taxa"}).
#' @param .sites The sites you want. defaults to all of them
#'
#' @return data.frame of all bromeliad.physical tabs
#' @examples
#' combine_tab("French_Guiana","site.info")
#' @export
combine_tab <- function(.sites =  c("Argentina","Cardoso", "Colombia",
                                    "French_Guiana", "Macae", "PuertoRico",
                                    "CostaRica"),
                        sheetname){

  ## get all the site data
  site_data <- get_all_sites(sheetname = sheetname, sites = .sites)
  ########

  ## make names unique
  site_data <- lapply(site_data, which_names_doubled)

  #### ending the cleaning
  ## does the first dataset downloaded have the names "site" and "bromeliad.id"?
  is_site_brom_pres <- find_site_brom(site_data[[1]])
  ## if there are site and bromeliad columns, fuse them.
  if (is_site_brom_pres) {
    site_data <- lapply(site_data, brom_id_maker)
  }

  ## if this is invertebrates, gather them.
  if (sheetname %in% c("bromeliad.final.inverts",
                       "bromeliad.initial.inverts")) {
    site_data <- lapply(site_data, invert_to_long,
                        category_vars = c("site", "trt.name",
                                          "abundance.or.biomass",
                                          "site_brom.id"))
  }

  if(!names_all_same(site_data)) stop("names are different!")

  ## finally, rbind all
  allsite <- dplyr::bind_rows(site_data)
  return(allsite)
}

#' Tidy wide invert data into long format
#'
#' @param insect_data the dataset to tidy. In the same shape as the bromeliad.inverts.final tab
#' @param category_vars those variables which define groups (ie the names of every variable that is NOT the name of an invertebrate species). Must be a character vector.
#'
#' @return tbl.df containing invertebrate data in long format
#' @importFrom magrittr "%>%"
#' @export
invert_to_long <- function(insect_data, category_vars){

  data_names <- names(insect_data)

  # are all categories present?
  if (!all(category_vars %in% data_names)) {
    stop("missing a category")
  }

  # what names are *not* the categorical vars?

  insect_names <- setdiff(data_names, category_vars)

  # gather in all species names
  # spread out the two kinds of measurements
  # split the treatments into numbers
  long_out <- insect_data %>%
    tidyr::gather_("species", "quantity", insect_names, convert = TRUE)%>%
    tidyr::spread(abundance.or.biomass, quantity)%>%
    tidyr::separate(trt.name, c("mu", "k"), "k")%>%
    dplyr::mutate(mu = readr::parse_number(mu), k = readr::parse_number(k))

  has_biomass <- assertthat::has_name(long_out, "biomass")
  if (has_biomass) {
    zeros_same <- identical(which(long_out$abundance==0), which(long_out$biomass == 0))
    
    ## is there even biomass measurements?
    biomass_absent <- all(is.na(long_out$biomass))
    
    countryname <- unique(insect_data[["site"]])
    if(!zeros_same & !biomass_absent) warning(sprintf("there are inconsistencies between the abundance and biomass columns in %s", countryname))
  
    # remove the zeros
    long_final <- long_out %>%
      dplyr::filter(abundance != 0 | biomass != 0)
  } 
    long_final <- long_out %>% 
      dplyr::filter(abundance != 0)
    
  return(long_final)
}


#' Find and edit duplicate names in dataset
#'
#' @param df data frame which might have duplicate names
#'
#' @return data.frame with unique names. if the names were already unique, it is the same. otherwise the names are passed through \code{make.names} and a message is given
#' @export
which_names_doubled <- function(df){
  df_names <- df %>%
    names

  dup_names <- df_names %>%
    table %>%
    Filter(function(x) x > 1, .) %>%
    names

  if (length(dup_names) > 0) {
    dup_names %>%
      paste0(collapse = ", ") %>%
      sprintf("these names were duplicates: %s", .) %>%
      warning(.)
  }

  names(df) <- make.names(names(df), unique = TRUE)
  df
}


#' Merge functional groups to invert data
#'
#' @param insect_data data.frame of invert observations. must be long format (ie output of \code{invert_to_long})
#' @param trait_data bwg_names data. output of \code{get_bwg_names}
#'
#' @return merged data
#' @importFrom magrittr "%>%"
#' @export
merge_func <- function(insect_data, trait_data){

  merged <- dplyr::left_join(insect_data, trait_data, by = c("species" = "bwg_name"))

  return(merged)

}

#' summarize functional groups
#'
#' @param merged_data data formed by merging insect data to trait data
#' @param grps using the formula, indicate the grouping factors required
#' @examples 
#' invert_data <- combine_tab(c("Argentina", "French_Guiana", "Colombia","Macae", "PuertoRico","CostaRica"),"bromeliad.final.inverts")
#' bwg_names <- get_bwg_names()
#' invert_traits <- merge_func(invert_data, bwg_names)
#' # Functional Groups
#' func_groups <- sum_func_groups(invert_traits,
#'                               grps = list(~site,
#'                                           ~site_brom.id,
#'                                           ~functional_group))
#' func_groups
#' library(ggplot2) #require the 'ggplot2' package
#' ## plot by functional group abundance
#' func_groups %>%
#'  ggplot(aes(x = as.factor(functional_group), y = total_abundance)) +
#'  geom_point(position = position_jitter(width = 0.25), alpha = 0.5) +
#'  stat_summary(fun.data = "mean_cl_boot", colour = "red", size = 0.6) +
#'  facet_wrap(~site, scales = "free_y") +
#'  ggtitle("Functional group abundance")
#' # Trophic level
#' # To summarize by trophic level group, simply switch \code{~functional_group} to \code{~predation}:
#' predprey <- sum_func_groups(invert_traits,
#'                          grps = list(~site,
#'                                      ~site_brom.id,
#'                                      ~predation))
#' predprey
#' ## plot by trophic level abundance
#' predprey %>%
#'  ggplot(aes(x = as.factor(predation), y = total_abundance)) +
#'  geom_point(position = position_jitter(width = 0.25), alpha = 0.5) +
#'  stat_summary(fun.data = "mean_cl_boot", colour = "red", size = 0.6) +
#'  facet_wrap(~site, scales = "free_y") +
#'  ggtitle("Trophic level abundance")
#' @return summarized data. NOTE that this data will be grouped!
#' @importFrom magrittr "%>%"
#' @export
#'
sum_func_groups <- function(merged_data, grps = list(~site, ~site_brom.id, ~predation, ~functional_group)){
  merged_data %>%
    dplyr::group_by_(.dots = grps) %>%
    dplyr::summarize(total_abundance = sum(abundance),
              total_biomass = sum(biomass),
              total_taxa = n())
}



#' Summarize functional groups still farther into trophic ranks
#'
#' @param func_sums must be a grouped tbl.df, the groups must be bromeliad.id and predation, in that order
#'
#' @return summarized data
#' @importFrom magrittr "%>%"
#' @export
sum_trophic <- function(func_sums){

  test_groups <- identical(dplyr::groups(func_sums),
                           lapply(list("site","site_brom.id",
                                       "predation"),
                                  as.name))

  if(!test_groups) stop("the input must be grouped by bromeliad.id and predation, in that order")

  func_sums %>%
    dplyr::summarise_each(dplyr::funs(sum), total_abundance, total_biomass, total_taxa)
}

#' Are the columns site and bromeliad.id found in this data.frame?
#'
#' @param df data frame to check for
#'
#' @return are those column names present? TRUE or FALSE
find_site_brom <- function(df){
  has_site <- assertthat::has_name(df, "site")
  has_brom <- assertthat::has_name(df, "bromeliad.id")

  has_site & has_brom
}

#' Check that names are all identical
#'
#' @param datalist list of dataframes whose names must all be identical
#'
#' @return are the names identical? TRUE or FALSE
#' @export
names_all_same <- function(datalist){
  intersectnames <- datalist %>%
    lapply(names) %>%
    Reduce(intersect, .)

  identical(intersectnames, names(datalist[[1]]))
}
SrivastavaLab/bwgtools documentation built on May 9, 2019, 1:54 p.m.