R/data_prep.R

Defines functions data_prep

Documented in data_prep

#' Mangrove Data Preparation
#'
#' Reads singular excel file with tabs containing data gathered in a plot-method survey of mangrove stands.
#'
#'
#'
#'
#' @param excelpath String where the excel file is located.
#' @param nontabs Number of tabs in excel file which does are not part of the plot data. These tabs must be located at the end of all tabs with necessary data.
#' @param location Name the general location where the survey took place.
#' @param levels List of column names set for varying spatial levels.
#' @param sitename Column name indicating the name of the site. Default is "SITE NAME".
#' @param site Column name indicating the tag or code for the site. Default is "SITE".
#' @param plotnumber Column name indicating the plot tag for each site. Default is "PLOT #".
#' @param plotsize Column name indicating the area of plots. Default is "Plot size".
#' @param species Column name indicating the type of species. Default is "Species".
#' @param height Column name indicating the height measured. Default is "Height (m)". Optional.
#' @param dbh Column name indicating measured diameter at breast height. Default is "DBH (cm)". Optional IF GBH is present.
#' @param gbh Column name indicating measured girth at breast height. Default is "GBH (cm)". Optional IF DBH is present.
#'
#'
#' @return Outputs should include'm.data', a consolidated data frame containing information,
#' and 'm.data.saps', contains observations that fall less than 5 DBH
#'
#' @keywords data preparation
#'
#'
#'
#' @export

# Function to rearrange data
data_prep<- function(excelpath=excelpath,
                         nontabs=0,
                         location=NA,
                         sitename = 'SITE NAME',
                         site = 'SITE',
                         plotnumber = 'PLOT #',
                         plotsize = 'Plot size',
                         species = 'Species',
                         gbh = 'GBH (cm)',
                         dbh = 'DBH (cm)',
                         height = 'Height (m)',
                         levels = list())
  {

  # Defines the `%>%` operator to the current environment
  `%>%` <- dplyr::`%>%`

  # Declaring number of tabs containing plot data.
  tab.lengths<- length(readxl::excel_sheets(excelpath))-nontabs

  # Preparing an empty data frame to gather all information from plot tabs.
  man_data<- data.frame(sitename = NA, site = NA,
                        plotnumber = NA, plotsize = NA,
                        gbh = NA, dbh = NA, height = NA)

  colnames(man_data)<- c("SITE NAME", "SITE", "PLOT #", "Plot size", "GBH", "DBH", "Height")

  # For loop reading all tabs with plot data.
  for (l in 1:tab.lengths) {
    d<- readxl::read_excel(excelpath, sheet=l)

    ## Transforms all data as characters for now
    d<- d %>%
      dplyr::mutate_all(as.character)

    # Compiling data per loop
    man_data<- dplyr::bind_rows(d,man_data)
  }

  # Declares the man_data as a data frame for processing
  man_data<- as.data.frame(man_data)

  # Load columns if some column names were named different from default
  man_data$`SITE NAME` <- man_data[,sitename]
  man_data$SITE <- man_data[,site]
  man_data$`PLOT #` <- man_data[,plotnumber]
  man_data$`Plot size`<- as.numeric(man_data[,plotsize])
  if(gbh %in% colnames(man_data)) man_data$GBH<- as.numeric(man_data[,gbh])
  if(dbh %in% colnames(man_data)) man_data$DBH<- as.numeric(man_data[,dbh])
  if(height %in% colnames(man_data)) man_data$`Height (m)` <- as.numeric(man_data[,height])

  ## Removes initial empty row
  man_data<- man_data %>% tidyr::drop_na(SITE)

  ## Includes a column with the general location of the sites if indicated
  if(!is.na(location)){
    man_data$LOCATION <- location

    # and add it to cluster levels
    levels<- unlist(c("LOCATION", levels))
  }

   # Row by row evaluation
  for (row in 1:nrow(man_data)) {

    ## Computing for diameter at breast height if DBH is not present
    if(is.na(man_data[row,"DBH"])){

      # Formula for computing for DBH
      man_data[row,"DBH"]<- man_data[row,"GBH"]/pi

      ## IF both GBH and DBH are not present, function will stop.
      if(is.na(man_data[row,"DBH"])){ stop("Blanks detected in both DBH and GBH columns.")}
    }

  }

  # Creates a data frame for those entries with diameter at breast height measurement less than 5cm
  man_saps<- subset(man_data, man_data$DBH<5)

  # Removes entries containing diameter at breast height measurement that is less than 5cm
  man_data<- subset(man_data, man_data$DBH>5)

  # Removes all data columns that contain at least one NA/NAN
  man_data<- man_data[ , colSums(is.na(man_data)) == 0]

  # This will print out the summary results to the console
  cat("\n ---------------------------------------------------")
  cat("\n Mangrove data preparation summary")
  cat("\n ---------------------------------------------------\n\n")

  ## Prints out column names
  cat("Column names: ")
  cat("\n ")
  print(names(man_data))

  ## Prints out list of sites
  cat("\n List of sites:")
  cat("\n ")
  print(unique(man_data$`SITE`))

  ## Prints out list of species
  cat("\n List of species observed:")
  cat("\n ")
  print(unique(man_data$Species))

  ## Sorts important variables to export. Will vary if Height column is present.
  if("Height" %in% colnames(man_data)){
    man_data<- man_data[,c(levels,"SITE NAME","SITE","PLOT #", "Plot size", "Species","DBH", "Height (m)")]
    man_saps<- man_saps[,c(levels,"SITE NAME","SITE","PLOT #", "Species","DBH")]
    }else{
    man_data<- man_data[,c(levels,"SITE NAME","SITE","PLOT #", "Plot size", "Species","DBH")]
    man_saps<- man_saps[,c(levels,"SITE NAME","SITE","PLOT #", "Species","DBH")]
  }

  ## Outputs back to the global environment a data frame
  assign("m.data", man_data, pos = .GlobalEnv)
  assign("m.saps", man_saps, pos = .GlobalEnv)
}
ppcadelina/bucs documentation built on April 4, 2020, 5:52 a.m.