R/FluxCal.R

Defines functions FluxCal

Documented in FluxCal

#' @title Calculate CO2 and CH4 gas fluxes
#'
#' @description A function to calculate CO2 and CH4 gas fluxes from the data loaded by the function \code{\link{LoadLGR}}
#' or \code{\link{LoadOther}}.
#' It takes a time cue data frame (argument \code{df_cue}), either created by the function \code{\link{SelCue}} or prepared by the
#' user following the format of example files "Time_&_Ta_1.csv" and "Time_&_Ta_2.csv" at
#'  https://github.com/junbinzhao/FluxCalR/tree/master/inst/extdata
#' to separate the measurements and then calculate the fluxes for all the measurements at once.
#' Note that the header for the time cue column must be either \strong{"Start"} or \strong{"End"}.
#' Based on the time cues and window width provided for the calculation, the function will automatically scan over data that cover
#'  1.5x (default) length of the window and calculate the fluxes based on the best linear regression (i.e., largest R2).
#'  After the calculations are done, a graph with regression lines plotted on the CO2 and/or CH4 concentration time series can be
#'  drawn for checkup purposes.
#'
#' @param data A dataframe generated by the function \code{\link{LoadLGR}} or \code{\link{LoadOther}}.
#' @param win A number indicates the window width for the flux calculation, unit: minutes.
#' @param vol A number indicates volume of the chamber; unit: dm^3 or l.
#' @param area A number indicates base area of the chamber; unit: m^2.
#' @param pa A number indicates the air pressure during measurements; unit: atm. Default: 1.
#' @param cal A string, either "CO2_CH4" (default),"CO2" or "CH4", indicates which gas flux it is calculated for.
#' @param df_cue A data frame that includes "Start" and/or "End" time (HH:MM:SS) of each measurement.
#' The header for the time must be either \strong{"Start"} or \strong{"End"}. This data frame can either be created by the
#' function \code{\link{SelCue}} or be prepared by the user
#' (see example files "Time_&_Ta_1.csv" and "Time_&_Ta_2.csv" at https://github.com/junbinzhao/FluxCalR/tree/master/inst/extdata).
#' @param cue_type A string, either "Start", "End" (default) or "Start_End", indicates if start, end or both time in the
#' data frame assigned to \code{df_cue} will be used as the cues . When "Start_End" is chosen, both "Start" and "End" columns
#' have to be present in the data frame for \code{df_cue} and the flux with the largest R2 within the range will be calculated
#' without considering the argument \code{ext}.
#' @param other A vector of strings indicates the names of other columns in the data frame for the \code{df_cue} argument
#' that need to be passed along to the final output data frame. Default: NULL.
#' @param df_Ta A data frame contains a column "Ta" with the air temperature values (ideally, this is temperature measured inside
#'  of the chamber during the flux measurement; unit: degree C. This can be the same data frame as in \code{df_cue}.
#'  See example files "Time_&_Ta_1.csv" and "Time_&_Ta_2.csv" at https://github.com/junbinzhao/FluxCalR/tree/master/inst/extdata).
#'  Note the row number of the data frame must be the same as the number of flux measurements.
#' Default: NULL, then the temperature used is either the average ambient air temperature measured by the LGR analyzer
#' (column "AmbT_C") or, if the data measured by other analyzers, Ta input from function \code{\link{LoadOther}}.
#' @param f A number indicates the sampling (recording) frequency of the data (unit: second). If not provided (default), the
#' frequency is computed based on the difference of timestamps of the first two rows in the data.
#' @param ext A number indicates a range of how many times of the window width (\code{win}) should the calculation scan through to
#' choose the regression with the largest R2. Default: 1.5. This argument is ignored when \code{df_cue} is "Start_End".
#' @param metric A string to indicate the evaluation metric for selecting the regression. Possible options: "R2" (default) and
#' "RMSE".
#' @param output A string includes output directory and file name (.csv) to export the calculated fluxes.
#' Default: a file named "Flux_output.csv" with calculated fluxes will be created under the current work directory.
#' FALSE, do not create a file.
#' @param digits An integer indicates the number of decimal digits to be used for the calculated fluxes and slopes.
#' @param check_plot A logic value indicates whether a checking plot should be drawn after the calculation. Default: TRUE.
#'
#' @return A data frame with 9 columns, including number of measurement ("Num"), date of measurement ("Date"), start and end time
#' for each flux calculation ("Start" and "End"), gas name ("Gas", either CO2 or CH4), slope ("Slope") and R2 ("R2") of the
#' regressions, air temperature used for calculation ("Ta") and the calculated fluxes ("Flux", unit: umol m-2 s-1).
#' As default, a copy of the data frame will be saved as "Flux_output.csv" under the work directory (or as what is provided in the
#' argument \code{output}) and a graph with regression lines plotted on the CO2 and/or CH4 concentration time series will pop up
#' for checkup purposes.
#'
#' @importFrom grDevices dev.new dev.off rgb
#' @importFrom graphics abline axis.POSIXct lines par plot text
#' @importFrom stats lm
#'
#' @examples
#' \dontrun{
#' library(FluxCalR)
#' #### data from LGR
#' # get the directory of the example LGR raw data
#' example_data1 <- system.file("extdata", "Flux_example_1_LGR.txt", package = "FluxCalR")
#' example_data1
#'
#' # load the data
#' Flux_lgr <- LoadLGR(file = example_data1,
#'                     time_format = "mdy_HMS")
#'
#' # manually select the end of each measurement as time cues
#' time_cue <- SelCue(Flux_lgr,flux = "CO2",cue = "End",save = FALSE)
#'
#' # calculate the fluxes over a 3-minute window using the manually selected cues
#' Flux_output1 <- FluxCal(data = Flux_lgr,
#'                         win = 3,
#'                         vol = 208, area = 0.26,
#'                         df_cue = time_cue,
#'                         cue_type = "End",
#'                         output = FALSE) # don't create a output file
#' Flux_output1
#'
#' ## input the time cues from a prepared file and calculate the fluxes over a 3-minute window
#' # get directory of the example file with time cues and Ta
#' Example_cue1 <- system.file("extdata", "Time_&_Ta_1.csv", package = "FluxCalR")
#' Time_Ta1 <- read.csv(Example_cue1)
#' # this is how this file should look like
#' head(Time_Ta1)
#' Flux_output2 <- FluxCal(data = Flux_lgr,
#'                         win = 3,
#'                         vol = 208, area = 0.26,
#'                         df_cue = Time_Ta1,
#'                         cue_type = "Start_End", # use both start and end time as time cues
#'                         other = c("Plot","Light_Dark"), # pass other columns into the final output
#'                         df_Ta = Time_Ta1) # use separately measured air temperature for calculation
#' Flux_output2
#'
#' #### data from other sources
#' # get the directory of the example data
#' example_data2 <- system.file("extdata", "Flux_example_2_other.csv", package = "FluxCalR")
#' example_data2
#'
#' # load the data
#' Flux_other <- LoadOther(file = example_data2,
#'                         time = "Date_time",
#'                         time_format = "mdy_HMS",
#'                         CO2 = "CO2_PPM",
#'                         Ta = "Tem_C")
#'
#' ## input the time cues from a prepared file and calculate the fluxes over a 3-minute window
#' # get directory of the example file with time cues and Ta
#' Example_cue2 <- system.file("extdata", "Time_&_Ta_2.csv", package = "FluxCalR")
#' Time_Ta2 <- read.csv(Example_cue2)
#' head(Time_Ta2)
#' Flux_output3 <- FluxCal(data = Flux_other,
#'                         cal = "CO2", # only calculate CO2 flux
#'                         win = 3,
#'                         vol = 208, area = 0.26,
#'                         df_cue = Time_Ta2,
#'                         cue_type = "Start",
#'                         other = c("Plot","Light_Dark"),
#'                         output = FALSE) # don't create a output file
#' Flux_output3
#' }
#'
#' @export
## Flux calculation function ---------------
FluxCal <- function(data,
                    win,
                    vol,
                    area,
                    pa = 1,
                    cal = "CO2_CH4",
                    df_cue,
                    cue_type = "End",
                    other = NULL,
                    df_Ta = NULL,
                    f = NULL,
                    ext = 1.5,
                    metric = "R2",
                    output = "Flux_output.csv",
                    digits = 3,
                    check_plot = TRUE
) {
  # check arguments
  cal <- match.arg(cal,c("CO2_CH4","CO2","CH4"))
  cue_type <- match.arg(cue_type,c("End","Start","Start_End"))
  metric <- match.arg(metric,c("R2","RMSE"))
  # # if volumn or base area of the chamber are not specified
  # if (!is.numeric(vol) | !is.numeric(area)){
  #   stop("Error: both 'vol' and 'area' have to be specified!")
  # }
  # ext has to be >=1
  if (ext < 1) {
    stop("Error: 'ext' argument can not be < 1 !")}
  # define the pipe from the package "magrittr"
  `%>%` <- magrittr::`%>%`
  # calculate the sampling frequency based on the timestamps (unit: seconds)
  if (is.null(f)){
    f <- round(as.numeric(difftime(data$Time[2],data$Time[1],units = "secs")))
  }

  # constants
  R_index <- 0.08205783 # universal gas constant; unit: L*atm*K^-1*mol^-1
  win_f <- win*60/f # the number of rows for the defined window

  # add one column as row index and one as time (HH:MM:SS)
  data <- cbind(Row=row(data)[,1],data) %>%
    dplyr::mutate(time=paste(lubridate::hour(Time),
                             sprintf("%02d",lubridate::minute(Time)),
                             sprintf("%02d",floor(lubridate::second(Time))),sep = ":"))

  ### get the row index for the moving window in calculation ---------
  mov <- ext-1 # the extend of moving

  # # extract the date
  date_m <- strftime(data$Time[1],format="%y-%m-%d","UTC")
  # create the index
  In <- vector()

  # get the timestamp cues that are closest to what in the data
  if (cue_type == "End"){ ### if end time is given
    df_cue$End <- lubridate::ymd_hms(paste(date_m,df_cue$End,sep = "_")) # add date of measurement to time cues
    # match the closest
    for(i in seq_len(nrow(df_cue))){
      In[i] <- birk::which.closest(data$Time,df_cue$End[i])
    }
    In1 <- In # start moving
    In2 <- In1-mov*win_f # end of moving (backwards)
    # return an error if the indices includes NAs
    if (anyNA(In1)) {
      stop(paste0("Time matching fails: check if time cue of the measurement No.",
                  which(is.na(In1)),which(is.na(In2)),
                  " in the 'df_cue' is correct; also, please split the dataset if the period covers several days!"))
    }
  } else {
    if (cue_type == "Start"){ ### if start time is given
      df_cue$Start <- lubridate::ymd_hms(paste(date_m,df_cue$Start,sep = "_")) # add date of measurement to time cues
      # match the closest
      for(i in seq_len(nrow(df_cue))){
        In[i] <- birk::which.closest(data$Time,df_cue$Start[i])
      }
      In1 <- In+ext*win_f
      In2 <- In1-mov*win_f
      # return an error if the indices includes NAs
      if (anyNA(In1)) {
        stop(paste0("Time matching fails: check if time cue of the measurement No.",
                    which(is.na(In1)),which(is.na(In2)),
                    " in the 'df_cue' is correct; also, please split the dataset if the period covers several days!"))
      }
    } else { ### if both start and end is given
      df_cue$End <- lubridate::ymd_hms(paste(date_m,df_cue$End,sep = "_")) # add date of measurement to time cues
      df_cue$Start <- lubridate::ymd_hms(paste(date_m,df_cue$Start,sep = "_"))
      # match the closest
      for(i in seq_len(nrow(df_cue))){
        In[i] <- birk::which.closest(data$Time,df_cue$End[i])
      }
      In1 <- In
      # recycle In
      In <- vector()
      #
      for(i in seq_len(nrow(df_cue))){
        In[i] <- birk::which.closest(data$Time,df_cue$Start[i])
      }
      In2 <- In+win_f
      # return an error if the indices includes NAs
      if (anyNA(In1)) {
        stop(paste0("Time matching fails: check if time cue of the measurement No.",
                    which(is.na(In1)),which(is.na(In2)),
                    " in the 'df_cue' is correct; also, please split the dataset if the period covers several days!"))
      }
      if (anyNA(In2)) {
        stop(paste0("Time matching fails: check if time cue of the measurement No.",
                    which(is.na(In1)),which(is.na(In2)),
                    " in the 'df_cue' is correct; also, please split the dataset if the period covers several days!"))
      }
      # return an error if the window length is larger than the range between start and end
      if (any(In1-In2<0,na.rm = TRUE)){
        stop(paste0("Error: differences between start and end time must be larger than the window size (win)!"))
      }
    }
  } # here get two time cues In1 (End) and In2 (Start)!!

  # function to calculate R2, slopes and fluxes ----------
  CalFUN <- function(flux = "CO2") {
    dft <- data.frame(matrix(-999,nrow(df_cue),9)) # for record the data
    names(dft) <- c("Num","Date","Start","End","Gas","Slope","metric","Ta","Index")
    ########## 1. calculate the max R2 and slopes of the regression winthin the window with an extended range
    for (a in seq_len(nrow(df_cue))){
      # the window is moving backwards from the end point
      for (b in In1[a]:In2[a]){
        if (flux == "CO2"){ # for CO2
          Slm <- try(summary(lm(data$X.CO2.d_ppm[(b-win_f):b]~data$Row[(b-win_f):b])),silent = TRUE)
        } else { # for CH4
          Slm <- try(summary(lm(data$X.CH4.d_ppm[(b-win_f):b]~data$Row[(b-win_f):b])),silent = TRUE)
        }
        # if no data is provided
        if (class(Slm)=="try-error" | is.na(Slm$r.squared)){
          stop(paste0("Error: make sure ",flux," data are provided and specified in the correct way (See the help)."))
        } else {
          dft[a,"Num"] <- a # the Number of measurements
          # define argument for "if" based on the evaluation metric
          met <- ifelse(metric == "R2",
                        Slm$r.squared > dft[a,"metric"], # metric == "R2"
                        sqrt(mean(Slm$residuals^2)) < dft[a,"metric"] # metric == "RMSE"
                        )
          # if the new regression is better than the one already saved
          if (met | dft[a,"metric"] == -999) {
            dft[a,"Date"] <- paste0(lubridate::year(data$Time[1]),"-", # the date
                               lubridate::month(data$Time[1]),"-",
                               lubridate::day(data$Time[1]))
            dft[a,"Start"] <- strftime(data$Time[(b-win_f)],format="%H:%M:%S","UTC") # the start time of the slope
            dft[a,"End"] <- strftime(data$Time[b],format="%H:%M:%S","UTC") # the end time of the slope
            dft[a,"Gas"] <- flux # CO2 or CH4
            dft[a,"Slope"] <- try(round(Slm$coefficients[2]/f,digits = digits),silent = TRUE) # slope as against 1s
            dft[a,"metric"] <- try(ifelse(metric == "R2",
                                          round(Slm$r.squared,digits = 2), # metric == "R2"
                                          round(sqrt(mean(Slm$residuals^2)),digits = 2) # metric == "RMSE"
                                          ),
                                   silent = TRUE) # R2 or RMSE
            dft[a,"Ta"] <- round(mean(data$AmbT_C[(b-win_f):b]),digits=2) # temperautre
            dft[a,"Index"] <- b # output the row index at the END of the slope for plotting the graphs
          } # end of if for R2 or RMSE
        }
      } # end b loop
    } # end a loop, end of calculate the slopes

    # change the name for the metric
    names(dft)[7] <- metric

    ####### 2. determine which temperature will be used for flux calculation
    if (is.null(df_Ta)){ # if no Ta provided, use the ones measured by analyzer
      dft <- data.frame(dft,Tk=dft$Ta+273.2)
    } else { # if Ta is provided as a data frame, use the column "Ta"
      dft <- data.frame(dft,Tk=df_Ta$Ta+273.2) %>%
        dplyr::mutate(Ta=Tk-273.2) # replace the Ta with the provided values
    }

    ######## 3. calculate the flux
    dft <- dft %>%
      dplyr::mutate(Flux=try(round(((Slope*vol)/(R_index*Tk)/area)*pa,digits = digits),silent=TRUE)) # umol m-2 s-1

    # when other meta data need to be passed along from the df_cue data frame
    if (!is.null(other)){
      dft <- try(data.frame(dft,df_cue[,c(other)]),silent = TRUE)
      if (class(dft)=="try-error")
        stop("Error: please check the 'other' argument is properly specified as column names (see the help)!")
      # rename the variables passed along
      names(dft)[(ncol(dft)-length(other)+1):ncol(dft)] <- other
    }

    ######### 4. plot the result if required
    # if (check_plot == TRUE){ # if checking plot is needed, then make a graph
    #   # determine if the CO2 or CH4 column is used for ploting and ylim range
    #   if (flux=="CO2"){ # for CO2
    #     var <- "X.CO2.d_ppm"
    #     if (is.null(ylim_CO2)){
    #       ylim <- range(data$X.CO2.d_ppm)
    #     } else {
    #       ylim <- ylim_CO2
    #     }
    #   } else { # for CH4
    #     var <- "X.CH4.d_ppm"
    #     if (is.null(ylim_CH4)){
    #       ylim <- range(data$X.CH4.d_ppm)
    #     } else {
    #       ylim <- ylim_CH4
    #     }
    #   }
    #   # plot the data against the row index
    #   try(plot(data[,var],
    #            ylab=paste0(flux," readings in ppm"),
    #            xlab="Time",bty="n", xaxt="n",
    #            ylim=ylim),silent=TRUE)
    #   # plot the regression lines
    #   for (i in seq_len(nrow(dft))){
    #     Slm <- try(lm(data[(dft[i,"Index"]-win_f):dft[i,"Index"],var]~
    #                     data$Row[(dft[i,"Index"]-win_f):dft[i,"Index"]]),silent=TRUE)
    #     try(lines(data$Row[(dft[i,"Index"]-win_f):dft[i,"Index"]], Slm$fitted.values,
    #               col="green", lwd=3),silent=TRUE)
    #     try(text(data[dft[i,"Index"]-win_f,"Row"],data[dft[i,"Index"]-win_f,var],
    #              labels = paste(dft[i,"Num"]),
    #              col="red",cex=1.2,pos = 3),silent=TRUE)
    #   }
    #   # add x-axis as time
    #   par(new=T)
    #   try(plot(data[,"Time"],data[,var],
    #            type = "n",axes = F, xlab = "", ylab = ""),silent=TRUE)
    #   # add time interval ticks
    #   c <- try(lubridate::pretty_dates(data$Time,n=10),silent=TRUE)
    #   try(axis.POSIXct(1, at= c,format = "%H:%M"),silent=TRUE)
    # }
    return(dft)
  } # end of the function

  ###### calculate the flux using the function and, if required, output the file and plot the result ------------------
  # if (check_plot == TRUE){ # if the checking plot is needed, create a window for plotting
  #   if (cal == "CO2_CH4"){ # both CO2 and CH4 are calculated, plot 2 graphs
  #     dev.new(width = 16,height = 12,noRStudioGD=TRUE)
  #     par(mfrow=c(2,1),mar=c(2,1,2,1),xpd=NA,oma=c(4,4,1,1))
  #   } else { # only CO2 or CH4 is calculated, plot 1 graph
  #     dev.new(width = 16,height = 5,noRStudioGD=TRUE)
  #     par(mar=c(0.5,1,0.5,1),xpd=NA,oma=c(4,4,1,1))
  #   }
  # }

  # function for plotting check plot with "plotly"
  # data points
  plotck <- function(flux,dft) {
    # determine if the CO2 or CH4 column is used for ploting and ylim range
      if (flux=="CO2"){ # for CO2
        var <- "X.CO2.d_ppm"
      } else { # for CH4
        var <- "X.CH4.d_ppm"
      }
    # scatter plot
    p <- plotly::plot_ly(x = data[,"Time"],
                         y = data[,var],
                         mode = "markers",
                         type = "scatter",
                         color = I(rgb(0,0,0,alpha = 0.5)),
                         ) %>%
      plotly::layout(xaxis = list(title = ""),
                     yaxis = list(title = paste0(flux," readings in ppm")),
                     showlegend = FALSE)
    # add regression lines
    for (i in seq_len(nrow(dft))){
      Slm <- try(lm(data[(dft[i,"Index"]-win_f):dft[i,"Index"],var]~
                    data$Row[(dft[i,"Index"]-win_f):dft[i,"Index"]]),silent=TRUE)
      # add regression lines
      p <- try(plotly::add_lines(p,
                                 x = data$Time[(dft[i,"Index"]-win_f):dft[i,"Index"]],
                                 y = Slm$fitted.values,
                                 line = list(width = 3,color="chartreuse")),
                   silent=TRUE)
    } # end of regression line loop

    # add index to all regression line
    p <- try(plotly::add_text(p,
                              x = data[dft[,"Index"]-win_f,"Time"],
                              y = data[dft[,"Index"]-win_f,var],
                              text = paste(dft[,"Num"]),
                              textfont = list(color = "red"),
                              textposition = "top"),
             silent=TRUE)
    return(p)
  } # end of check plot function

  # function to create a temporary HTML file to show plotly check plot
  htmlplotly <- function(p_=pp){
    tempDir <- tempfile()
    dir.create(tempDir)
    htmlFile <- file.path(tempDir, "index.html")
    # add the plotly graph to the html file
    htmlwidgets::saveWidget(plotly::as_widget(p_), htmlFile)
    # view the graph
    viewer <- getOption("viewer")
    viewer(htmlFile)
  } # end of HTML file for plotly

  # calculate and plot
  if (cal == "CO2_CH4"){ # both CO2 and CH4 are calculated
    df_CO2 <- CalFUN(flux = "CO2")
    df_CH4 <- CalFUN(flux = "CH4")
    # plot the check plot
    if (check_plot == TRUE) {
      p_CO2 <- plotck(flux = "CO2",dft = df_CO2)
      p_CH4 <- plotck(flux = "CH4",dft = df_CH4)
      pp <- plotly::subplot(p_CO2,p_CH4,
                            nrows = 2,
                            titleY = TRUE)
      htmlplotly()
    } # end of plotting check plot

    # the output
    dfoutput <- rbind(df_CO2,df_CH4) %>%
      dplyr::select(-Index,-Tk) # don't output the row index and Tk
  } else { # only CO2 or CH4 is calculated
    df_var <- CalFUN(flux = cal)
    # plot the check plot
    if (check_plot == TRUE){
      pp <- plotck(flux = cal,dft = df_var)
      htmlplotly()
    }

    # the output
    dfoutput <- df_var %>%
      dplyr::select(-Index,-Tk) # don't output the row index and Tk
  }
  # check if the data frame needs to be output as a file
  if (assertthat::is.string(output)){
    utils::write.csv(dfoutput,file = output,row.names = F)
  }

  return(dfoutput)
}
## function end here#####
junbinzhao/R-Package-FluxCalR documentation built on May 25, 2021, 5:49 a.m.