Nothing
#' @name process_gfdata
#' @title Process Preliminary and Finalized 'GreenFeed' Data
#'
#' @description Processes and calculates daily and weekly averages of 'GreenFeed' data.
#' Handles data filtering, aggregation, and summarization to facilitate further analysis.
#'
#' @param data a data frame with preliminary or finalized 'GreenFeed' data
#' @param start_date a character string representing the start date of the study (format: "dmy")
#' @param end_date a character string representing the end date of the study (format: "dmy")
#' @param param1 an integer representing the number of records per day to be consider for analysis
#' @param param2 an integer representing the number of days with records per week to be consider for analysis
#' @param min_time an integer representing the minimum number of minutes for a records to be consider for analysis (default: 2 minutes)
#' @param cutoff an integer specifying the range for identifying outliers (default: 3 SD)
#'
#' @return A list of three data frames:
#' \item{filtered_data }{data frame with filtered 'GreenFeed' data}
#' \item{daily_data }{data frame with daily processed 'GreenFeed' data}
#' \item{weekly_data }{data frame with weekly processed 'GreenFeed' data}
#'
#' @examples
#' file <- system.file("extdata", "StudyName_GFdata.csv", package = "greenfeedr")
#' datafile <- readr::read_csv(file)
#'
#' gf_data <- process_gfdata(
#' data = datafile,
#' start_date = "2024-05-13",
#' end_date = "2024-05-25",
#' param1 = 2,
#' param2 = 3,
#' min_time = 2
#' )
#' head(gf_data)
#'
#' @export process_gfdata
#'
#' @import dplyr
#' @importFrom dplyr %>%
#' @import readxl
#' @importFrom stats weighted.mean
#' @importFrom stats sd
utils::globalVariables(c(
"EndTime", "CH4GramsPerDay", "CO2GramsPerDay", "O2GramsPerDay", "H2GramsPerDay",
"nDays", "nRecords", "TotalMin"
))
process_gfdata <- function(data, start_date, end_date,
param1, param2, min_time = 2, cutoff = 3) {
# Check date format
start_date <- ensure_date_format(start_date)
end_date <- ensure_date_format(end_date)
# Ensure param1 and param2 are defined
if (missing(param1) || missing(param2)) {
stop("Please define 'param1' (minimum records per day), 'param2' (minimum days per week), and min_time (minimum minutes per visit)")
} else {
message(paste("Using param1 =", param1, ", param2 =", param2, ", min_time =", min_time))
}
# Function to read and process each file
process_data <- function(data) {
if (ncol(data) >= 25) { # Preliminary data always have 25 columns or more
names(data)[1:14] <- c(
"RFID",
"AnimalName",
"FeederID",
"StartTime",
"EndTime",
"GoodDataDuration",
"HourOfDay",
"CO2GramsPerDay",
"CH4GramsPerDay",
"O2GramsPerDay",
"H2GramsPerDay",
"H2SGramsPerDay",
"AirflowLitersPerSec",
"AirflowCf"
)
data <- data %>%
## Remove leading zeros from RFID column to match with IDs
dplyr::mutate(RFID = gsub("^0+", "", RFID)) %>%
## Remove records with unknown ID and airflow below 25 l/s (threshold value recommended by C-Lock Inc.)
dplyr::filter(
RFID != "unknown",
AirflowLitersPerSec >= 25
) %>%
## Mark records with invalid gas values as NA, instead of removing them
dplyr::mutate(
CH4GramsPerDay = ifelse(CH4GramsPerDay <= 0, NA, CH4GramsPerDay),
CO2GramsPerDay = ifelse(CO2GramsPerDay <= 0, NA, CO2GramsPerDay),
O2GramsPerDay = ifelse(O2GramsPerDay <= 0, NA, O2GramsPerDay),
H2GramsPerDay = ifelse(H2GramsPerDay <= 0, NA, H2GramsPerDay)
) %>%
## Change format of 'StartTime' and 'GoodDataDuration' columns
dplyr::mutate(
day = as.Date(StartTime),
## Suppress warnings from coercion issues with GoodDataDuration
GoodDataDuration = suppressWarnings(
case_when(
# HH:MM:SS format
nchar(GoodDataDuration) == 8 ~ as.numeric(substr(GoodDataDuration, 1, 2)) * 60 +
as.numeric(substr(GoodDataDuration, 4, 5)) +
as.numeric(substr(GoodDataDuration, 7, 8)) / 60,
# YYYY-MM-DD HH:MM:SS format
nchar(GoodDataDuration) > 8 ~ as.numeric(substr(GoodDataDuration, 12, 13)) * 60 +
as.numeric(substr(GoodDataDuration, 15, 16)) +
as.numeric(substr(GoodDataDuration, 18, 19)) / 60,
TRUE ~ NA_real_
)
),
GoodDataDuration = round(GoodDataDuration, 2)
)
} else { # Finalized data always have less than 21 columns
names(data) <- c(
"FeederID",
"AnimalName",
"RFID",
"StartTime",
"EndTime",
"GoodDataDuration",
"CO2GramsPerDay",
"CH4GramsPerDay",
"O2GramsPerDay",
"H2GramsPerDay",
"H2SGramsPerDay",
"AirflowLitersPerSec",
"AirflowCf",
"WindSpeedMetersPerSec",
"WindDirDeg",
"WindCf",
"WasInterrupted",
"InterruptingTags",
"TempPipeDegreesCelsius",
"IsPreliminary",
"RunTime"
)
data <- data %>%
## Remove leading zeros from RFID column to match with IDs
dplyr::mutate(RFID = gsub("^0+", "", RFID)) %>%
## Remove records with unknown ID and airflow below 25 l/s (threshold value recommended by C-Lock Inc.)
dplyr::filter(
RFID != "unknown",
AirflowLitersPerSec >= 25
) %>%
## Mark records with invalid gas values as NA, instead of removing them
dplyr::mutate(
CH4GramsPerDay = ifelse(CH4GramsPerDay <= 0, NA, CH4GramsPerDay),
CO2GramsPerDay = ifelse(CO2GramsPerDay <= 0, NA, CO2GramsPerDay),
O2GramsPerDay = ifelse(O2GramsPerDay <= 0, NA, O2GramsPerDay),
H2GramsPerDay = ifelse(H2GramsPerDay <= 0, NA, H2GramsPerDay)
) %>%
## Change columns format
dplyr::mutate(
## Create a column with date
day = parse_date_time(sub(" .*", "", StartTime),
orders = c("Y-m-d", "m/d/y", "d/m/y", "y/m/d", "d-b-Y", "b/d/Y", "m/d/Y", "Y/m/d", "Ymd", "mdy")
),
## Extract hours, minutes, and seconds from GoodDataDuration
GoodDataDuration = round(
as.numeric(substr(GoodDataDuration, 1, 2)) * 60 + # Hours to minutes
as.numeric(substr(GoodDataDuration, 4, 5)) + # Minutes
as.numeric(substr(GoodDataDuration, 7, 8)) / 60, # Seconds to minutes
2
),
## Create a column that contains daytime (extract the time part from StartTime (HH:MM:SS))
HourOfDay = round(
as.numeric(format(as.POSIXct(StartTime, format = "%m/%d/%y %H:%M"), "%H")) + # Extract hours
as.numeric(format(as.POSIXct(StartTime, format = "%m/%d/%y %H:%M"), "%M")) / 60, # Extract minutes and convert to fraction of an hour
2
)
)
}
}
# Combine files into one data frame
df <- process_data(data)
# Filtering data
df <- df %>%
## Filter by conditions where CH4 and CO2 must be within range, but allow O2 and H2 to be NA
dplyr::filter(
dplyr::if_all(
c(CH4GramsPerDay, CO2GramsPerDay),
~ filter_within_range(.x, cutoff)
),
## Retain records with valid CH4 and CO2, even if O2 or H2 are NA
!is.na(CH4GramsPerDay) & !is.na(CO2GramsPerDay),
## Filter by minimum time of records
GoodDataDuration >= min_time,
## Filter by start and end of study
day >= start_date & day <= end_date
)
# Calculation of average daily gas production
daily_df <- df %>%
## Group by animal and date
dplyr::group_by(RFID, day) %>%
## Compute weighted mean of all gases
dplyr::summarise(
n = n(),
across(
c(CH4GramsPerDay, CO2GramsPerDay, O2GramsPerDay, H2GramsPerDay),
~ weighted.mean(.x, GoodDataDuration, na.rm = TRUE),
.names = "{.col}"
),
minutes = sum(GoodDataDuration, na.rm = TRUE),
.groups = "drop"
) %>%
## Filter by number of records per day (=param1)
dplyr::filter(n >= param1) %>%
## Compute week based on the minimum date
dplyr::mutate(week = floor(as.numeric(difftime(day, as.Date(start_date), units = "weeks"))) + 1) %>%
## Select columns
dplyr::select(RFID, week, day, n, minutes, CH4GramsPerDay, CO2GramsPerDay, O2GramsPerDay, H2GramsPerDay)
# Calculation of average weekly gas production
weekly_df <- daily_df %>%
## Group by animal and week
dplyr::group_by(RFID, week) %>%
## Compute number of days with records, total records and minutes, and the weighted mean of all gases
dplyr::summarise(
nDays = n(),
nRecords = sum(n),
TotalMin = round(sum(minutes), 2),
across(
c(CH4GramsPerDay, CO2GramsPerDay, O2GramsPerDay, H2GramsPerDay),
~ weighted.mean(.x, minutes, na.rm = TRUE),
.names = "{.col}"
),
.groups = "drop" # Un-group after summarizing
) %>%
## Filter by number of days with records (=param2)
dplyr::filter(nDays >= param2) %>%
## Select columns
dplyr::select(RFID, week, nDays, nRecords, TotalMin, CH4GramsPerDay, CO2GramsPerDay, O2GramsPerDay, H2GramsPerDay)
# Description of mean, sd, and CV for weekly gases
message(paste0("CH4: ", round(mean(weekly_df$CH4GramsPerDay, na.rm = TRUE), 2), " +- ", round(stats::sd(weekly_df$CH4GramsPerDay, na.rm = TRUE), 2)))
message(paste0("CH4 CV = ", round(stats::sd(weekly_df$CH4GramsPerDay, na.rm = TRUE) / mean(weekly_df$CH4GramsPerDay, na.rm = TRUE) * 100, 1), "%"))
message(paste0("CO2: ", round(mean(weekly_df$CO2GramsPerDay, na.rm = TRUE), 2), " +- ", round(stats::sd(weekly_df$CO2GramsPerDay, na.rm = TRUE), 2)))
message(paste0("CO2 CV = ", round(stats::sd(weekly_df$CO2GramsPerDay, na.rm = TRUE) / mean(weekly_df$CO2GramsPerDay, na.rm = TRUE) * 100, 1), "%"))
message(paste0("O2: ", round(mean(weekly_df$O2GramsPerDay, na.rm = TRUE), 2), " +- ", round(stats::sd(weekly_df$O2GramsPerDay, na.rm = TRUE), 2)))
message(paste0("O2 CV = ", round(stats::sd(weekly_df$O2GramsPerDay, na.rm = TRUE) / mean(weekly_df$O2GramsPerDay, na.rm = TRUE) * 100, 1), "%"))
message(paste0("H2: ", round(mean(weekly_df$H2GramsPerDay, na.rm = TRUE), 2), " +- ", round(stats::sd(weekly_df$H2GramsPerDay, na.rm = TRUE), 2)))
message(paste0("H2 CV = ", round(stats::sd(weekly_df$H2GramsPerDay, na.rm = TRUE) / mean(weekly_df$H2GramsPerDay, na.rm = TRUE) * 100, 1), "%"))
# Return a list of data frames
return(list(
filtered_data = df,
daily_data = daily_df,
weekly_data = weekly_df
))
message("List with data sets created.")
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.