R/ts_feature_correction.R
In PVplr: Performance Loss Rate Analysis Pipeline

Documented in plr_decomposition plr_kmeans_test plr_saturation_removal

#removes values above sat_limit*power_threshold

#' Removing Saturated Data
#'
#' @description Tests for readings which may indicate saturation of the system.
#' Removes values above the power saturation limit (calculated by multiplying
#' sat_limit and power_thresh).
#'
#' @param df A dataframe containing pv data.
#' @param var_list A list of the dataframe's standard variable names, obtained from
#' the output of \code{\link{plr_variable_check}}.
#' @param sat_limit An upper limit on power saturation. This is multiplied by the
#'  power threshold, and power values above this point are filtered from
#'  the dataframe. The value depends on the system's inverter.
#' @param power_thresh An upper limit on power.
#'
#' @return Returns passed data frame with rows removed which contain power values above the specified threshold
#' 
#' @examples 
#' # build var_list
#' var_list <- plr_build_var_list(time_var = "timestamp",
#'                                power_var = "power",
#'                                irrad_var = "g_poa",
#'                                temp_var = "mod_temp",
#'                                wind_var = NA)
#' # Clean Data
#' test_dfc <- plr_cleaning(test_df, var_list, irrad_thresh = 100,
#'                          low_power_thresh = 0.01, high_power_cutoff = NA)
#'                          
#' test_dfc_removed_saturation <- plr_saturation_removal(test_dfc, var_list,
#'                                                       sat_limit = 3000, power_thresh = 0.99)
#'
#' @importFrom rlang .data
#' @export
plr_saturation_removal <- function(df, var_list, sat_limit, power_thresh = 0.99) {
  
  df <- dplyr::filter(df, df[, var_list$power_var] <= as.numeric(as.character(sat_limit)) * power_thresh)
  return(df)

}

#' Statistical k-means Test
#' 
#' @description The method builds linear models by day, identifies outliers,
#' and performs 2-means clustering by slopes. If the lower identified
#' cluster is significantly less than the higher mean, and constitutes less
#' than 25\% of the data, it is identified as soiled and returned. Otherwise, the
#' outlier points are identified as soiled and returned.
#' 
#' @param df A df containing pv data. Should be 'cleaned' by \code{\link{plr_cleaning}}.
#' @param var_list A list of the dataframe's standard variable names, obtained from
#' the output of \code{\link{plr_variable_check}}.
#' @param mean_ratio This scales the higher identified cluster's mean for comparison.
#' Higher values will be more likely to identify the second mean as soiled, and vice versa.
#' Values should range from 0 to 1.
#' @param plot optional; Boolean; whether to return the box plot generated by the method
#' to identify outliers.
#' @param file_path optional; location to store the boxplot if plot is set TRUE.
#' Note this is not necessary if you select to plot - only if you wish to save it.
#' @param file_name optional; name of file to save boxplot if plot is set to TRUE.
#' @param set_cutoff Defaults to FALSE; pass a numeric value to cut off all slopes
#' less than the cutoff value. This bypasses entirely the outlier and clustering
#' calculuations to remove slope values you believe to be soiled.
#'
#' @return The method returns a dataframe containing the values that should be
#' removed. If you want to discard them, try using dplyr::filter().
#' 
#' 
plr_kmeans_test <- function(df, var_list, mean_ratio = 0.7, plot = FALSE, 
                            file_path, file_name, set_cutoff = FALSE) {
  
  #adjust dates to char
  df$day <- sapply(as.Date(df[, var_list$time_var]) - as.Date(df[, var_list$time_var][1]), as.character)
  
  slopes <- NULL
  rr <- NULL
  for (i in unique(df$day)) {
    temp <- subset(df, df$day == i) #using .data$ here creates an error
    if (nrow(temp) > 3) {
      mod <- lm(iacp ~ poay - 1, data = temp)
      # mod <- lm(iacp ~ poay, data = temp)
      slope <- as.numeric(stats::coefficients(mod)[1])
      rs <- summary(mod)$adj.r.squared
      slopes <- rbind(c(slope, i, as.character(as.Date(temp$tmst[1]))), slopes) #adds to top
      rr <- rbind(rs, rr)
    }
  }
  # temp <- dat %>% dplyr::group_by(day) %>% summarise(lm(iacp ~ poay - 1)$coefficients[1])
  
  slopes <- as.data.frame(slopes, row.names = FALSE) # make dataframe
  slopes$rr <- as.numeric(rr) # cast as numeric (will make non-numerics NA)
  slopes <- slopes[stats::complete.cases(slopes), ] # filters out all NA's
  slopes$V1 <- as.numeric(as.character(slopes$V1)) # forces numeric
  
  
  if (set_cutoff != FALSE) {
    slopes$set_cut <- ifelse(slopes$V1 > set_cutoff, TRUE, FALSE)
    days_to_remove <- as.character(slopes$V2[which(slopes$set_cut == FALSE)])
  } else {
    
    # cluster
    try <- cluster::pam(slopes$V1, k = 2)
    slopes$cluster <- try$clustering
    
    # outlier check
    
    box <- graphics::boxplot(slopes$V1)
    slopes$out <- ifelse(slopes$V1 %in% box$out, TRUE, FALSE)
    box_out <- length(which(slopes$out == TRUE))
    box_in <- length(which(slopes$out == FALSE))
    out_days <- as.character(slopes$V2[which(slopes$out == TRUE)])
    # plot(slopes$V1, col = as.factor(slopes$out))
    
    # clust <- kmeans(slopes$V1, 2)
    # test <- data.frame(x = as.numeric(as.character(slopes$V1)),
    #                    y = slopes$cluster,
    #                    z = as.numeric(as.character(slopes$V2)),
    #                    rr = slopes$rr)
    #
    mean1 <- mean(slopes$V1[which(slopes$cluster == 1)])
    mean2 <- mean(slopes$V1[which(slopes$cluster == 2)])
    
    # plot(test$x, col = test$y)
    # ggplot(test, aes(x = z, y = x)) +
    # geom_point(aes(color = rr)) + 
    # labs(x = 'day', y = 'slope', title = paste0('Daily slope dist, site ', file))
    
    if (plot == TRUE) {
      grDevices::png(filename = paste0(file_path, file_name, ".png"))
      # print(ggplot(test, aes(x = z, y = x)) + 
      # geom_point(aes(color = rr)) +
      # labs(x = "day", y = "slope", title = paste0("Daily slope dist, site ", file)))
      print(plot(slopes$V1, col = slopes$cluster, xlab = "days", ylab = "slope", 
                 main = paste0("Daily slope dist, site ", file_name)))
      grDevices::dev.off()
    }
    
    #identifying meaningful clusters
    if (mean1 > mean2) {
      mean_clear <- mean1
      mean_soil <- mean2
      clear_clust <- 1
    } else {
      mean_clear <- mean2
      mean_soil <- mean1
      clear_clust <- 2
    }
    
    clear_days <- slopes$V2[which(slopes$cluster == clear_clust)]
    day_ratio <- length(clear_days) / length(unique(df$day))
    soil_days <- as.character(slopes$V2[which(slopes$cluster != clear_clust)])
    
    days_to_remove <- NULL
    if (mean_ratio * mean_clear > mean_soil && day_ratio > 0.75) {
      #df <- df[which(df$day %in% clear_days),]
      days_to_remove <- soil_days
    } else if (box_out / box_in <= 0.15) {
      days_to_remove <- out_days
    }
  }
  
  return(days_to_remove)
  
}

# seems to find days with power readings outside of correlation threshold
# and assume the system failed those days. Removes outliers. It has the option
# of calling kmeans test at one point.


#' Decompose Seasonality from Data
#' 
#' @description Decomposes seasonality from a dataframe that has already
#' passed through a PLR Determination test, e.g. \code{\link{plr_xbx_model}}. This method has
#' the option of creating plot and data files.
#' 
#' @param data a dataframe containing PV data that has undergone a power
#' predictive model, e.g. \code{\link{plr_xbx_model}}.
#' @param freq the frequency of seasonality. This is typically 4 but depends
#' on the location of the system.
#' @param power_var name of the power variable, e.g. iacp
#' @param time_var name of the time variable, e.g. tvar
#' @param plot boolean indicating if you wish to save a plot.
#' @param plot_file location to save the plot, if the plot param is given TRUE.
#' @param title the title of the plot created if the plot param is given TRUE.
#' @param data_file location to save data. Currently non-functional.
#' 
#' @return Dataframe containing decomposed time series features
#' 
#' @examples 
#' #' # build var_list
#' var_list <- plr_build_var_list(time_var = "timestamp",
#'                                power_var = "power",
#'                                irrad_var = "g_poa",
#'                                temp_var = "mod_temp",
#'                                wind_var = NA)
#' # Clean Data
#' test_dfc <- plr_cleaning(test_df, var_list, irrad_thresh = 100,
#'                          low_power_thresh = 0.01, high_power_cutoff = NA)
#' # Perform power modeling step
#' test_xbx_wbw_res <- plr_xbx_model(test_dfc, var_list, by = "week",
#'                                   data_cutoff = 30, predict_data = NULL)
#'                                   
#' test_xbx_wbw_decomp <- plr_decomposition(test_xbx_wbw_res, freq = 4,
#'                                          power_var = 'power_var', time_var = 'time_var',
#'                                          plot = FALSE, plot_file = NULL, title = NULL, 
#'                                          data_file = NULL)
#' 
#' @importFrom stlplus stlplus
#' @importFrom stats lm
#' 
#' @export
plr_decomposition <- function(data, freq, power_var, time_var, plot = FALSE, 
                              plot_file = NULL, title = NULL, data_file = NULL) {
  
  # require(stlplus)
  # require(padr)
  # require(ggplot2)
  
 # if (!("time_var" %in% colnames(data))) {
 #  stop("column time_var not detected. Pass the data through a PLR determining test.")
  
  total.age <- 1:max(data[, time_var], na.rm = TRUE)
  power.sum <- as.data.frame(total.age)
  
  # tibble object does not work with preexisting logic
  # convert to dataframe
  data <- as.data.frame(data)
  
  #add NA rows where there is no monthly data
  #collect the power and error data
  for (j in power.sum$total.age) {
    if (j %in% data[, time_var]) {
      power.sum$power[j] <- data[which(data[, time_var] == j), power_var]
      power.sum$sigma[j] <- data$sigma[which(data[, time_var] == j)]
    } else {
      power.sum$power[j] <- NA
      power.sum$sigma[j] <- NA
    }
  }
  
  #If data is missing at the start is NA, remove it and start from the existing values
  while (is.na(power.sum$power[1])) {
    power.sum <- power.sum[-1, ]
  }
  #Same for NAs at the end of the time series
  while (is.na(power.sum$power[length(power.sum$power)])) {
    power.sum <- power.sum[-length(power.sum$power), ]
  }
  
  #define time series object of the predicted power, frequency is 12 months/year
  ts.power <- stats::ts(power.sum$power, frequency = freq)
  #stlplus to decompose with NA months
  stl.model <- stlplus(x = ts.power, s.window = "periodic")
  stl.data <- stl.model[[1]]
  #define column showing if trend month was interpolated from an NA month
  stl.data$interpolated <- ifelse(is.na(stl.data$raw), TRUE, FALSE)
  stl.data$age <- power.sum$total.age[1]:utils::tail(power.sum$total.age, n = 1)
  stl.data$sigma <- power.sum$sigma
  stl.data$operating <- 1:length(stl.data$age)
  
  stl.data$power <- ifelse(stl.data$interpolated == TRUE, NA, stl.data$trend)
  Rd <- lm(power ~ operating, data = stl.data, weights = 1 / sigma)
  
  if (plot == TRUE) {
    
    grDevices::png(filename = plot_file)
    #plot the trend of the predicted power, color by if the month had raw data, add sigma error bar
    plot(stl.data$operating,
         stl.data$trend,
         col = ifelse(stl.data$interpolated == TRUE, "green", "blue"),
         ylim = c(0, 1.1 * max(stl.data$trend)),
         xlab = "Age (Pseudo Months)",
         ylab = "Predicted Power (KW) Trend",
         main = title)
    graphics::arrows(stl.data$operating,
           stl.data$trend - stl.data$sigma,
           stl.data$operating,
           stl.data$trend + stl.data$sigma,
           length = 0.03,
           angle = 90,
           code = 3,
           col = "blue")
    
    stl.data$power <- ifelse(stl.data$interpolated == TRUE, NA, stl.data$trend)
    #define power change trendline
    Rd <- lm(power ~ operating, data = stl.data, weights = 1 / sigma)
    graphics::abline(Rd)
    
    grDevices::dev.off()
    
  }
  
  #change rate (%/yr) is defined as slope/intercept*12
  Rc <- summary(Rd)$coefficients[[2]] / summary(Rd)$coefficients[[1]] * 12 * 100
  
  return(stl.data)
  #write.csv(stl.data, data_file)
}