R/preprocessing.R

Defines functions clean_Temp_2 clean_Temp clean_eload create_date_var to_extract to_exclude convert_15min_to_1_hour time_format

Documented in clean_eload clean_Temp clean_Temp_2 convert_15min_to_1_hour create_date_var time_format to_exclude to_extract

#-------------------------------------------------------------------------------
# RMV2.0 (version 1.1.0)
# LBNL MV 2.0 Toolbox
# Samir Touzani, PhD
#-------------------------------------------------------------------------------


#' Convert the timestamps into the default format
#'
#' \code{time_format} This function convert the actual timestamps format into "%m/%d/%y %H:%M" format
#'
#'
#' @param data A data frame that contains time column
#' @param format A character string that defines the actual format of the timestamps.
#' Use the description of the base R function \emph{strptime} to define the format.
#' @return A data frame with timestamps converted into "%m/%d/%y %H:%M" format
#'
#' @export


time_format <- function(Data,format){
  if (is.null(format)){
    stop("The original time format is not indicated")
  }
  dts <- as.POSIXct(strptime(Data$time, format = format))
  Data$time <- format(dts,"%m/%d/%y %H:%M")
  return(Data)
}




#' Convert 15 minute interval data into hourly interval data
#'
#' \code{convert_15min_to_1_hour} This function convert 15 minute interval data into hourly interval data
#'
#'
#' @param Data A dataframe that contains 15 minutes interval data
#' @param kWh A logical if TRUE it means that the interval data are energy data
#' @return A dataframe with hourly interval data
#'
#' @export

convert_15min_to_1_hour <- function(Data,kWh=TRUE){
  missing_hours <- 0
  dts <- as.POSIXct(strptime(Data$time, format = "%m/%d/%y %H:%M"))
  Data$dts <- dts
  dts_1 <- as.POSIXct(strptime(Data$time, format = "%m/%d/%y %H"))
  Data$days_hours <- dts_1
  days_hours <- unique(Data$days_hours)
  N_days_hours <- length(days_hours)
  Data_hourly <-as.data.frame(matrix(nr=N_days_hours, nc=3))
  names(Data_hourly)<- c("days_hours","eload","Temp")
  Data_hourly$days_hours <- days_hours
  for (k in 1:N_days_hours){
    k_day_hour <- days_hours[k]
    idx_k_day_hour <- which(Data$days_hours == k_day_hour)
    if (length(idx_k_day_hour)==4){
      if (kWh){
       Data_hourly$eload[k] <- sum(Data$eload[idx_k_day_hour])
      }
      else{Data_hourly$eload[k] <- mean(Data$eload[idx_k_day_hour])}
      Data_hourly$Temp[k] <- mean(Data$Temp[idx_k_day_hour])
    }
    else{missing_hours <- missing_hours + 1}
  }
  Data_hourly$days_hours<-format(Data_hourly$days_hours,format="%m/%d/%y %H:%M")
  names(Data_hourly)<-c("time","eload","Temp")
  warning_text <- paste("This data has",missing_hours,"hours with missing values", sep = " ")
  print(warning_text)
  return(Data_hourly)
}


#' Exclude data from given intervals
#'
#' \code{to_exclude} This function exclude data from given intervals
#'
#'
#' @param Data A dataframe
#' @param intervals_to_exclude a path to the file with intervals to exclude
#' @return A dataframe containing the remaining observations
#'
#' @export

to_exclude <- function(Data,intervals_to_exclude){
  intervals <- read.csv(file = intervals_to_exclude, header=T, row.names = NULL, stringsAsFactors = F)
  Data$dts <- as.POSIXct(strptime(Data$time, format = "%m/%d/%y %H:%M"))
  if (dim(intervals)[1]>=1){
   for (i in 1:dim(intervals)[1]){
    start_time <- as.POSIXct(strptime(intervals$start[i], format = "%m/%d/%y %H:%M"))
    end_time <- as.POSIXct(strptime(intervals$end[i], format = "%m/%d/%y %H:%M"))
    if (length(which(Data$dts > start_time & Data$dts < end_time))!=0){
     Data <- Data[-which(Data$dts > start_time & Data$dts < end_time),]
    }
   }
  }
  Data <- dplyr::select(Data,time,eload,Temp)
  return(Data)
}

#' Extract data from given intervals
#'
#' \code{to_extract} This function extract data from given intervals
#'
#'
#' @param Data A dataframe containing all observations
#' @param intervals_to_extract A path to the file with intervals to extract
#' @param start The start date of the interval to exclude
#' @param end The end date of the interval to exclude
#' @return a to_extract object, which is a list with the following components:
#' \describe{
#'   \item{Data}{a dataframe containing the remaining observations}
#'   \item{Data_new}{a dataframe containing the extracted observations}
#' }
#'
#' @export

to_extract <- function(Data,
                       intervals_to_extract = NULL,
                       start = NULL,
                       end = NULL){
  if (length(intervals_to_extract) != 0){
    intervals <- read.csv(file = intervals_to_extract,
                          header=T, row.names = NULL,
                          stringsAsFactors = F)
  }
  else if (length(start) != 0 && length(end) != 0) {
    intervals <- as.data.frame(matrix(nr=1,nc=2))
    names(intervals) <-  c("start","end")
    intervals$start <- start
    intervals$end <- end
  }
  else(return())

  Data$dts <- as.POSIXct(strptime(Data$time, format = "%m/%d/%y %H:%M"))
  Data_new <- as.data.frame(matrix(nr=1,nc=4))
  names(Data_new) <- c("time","eload","Temp","dts")
  if (dim(intervals)[1]>=1){
   for (i in 1:dim(intervals)[1]){
    start_time <- as.POSIXct(strptime(intervals$start[i],
                                      format = "%m/%d/%y %H:%M"))
    end_time <- as.POSIXct(strptime(intervals$end[i],
                                    format = "%m/%d/%y %H:%M"))
    if (length(which(Data$dts > start_time & Data$dts < end_time))!=0){
     Data_i <- Data[which(Data$dts >= start_time & Data$dts <= end_time),]
     Data <- Data[-which(Data$dts > start_time & Data$dts < end_time),]
     Data_new <- rbind(Data_new,Data_i)
    }
   }
   Data_new <- Data_new[order(Data_new$dts),]
  }
  Data <- dplyr::select(Data,time,eload,Temp)
  Data_new <- dplyr::select(Data_new,time,eload,Temp)
  results <- list(Data=Data,Data_new=Data_new)
  return(results)
}

#' Create an input variable corresponding to given intervals
#'
#'
#' \code{create_date_var} This function Create a binary input variable corresponding
#' to given intervals if the time stamps correspond to a date within the given
#' interval then the value of the input variables will be equal to 1 and if not it
#' will be equal to 0
#'
#'
#' @param Data A dataframe of pre or post-installation data.
#' @param intervals_path A path to the file with intervals to extract
#' @param start A vector of start date of each interval
#' @param end A vector of end date of each interval. Note that start vector and
#' end vector should have the same length
#' @param var_name A string character that correspond to the name of the new
#' input variable
#' @return a dataframe with the additional input variable
#'
#' @export

create_date_var <- function(Data,
                            intervals_path = NULL,
                            start = NULL,
                            end = NULL,
                            var_name = "date_off"){
  Data$var_add <- 0
  if (length(intervals_path) != 0){
    intervals <- read.csv(file = intervals_path,
                          header=T, row.names = NULL,
                          stringsAsFactors = F)
  }
  else if (length(start) != 0 && length(end) != 0) {
    intervals <- as.data.frame(matrix(nr=length(start),nc=2))
    names(intervals) <-  c("start","end")
    intervals$start <- start
    intervals$end <- end
  }
  else(return())

  Data$dts <- as.POSIXct(strptime(Data$time, format = "%m/%d/%y %H:%M"))
  if (dim(intervals)[1]>=1){
   for (i in 1:dim(intervals)[1]){
    start_time <- as.POSIXct(strptime(intervals$start[i],
                                      format = "%m/%d/%y %H:%M"))
    end_time <- as.POSIXct(strptime(intervals$end[i],
                                    format = "%m/%d/%y %H:%M"))
    if (length(which(Data$dts > start_time & Data$dts < end_time))!=0){
     Data$var_add[which(Data$dts >= start_time & Data$dts <= end_time)] <- 1
    }
   }
  }
  Data <- dplyr::select(Data,time,eload,Temp,var_add)
  names(Data) <- c("time","eload","Temp",var_name)
  return(Data)
}


# #' Count the number of days
# #'
# #' \code{number_of_days} This function compute the number of days for  which the data are available
# #'
# #'
# #' @param Data A dataframe that contains time as column
# #' @return a numeric corresponding to the number of days
# #'
# #' @export
#
#
# number_of_days <- function(Data){
#   Data$time <- as.POSIXct(strptime(Data$time,"%m/%d/%y %H:%M"))
#   Data$date <- as.Date(Data$time)
#   num <- length(unique(Data$date))
#   return(num)
# }


#' Clean the elaod data
#'
#' \code{clean_eload} This function removes the observations, which have negative eload values or with eload
#' values higher or lower than some upper and lower thresholds. The upper threshold is defined as
#' \emph{n} percent higher than the quantile corresponding to the given upper probability \emph{U_ptresh} and the lower
#'  threshold is defined as \emph{n} percent lower than the quantile corresponding to the given lower probability \emph{L_ptresh}.
#'
#'
#' @param data A dataframe of training or prediction data.
#' @param n An integer that correspond to a multiplicative coefficient that is used
#' to define the thresholding value. The default value is 0.2 which correspond to \emph{20} percent.
#' @param L_ptresh A numeric that correspond to the probability of the lower quantile used for filtering
#' @param U_ptresh A numeric that correspond to the probability of the upper quantile used for filtering
#'

#' @return A dataframe that correspond to the cleaned data
#'
#' @export

clean_eload <- function(Data,n = .2, L_ptresh = 0.005, U_ptresh = 0.995){
  # exclude negative values
  Data <- dplyr::filter(Data, eload >= 0)
  # exclude values higher than n*ptresh
  U_tresh <- as.numeric(quantile(Data$eload,probs = c(U_ptresh),na.rm = T))
  Data <- dplyr::filter(Data, eload < (1+n)*U_tresh)
  # exclude the observations when the eload is n times lower than L_tresh
  L_tresh <- as.numeric(quantile(Data$eload,probs = c(L_ptresh),na.rm = T))
  Data <- dplyr::filter(Data, eload > (L_tresh-(L_tresh*n)))
  return(Data)
}

#' Clean the Temperature data
#'
#' \code{clean_Temp} This function remove the observations which have Temperature values higher or lower
#'  than some predefined extreme values.
#'
#'
#' @param Data A dataframe of pre or post-installation data.
#' @param maxT A numeric that correspond to the temperature above which the corresponding
#' observations will be excluded
#' @param minT A numeric that correspond to the temperature below which the corresponding
#' observations will be excluded

#' @return A dataframe that correspond to the cleaned pre or post-installation data
#'
#' @export

clean_Temp <- function(Data, maxT = 130, minT= -80){
  Data <- dplyr::filter(Data, Temp <= maxT)
  Data <- dplyr::filter(Data, Temp >= minT)
  return(Data)
}


#' Clean the Temperature data
#'
#' \code{clean_Temp_2} This function remove the period of time where temperature
#' observations do not change.
#'
#'
#' @param data A dataframe of pre or post-installation data.
#' @param thresh A numeric that correspond to the number of similar successive
#' temperature values
#' @return A dataframe that correspond to the cleaned pre or post-installation data
#'
#' @export


clean_Temp_2 <- function(Data, thresh = 5){
  count <- rle(Data$Temp)$lengths
  idx <- c(1)
  l <- 1
  for(i in 1:length(count)){
    count_i <- count[i]
    if(count_i == 1){
      idx <- c(idx, idx[l] +1)
      l <- l + 1}
    else if(count_i > 1 & count_i <= thresh) {
      k <- l
      for(j in 1:count_i){
        idx <- c(idx, idx[l] +1)
        l <- l +1}
    }
    else if(count_i > thresh) {
      idx <- c(idx, idx[l] + count_i)
      l <- l + 1}
  }
  Data <- Data[idx,]
  return(Data)
}
LBNL-ETA/RMV2.0 documentation built on Nov. 9, 2020, 5:44 a.m.