Report start at r Sys.time()

# remove(list=ls())
Sys.setenv(TZ='Etc/GMT-1') # sets the environment on italy?s time zone

# ..... Libraries .....................................................................................................................................

library(devtools)
install_github("bridachristian/DataQualityCheckEuracAlpEnv")
library("DataQualityCheckEuracAlpEnv")

library(zoo)
library(timeSeries)
library(knitr)
library(ggplot2)
library(dplyr)
library(plyr)
library(imputeTS)
library(reshape2)
library(kableExtra)
library(pander)


# .....................................................................................................................................................

# ..... Input section .................................................................................................................................

scheduling_dir <- params$scheduling_dir                  # <-- schelduling directory: for files to be processed
report_dir <-  params$report_dir                    # <-- report directory: where to put reports and files whith overlaps
output_dir <- params$output_dir                  # <-- output directory: where processed files go
support_dir <- params$support_dir       # <-- support directory: where to read support files

write_output <- params$write_output                                             # if write_output == TRUE => write csv is admitted, if == FALSE not!

FILE <- params$file                                      # <-- Write here the file or the list of file  that you want to analyze!

RANGE_FILE = params$RANGE_FILE

DATA_FROM_ROW <- params$DATA_FROM_ROW                                                # <-- Row number of first data
HEADER_ROW_NUMBER <- params$HEADER_ROW_NUMBER                                          # <-- Row number of header
DATETIME_HEADER <- params$DATETIME_HEADER                               # <-- header corresponding to TIMESTAMP
DATETIME_FORMAT <- params$DATETIME_FORMAT                           # <-- datetime format. Use only: y -> year, m -> month, d -> day, H -> hour, M -> minute
DATETIME_SAMPLING <- params$DATETIME_SAMPLING
RECORD_HEADER <- params$RECORD_HEADER                                   # <-- header corresponding to RECORD

# ~~~ Folders ~~~~

# scheduling_dir <- "H:/Projekte/Klimawandel/Experiment/data/2order/DataQualityCheckEuracAlpEnv/Data/Input/"                   # <-- schelduling directory: for files to be processed
# 
# # setwd(scheduling_dir)
# 
# report_dir <- "H:/Projekte/Klimawandel/Experiment/data/2order/DataQualityCheckEuracAlpEnv/Report/"                    # <-- report directory: where to put reports and files whith overlaps
# output_dir <- "H:/Projekte/Klimawandel/Experiment/data/2order/DataQualityCheckEuracAlpEnv/Data/Output/"               # <-- output directory: where processed files go
# support_dir <- "H:/Projekte/Klimawandel/Experiment/data/2order/DataQualityCheckEuracAlpEnv/Data/Support_files/"       # <-- support directory: where to read support files
# 
# write_output <- FALSE                                             # if write_output == TRUE => write csv is admitted, if == FALSE not!
# 
# # ~~~ Files ~~~~
# 
# files <- files_in_scheduling_dir(SCHEDULING_DIR = scheduling_dir)
# # cat("Which of this files you want analyze? \n ",files)         # <-- Here we show files available for data quality check
# 
# FILE <- "M3.dat"                                      # <-- Write here the file or the list of file  that you want to analyze!
# # cat("Selected files: \n", FILE)
# 
# RANGE_FILE = "Range.csv"
# 
# # ~~~ Datetime ~~~~
# DATA_FROM_ROW <- 5                                             # <-- Row number of first data
# 
# HEADER_ROW_NUMBER <- 2                                         # <-- Row number of header
# 
# DATETIME_HEADER <- "TIMESTAMP"                                 # <-- header corresponding to TIMESTAMP
# DATETIME_FORMAT <- "yyyy-mm-dd HH:MM"                          # <-- datetime format. Use only: y -> year, m -> month, d -> day, H -> hour, M -> minute
# DATETIME_SAMPLING <- "15 min"
# 
# RECORD_HEADER <- "RECORD"                                      # <-- header corresponding to RECORD

# .....................................................................................................................................................

# ..... Body ..........................................................................................................................................


if(check_empty_file(SCHEDULING_DIR = scheduling_dir, FILE = FILE) == TRUE){
  # writeLines(paste(FILE,"WARNING: NO DATA FOUND!!!",sep = " "))
  flag_empty = 1
}else{

  flag_empty = 0

  data_import <- read_data(FILE_PATH = scheduling_dir, FILE_NAME = FILE, 
                           DATETIME_HEADER = DATETIME_HEADER, DATETIME_FORMAT = DATETIME_FORMAT,
                           DATA_FROM_ROW = DATA_FROM_ROW, HEADER_ROW_NUMBER = HEADER_ROW_NUMBER)  
  header <- data_import [[1]]
  header_colnames <- data_import [[2]]
  data <- data_import [[3]]

  original <- data
  mydata <- data

  deletes_duplcated <- deletes_duplcated_data(DATA = mydata,DATETIME_HEADER = DATETIME_HEADER)         # <- Deletes identical rows if found
  mydata <- deletes_duplcated [[1]]                                                                                                        
  duplicated_data <- deletes_duplcated [[2]]

  overlap <- detect_overlap(DATA = mydata,DATETIME_HEADER = DATETIME_HEADER, RECORD_HEADER = RECORD_HEADER)          # <- Detect overlap


  if(length(overlap) != 0){

    flag_overlap = 1
    # stop(paste("Overlapping data in files:", FILE))
    overlap[,1]<- overlap[,1] + DATA_FROM_ROW - 1
    colnames(overlap)[1]= "File Row"

  }else{

    flag_overlap = 0

    missing  <- missing_dates(DATA = mydata, DATETIME_HEADER = DATETIME_HEADER, RECORD_HEADER = RECORD_HEADER,DATETIME_SAMPLING = DATETIME_SAMPLING)  # <- fill missing dates with NA
    mydata <- missing[[1]]
    missing_index_date <- missing[[2]]

    range <- exclude_out_of_range(DATA = mydata,DATETIME_HEADER = DATETIME_HEADER, SUPPORT_DIR = support_dir, RANGE_FILE = RANGE_FILE) # <- Substitute with NA data out of phisical range
    mydata <- range[[1]]
    check_out_of_range <- range[[2]]
    variable_not_in_range <-range[[3]]

    mydata <- time_to_char(DATA = mydata, DATETIME_HEADER = DATETIME_HEADER, DATETIME_FORMAT = DATETIME_FORMAT)
  }
}


# ..... Output ..........................................................................................................................................

if(write_output == TRUE){

  #~~~~~~~~~~
  colnames(header) = header[1,]

  out_my = mydata
  colnames(out_my) = colnames(header)

  out_mydata=rbind(header[-1,],out_my)
  write.csv(out_mydata,paste(output_dir,"DQCok_",substring(FILE,1,nchar(FILE)-4),".csv",sep = ""),quote = F,row.names = F)

  #~~~~~~~~~~
  colnames(duplicated_data) = colnames(header)

  out_duplicated_data=rbind(header[-1,],duplicated_data)
  write.csv(out_duplicated_data,paste(output_dir,"Duplicated_",substring(FILE,1,nchar(FILE)-4),".csv",sep = ""),quote = F,row.names = F)
}

INPUT info:

File selected: r FILE

You have also select these parameters:

The folders are:

Range file in support folder is called: r RANGE_FILE

You decide: r if(write_output == TRUE){paste("to write output file in output folder")} else{paste(("to don't write output file in output folder"))}

OUTPUT:

Info empty file:

r if(flag_empty == 1){paste("Error:", FILE,"without observation")}else{paste(FILE, "has some obseravtion.")}

Duplicated rows

More details on: r paste(output_dir,"Duplicated_",substring(FILE,1,nchar(FILE)-4),".csv",sep = "")

Overlap

r if(flag_overlap == 1){paste("In file:", FILE, " there are the following ovelap:")}else{paste("In", FILE, "there isn't any overlap!")}

if(flag_overlap == 1){
  kable(overlap, format = "markdown",align = "c")
}

Missing dates

w = which(colnames(mydata) == DATETIME_HEADER)
time_tot <- mydata[,w]
time_tot <- as.POSIXct(time_tot, format = "%Y-%m-%d %H:%M:%S")
time_missing <- missing_index_date[,2]

df_missing <- data.frame(time_tot,rep("Dates in original file",times = length(time_tot)))
colnames(df_missing) = c("time","Status")
df_missing[which(time_tot %in% time_missing ),2] = "Missing dates filled"
y = rep(1, times = length(time_tot))

Status_num = rep(1,times = length(time_tot))
Status_num[which(time_tot %in% time_missing )] = 0

df_missing = cbind(df_missing, y,Status_num)


theme_new = theme_bw()+theme(axis.title.y=element_blank(),
                             axis.text.y=element_blank(),
                             axis.ticks.y=element_blank(),
                             axis.text.x = element_text(angle = 90, hjust = 1))


ggplot(df_missing,aes(x = time, y=y ,colour = Status)) +  
  geom_point(aes(size = Status,shape = Status) )+
  scale_colour_manual(values = c("#7CAE00","#F8766D"))+ 
  scale_shape_manual(values = c(20,124))+
  scale_x_datetime(name = DATETIME_HEADER,date_breaks = "2 weeks" )+
  theme_new+
  ggtitle("Time distribution of missing dates")

Statistics of missing dates:

Status_num_NA=df_missing
Status_num_NA = Status_num_NA[,-c(2,3)]

differ = c(0,diff(Status_num_NA$Status_num))
start = which(differ == -1)
end  = which(differ == 1) - 1
gap_lenght = end - start + 1

date_start = Status_num_NA$time[start]
date_end = Status_num_NA$time[end]

statistic_missing = data.frame(date_start,date_end,gap_lenght)
colnames(statistic_missing) = c("From", "To", "Number of Record")

kable(statistic_missing, format = "markdown",align = "c")

Values out of admitted range

for(j in 1:ncol(check_out_of_range)){
  if(colnames(check_out_of_range)[j] ==  DATETIME_HEADER){}else{

    # ~~~~~~ preparation data for ggplot ~~~~~~ 

    df_tmp = check_out_of_range[,c(which(colnames(check_out_of_range)==DATETIME_HEADER), j)]
    df_factor = df_tmp

    df_factor[,2]=as.character(df_factor[,2])

    y = rep(1,times = nrow(df_factor))
    df_factor = cbind(df_factor,y)
    colnames(df_factor) = c("time", "Variable", "y") 
    df_factor$y[which(df_factor$Variable == -1)] = 0.9
    df_factor$y[which(df_factor$Variable == 1)] = 1.1
    # colnames(df_factor) =c("time","Variable","y")

    # ~~~~~~ preparation data for statistic table ~~~~~~ 

    c_oor = colnames(check_out_of_range)[j]

    df_oor = check_out_of_range[,c(which(colnames(check_out_of_range)==DATETIME_HEADER), j)]
    colnames(df_oor)= c("time", "variable")

    over_range = rep(0, times = nrow(df_oor))

    if(length(which(df_oor$variable == 1)) != 0){
      over_range[which(df_oor$variable == 1)] = 1
    }

    under_range = rep(0, times = nrow(df_oor))

    if(length(which(df_oor$variable == -1)) != 0){
      under_range[which(df_oor$variable == -1)] = 1
    }

    df_oor2 = data.frame(df_oor$time, under_range, over_range)

    diff_df = cbind(df_oor$time,as.data.frame(rbind(c(0,0),apply(df_oor2[,-1],2, diff))))

    if(any(unique(df_oor[,2])!= 0)){

      # ~~~~~~ preparation data for ggplot ~~~~~~ 

      theme_new2 = theme_bw()+theme(axis.title.y=element_blank(),
                                    axis.text.y=element_blank(),
                                    axis.ticks.y=element_blank(),
                                    axis.text.x = element_text(angle = 90, hjust = 1),
                                    legend.title = element_blank())

      # p2 = ggplot(melt_df_ggplot, aes(y=variable, x=time))+
      #   geom_tile(aes(fill = value))+
      #   scale_fill_manual(values = c("green", "red", "red"))
      # 
      # p2

      set_limits = data.frame(as.POSIXct(c(df_factor$time[1], df_factor$time[nrow(df_factor)%/%2], df_factor$time[nrow(df_factor)])), c(-1,0,1), c(0,0,0))
      colnames(set_limits) = colnames(df_factor)

      df_factor = rbind(df_factor,set_limits)

      df_factor$Variable = factor(df_factor$Variable, ordered = TRUE)
      df_factor$Variable <- factor( df_factor$Variable, levels=rev(levels( df_factor$Variable)))


      p1 = ggplot(df_factor,aes(x = time, y = y,colour = Variable))+
        geom_point(aes(size = Variable,shape = Variable ))+
        scale_colour_manual(values = c("#0000FF","#7CAE00","#F8766D"),labels = c("Above upper limit", "In the range","Below lower limit"))+ 
        scale_shape_manual(values = c(124,20,124),labels =c("Above upper limit", "In the range","Below lower limit"))+
        scale_size_manual(values = c(10,1,10),labels = c("Above upper limit", "In the range","Below lower limit"))+
        scale_x_datetime(name = DATETIME_HEADER,date_breaks = "2 weeks" )+
        theme_new2 +
        ggtitle(colnames(check_out_of_range)[j])+
        scale_y_continuous(limits = c(0.7,1.3))

      print(p1)

      # ~~~~~~ preparation data for statistic table ~~~~~~ 

      under_start_oor = which(diff_df$under_range == 1)
      under_end_oor  = which(diff_df$under_range == -1) - 1


      if(length(under_start_oor) != 0 & length(under_end_oor) != 0){
        if(under_end_oor[1] < under_start_oor[1]){
          under_start_oor = c(1,under_start_oor)
        }
        if(under_start_oor[length(under_start_oor)] > under_end_oor[length(under_end_oor)] ){
          under_end_oor = c(under_end_oor,nrow(diff_df))
        }
      }

      under_gap_lenght_oor = under_end_oor - under_start_oor + 1
      under_date_start_oor = df_oor[under_start_oor,1]
      under_date_end_oor = df_oor[under_end_oor,1]

      under_statistic_oor = data.frame(rep("Under lower limit", times = length(under_date_start_oor)), under_date_start_oor,under_date_end_oor,under_gap_lenght_oor)
      colnames(under_statistic_oor) = c(" ","From", "To", "Number of Record")

      # over

      over_start_oor = which(diff_df$over_range == 1)
      over_end_oor  = which(diff_df$over_range == -1) - 1


      if(length(over_start_oor) != 0 & length(over_end_oor) != 0){
        if(over_end_oor[1] < over_start_oor[1]){
          over_start_oor = c(1,over_start_oor)
        }
        if(over_start_oor[length(over_start_oor)] > over_end_oor[length(over_end_oor)] ){
          over_end_oor = c(over_end_oor,nrow(diff_df))
        }
      }

      over_gap_lenght_oor = over_end_oor - over_start_oor + 1
      over_date_start_oor = df_oor[over_start_oor,1]
      over_date_end_oor = df_oor[over_end_oor,1]


      over_statistic_oor = data.frame(rep("Above upper limit", times = length(over_date_start_oor)), over_date_start_oor,over_date_end_oor,over_gap_lenght_oor)
      colnames(over_statistic_oor) = c(" ","From", "To", "Number of Record")


      if(nrow(under_statistic_oor) == 0){
        statistic_oor = over_statistic_oor
      }else{ 
        if(nrow(over_statistic_oor) == 0){
          statistic_oor = under_statistic_oor
        }else{
          statistic_oor= rbind(under_statistic_oor,over_statistic_oor)
        }
      }


      statistic_oor = statistic_oor[order(statistic_oor$From),] 

       # cat(paste("Statistics of data out of range:",c_oor))
      print(kable(statistic_oor, format = "html",align = "c",row.names = F)%>%
              kable_styling() %>%
              scroll_box( height = "200px") ) 

    }
  }
}

Report end at r Sys.time()



bridachristian/DataQualityCheckEuracAlpEnv documentation built on Oct. 27, 2019, 5:55 p.m.