Report start at r Sys.time()

# remove(list=ls())
Sys.setenv(TZ='Etc/GMT-1') # sets the environment on italy?s time zone

# ..... Libraries .....................................................................................................................................

library(devtools)
install_github("bridachristian/DataQualityCheckEuracAlpEnv")
library("DataQualityCheckEuracAlpEnv")

library(zoo)
library(timeSeries)
library(knitr)
library(ggplot2)
library(dplyr)
library(plyr)
library(imputeTS)
library(reshape2)
library(kableExtra)
library(pander)


# .....................................................................................................................................................

# ..... Input section .................................................................................................................................

# ~~~ Folders ~~~~

# scheduling_dir <- "H:/Projekte/Klimawandel/Experiment/data/2order/scheduling/"                              # <-- schelduling directory: for files to be processed
scheduling_dir <- "H:/Projekte/Klimawandel/Experiment/data/2order/DataQualityCheckEuracAlpEnv/Data/Input/"                   # <-- schelduling directory: for files to be processed
# scheduling_dir <- report_dir                                                                                # <-- schelduling directory: for files that the script found whith overlap and you fixed manually

setwd(scheduling_dir)

report_dir <- "H:/Projekte/Klimawandel/Experiment/data/2order/DataQualityCheckEuracAlpEnv/Report/"                    # <-- report directory: where to put reports and files whith overlaps
output_dir <- "H:/Projekte/Klimawandel/Experiment/data/2order/DataQualityCheckEuracAlpEnv/Data/Output/"               # <-- output directory: where processed files go
support_dir <- "H:/Projekte/Klimawandel/Experiment/data/2order/DataQualityCheckEuracAlpEnv/Data/Support_files/"       # <-- support directory: where to read support files

write_output <- TRUE                                             # if write_output == TRUE => write csv is admitted, if == FALSE not!

# ~~~ Files ~~~~

files <- files_in_scheduling_dir(SCHEDULING_DIR = scheduling_dir)
# cat("Which of this files you want analyze? \n ",files)         # <-- Here we show files available for data quality check

FILE <- "S3.dat"                                           # <-- Write here the file or the list of file  that you want to analyze!
# cat("Selected files: \n", FILE)

RANGE_FILE = "Range.csv"

# ~~~ Datetime ~~~~
DATA_FROM_ROW <- 5                                             # <-- Row number of first data

HEADER_ROW_NUMBER <- 2                                         # <-- Row number of header

DATETIME_HEADER <- "TIMESTAMP"                                 # <-- header corresponding to TIMESTAMP
DATETIME_FORMAT <- "yyyy-mm-dd HH:MM"                          # <-- datetime format. Use only: y -> year, m -> month, d -> day, H -> hour, M -> minute
DATETIME_SAMPLING <- "15 min"

RECORD_HEADER <- "RECORD"                                      # <-- header corresponding to RECORD

# .....................................................................................................................................................

# ..... Body ..........................................................................................................................................


if(check_empty_file(SCHEDULING_DIR = scheduling_dir, FILE = FILE) == TRUE){
  # writeLines(paste(FILE,"WARNING: NO DATA FOUND!!!",sep = " "))
  flag_empty = 1
}else{

  flag_empty = 0

  data_import <- read_data(FILE_PATH = scheduling_dir, FILE_NAME = FILE, 
                           DATETIME_HEADER = DATETIME_HEADER, DATETIME_FORMAT = DATETIME_FORMAT,
                           DATA_FROM_ROW = DATA_FROM_ROW, HEADER_ROW_NUMBER = HEADER_ROW_NUMBER)  
  header <- data_import [[1]]
  header_colnames <- data_import [[2]]
  data <- data_import [[3]]

  original <- data
  mydata <- data

  deletes_duplcated <- deletes_duplcated_data(DATA = mydata,DATETIME_HEADER = DATETIME_HEADER)         # <- Deletes identical rows if found
  mydata <- deletes_duplcated [[1]]                                                                                                        
  duplicated_data <- deletes_duplcated [[2]]
  duplicated_data <- time_to_char(DATA = duplicated_data, DATETIME_HEADER = DATETIME_HEADER, DATETIME_FORMAT = DATETIME_FORMAT)

  overlap <- detect_overlap(DATA = mydata,DATETIME_HEADER = DATETIME_HEADER, RECORD_HEADER = RECORD_HEADER)          # <- Detect overlap

  if(length(overlap) != 0){

    flag_overlap = 1
    # stop(paste("Overlapping data in files:", FILE))
    overlap[,1]<- overlap[,1] + DATA_FROM_ROW - 1
    colnames(overlap)[1]= "File Row"

  }else{

    flag_overlap = 0

    missing  <- missing_dates(DATA = mydata, DATETIME_HEADER = DATETIME_HEADER, RECORD_HEADER = RECORD_HEADER,DATETIME_SAMPLING = DATETIME_SAMPLING)  # <- fill missing dates with NA
    mydata <- missing[[1]]
    missing_index_date <- missing[[2]]

    range <- exclude_out_of_range(DATA = mydata,DATETIME_HEADER = DATETIME_HEADER, SUPPORT_DIR = support_dir, RANGE_FILE = RANGE_FILE) # <- Substitute with NA data out of phisical range
    mydata <- range[[1]]
    check_out_of_range <- range[[2]]
    variable_not_in_range <-range[[3]]

    mydata <- time_to_char(DATA = mydata, DATETIME_HEADER = DATETIME_HEADER, DATETIME_FORMAT = DATETIME_FORMAT)
  }
}


# ..... Output ..........................................................................................................................................

if(write_output == TRUE){

  #~~~~~~~~~~
  colnames(header) = header[1,]

  out_my = mydata
  colnames(out_my) = colnames(header)

  out_mydata=rbind(header[-1,],out_my)

  out_filename =  paste(substring(FILE,1,nchar(FILE)-4),
                        "_",
                        substring(mydata[nrow(mydata),which(colnames(mydata) == DATETIME_HEADER)],1,4),
                        substring(mydata[nrow(mydata),which(colnames(mydata) == DATETIME_HEADER)],6,7),
                        substring(mydata[nrow(mydata),which(colnames(mydata) == DATETIME_HEADER)],9,10),
                        # "_",
                        substring(mydata[nrow(mydata),which(colnames(mydata) == DATETIME_HEADER)],12,13),
                        substring(mydata[nrow(mydata),which(colnames(mydata) == DATETIME_HEADER)],15,16),
                        sep = "")

  write.csv(out_mydata,paste(output_dir,"DQCok_",out_filename,".csv",sep = ""),quote = F,row.names = F)

  #~~~~~~~~~~
  colnames(duplicated_data) = colnames(header)

  out_duplicated_data=rbind(header[-1,],duplicated_data)
  write.csv(out_duplicated_data,paste(output_dir,"Duplicated_",out_filename,".csv",sep = ""),quote = F,row.names = F)
}

INPUT info:

File selected: r FILE

You have also select these parameters:

The folders are:

Range file in support folder is called: r RANGE_FILE

You decide: r if(write_output == TRUE){paste("to write output file in output folder")} else{paste(("to don't write output file in output folder"))}

OUTPUT:

Info empty file:

r if(flag_empty == 1){paste("Error:", FILE,"without observation")}else{paste(FILE, "has some obseravtion.")}

Duplicated rows

More details on: r paste(output_dir,"Duplicated_",substring(FILE,1,nchar(FILE)-4),".csv",sep = "")

Overlap

r if(flag_overlap == 1){paste("In file:", FILE, " there are the following ovelap:")}else{paste("In", FILE, "there isn't any overlap!")}

if(flag_overlap == 1){
  # kable(overlap, format = "markdown",align = "c")
  kable(overlap, format = "html",align = "c",row.names = F)%>%
    kable_styling() %>%
    scroll_box( height = "200px") 

}

Missing dates

if(flag_overlap == 0){
  w = which(colnames(mydata) == DATETIME_HEADER)
  time_tot <- mydata[,w]
  time_tot <- as.POSIXct(time_tot, format = "%Y-%m-%d %H:%M:%S")
  time_missing <- missing_index_date[,2]

  df_missing <- data.frame(time_tot,rep("Dates in original file",times = length(time_tot)))
  colnames(df_missing) = c("time","Status")
  df_missing[which(time_tot %in% time_missing ),2] = "Missing dates filled"
  y = rep(1, times = length(time_tot))

  Status_num = rep(1,times = length(time_tot))
  Status_num[which(time_tot %in% time_missing )] = 0

  df_missing = cbind(df_missing, y,Status_num)


  theme_new = theme_bw()+theme(axis.title.y=element_blank(),
                               axis.text.y=element_blank(),
                               axis.ticks.y=element_blank(),
                               axis.text.x = element_text(angle = 90, hjust = 1))


  ggplot(df_missing,aes(x = time, y=y ,colour = Status)) +  
    geom_point(aes(size = Status,shape = Status) )+
    scale_colour_manual(values = c("#7CAE00","#F8766D"))+ 
    scale_shape_manual(values = c(20,124))+
    scale_x_datetime(name = DATETIME_HEADER,date_breaks = "2 weeks" )+
    theme_new+
    ggtitle("Time distribution of missing dates")
}

Statistics of missing dates:

if(flag_overlap == 0){
  Status_num_NA=df_missing
  Status_num_NA = Status_num_NA[,-c(2,3)]

  differ = c(0,diff(Status_num_NA$Status_num))
  start = which(differ == -1)
  end  = which(differ == 1) - 1
  gap_lenght = end - start + 1

  date_start = Status_num_NA$time[start]
  date_end = Status_num_NA$time[end]

  statistic_missing = data.frame(date_start,date_end,gap_lenght)
  colnames(statistic_missing) = c("From", "To", "Number of Record")

  # kable(statistic_missing, format = "markdown",align = "c")
  kable(statistic_missing, format = "html",align = "c",row.names = F)%>%
    kable_styling() %>%
    scroll_box( height = "200px") 
}

Values out of admitted range

if(flag_overlap == 0){
  for(j in 1:ncol(check_out_of_range)){
    if(colnames(check_out_of_range)[j] ==  DATETIME_HEADER){}else{

      # ~~~~~~ preparation data for ggplot ~~~~~~ 

      df_tmp = check_out_of_range[,c(which(colnames(check_out_of_range)==DATETIME_HEADER), j)]
      df_factor = df_tmp

      df_factor[,2]=as.character(df_factor[,2])

      y = rep(1,times = nrow(df_factor))
      df_factor = cbind(df_factor,y)
      colnames(df_factor) = c("time", "Variable", "y") 
      df_factor$y[which(df_factor$Variable == -1)] = 0.9
      df_factor$y[which(df_factor$Variable == 1)] = 1.1
      # colnames(df_factor) =c("time","Variable","y")

      # ~~~~~~ preparation data for statistic table ~~~~~~ 

      c_oor = colnames(check_out_of_range)[j]

      df_oor = check_out_of_range[,c(which(colnames(check_out_of_range)==DATETIME_HEADER), j)]
      colnames(df_oor)= c("time", "variable")

      over_range = rep(0, times = nrow(df_oor))

      if(length(which(df_oor$variable == 1)) != 0){
        over_range[which(df_oor$variable == 1)] = 1
      }

      under_range = rep(0, times = nrow(df_oor))

      if(length(which(df_oor$variable == -1)) != 0){
        under_range[which(df_oor$variable == -1)] = 1
      }

      df_oor2 = data.frame(df_oor$time, under_range, over_range)

      diff_df = cbind(df_oor$time,as.data.frame(rbind(c(0,0),apply(df_oor2[,-1],2, diff))))

      if(any(unique(df_oor[,2])!= 0)){

        # ~~~~~~ preparation data for ggplot ~~~~~~ 

        theme_new2 = theme_bw()+theme(axis.title.y=element_blank(),
                                      axis.text.y=element_blank(),
                                      axis.ticks.y=element_blank(),
                                      axis.text.x = element_text(angle = 90, hjust = 1),
                                      legend.title = element_blank())

        # p2 = ggplot(melt_df_ggplot, aes(y=variable, x=time))+
        #   geom_tile(aes(fill = value))+
        #   scale_fill_manual(values = c("green", "red", "red"))
        # 
        # p2

        set_limits = data.frame(as.POSIXct(c(df_factor$time[1], df_factor$time[nrow(df_factor)%/%2], df_factor$time[nrow(df_factor)])), c(-1,0,1), c(0,0,0))
        colnames(set_limits) = colnames(df_factor)

        df_factor = rbind(df_factor,set_limits)

        df_factor$Variable = factor(df_factor$Variable, ordered = TRUE)
        df_factor$Variable <- factor( df_factor$Variable, levels=rev(levels( df_factor$Variable)))


        p1 = ggplot(df_factor,aes(x = time, y = y,colour = Variable))+
          geom_point(aes(size = Variable,shape = Variable ))+
          scale_colour_manual(values = c("#0000FF","#7CAE00","#F8766D"),labels = c("Above upper limit", "In the range","Below lower limit"))+ 
          scale_shape_manual(values = c(124,20,124),labels =c("Above upper limit", "In the range","Below lower limit"))+
          scale_size_manual(values = c(10,1,10),labels = c("Above upper limit", "In the range","Below lower limit"))+
          scale_x_datetime(name = DATETIME_HEADER,date_breaks = "2 weeks" )+
          theme_new2 +
          ggtitle(colnames(check_out_of_range)[j])+
          scale_y_continuous(limits = c(0.7,1.3))

        print(p1)

        # ~~~~~~ preparation data for statistic table ~~~~~~ 

        under_start_oor = which(diff_df$under_range == 1)
        under_end_oor  = which(diff_df$under_range == -1) - 1


        if(length(under_start_oor) != 0 & length(under_end_oor) != 0){
          if(under_end_oor[1] < under_start_oor[1]){
            under_start_oor = c(1,under_start_oor)
          }
          if(under_start_oor[length(under_start_oor)] > under_end_oor[length(under_end_oor)] ){
            under_end_oor = c(under_end_oor,nrow(diff_df))
          }
        }

        under_gap_lenght_oor = under_end_oor - under_start_oor + 1
        under_date_start_oor = df_oor[under_start_oor,1]
        under_date_end_oor = df_oor[under_end_oor,1]

        under_statistic_oor = data.frame(rep("Under lower limit", times = length(under_date_start_oor)), under_date_start_oor,under_date_end_oor,under_gap_lenght_oor)
        colnames(under_statistic_oor) = c(" ","From", "To", "Number of Record")

        # over

        over_start_oor = which(diff_df$over_range == 1)
        over_end_oor  = which(diff_df$over_range == -1) - 1


        if(length(over_start_oor) != 0 & length(over_end_oor) != 0){
          if(over_end_oor[1] < over_start_oor[1]){
            over_start_oor = c(1,over_start_oor)
          }
          if(over_start_oor[length(over_start_oor)] > over_end_oor[length(over_end_oor)] ){
            over_end_oor = c(over_end_oor,nrow(diff_df))
          }
        }

        over_gap_lenght_oor = over_end_oor - over_start_oor + 1
        over_date_start_oor = df_oor[over_start_oor,1]
        over_date_end_oor = df_oor[over_end_oor,1]


        over_statistic_oor = data.frame(rep("Above upper limit", times = length(over_date_start_oor)), over_date_start_oor,over_date_end_oor,over_gap_lenght_oor)
        colnames(over_statistic_oor) = c(" ","From", "To", "Number of Record")


        if(nrow(under_statistic_oor) == 0){
          statistic_oor = over_statistic_oor
        }else{ 
          if(nrow(over_statistic_oor) == 0){
            statistic_oor = under_statistic_oor
          }else{
            statistic_oor= rbind(under_statistic_oor,over_statistic_oor)
          }
        }


        statistic_oor = statistic_oor[order(statistic_oor$From),] 

        # cat(paste("Statistics of data out of range:",c_oor))
        print(kable(statistic_oor, format = "html",align = "c",row.names = F)%>%
                kable_styling() %>%
                scroll_box( height = "200px") ) 

      }
    }
  }

}

Report end at r Sys.time()



bridachristian/DataQualityCheckEuracAlpEnv documentation built on Oct. 27, 2019, 5:55 p.m.