Report start at r Sys.time()

input_dir <- params$input_dir
output_dir_data <- params$output_dir_data
output_dir_report <- params$output_dir_report
project_dir <- params$project_dir
data_from_row <- params$data_from_row
header_row_number <- params$header_row_number
datetime_header <- params$datetime_header
datetime_format <- params$datetime_format
datetime_sampling <- params$datetime_sampling
record_header <- params$record_header
range_file <- params$range_file
write_output_files <- params$write_output_files
write_output_report <- params$write_output_report
file <- params$file
start_date <- params$start_date
# ~~~ Default directory ~~~~

# range_dir <- paste(project_dir, "Data/Support_files/Range/",sep = "")


# .....................................................................................................................................................

# ..... Define flags ..................................................................................................................................

flag_empty = NA
flag_error_df = NA
flag_date = NA
flag_duplicates_rows = NA
flag_overlap = NA
flag_missing_dates = NA
flag_range_variable_to_set = NA
flag_range_variable_new = NA
flag_out_of_range = NA

# ..... Body ..........................................................................................................................................

if(check_empty_file(INPUT_DATA_DIR = input_dir, FILE_NAME = file) == TRUE){

  flag_empty = 1

}else{

  flag_empty = 0

  data_import <- read_data(INPUT_DATA_DIR = input_dir, FILE_NAME = file, 
                           DATETIME_HEADER = datetime_header, DATETIME_FORMAT = datetime_format,
                           DATA_FROM_ROW = data_from_row, HEADER_ROW_NUMBER = header_row_number)  
  header = data_import [[1]]
  header_colnames = data_import [[2]]
  data = data_import [[3]]
  flag_error_df = data_import [[4]]

  rm(data_import)

  if(flag_error_df == 0){
    time_data = data[,which(colnames(data)==datetime_header)]

    if(is.na(start_date)){
      original <- data
      mydata <- data    
      flag_date = 0
      rm(data)

    }else{

      if(as.POSIXct(start_date, tz = 'Etc/GMT-1') < time_data[length(time_data)]){

        if(as.POSIXct(start_date, tz = 'Etc/GMT-1') < time_data[1]){
          original <- data
          mydata <- data
          first_row <- data.frame(t(rep(NA,times = ncol(data))))
          colnames(first_row) = colnames(data)

          fd = as.POSIXct(seq(from = as.POSIXct(start_date, tz = "Etc/GMT-1"), by = datetime_sampling,length.out = 2)[2],tz = "Etc/GMT-1")
          first_row[,which(colnames(data) == datetime_header)] = as.POSIXct(first_row[,which(colnames(data) == datetime_header)])
          first_row[1,which(colnames(data) == datetime_header)] <- fd
          first_row[which(colnames(data) == record_header)] = -1

          mydata <- rbind(first_row,data)
          flag_date = 0
          rm(data)

        }else{
          original = data[(which(time_data == as.POSIXct(start_date, tz = 'Etc/GMT-1'))+1):nrow(data),]      # possible issues in data subset!!! to check
          mydata = data[(which(time_data == as.POSIXct(start_date, tz = 'Etc/GMT-1'))+1):nrow(data),]
          flag_date = 0
          rm(data)
        }

      } else {

        flag_date = 1
      }
    }


    if(flag_date == 0){
      deletes_duplcated <- deletes_duplcated_data(DATA = mydata,DATETIME_HEADER = datetime_header)         # <- Deletes identical rows if found
      mydata = deletes_duplcated [[1]]
      duplicated_data = deletes_duplcated [[2]]
      duplicated_data = time_to_char(DATA = duplicated_data, DATETIME_HEADER = datetime_header, DATETIME_FORMAT = datetime_format)



      rm(deletes_duplcated)

      if(unique(as.character(duplicated_data[1,])) == "---"){
        flag_duplicates_rows = 0
      } else{
        flag_duplicates_rows = 1
      }

      data_in_old_files <- deletes_old_datetime(DATA = mydata,DATETIME_HEADER = datetime_header)  
      mydata = data_in_old_files [[1]]
      old_data = data_in_old_files[[2]]

      rm(data_in_old_files)

      orig_wihtout_dupli = mydata

      lastdate_orig = format(orig_wihtout_dupli[nrow(orig_wihtout_dupli), which(colnames(orig_wihtout_dupli) == datetime_header)],datetime_format)

      out_lastdate_orig = paste(substring(lastdate_orig,1,4),
                                substring(lastdate_orig,6,7),
                                substring(lastdate_orig,9,10),
                                substring(lastdate_orig,12,13),
                                substring(lastdate_orig,15,16),
                                sep = "")
      orig_filename = paste(substring(file, 1, nchar(file)-4),"_", out_lastdate_orig,"_raw.dat", sep = "")

      if(file.exists(paste(output_dir_data,orig_filename,sep = ""))){
        j=0
        repeat{
          j=j+1
          out_filename_orig_new = paste(substring(orig_filename,1, nchar(orig_filename)-8),"_raw_",j,".dat",sep = "")
          if(!file.exists(paste(output_dir_data,out_filename_orig_new,sep = ""))){
            break
          }
        }
      } else {
        out_filename_orig_new = orig_filename
      }

      write.csv(orig_wihtout_dupli,paste(output_dir_data,out_filename_orig_new,sep = ""),quote = F,row.names = F, na = "NaN")      # <- write "origina data"

      overlap <- detect_overlap(DATA = mydata,DATETIME_HEADER = datetime_header, RECORD_HEADER = record_header)          # <- Detect overlap


      if(length(overlap) != 0){

        flag_overlap = 1
        overlap[,1]<- overlap[,1] + data_from_row - 1
        colnames(overlap)[1]= "File Row"

      }else{

        flag_overlap = 0

        missing  <- missing_dates(DATA = mydata, DATETIME_HEADER = datetime_header, RECORD_HEADER = record_header, DATETIME_SAMPLING = datetime_sampling)  # <- fill missing dates with NA
        mydata = missing[[1]]
        missing_index_date = missing[[2]]

        rm(missing)


        range <- exclude_out_of_range(DATA = mydata,DATETIME_HEADER = datetime_header, RANGE_DIR = range_dir, RANGE_FILE = range_file) # <- Substitute with NA data out of phisical range
        mydata = range[[1]]
        check_out_of_range = range[[2]]
        variable_new = range[[3]]
        variable_to_set = range[[4]]

        rm(range)
        # ..... Flags .....................................................................................................................................

        if(length(variable_to_set) != 0){
          flag_range_variable_to_set = 1
        }else{
          flag_range_variable_to_set = 0
        }

        if(length(variable_new) != 0){
          flag_range_variable_new = 1
        }else{
          flag_range_variable_new = 0
        }


        if(1 %in% unique(unlist(apply(X = check_out_of_range[,-which(colnames(check_out_of_range) == datetime_header)],MARGIN = 2, unique)))){
          flag_out_of_range = 1
        }else{
          if(-1 %in% unique(unlist(apply(X = check_out_of_range[,-which(colnames(check_out_of_range) == datetime_header)],MARGIN = 2, unique)))){
            flag_out_of_range = 1
          }else{
            flag_out_of_range = 0
          }
        }


        time_tot = as.POSIXct(mydata[,which(colnames(mydata) == datetime_header)], format = datetime_format, tz = 'Etc/GMT-1')
        time_missing = missing_index_date[,2]

        if(length(which(time_tot %in% time_missing )) == 0){
          flag_missing_dates = 0      # No missing dates
        }else{
          flag_missing_dates = 1      # YES missing dates
        }

        mydata <- time_to_char(DATA = mydata, DATETIME_HEADER = datetime_header, DATETIME_FORMAT = datetime_format)

      }
    }
  }
}



# ..... Output ..........................................................................................................................................

if(flag_empty == 0){
  if(flag_error_df == 0){
    if(flag_date == 0){
      if(flag_overlap == 0){
        if(write_output_files == TRUE){
          #~~~~~~~~~~
          colnames(header) = header[1,]

          out_my = mydata
          colnames(out_my) = colnames(header)

          out_mydata=rbind(header[-1,],out_my)

          out_date = paste(substring(mydata[nrow(mydata),which(colnames(mydata) == datetime_header)],1,4),
                           substring(mydata[nrow(mydata),which(colnames(mydata) == datetime_header)],6,7),
                           substring(mydata[nrow(mydata),which(colnames(mydata) == datetime_header)],9,10),
                           # # "_",
                           # substring(mydata[nrow(mydata),which(colnames(mydata) == datetime_header)],12,13),
                           # substring(mydata[nrow(mydata),which(colnames(mydata) == datetime_header)],15,16),
                           sep = "")
          out_filename_data = paste(substring(file,1,nchar(file)-4),"_",out_date,".csv",sep = "")

          if(file.exists(paste(output_dir_data,out_filename_data,sep = ""))){
            j=0
            repeat{
              j=j+1
              out_filename_data_new = paste(substring(out_filename_data,1, nchar(out_filename_data)-4),"_",j,".csv",sep = "")
              if(!file.exists(paste(output_dir_data,out_filename_data_new,sep = ""))){
                break
              }
            }
          } else {
            out_filename_data_new = out_filename_data
          }



          write.csv(out_mydata,paste(output_dir_data,out_filename_data_new,sep = ""),quote = F,row.names = F, na = "NaN")

          rm(out_my)
          rm(out_mydata)

          #~~~~~~~~~~

          # out_filename_dupli = paste("Duplicated_",substring(file,1,nchar(file)-4),"_",out_date,".csv",sep = "")
          # 
          # output_dir_data_DQC_OK = paste(output_dir_data, "DQC_OK/",sep = "")
          # ifelse(test = !dir.exists(output_dir_data_DQC_OK),yes = dir.create(output_dir_data_DQC_OK),no = FALSE)
          # 
          # output_dir_data_duplicated = paste(output_dir_data, "Duplicated/",sep = "")
          # ifelse(test = !dir.exists(output_dir_data_duplicated),yes = dir.create(output_dir_data_duplicated),no = FALSE)

          # out_duplicated = duplicated_data
          # colnames(out_duplicated) = colnames(header)
          # 
          # out_duplicated_data=rbind(header[-1,],out_duplicated)
          # write.csv(out_duplicated_data,paste(output_dir_data_duplicated,out_filename_dupli,sep = ""),quote = F,row.names = F, na = "NaN")
          # rm(out_duplicated)
          # rm(out_duplicated_data)
        }
      }
    }
  }
}


flags_names = c("flag_empty","flag_error_df","flag_date","flag_duplicates_rows","flag_overlap","flag_missing_dates","flag_range_variable_to_set","flag_range_variable_new","flag_out_of_range")
flags_df = data.frame(flags_names, rep(NA,times = length(flags_names)))
colnames(flags_df) = c("flag_names", "value")

for(i in 1: nrow(flags_df)){
  if(exists(flags_names[i])){
    flags_df$value[i] = eval(parse(text = flags_names[i]))
  }
}
# 
# flag_empty = flags_df$value[1]
# flag_error_df = flags_df$value[2]
# flag_date = flags_df$value[3]
# flag_duplicates_rows = flags_df$value[4]
# flag_overlap = flags_df$value[5]
# flag_missing_dates = flags_df$value[6]
# flag_range_variable_to_set = flags_df$value[7]
# flag_range_variable_new = flags_df$value[8]
# flag_out_of_range = flags_df$value[9]

INPUT info:

File selected: r file

You have also select these parameters:

The folders are:

Range file in r range_dir is called: r range_file

You decide: r if(write_output_files == TRUE){paste("to write output file here:", output_dir_data)} else{paste(("to don't write output file"))}

And: r if(write_output_report == TRUE){paste("to write output report here:", output_dir_report)} else{paste(("to don't write output report"))}

OUTPUT:

Reminder (for developer): no problem => no message

Is the file empty? r if(flag_empty == 0){paste("NO: all right!")}else{paste("YES: check it!")}

r if(flag_empty == 0){paste("Ok:",file,"is not empty")}else{paste("Error:",file,"is empty")}

There are problems in data file structure? r if(is.na(flag_error_df)){paste("- - -")}else{if(flag_error_df == -1){paste("YES, there are more header than data!")}else{if(flag_error_df == 0){paste("NO: all right!")}else{if(flag_error_df == 1){paste("YES: there are some data without header!")}}}}

r if(is.na(flag_error_df)){paste("Before continuing, solve previous problems!")}else{if(flag_error_df == -1){paste("Check input file! Why there is more headers than data?")}else{if(flag_error_df == 0){paste("There isn't any problems")}else{if(flag_error_df == 1){paste("Possible problems: 1. row shifted above. 2. new data with old headers. 3. other file manipulation errors")}}}}

The file is already process? r if(is.na(flag_date)){paste("- - -")}else{if(flag_date == 0){paste("NO: all right!")}else{if(flag_date == 1){paste("YES: File already process!")}}}

r if(is.na(flag_date)){paste("Before continuing, solve previous problems!")}

There are duplicated rows? r if(is.na(flag_duplicates_rows)){paste("- - -")}else{if(flag_duplicates_rows == 0){paste("NO: all right!")}else{if(flag_duplicates_rows == 1){paste("YES: there are duplicated rows!")}}}

r if(is.na(flag_duplicates_rows)){paste("Before continuing, solve previous problems!")}else{if(flag_duplicates_rows == 0){paste("There isn't any duplicated row")}else{if(flag_duplicates_rows == 1){paste("Attention: there are duplicated row in input file. See:",paste(output_dir_data,"Duplicated_",substring(file,1,nchar(file)-4),".csv",sep = ""))}}}

There are overlaps? r if(is.na(flag_overlap)){paste("- - -")}else{if(flag_overlap == 0){paste("NO: all right!")}else{if(flag_overlap == 1){paste("YES: there are overlaps!")}}}

r if(is.na(flag_overlap)){paste("Before continuing, solve previous problems!")}else{if(flag_overlap == 0){paste("There isn't any overlap!")}else{if(flag_overlap == 1){paste("There are the following ovelap:")}}}

if(flag_empty == 0){
  if(flag_error_df == 0){
    if(flag_date == 0){
      if(flag_overlap == 1){

        datatable(overlap,rownames = F)

        # kable(overlap, format = "html",align = "c",row.names = F)%>%
        #   kable_styling() %>%
        #   scroll_box( height = "200px")

      }
    }
  }
}
if(flag_empty == 0){
  if(flag_error_df == 0){
    if(flag_date == 0){
      if(flag_overlap == 0){

        time_tot <- as.POSIXct(mydata[,which(colnames(mydata) == datetime_header)], format = datetime_format, tz = 'Etc/GMT-1' )
        time_missing <- missing_index_date[,2]
        # time_missing <-as.character(time_missing)

        df_missing <- data.frame(time_tot,rep("Dates in original file",times = length(time_tot)))
        colnames(df_missing) = c("time","Status")
        df_missing[which(time_tot %in% time_missing ),2] = "Missing dates filled"
        y = rep(1, times = length(time_tot))

        Status_num = rep(1,times = length(time_tot))
        Status_num[which(time_tot %in% time_missing )] = 0

        df_missing = cbind(df_missing, y,Status_num)

        rm(time_tot)
        rm(time_missing)
        rm(y)
        rm(Status_num)


      }
    }
  }
}

There are missing dates? r if(is.na(flag_missing_dates)){paste("- - -")}else{if(flag_missing_dates == 0){paste("NO: all right!")}else{if(flag_missing_dates == 1){paste("YES: there are missing dates!")}}}

r if(is.na(flag_missing_dates)){paste("Before continuing, solve previous problems!")}

if(flag_empty == 0){
  if(flag_error_df == 0){
    if(flag_date == 0){
      if(flag_overlap == 0){
        if(flag_missing_dates == 1){
          theme_new = theme_bw()+theme(axis.title.y=element_blank(),
                                       axis.text.y=element_blank(),
                                       axis.ticks.y=element_blank(),
                                       axis.text.x = element_text(angle = 90, hjust = 1))

          tot_missing =length(which(df_missing$Status_num == 0))

          ggplot(df_missing,aes(x = time, y=y ,colour = Status)) +
            geom_point(aes(size = Status,shape = Status) )+
            scale_colour_manual(values = c("#7CAE00","#F8766D"))+
            scale_shape_manual(values = c(20,124))+
            # scale_x_datetime(name = datetime_header,date_breaks = "2 weeks" )+
            theme_new+
            # ggtitle("Time distribution of missing dates")+
            labs(title="Time distribution of missing dates",
                 subtitle=paste("Total missing dates: ", tot_missing,sep = ""))
        }
      }
    }
  }
}

r if(!is.na(flag_missing_dates)){if(flag_missing_dates == 0){paste("There isn't any missing dates!")}else{if(flag_missing_dates == 1){paste("Statistics of missing dates:")}}}

if(flag_empty == 0){
  if(flag_error_df == 0){
    if(flag_date == 0){
      if(flag_overlap == 0){
        if(flag_missing_dates == 1){
          Status_num_NA=df_missing
          Status_num_NA = Status_num_NA[,-c(2,3)]

          differ = c(0,diff(Status_num_NA$Status_num))
          start = which(differ == -1)
          end  = which(differ == 1) - 1
          gap_lenght = end - start + 1

          date_start = Status_num_NA$time[start]
          date_end = Status_num_NA$time[end]

          if(length(date_end) != 0){
            date_end_tmp = as.POSIXct("1990-01-01 00:00")    # this for cycle is to fix a bug on time difference
            for(k in 1:length(date_end)){

              date_end_tmp[k] =  seq.POSIXt(date_end[k], by = datetime_sampling, length. =  2)[2]
            }
            gap_hour = difftime(time1 = date_end_tmp,time2 = date_start,units = "hours")
          }else{
            gap_hour = numeric(0)
          }


          statistic_missing = data.frame(date_start,date_end,gap_lenght,gap_hour)
          colnames(statistic_missing) = c("From", "To", "Number of Record", "Hours")
          statistic_missing[,1:2] = format(statistic_missing[,1:2], format = datetime_format)

          datatable(statistic_missing,rownames = F)

        }
      }
    }
  }
}

Do you need to update the range file? r if(is.na(flag_range_variable_to_set) | is.na(flag_range_variable_new) ){paste("- - -")}else{if(flag_range_variable_to_set == 0 & flag_range_variable_new == 0){paste("NO, the range file is up to date!")}else{if(flag_range_variable_to_set == 1 | flag_range_variable_new == 1){paste("YES, you need to update the range of the following variables.")}}}

if(flag_empty == 0){
  if(flag_error_df == 0){
    if(flag_date == 0){
      if(flag_overlap == 0){

        if(flag_range_variable_new == 1){
          df_variable_new = data.frame(variable_new, rep("New variable. Set threshold in range file",times = length(variable_new)))
          colnames(df_variable_new) = c("Variable", "Warning")

        }else{
          df_variable_new = data.frame("---", rep("Any new variables added in range file",times = 1))
          colnames(df_variable_new) = c("Variable", "Warning")
        }

        # if(flag_range_variable_to_set == 1){
        #   df_variable_to_set = data.frame(variable_to_set, rep("Remember: update threshold in range file and disables flag *to_set* ", times = length(variable_to_set)))
        #   colnames(df_variable_to_set) = c("Variable", "Warning")
        #   
        #   
        # }else{
        #   df_variable_to_set = data.frame("---", rep("All variables in range file are up to date",times = 1))
        #   colnames(df_variable_to_set) = c("Variable", "Warning")
        #   # print("Any variables to set in range file.")
        # }

        # df_to_print = rbind(df_variable_new,df_variable_to_set)
        df_to_print = df_variable_new

        # if(flag_range_variable_to_set == 1 | flag_range_variable_new == 1){
        #   datatable(df_to_print,rownames = F)
        # }

        datatable(df_to_print,rownames = F)



      }
    }
  }
}

There are values are out of physical range? r if(is.na(flag_out_of_range)){paste("- - -")}else{if(flag_out_of_range == 0){paste("NO: all right!")}else{if(flag_out_of_range == 1){paste("YES, the following variables contain data out of range!")}}}

if(flag_empty == 0){
  if(flag_error_df == 0){
    if(flag_date == 0){
      if(flag_overlap == 0){

        for(j in 1:ncol(check_out_of_range)){
          gc(reset = T)
          if(colnames(check_out_of_range)[j] !=  datetime_header){

            # ~~~~~~ preparation data for ggplot ~~~~~~

            df_tmp = check_out_of_range[,c(which(colnames(check_out_of_range)==datetime_header), j)]
            df_data = mydata[,c(which(colnames(check_out_of_range)==datetime_header), j)]

            df_tmp_new = cbind(df_tmp, df_data[,2])
            df_tmp_new[is.na(df_data[,2]),3] = 1
            df_tmp_new[!is.na(df_data[,2]),3] = 0
            df_tmp_new[which(df_tmp[,2] != 0),3] = 0
            colnames(df_tmp_new)[3] = "na_flag"

            df_factor = df_tmp

            df_factor[,2]=as.character(df_factor[,2])

            y = rep(1,times = nrow(df_factor))
            df_factor = cbind(df_factor,y)
            colnames(df_factor) = c("time", "Variable", "y")
            df_factor$y[which(df_factor$Variable == -1)] = 0.9
            df_factor$y[which(df_factor$Variable == 1)] = 1.1
            # colnames(df_factor) =c("time","Variable","y")

            # ~~~~~~ preparation data for statistic table ~~~~~~

            c_oor = colnames(check_out_of_range)[j]

            df_oor0 = df_tmp_new[,c(1,3)]
            colnames(df_oor0)= c("time", "variable")

            nan = rep(0, times = nrow(df_oor0))

            how_many_nan = length(which(df_oor0$variable == 1))

            if(length(which(df_oor0$variable == 1)) != 0){
              nan[which(df_oor0$variable == 1)] = 1
            }

            df_oor = df_tmp
            colnames(df_oor)= c("time", "variable")

            over_range = rep(0, times = nrow(df_oor))

            how_many_over_range = length(which(df_oor$variable == 1))

            if(length(which(df_oor$variable == 1)) != 0){
              over_range[which(df_oor$variable == 1)] = 1
            }

            under_range = rep(0, times = nrow(df_oor))

            how_many_under_range = length(which(df_oor$variable == -1))
            if(length(which(df_oor$variable == -1)) != 0){
              under_range[which(df_oor$variable == -1)] = 1
            }

            df_oor2 = data.frame(df_oor$time, under_range, over_range, nan)

            df_oor3 = rbind(df_oor2[1,],df_oor2,df_oor2[nrow(df_oor2),])


            df_oor3[1,2] = 0 ; df_oor3[1,3] = 0; df_oor3[1,4] = 0
            df_oor3[nrow(df_oor3),2] = 0 ; df_oor3[nrow(df_oor3),3] = 0; df_oor3[nrow(df_oor3),4] = 0

            a=as.data.frame(rbind(c(0,0),apply(df_oor3[,-1],2, diff)))
            time_a =c(df_oor$time[1],df_oor$time,df_oor$time[nrow(df_oor)])
            diff_df = cbind(time_a,a)

            if(any(unique(df_oor[,2])!= 0)){

              # ~~~~~~ preparation data for ggplot ~~~~~~

              theme_new2 = theme_bw()+theme(axis.title.y=element_blank(),
                                            axis.text.y=element_blank(),
                                            axis.ticks.y=element_blank(),
                                            axis.text.x = element_text(angle = 90, hjust = 1),
                                            legend.title = element_blank())


              # set_limits = data.frame(as.POSIXct(c(df_factor$time[1], df_factor$time[nrow(df_factor)%/%2], df_factor$time[nrow(df_factor)])), c(-1,0,1), c(0,0,0))

              set_limits = data.frame(rep(as.POSIXct("1900-01-01 00:00",format = "%Y-%m-%d %H:%M", tz = "Etc/GMT-1"),times = 4),
                                      c(-1,0,2,1), c(0.9,1,1,1.1))
              colnames(set_limits) = colnames(df_factor)

              df_tmp_new[,2] = rep(2, times = nrow(df_tmp_new))
              colnames(df_tmp_new) = colnames(df_factor)
              df_na = df_tmp_new[which(df_tmp_new[,3] == 1),]

              df_factor = rbind(set_limits,df_factor)
              df_factor = rbind(df_factor, df_na)
              df_factor = df_factor[order(df_factor$time),]

              df_factor$Variable =as.numeric(df_factor$Variable)

              df_factor$Variable = factor(df_factor$Variable, ordered = TRUE)
              df_factor$Variable <- factor( df_factor$Variable, levels=rev(levels( df_factor$Variable)))





              p1 = ggplot(df_factor,aes(x = time, y = y,colour = Variable))+
                geom_point(aes(size = Variable,shape = Variable ))+
                scale_colour_manual(values = c("#777777","#F8766D","#7CAE00","#0000FF"),labels = c("NaN", "Above upper limit", "In the range","Below lower limit"))+
                scale_shape_manual(values = c(124,124,20,124),labels =c("NaN","Above upper limit", "In the range","Below lower limit"))+
                scale_size_manual(values = c(7,10,1,10),labels = c("NaN","Above upper limit", "In the range","Below lower limit"))+
                theme_new2 +
                scale_y_continuous(limits = c(0.7,1.3))+
                scale_x_datetime(limits = as.POSIXct(c(df_factor$time[5], df_factor$time[nrow(df_factor)]),tz = "Etc/GMT-1") )+
                labs(title=paste(colnames(check_out_of_range)[j]),
                     subtitle=paste("Total NaN: ", how_many_nan, "\n",
                                    "Total Above upper limit: ", how_many_over_range, "\n",
                                    "Total Below lower limit: ", how_many_under_range, sep = ""))

              print(p1)

              # ~~~~~~ preparation data for statistic table ~~~~~~

              # NaN
              nan_start_oor = which(diff_df$nan == 1) - 1
              nan_end_oor  = which(diff_df$nan == -1) - 2


              if(length(nan_start_oor) != 0 & length(nan_end_oor) != 0){
                if(nan_end_oor[1] < nan_start_oor[1]){
                  nan_start_oor = c(1,nan_start_oor)
                }
                if(nan_start_oor[length(nan_start_oor)] > nan_end_oor[length(nan_end_oor)] ){
                  nan_end_oor = c(nan_end_oor,nrow(diff_df))
                }
              }

              nan_gap_lenght_oor = nan_end_oor - nan_start_oor + 1
              nan_date_start_oor = df_oor[nan_start_oor,1]
              nan_date_end_oor = df_oor[nan_end_oor,1]

              if(length(nan_date_end_oor) != 0){
                nan_date_end_oor_tmp = as.POSIXct("1990-01-01 00:00")    # this for cycle is to fix a bug on time difference
                for(k in 1:length(nan_date_end_oor)){

                  nan_date_end_oor_tmp[k] =  seq.POSIXt(nan_date_end_oor[k], by = datetime_sampling, length. =  2)[2]
                }

                nan_gap_hour_oor = difftime(time1 = nan_date_end_oor_tmp,time2 = nan_date_start_oor,units = "hours")
              }else{
                nan_gap_hour_oor = numeric(0)
              }

              nan_statistic_oor = data.frame(rep("NaN", times = length(nan_date_start_oor)),
                                             nan_date_start_oor,
                                             nan_date_end_oor,
                                             nan_gap_lenght_oor,
                                             nan_gap_hour_oor)
              colnames(nan_statistic_oor) = c(" ","From", "To", "Number of Record", "Hours")

              # under

              under_start_oor = which(diff_df$under_range == 1) - 1
              under_end_oor  = which(diff_df$under_range == -1) - 2


              if(length(under_start_oor) != 0 & length(under_end_oor) != 0){
                if(under_end_oor[1] < under_start_oor[1]){
                  under_start_oor = c(1,under_start_oor)
                }
                if(under_start_oor[length(under_start_oor)] > under_end_oor[length(under_end_oor)] ){
                  under_end_oor = c(under_end_oor,nrow(diff_df))
                }
              }

              under_gap_lenght_oor = under_end_oor - under_start_oor + 1
              under_date_start_oor = df_oor[under_start_oor,1]
              under_date_end_oor = df_oor[under_end_oor,1]

              if(length(under_date_end_oor) != 0){
                under_date_end_oor_tmp = as.POSIXct("1990-01-01 00:00")    # this for cycle is to fix a bug on time difference
                for(k in 1:length(under_date_end_oor)){

                  under_date_end_oor_tmp[k] =  seq.POSIXt(under_date_end_oor[k], by = datetime_sampling, length. =  2)[2]
                }

                under_gap_hour_oor = difftime(time1 = under_date_end_oor_tmp,time2 = under_date_start_oor,units = "hours")
              }else{
                under_gap_hour_oor = numeric(0)
              }

              under_statistic_oor = data.frame(rep("Under lower limit", times = length(under_date_start_oor)),
                                               under_date_start_oor,
                                               under_date_end_oor,
                                               under_gap_lenght_oor,
                                               under_gap_hour_oor)
              colnames(under_statistic_oor) = c(" ","From", "To", "Number of Record", "Hours")

              # over

              over_start_oor = which(diff_df$over_range == 1) - 1
              over_end_oor  = which(diff_df$over_range == -1) - 2


              if(length(over_start_oor) != 0 & length(over_end_oor) != 0){
                if(over_end_oor[1] < over_start_oor[1]){
                  over_start_oor = c(1,over_start_oor)
                }
                if(over_start_oor[length(over_start_oor)] > over_end_oor[length(over_end_oor)] ){
                  over_end_oor = c(over_end_oor,nrow(diff_df))
                }
              }

              over_gap_lenght_oor = over_end_oor - over_start_oor + 1
              over_date_start_oor = df_oor[over_start_oor,1]
              over_date_end_oor = df_oor[over_end_oor,1]

              if(length(over_date_end_oor) != 0){
                over_date_end_oor_tmp = as.POSIXct("1990-01-01 00:00")    # this for cycle is to fix a bug on time difference
                for(k in 1:length(over_date_end_oor)){

                  over_date_end_oor_tmp[k] =  seq.POSIXt(over_date_end_oor[k], by = datetime_sampling, length. =  2)[2]
                }
                over_gap_hour_oor = difftime(time1 = over_date_end_oor_tmp,time2 = over_date_start_oor,units = "hours")
              }else{
                over_gap_hour_oor =  numeric(0)
              }

              over_statistic_oor = data.frame(rep("Above upper limit", times = length(over_date_start_oor)), over_date_start_oor,over_date_end_oor,over_gap_lenght_oor, over_gap_hour_oor)
              colnames(over_statistic_oor) = c(" ","From", "To", "Number of Record", "Hours")


              if(nrow(under_statistic_oor) == 0){
                statistic_oor_under_over = over_statistic_oor
              }else{
                if(nrow(over_statistic_oor) == 0){
                  statistic_oor_under_over = under_statistic_oor
                }else{
                  statistic_oor_under_over= rbind(under_statistic_oor,over_statistic_oor)
                }
              }

              if(nrow(nan_statistic_oor) == 0){
                statistic_oor = statistic_oor_under_over
              }else{
                if(nrow(under_statistic_oor) == 0 & nrow(over_statistic_oor) == 0){
                  statistic_oor = nan_statistic_oor
                }else{
                  statistic_oor = rbind(nan_statistic_oor,statistic_oor_under_over)
                }
              }


              statistic_oor = statistic_oor[order(statistic_oor$From),]
              statistic_oor[,2:3] = format(statistic_oor[,2:3], format = datetime_format) 

              print(htmltools::tagList(datatable(statistic_oor)))

              # if(nrow(statistic_oor) == 1){
              #   print(kable(statistic_oor, format = "html",align = "c",row.names = F)%>%
              #           kable_styling() %>%
              #           scroll_box( height = "120px") )
              # }else{
              #   if(nrow(statistic_oor) == 2){
              #     print(kable(statistic_oor, format = "html",align = "c",row.names = F)%>%
              #             kable_styling() %>%
              #             scroll_box( height = "150px") )
              #   }else{
              #     if(nrow(statistic_oor) == 3){
              #       print(kable(statistic_oor, format = "html",align = "c",row.names = F)%>%
              #               kable_styling() %>%
              #               scroll_box( height = "200px") )
              #     }else{
              #       if(nrow(statistic_oor) >  3){
              #         print(kable(statistic_oor, format = "html",align = "c",row.names = F)%>%
              #                 kable_styling() %>%
              #                 scroll_box( height = "220px") )
              #       }
              #     }
              #   }
              # }



            }
            rm(df_tmp)
            rm(df_factor)
            rm(y)
            rm(c_oor)
            rm(df_oor)
            rm(over_range)
            rm(how_many_over_range)
            rm(under_range)
            rm(how_many_under_range)
            rm(df_oor2)
            rm(df_oor3)
            rm(a)
            rm(time_a)
            rm(diff_df)
          }
        }

      }
    }
  }
}

*End Report - r Sys.time() *



bridachristian/DataQualityCheckEuracAlpEnv documentation built on Oct. 27, 2019, 5:55 p.m.