R/DQC_function.R

Defines functions DQC_function

Documented in DQC_function

#' @title DQC_function
#'
#' @description This function checks a data file evaluating structures problems, missing and overlapping data, overrange data and writing different data files for different pourpuse.
#' It is the core of the DQC scripts. It can be applied for every single files or in a operative method for a near real time monitoring.
#' This function use different functions and process to manage data append and structured storage.
#'
#' @param input_dir character: is the directory where to source input data file
#' @param output_dir_data character: is the directory where total data are stored
#' @param output_dir_raw character: is the directory where raw data are stored
#' @param output_dir_report  NOT USED! character:  It is the directory where report/alert files are stored
#' @param project_dir NOT USED! character: It is the main directory used to distinguish different project
#' @param data_from_row numeric: when you open a file it is the numer of the first row of data (header excluded!)
#' @param header_row_number numeric: when you open a file it iis the numeber of the row indicating the real header or column names
#' @param datetime_header character: is the name of the column indicates the datetime. For examples TIMESTAMP
#' @param datetime_format character: is the datetime format in POSIXct style
#' @param datetime_sampling character: is the time sampling interval. For examples: 15 min
#' @param record_header character: is the name of the column indicates the record. For examples: RECORD
#' @param sep character: is the character indicating the separator in the input file
#' @param range_file character: is the name of the file containg the limits of overrange data  # NOTE --> add range_dir to input of DQC_function and adapt all scripts!
#' @param write_output_files logical: is a flag that indicates the necessity of writing the total/raw files.
#' @param write_output_report NOT USED! logical: is a flag that indicates the necessity of writing the report files.
#' @param file_name character: is the file name of file to process
#' @param station_name character: is the name of station to process. It could be extracted with rules from file_name
#' @param start_date POSIXct: is the date of the first data to check
#' @param logger_info_file character: is the name of the file used to compare headers and higlight changes
#' @param record_check numeric: is a flag that indicates if check (record_check = 1) or not (record_check = 0) if the records are continuous
#' @param mail_file_alert character: is the name of the file that indicates if to check overrange and  to manage sending mail
#' @param use_alert_station_flag logical: is a flag to decide to use or not the station flags in range file # --> To clarify the usage
#' @param use_realtime_station_flag:  is a flag indicates to decide to use or not the station flags in range file # --> To clarify the usage
#' @param header_check logical: is a flag that indicates if you wanto to check header and logger information (based on new station file standard) (TRUE --> check all!)
#'
#' @return A list. The list contains: 1. A dataframe of th complete header 2. A dataframe of data column names 3. A dataframe of data 4. A list of errors 5.A dataframe of data without out of range data

DQC_function= function(input_dir,
                       output_dir_data,
                       output_dir_report,
                       project_dir,
                       data_from_row,
                       header_row_number,
                       datetime_header,
                       datetime_format,
                       datetime_sampling,
                       record_header,
                       range_file,
                       write_output_files,
                       write_output_report,
                       file_name,
                       station_name,
                       start_date,
                       # database_dir,
                       logger_info_file,
                       record_check,
                       output_dir_raw,
                       use_alert_station_flag,
                       mail_file_alert,
                       use_realtime_station_flag,
                       header_check,
                       sep){

  # ..... Define flags ..................................................................................................................................

  flag_empty = NA
  flag_logger_number = NA
  flag_error_df = NA
  flag_date = NA
  flag_duplicates_rows = NA
  flag_overlap = NA
  flag_missing_records = NA
  flag_missing_dates = NA
  flag_range_variable_to_set = NA
  flag_range_variable_new = NA
  flag_out_of_range = NA

  flag_new_duplicates_rows = NA
  flag_new_overlap = NA
  flag_new_missing_dates = NA

  flag_missing_records_new = NA

  flag_out_of_range_ALERT = NA

  # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # PART 1 --> ANALYZE AND WRITE DATA
  # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

  # ..... Body ..........................................................................................................................................

  if(check_empty_file(INPUT_DATA_DIR = input_dir, FILE_NAME = file_name) == TRUE){

    flag_empty = 1

  }else{

    flag_empty = 0

    data_import <- read_data(INPUT_DATA_DIR = input_dir, FILE_NAME = file_name, SEP = sep,                           # read and import data well formatted
                             DATETIME_HEADER = datetime_header, DATETIME_FORMAT = datetime_format, DATETIME_SAMPLING = datetime_sampling,
                             DATA_FROM_ROW = data_from_row, HEADER_ROW_NUMBER = header_row_number)
    header = data_import [[1]]
    header_colnames = data_import [[2]]
    data = data_import [[3]]
    flag_error_df = data_import [[4]]
    df_out = data_import [[5]]
    # data_star = data_import [[5]]
    # max_col  = data_import [[6]]
    rm(data_import)

    # logger_number = header[1,4]                                                                   # check logger numbers
    # software_version = header[1,6]
    if(header_check == TRUE){
      logger_info_csv = read.csv(file = logger_info_file, stringsAsFactors = F)
      w_logger = which(logger_info_csv[,1] == station_name)
      header_info = header[1,1:8]

      if(length(w_logger) == 0){
        # logger_info_csv = rbind(logger_info_csv, c(station_name, logger_number, software_version))
        new_logger_info = cbind(station_name,header_info)
        colnames(new_logger_info) = colnames(logger_info_csv)
        logger_info_csv = rbind(logger_info_csv, new_logger_info)
        write.csv(logger_info_csv,logger_info_file,row.names = F, na = "")
        flag_logger_number = 0
      }else{
        logger_info = logger_info_csv[w_logger,]
        if(any(header_info[,-c(1,2)] != logger_info[,-c(1,2,3)])){
          flag_logger_number = 1

          header_info = header_info[,-c(1,2)]     # difference on TOA and on Station_Name are admitted
          logger_info = logger_info[,-c(2,3)]     # difference on TOA and on Station_Name are admitted

          w_diff = which(header_info != logger_info[,-1])
          cc = colnames(logger_info[,-1])[w_diff]
          new_h = as.character(header_info[w_diff])
          old_h = as.character(logger_info[,-1][w_diff])
          logger_difference = data.frame(cc,old_h,new_h)
          colnames(logger_difference) = c("Column", "Old", "New")

          # logger_difference = gsub("_", " ", logger_difference$Column)   # remove underescore

        }else{
          flag_logger_number = 0
        }
      }
    }

    # if(flag_logger_number == 0){
    if(flag_error_df == 0){
      time_data = data[,which(colnames(data)==datetime_header)]
      time_data = time_data[order(time_data)]
      ###################################################################################################

      # check overlap comparing the all new file with the old files alreary saved!

      years = as.numeric(unique(format(time_data, format = "%Y", tz = "Etc/GMT-1")))
      old_data = data.frame(matrix(ncol = ncol(data), nrow = 0))
      colnames(old_data) = colnames(data)

      y = 1
      for(y in 1: length(years)){
        file_name_old <- paste(station_name, "_", years[y],".dat",sep = "")
        if(file.exists(paste(output_dir_raw,file_name_old,sep = ""))){
          old_import <- read_data(INPUT_DATA_DIR = output_dir_raw, FILE_NAME = file_name_old,  SEP = sep,                           # read and import data well formatted
                                  DATETIME_HEADER = datetime_header, DATETIME_FORMAT = datetime_format, DATETIME_SAMPLING = datetime_sampling,
                                  DATA_FROM_ROW = data_from_row, HEADER_ROW_NUMBER = header_row_number)
          old_import_header = old_import [[1]]
          old_import_data = old_import [[3]]
          if(identical(old_import_header[-1,], header[-1,])){
            old_data = rbind(old_data,old_import_data)
          }
        }else{
          old_import_header = NULL # <- tmp solution to fix bug due to new station!
        }
        gc(reset = T)
      }

      if(identical(old_import_header[-1,], header[-1,])){
        overlap_data = rbind(old_data, data)
        overlap_data = overlap_data[order(overlap_data[,which(colnames(overlap_data)==datetime_header)]),]

        deletes_duplcated <- deletes_duplcated_data(DATA = overlap_data,DATETIME_HEADER = datetime_header)         # <- Deletes identical rows if found
        deletes_duplcated_mydata <- deletes_duplcated [[1]]
        deletes_duplcated_data <- deletes_duplcated [[2]]
        deletes_duplcated_data = time_to_char(DATA = deletes_duplcated_data, DATETIME_HEADER = datetime_header, DATETIME_FORMAT = datetime_format)

        overlap <- detect_overlap(DATA = deletes_duplcated_mydata,DATETIME_HEADER = datetime_header, RECORD_HEADER = record_header)          # <- Detect overlap
      }else{
        overlap = NULL
      }
      ###################################################################################################
      data = data[order(data[,which(colnames(data)==datetime_header)]),]

      if(length(overlap) != 0){

        flag_overlap = 1
        overlap[,1]<- overlap[,1] + data_from_row - 1
        colnames(overlap)[1]= "File Row"

      }else{
        flag_overlap = 0

        if(is.na(start_date)){

          original = data
          mydata = data
          flag_date = 0

          rm(data)

        }else{
          recent_date = format(time_data[length(time_data)], format = datetime_format)
          if(as.POSIXct(start_date,tz = 'Etc/GMT-1') < time_data[length(time_data)]){
            w_date = which(time_data == as.POSIXct(start_date,tz = 'Etc/GMT-1'))

            if(length(w_date) != 0){
              original = data[(w_date[1] + 1):nrow(data),]      # possible issues in data subset!!! to check
              mydata = data[(w_date[1] + 1):nrow(data),]

              flag_date = 0

              rm(data)
            }else{
              original = data
              mydata = data

              flag_date = 0

              rm(data)
            }
          } else {

            flag_date = 1
          }
        }


        if(flag_date == 0){
          deletes_duplcated <- deletes_duplcated_data(DATA = mydata,DATETIME_HEADER = datetime_header)         # <- Deletes identical rows if found
          mydata = deletes_duplcated [[1]]
          duplicated_data = deletes_duplcated [[2]]
          duplicated_data = time_to_char(DATA = duplicated_data, DATETIME_HEADER = datetime_header, DATETIME_FORMAT = datetime_format)

          rm(deletes_duplcated)

          if(unique(as.character(duplicated_data[1,])) == "---"){
            flag_duplicates_rows = 0
          } else{
            flag_duplicates_rows = 1
          }

          data_in_old_files <- deletes_old_datetime(DATA = mydata,DATETIME_HEADER = datetime_header)
          mydata = data_in_old_files [[1]]
          old_data = data_in_old_files[[2]]

          rm(data_in_old_files)

          orig_wihtout_dupli = mydata

          overlap <- detect_overlap(DATA = mydata,DATETIME_HEADER = datetime_header, RECORD_HEADER = record_header)          # <- Detect overlap


          if(length(overlap) != 0){

            flag_overlap = 1
            overlap[,1]<- overlap[,1] + data_from_row - 1
            colnames(overlap)[1]= "File Row"

          }else{

            flag_overlap = 0

            # inserire qui controllo sul numero dei record. Ricorda di togliere le date inserite (missing dates --> record = -1)

            rec_miss  <- missing_record(DATA = mydata, DATETIME_HEADER = datetime_header, RECORD_HEADER = record_header, DATETIME_SAMPLING = datetime_sampling, DATETIME_FORMAT = datetime_format)  # <- fill missing dates with NA
            records_missing = rec_miss[[2]]
            records_restart = rec_miss[[3]]

            if(record_check == 1){
              flag_missing_records = rec_miss[[1]]
            }else{
              flag_missing_records = 50
            }

            if(flag_missing_records != 1){

              missing  <- missing_dates(DATA = mydata, DATETIME_HEADER = datetime_header, RECORD_HEADER = record_header, DATETIME_SAMPLING = datetime_sampling)  # <- fill missing dates with NA
              mydata = missing[[1]]
              missing_index_date = missing[[2]]

              rm(missing)

              # ALERT OUT OF RANGE --> ANY MYDATA MODIFICATION
              alert_range <- alert_range_notify(DATA = mydata, DATETIME_HEADER = datetime_header, DATETIME_FORMAT = datetime_format, RECORD_HEADER = record_header,
                                                RANGE_DIR = range_dir, RANGE_FILE = range_file,
                                                MAIL_DIR = mail_dir, MAIL_FILE_ALERT = mail_file_alert,
                                                STATION = STATION_NAME,
                                                USE_FLAG = use_alert_station_flag,USE_RT_FLAG = use_realtime_station_flag) # <- Substitute with NA data out of phisical range
              alert_out_of_range_table = alert_range[[1]]
              alert_variable_new = alert_range[[2]]
              alert_variable_to_set = alert_range[[3]]


              # OUT OF RANGE --> DELATE DATA OUT OF RANGE
              range <- exclude_out_of_range_v3(DATA = mydata,DATETIME_HEADER = datetime_header, RECORD_HEADER = record_header,
                                               RANGE_DIR = range_dir, RANGE_FILE = range_file) # <- Substitute with NA data out of phisical range
              mydata_out_of_range = range[[1]]               # don't subsitute out of range data with NaN
              out_of_range_table = range[[2]]
              # check_out_of_range = range[[2]]
              variable_new = range[[3]]
              variable_to_set = range[[4]]

              rm(range)


              # ..... Flags .....................................................................................................................................

              if(length(variable_to_set) != 0 | length(alert_variable_to_set) != 0){
                flag_range_variable_to_set = 1
              }else{
                flag_range_variable_to_set = 0
              }

              if(length(variable_new) != 0 | length(alert_variable_new) != 0 ){
                flag_range_variable_new = 1
              }else{
                flag_range_variable_new = 0
              }

              if(nrow(alert_out_of_range_table) == 0){
                flag_out_of_range_ALERT = 0
              }else{
                flag_out_of_range_ALERT = 1
              }


              if(nrow(out_of_range_table) == 0){
                flag_out_of_range = 0
              }else{
                flag_out_of_range = 1
              }

              # if(1 %in% unique(unlist(apply(X = check_out_of_range[,-which(colnames(check_out_of_range) == datetime_header)],MARGIN = 2, unique)))){
              #   flag_out_of_range = 1
              # }else{
              #   if(-1 %in% unique(unlist(apply(X = check_out_of_range[,-which(colnames(check_out_of_range) == datetime_header)],MARGIN = 2, unique)))){
              #     flag_out_of_range = 1
              #   }else{
              #     flag_out_of_range = 0
              #   }
              # }


              # time_tot = as.POSIXct(mydata[,which(colnames(mydata) == datetime_header)], format = datetime_format, tz = 'Etc/GMT-1')
              time_tot = as.POSIXct(mydata_out_of_range[,which(colnames(mydata_out_of_range) == datetime_header)], format = datetime_format, tz = 'Etc/GMT-1')
              time_missing = as.POSIXct(missing_index_date[,2], format = datetime_format, tz = 'Etc/GMT-1')

              if(length(which(time_tot %in% time_missing )) == 0){
                flag_missing_dates = 0      # No missing dates
              }else{
                flag_missing_dates = 1      # YES missing dates
              }

              mydata <- time_to_char(DATA = mydata_out_of_range, DATETIME_HEADER = datetime_header, DATETIME_FORMAT = datetime_format)
              # mydata_1 = mydata
            }
          }
        }
      }
      # }
    }
  }


  # missing_index_date

  # ..... Output ..........................................................................................................................................
  file_names = NULL


  if(flag_empty == 0){
    # if(flag_logger_number == 0){
    if(flag_error_df == 0){
      if(flag_overlap == 0){
        if(flag_date == 0){
          if(flag_missing_records != 1){
            # if(write_output_files == TRUE){    # here????

            time_mydata = as.POSIXct(mydata[,which(colnames(mydata)== datetime_header)],format = datetime_format, tz = 'Etc/GMT-1')
            time_orig = as.POSIXct(orig_wihtout_dupli[,which(colnames(orig_wihtout_dupli)== datetime_header)],format = datetime_format, tz = 'Etc/GMT-1')
            years_mydata = as.numeric(unique(format(time_mydata, format = "%Y")))

            # start_date_new = as.POSIXct(start_date,format = datetime_format, tz = 'Etc/GMT-1')
            # years_start = as.numeric(format(start_date_new, format = "%Y"))
            #
            # years = as.numeric(unique(c(years_start,years_mydata)))
            years = as.numeric(unique(format(time_mydata, format = "%Y")))

            if(format(as.POSIXct(time_mydata[1], format =  datetime_format, tz = "Etc/GMT-1"), format =  "%m-%d %H:%M")  == "01-01 00:00"){
              year_old = as.numeric(format(as.POSIXct(time_mydata[1], format =  datetime_format, tz = "Etc/GMT-1"), format =  "%Y")) - 1
              years = c(year_old, years)
            }

            file_names = paste(station_name,"_", years,".dat",sep = "")

            flag_new_duplicates_rows_tmp = c()
            flag_new_overlap_tmp = c()
            flag_new_missing_dates_tmp = c()
            flag_missing_records_new_tmp = c()
            df_difference = as.data.frame(matrix(ncol = 4, nrow = 0))
            colnames(df_difference) = c("Column", "Row", "Old", "New")

            new_missing_index_date_tot = c()
            new_overlap_tot = c()

            mydata_total = mydata
            original_total = orig_wihtout_dupli

            mydata_total[, which(colnames(mydata_total)== datetime_header)] = as.POSIXct( mydata_total[, which(colnames(mydata_total)== datetime_header)], format = datetime_format, tz ="Etc/GMT-1")
            original_total[, which(colnames(original_total)== datetime_header)] = as.POSIXct( original_total[, which(colnames(original_total)== datetime_header)], format = datetime_format, tz ="Etc/GMT-1")


            k=1

            for(k in 1: length(years)){

              # estrarre da mydata_total e original total solo i dati dall' year-01-01 00:15 al year+1-01-01 00:00
              date_min = as.POSIXct(paste(years[k], "-01-01 00:15",sep = ""), format = datetime_format, tz = "Etc/GMT-1")
              date_max = as.POSIXct(paste(years[k]+1, "-01-01 00:00",sep = ""), format = datetime_format, tz = "Etc/GMT-1")
              mydata = mydata_total[which(time_mydata >= date_min & time_mydata <= date_max),]
              orig_wihtout_dupli = original_total[which(time_orig >= date_min & time_orig <= date_max),]

              # mydata[, which(colnames(mydata)== datetime_header)] = as.POSIXct( mydata[, which(colnames(mydata)== datetime_header)], format = datetime_format, tz ="Etc/GMT-1")
              # orig_wihtout_dupli[, which(colnames(orig_wihtout_dupli)== datetime_header)] = as.POSIXct( orig_wihtout_dupli[, which(colnames(orig_wihtout_dupli)== datetime_header)], format = datetime_format, tz ="Etc/GMT-1")

              if(k > 1){
                if( flag_missing_records_new_tmp[k-1] == 1){
                  check_record_flag = 1
                }else{
                  check_record_flag = 0
                }
              }else{
                check_record_flag = 0
              }

              if(check_record_flag == 0){

                if(file.exists(paste(output_dir_data,file_names[k],sep = ""))){

                  # import old data
                  old_data_list = read_data(INPUT_DATA_DIR = output_dir_data,
                                            FILE_NAME = file_names[k],
                                            SEP = sep,
                                            DATETIME_HEADER = datetime_header,
                                            DATETIME_FORMAT = datetime_format,
                                            DATA_FROM_ROW = data_from_row,
                                            HEADER_ROW_NUMBER = header_row_number)

                  old_original_list = read_data(INPUT_DATA_DIR = output_dir_raw,
                                                FILE_NAME = paste(substring(file_names[k],1, nchar(file_names[k])-4),".dat",sep = ""),
                                                SEP = sep,
                                                DATETIME_HEADER = datetime_header,
                                                DATETIME_FORMAT = datetime_format,
                                                DATA_FROM_ROW = data_from_row,
                                                HEADER_ROW_NUMBER = header_row_number)


                  old_header = old_data_list [[1]]
                  old_header_colnames = old_data_list [[2]]
                  old_data = old_data_list [[3]]

                  last_old_datetime = old_data[nrow(old_data),which(colnames(old_data) == datetime_header)]
                  last_old_record = old_data[nrow(old_data),which(colnames(old_data) == record_header)]

                  rm(old_data_list)

                  old_orig_header = old_original_list [[1]]
                  old_orig_header_colnames = old_original_list [[2]]
                  old_orig_data = old_original_list [[3]]

                  last_old_orig_datetime = old_orig_data[nrow(old_orig_data),which(colnames(old_orig_data) == datetime_header)]
                  last_old_orig_record = old_orig_data[nrow(old_orig_data),which(colnames(old_orig_data) == record_header)]

                  rm(old_original_list)

                  if(identical(old_header[-1,], header[-1,])){   # <-- delete  [-1,] when all station are updated. Substitute header new in old datatable.

                    # -------------------------------------------------------------------------------
                    # append new data to old data if headers new and old are the same

                    # ~ ~ ~ done to prevent wrong overlaps ~ ~ ~
                    mydata_start = mydata[1,which(colnames(old_data) == datetime_header)]
                    time_old = old_data[,which(colnames(old_data) == datetime_header)]
                    old_data_tmp = old_data[which(time_old < mydata_start),]
                    new = rbind(old_data_tmp,mydata)
                    # ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~

                    # new = rbind(old_data,mydata)
                    new = new[order(new[,which(colnames(new) == datetime_header)]),]

                    # append new raw data to old data if headers new and old are the same

                    # df_toadd_raw = orig_wihtout_dupli[which(format(time_orig, format = "%Y") == years[k]),]
                    # df_toadd_raw[,which(colnames(df_toadd_raw)== datetime_header)] = as.POSIXct(format(df_toadd_raw[,which(colnames(df_toadd_raw)== datetime_header)],format = datetime_format),tz = "Etc/GMT-1")

                    # new_raw = rbind(old_orig_data,df_toadd_raw)
                    new_raw = rbind(old_orig_data,orig_wihtout_dupli)
                    new_raw = new_raw[order(new_raw[,which(colnames(new_raw) == datetime_header)]),]

                    new_deletes_duplcated <- deletes_duplcated_data(DATA = new,DATETIME_HEADER = datetime_header)
                    new_mydata = new_deletes_duplcated [[1]]
                    new_duplicated_data = new_deletes_duplcated [[2]]

                    raw_new_deletes_duplcated <- deletes_duplcated_data(DATA = new_raw,DATETIME_HEADER = datetime_header)
                    raw_new_mydata = raw_new_deletes_duplcated [[1]]

                    orig_data_new = raw_new_mydata

                    raw_new_duplicated_data = raw_new_deletes_duplcated [[2]]

                    # non ha senso vedere se ci sono righe duplicate tra quelle scaricate e file vecchi --> gia processiati!
                    if(unique(as.character(new_duplicated_data[1,])) == "---"){
                      if(write_output_files == FALSE){
                        flag_new_duplicates_rows_tmp = c(flag_new_duplicates_rows_tmp,0)

                      }else{
                        flag_new_duplicates_rows_tmp = c(flag_new_duplicates_rows_tmp,0)
                      }
                    } else{
                      if(write_output_files == FALSE){
                        flag_new_duplicates_rows_tmp = c(flag_new_duplicates_rows_tmp,0)

                      }else{
                        flag_new_duplicates_rows_tmp = c(flag_new_duplicates_rows_tmp,1)
                      }
                    }



                    new_duplicated_data = time_to_char(DATA = new_duplicated_data, DATETIME_HEADER = datetime_header, DATETIME_FORMAT = datetime_format)
                    raw_new_duplicated_data = time_to_char(DATA = raw_new_duplicated_data, DATETIME_HEADER = datetime_header, DATETIME_FORMAT = datetime_format)

                    # new_overlap <- detect_overlap(DATA = new_mydata,DATETIME_HEADER = datetime_header, RECORD_HEADER = record_header)
                    new_overlap <- detect_overlap(DATA = raw_new_mydata,DATETIME_HEADER = datetime_header, RECORD_HEADER = record_header)
                    new_overlap_tot = rbind(new_overlap_tot, new_overlap)
                    if(length(new_overlap) == 0){

                      flag_new_overlap_tmp = c(flag_new_overlap_tmp,0)

                      # ----- old!!!! ------
                      # if(record_check == 1){
                      #   w_last = which(new_mydata[,which(colnames(new_mydata) == datetime_header)] == last_old_datetime)
                      #   if(length(w_last) == 0){
                      #     rec_miss  <- missing_record(DATA = new_mydata[w_last:nrow(new_mydata),], DATETIME_HEADER = datetime_header, RECORD_HEADER = record_header, DATETIME_SAMPLING = datetime_sampling, DATETIME_FORMAT = datetime_format)  # <- fill missing dates with NA
                      #
                      #   }else{
                      #     rec_miss  <- missing_record(DATA = new_mydata[w_last:nrow(new_mydata),], DATETIME_HEADER = datetime_header, RECORD_HEADER = record_header, DATETIME_SAMPLING = datetime_sampling, DATETIME_FORMAT = datetime_format)  # <- fill missing dates with NA
                      #   }
                      #   flag_missing_records_new_tmp = rec_miss[[1]]
                      #   records_missing_new = rec_miss[[2]]
                      #   records_restart_new = rec_miss[[3]]
                      # }else{
                      #   flag_missing_records_new_tmp = 50
                      # }
                      # ----- old!!!! ------

                      # ----- new!!!! ------
                      w_last = which(new_mydata[,which(colnames(new_mydata) == datetime_header)] == last_old_datetime)
                      if(length(w_last) == 0){
                        rec_miss  <- missing_record(DATA = new_mydata[w_last:nrow(new_mydata),], DATETIME_HEADER = datetime_header, RECORD_HEADER = record_header, DATETIME_SAMPLING = datetime_sampling, DATETIME_FORMAT = datetime_format)  # <- fill missing dates with NA

                      }else{
                        rec_miss  <- missing_record(DATA = new_mydata[w_last:nrow(new_mydata),], DATETIME_HEADER = datetime_header, RECORD_HEADER = record_header, DATETIME_SAMPLING = datetime_sampling, DATETIME_FORMAT = datetime_format)  # <- fill missing dates with NA
                      }
                      records_missing_new = rec_miss[[2]]
                      records_restart_new = rec_miss[[3]]

                      if(record_check == 1){
                        flag_missing_records_new_tmp = c(flag_missing_records_new_tmp, rec_miss[[1]])
                      }else{
                        flag_missing_records_new_tmp = c(flag_missing_records_new_tmp,50)
                      }
                      # ----- new!!!! ------

                      if(flag_missing_records_new_tmp[length(flag_missing_records_new_tmp)] != 1){
                        new_missing  <- missing_dates(DATA = new_mydata,
                                                      DATETIME_HEADER = datetime_header,
                                                      RECORD_HEADER = record_header,
                                                      DATETIME_SAMPLING = datetime_sampling)  # <- fill missing dates with NA
                        new_mydata = new_missing[[1]]
                        new_missing_index_date = new_missing[[2]]

                        new_missing_index_date_tot = rbind(new_missing_index_date_tot,new_missing_index_date)



                        if(record_check != 1 | flag_missing_records_new_tmp[length(flag_missing_records_new_tmp)] != 1){
                          # We avoid to write output if record control is active (record_check = 1) and record has some issues (indicated by flag_append_new = -1)

                          # prepare data for output

                          new_mydata <- time_to_char(DATA = new_mydata, DATETIME_HEADER = datetime_header, DATETIME_FORMAT = datetime_format)
                          orig_data_new <- time_to_char(DATA = orig_data_new, DATETIME_HEADER = datetime_header, DATETIME_FORMAT = datetime_format)


                          new_time_tot = as.POSIXct(new_mydata[,which(colnames(new_mydata) == datetime_header)], format = datetime_format, tz = 'Etc/GMT-1')
                          new_time_orig = as.POSIXct(orig_data_new[,which(colnames(orig_data_new) == datetime_header)], format = datetime_format, tz = 'Etc/GMT-1')

                          new_time_missing = new_missing_index_date[,2]

                          if(length(which(new_time_tot %in% new_time_missing )) == 0){
                            flag_new_missing_dates_tmp = c(flag_new_missing_dates_tmp,0)      # No missing dates
                          }else{
                            flag_new_missing_dates_tmp = c(flag_new_missing_dates_tmp,1)      # YES missing dates
                          }

                          rm(new_missing)

                          colnames(header) = header[1,]
                          out_my = new_mydata
                          colnames(out_my) = colnames(header)
                          out_mydata=rbind(header[-1,],out_my)
                          file_name_output = file_names[k]
                          # flag_missing_records_new_tmp = c(flag_missing_records_new_tmp, 0)


                          # out_orig = orig_data_new[which(format(new_time_orig, format = "%Y") == years[k]),]
                          out_orig = orig_data_new
                          out_orig[,which(colnames(out_orig)== datetime_header)] = format(out_orig[,which(colnames(out_orig)== datetime_header)], format = datetime_format)
                          colnames(out_orig) = colnames(header)
                          out_original=rbind(header[-1,],out_orig)
                          # file_name_original = paste(substring(file_names[k], 1, nchar(file_names[k])-4), ".dat",sep = "")
                          file_name_original = file_names[k]

                          # create a dataframe database formatted

                          # db_mydata = new_mydata
                          # db_mydata[, which(colnames(db_mydata) == datetime_header)] = as.POSIXct(db_mydata[, which(colnames(db_mydata) == datetime_header)],tz ='Etc/GMT-1',format = datetime_format)
                          # first_row_selected = which(db_mydata[, which(colnames(db_mydata) == datetime_header)] == last_old_datetime)+1
                          # db_mydata = db_mydata[first_row_selected: nrow(db_mydata),]
                          # from_date = db_mydata[1,which(colnames(db_mydata) == datetime_header)]
                          # to_date = db_mydata[nrow(db_mydata),which(colnames(db_mydata) == datetime_header)]
                          # db_mydata[, which(colnames(db_mydata) == datetime_header)] = format(db_mydata[, which(colnames(db_mydata) == datetime_header)],format = datetime_format ) #change here if you want change datetime output format (for example  in database "%Y-%m-%dT%H:%M")
                          # colnames(db_mydata) = colnames(header)
                          # db_mydata=rbind(header[-1,],db_mydata)
                          # date_to_print_filename = paste(paste(format(from_date,format = "%Y"),format(from_date,format = "%m"),format(from_date,format = "%d"),
                          #                                      format(from_date,format = "%H"),format(from_date,format = "%M"),sep = ""),
                          #                                paste(format(to_date,format = "%Y"),format(to_date,format = "%m"),format(to_date,format = "%d"),
                          #                                      format(to_date,format = "%H"),format(to_date,format = "%M"),sep = "" ), sep = "_")

                          if(write_output_files == TRUE){    # here????

                            if(header_check == TRUE){
                              # keep updtate logger_info_file!
                              w_logger = which(logger_info_csv[,1] == station_name)
                              new_logger_info = cbind(station_name,header[1,1:8])
                              colnames(new_logger_info) = colnames(logger_info_csv)
                              logger_info_csv[w_logger,] = new_logger_info
                              write.csv(logger_info_csv,logger_info_file,row.names = F, na = "")
                            }

                            # write total .dat
                            write.csv(out_mydata,paste(output_dir_data,file_name_output,sep = ""),quote = F,row.names = F, na = "NaN")
                            write.csv(out_original,paste(output_dir_raw,file_name_original,sep = ""),quote = F,row.names = F, na = "NaN")

                            # write total .csv
                            file_name_output_csv = paste(substring(file_name_output, 1, nchar(file_name_output)-4),".csv",sep="")
                            output_dir_data_csv = substring(output_dir_data, 1, nchar(output_dir_data)-10)  ### NOTA: cartella livello sopra (elimino il num di caratteri di Files_dat)
                            file.copy(from = paste(output_dir_data,file_name_output,sep = ""), to = paste(output_dir_data_csv,file_name_output_csv,sep = ""), overwrite = T)
                            # write.csv(db_mydata, paste(database_dir ,substring(file_name_output,1, nchar(file_name_output)-8),date_to_print_filename, ".csv",sep = ""),quote = F,row.names = F, na = "NaN")
                          }

                        }
                      }
                    }else{
                      flag_new_overlap_tmp = c(flag_new_overlap_tmp,1)
                      new_overlap[,1]<- new_overlap[,1] + data_from_row - 1
                      colnames(new_overlap)[1]= "File Row"
                    }

                  }else{

                    # add missing records before in the new structure files
                    first_new_datetime = as.POSIXct(mydata[1,which(colnames(mydata) == datetime_header)],tz = "Etc/GMT-1")

                    all_dates = seq(from = last_old_datetime, to = first_new_datetime, by = datetime_sampling)
                    all_dates = all_dates[-c(1,length(all_dates))]

                    mydata_tmp = mydata[,which(colnames(mydata) %in% c(datetime_header, record_header))]
                    mydata_tmp[,which(colnames(mydata_tmp) == datetime_header)] = as.POSIXct(mydata_tmp[,which(colnames(mydata_tmp) == datetime_header)], format = datetime_format, tz = "Etc/GMT-1")

                    olddata_tmp = old_data[,which(colnames(old_data) %in% c(datetime_header, record_header))]
                    olddata_tmp[,which(colnames(olddata_tmp) == datetime_header)] = as.POSIXct(olddata_tmp[,which(colnames(olddata_tmp) == datetime_header)], format = datetime_format, tz = "Etc/GMT-1")

                    mydata_rec_miss = rbind(olddata_tmp, mydata_tmp)
                    rec_miss  <- missing_record(DATA = mydata_rec_miss, DATETIME_HEADER = datetime_header, RECORD_HEADER = record_header, DATETIME_SAMPLING = datetime_sampling, DATETIME_FORMAT = datetime_format)  # <- fill missing dates with NA
                    records_missing = rec_miss[[2]]
                    records_restart = rec_miss[[3]]

                    if(record_check == 1){
                      flag_missing_records_new_tmp = c(flag_missing_records_new_tmp,rec_miss[[1]])
                    }else{
                      flag_missing_records_new_tmp = c(flag_missing_records_new_tmp,50)
                    }

                    if(flag_missing_records_new_tmp[length(flag_missing_records_new_tmp)] != 1){
                      new_missing  <- missing_dates(DATA = mydata_rec_miss,
                                                    DATETIME_HEADER = datetime_header,
                                                    RECORD_HEADER = record_header,
                                                    DATETIME_SAMPLING = datetime_sampling)  # <- fill missing dates with NA
                      new_mydata = new_missing[[1]]
                      new_missing_index_date = new_missing[[2]]

                      new_missing_index_date_tot = rbind(new_missing_index_date_tot,new_missing_index_date)

                      if(record_check != 1 | flag_missing_records_new_tmp[length(flag_missing_records_new_tmp)] != 1){

                        new_mydata <- time_to_char(DATA = new_mydata, DATETIME_HEADER = datetime_header, DATETIME_FORMAT = datetime_format)
                        new_time_tot = as.POSIXct(new_mydata[,which(colnames(new_mydata) == datetime_header)], format = datetime_format, tz = 'Etc/GMT-1')
                        new_time_missing = as.POSIXct(new_missing_index_date[,2], format = datetime_format, tz = "Etc/GMT-1")

                        if(length(which(new_time_tot %in% new_time_missing )) == 0){
                          all_dates_df =  data.frame(matrix(nrow =length(all_dates), ncol = ncol(old_data)))
                          colnames(all_dates_df) = colnames(old_data)
                          all_dates_df[,which(colnames(all_dates_df) == datetime_header)] = format(all_dates,format = datetime_format,tz = "Etc/GMT-1")

                          flag_new_missing_dates_tmp = c(flag_new_missing_dates_tmp,0)      # No missing dates
                        }else{
                          all_dates_df =  data.frame(matrix(nrow =length(all_dates), ncol = ncol(old_data)))
                          colnames(all_dates_df) = colnames(old_data)
                          all_dates_df[,which(colnames(all_dates_df) == datetime_header)] = format(all_dates,format = datetime_format,tz = "Etc/GMT-1")
                          all_dates_df[,which(colnames(all_dates_df) == record_header)] = -1        # Record gap filled with NaN were flagged with RECORD = -1

                          flag_new_missing_dates_tmp = c(flag_new_missing_dates_tmp,1)      # YES missing dates
                        }


                        new_mydata_old = time_to_char(DATA = old_data,DATETIME_HEADER = datetime_header,DATETIME_FORMAT = datetime_format)
                        new_mydata_old = rbind(new_mydata_old, all_dates_df)

                        colnames(header) = header[1,]
                        colnames(old_header) = old_header[1,]

                        #--- old file ---
                        new_mydata_old <- time_to_char(DATA = new_mydata_old, DATETIME_HEADER = datetime_header, DATETIME_FORMAT = datetime_format)
                        out_my_old = new_mydata_old
                        colnames(out_my_old) = colnames(old_header)
                        out_mydata_old=rbind(old_header[-1,],out_my_old)
                        file_name_output_old = file_names[k]

                        if(write_output_files == TRUE){    # here????
                          write.csv(out_mydata_old,paste(output_dir_data,file_name_output_old,sep = ""),quote = F,row.names = F, na = "NaN")
                          file_name_output_csv = paste(substring(file_name_output_old, 1, nchar(file_name_output_old)-4),".csv",sep="")
                          output_dir_data_csv = substring(output_dir_data, 1, nchar(output_dir_data)-10)  ### NOTA: cartella livello sopra (elimino il num di caratteri di Files_dat)
                          file.copy(from = paste(output_dir_data,file_name_output_old,sep = ""), to = paste(output_dir_data_csv,file_name_output_csv,sep = ""), overwrite = T)

                          # rename total data
                          j=0
                          repeat{
                            j=j+1
                            file_names_old = paste(substring(file_names[k],1, nchar(file_names[k])-4),"_old",j,".dat",sep = "")
                            if(!file.exists(paste(output_dir_data,file_names_old,sep = ""))){
                              break
                            }
                          }
                          file_names_total_data = file_names[k]

                          file.rename(from = paste(output_dir_data,file_names_total_data,sep = ""),to = paste(output_dir_data,file_names_old,sep = ""))

                          output_dir_data_csv = substring(output_dir_data, 1, nchar(output_dir_data)-10)
                          file_names_old_csv = paste(substring(file_names_old, 1, nchar(file_names_old)-4),".csv",sep = "")
                          file.copy(from = paste(output_dir_data,file_names_old,sep = ""),to = paste(output_dir_data_csv,file_names_old_csv,sep = ""))

                          # rename raw data
                          j=0
                          repeat{
                            j=j+1
                            file_names_original_old = paste(substring(file_names[k],1, nchar(file_names[k])-4),"_old",j,".dat",sep = "")
                            if(!file.exists(paste(output_dir_raw,file_names_original_old,sep = ""))){
                              break
                            }
                          }
                          file_names_raw_data = paste(substring(file_names[k],1,nchar(file_names[k])-4),".dat", sep = "")
                          file.rename(from = paste(output_dir_raw,file_names_raw_data,sep = ""),to = paste(output_dir_raw,file_names_original_old,sep = ""))

                          #--- new file ---
                          mydata = time_to_char(DATA = mydata,DATETIME_HEADER = datetime_header, DATETIME_FORMAT = datetime_format)
                          out_my = mydata
                          colnames(out_my) = colnames(header)
                          out_mydata=rbind(header[-1,],out_my)
                          file_name_output = file_names[k]

                          orig_wihtout_dupli = time_to_char(DATA = orig_wihtout_dupli,DATETIME_HEADER = datetime_header, DATETIME_FORMAT = datetime_format)
                          out_orig = orig_wihtout_dupli
                          out_orig[,which(colnames(out_orig)== datetime_header)] = format(out_orig[,which(colnames(out_orig)== datetime_header)], format = datetime_format)
                          colnames(out_orig) = colnames(header)
                          out_original=rbind(header[-1,],out_orig)
                          file_name_original = paste(substring(file_names[k], 1, nchar(file_names[k])-4), ".dat",sep = "")

                          if(header_check == TRUE){
                            # keep updtate logger_info_file!
                            w_logger = which(logger_info_csv[,1] == station_name)
                            new_logger_info = cbind(station_name,header[1,1:8])
                            colnames(new_logger_info) = colnames(logger_info_csv)
                            logger_info_csv[w_logger,] = new_logger_info
                            write.csv(logger_info_csv,logger_info_file,row.names = F, na = "")
                          }

                          # write total .dat
                          write.csv(out_mydata,paste(output_dir_data,file_name_output,sep = ""),quote = F,row.names = F, na = "NaN")
                          write.csv(out_original,paste(output_dir_raw,file_name_original,sep = ""),quote = F,row.names = F, na = "NaN")

                          # write total .csv
                          file_name_output_csv = paste(substring(file_name_output, 1, nchar(file_name_output)-4),".csv",sep="")
                          output_dir_data_csv = substring(output_dir_data, 1, nchar(output_dir_data)-10)  ### NOTA: cartella livello sopra (elimino il num di caratteri di Files_dat)
                          file.copy(from = paste(output_dir_data,file_name_output,sep = ""), to = paste(output_dir_data_csv,file_name_output_csv,sep = ""), overwrite = T)

                        }
                      }
                    }
                    ######### end new section ##########

                    # flag_missing_records_new_tmp = c(flag_missing_records_new_tmp, 1) # ????

                    header_t = as.data.frame(t(header))
                    old_header_t = as.data.frame(t(old_header))

                    if( nrow(header_t) < nrow(old_header_t)){
                      diff_df = matrix(data = "", nrow = nrow(old_header_t) - nrow(header_t), ncol = ncol(header_t))
                      header_t = rbind(header_t,diff_df)
                    }else{
                      if( nrow(header_t) > nrow(old_header_t)){
                        diff_df = matrix(data = "", nrow = nrow(header_t) - nrow(old_header_t), ncol = ncol(header_t))
                        old_header_t = rbind(old_header_t,diff_df)
                      }
                    }

                    header_t = cbind(rep(NA, times = nrow(header_t)),header_t )
                    colnames(header_t) = c("NA","Station_info", "Header","Units", "Sampling_method")
                    # colnames(header_t) = paste("row_",seq(1:ncol(header_t))-1,sep = "")
                    rownames(header_t) = paste("col_",seq(1:nrow(header_t))-1,sep = "")
                    header_t = header_t[,-c(1:2)]

                    # old_header_t = as.data.frame(t(old_header))
                    old_header_t = cbind(rep(NA, times = nrow(old_header_t)),old_header_t )
                    colnames(old_header_t) = c("NA","Station_info", "Header","Units", "Sampling_method")
                    # colnames(old_header_t) = paste("row_",seq(1:ncol(old_header_t))-1,sep = "")
                    rownames(old_header_t) = paste("col_",seq(1:nrow(old_header_t))-1,sep = "")
                    old_header_t = old_header_t[,-c(1:2)]

                    # header_t[old_header_t != header_t]
                    # old_header_t[old_header_t != header_t]
                    nrow(old_header_t)
                    nrow(header_t)

                    # ------ NEW -------

                    # intersect(old_header_t$Header,header_t$Header)

                    old_eq= old_header_t[which(old_header_t$Header %in% intersect(old_header_t$Header,header_t$Header)),]
                    new_eq=     header_t[which(    header_t$Header %in% intersect(old_header_t$Header,header_t$Header)),]

                    new_eq = new_eq[ match(old_eq$Header,new_eq$Header), ]
                    w_df = as.data.frame(which(old_eq != new_eq,arr.ind = T))

                    o = old_eq[w_df$row,]
                    n = new_eq[w_df$row,]

                    m_o = melt(o,id.vars = "Header")
                    m_n = melt(n,id.vars = "Header")

                    colnames(m_o)[2:3] = c("Row" ,"Old")
                    colnames(m_n)[2:3] = c("Row" ,"New")

                    mer = merge(m_o, m_n)

                    mer$Header = factor(mer$Header)
                    mer$Row = factor(mer$Row)

                    level_header = intersect(old_header_t$Header,header_t$Header)
                    level_row = c("Units","Sampling_method")

                    mer = mer[order(match(mer$Header, level_header),match(mer$Row, level_row)),]
                    mer = data.frame(mer, stringsAsFactors = F)
                    colnames(mer) = c("Column", "Row", "Old", "New")
                    mer$Column = as.character(mer$Column)
                    mer$Row = as.character(mer$Row)

                    old_h = setdiff(old_header_t$Header,header_t$Header)
                    new_h = setdiff(header_t$Header,old_header_t$Header)


                    old_df = data.frame(match(old_h, old_header_t$Header),rep("Header", times = length(old_h)), old_h,rep("", times = length(old_h)))
                    colnames(old_df) = colnames(mer)
                    old_df$Column = as.character(old_df$Column)


                    new_df = data.frame(match(new_h, header_t$Header),rep("Header", times = length(new_h)),rep("", times = length(new_h)), new_h)
                    colnames(new_df) = colnames(mer)
                    new_df$Column = as.character(new_df$Column)


                    add_remove = rbind(old_df, new_df)

                    if(nrow(mer) == 0  & nrow(add_remove) == 0){
                      df_difference_tmp = data.frame("","Headers reorded", "", "")
                    }else{
                      df_difference_tmp = rbind(mer, add_remove)
                    }

                    # ------------------

                    # w_df = as.data.frame(which(old_header_t != header_t,arr.ind = T))
                    #
                    # df_difference_tmp = data.frame(rownames(header_t)[w_df$row],
                    #                                colnames(header_t)[w_df$col],
                    #                                old_header_t[old_header_t != header_t],
                    #                                header_t[old_header_t != header_t])

                    colnames(df_difference_tmp) = c("Column", "Row", "Old", "New")
                    df_difference = rbind(df_difference,df_difference_tmp)
                  }

                }else{

                  # NEW!!!!
                  # Manage date gaps and record gaps between 2 years
                  file_names_old = paste(station_name,"_", years[k]-1,".dat",sep = "")

                  if(file.exists(paste(output_dir_data,file_names_old,sep = ""))){
                    old_data_list = read_data(INPUT_DATA_DIR = output_dir_data,
                                              FILE_NAME = file_names_old,
                                              SEP = sep,
                                              DATETIME_HEADER = datetime_header,
                                              DATETIME_FORMAT = datetime_format,
                                              DATA_FROM_ROW = data_from_row,
                                              HEADER_ROW_NUMBER = header_row_number)

                    old_original_list = read_data(INPUT_DATA_DIR = output_dir_raw,
                                                  FILE_NAME = paste(substring(file_names_old,1, nchar(file_names_old)-4),".dat",sep = ""),
                                                  SEP = sep,
                                                  DATETIME_HEADER = datetime_header,
                                                  DATETIME_FORMAT = datetime_format,
                                                  DATA_FROM_ROW = data_from_row,
                                                  HEADER_ROW_NUMBER = header_row_number)


                    old_header = old_data_list [[1]]
                    old_header_colnames = old_data_list [[2]]
                    old_data = old_data_list [[3]]
                    colnames(old_header) = old_header[1,]  #new

                    last_old_datetime = old_data[nrow(old_data),which(colnames(old_data) == datetime_header)]
                    last_old_record = old_data[nrow(old_data),which(colnames(old_data) == record_header)]

                    rm(old_data_list)

                    old_orig_header = old_original_list [[1]]
                    old_orig_header_colnames = old_original_list [[2]]
                    old_orig_data = old_original_list [[3]]

                    last_old_orig_datetime = old_orig_data[nrow(old_orig_data),which(colnames(old_orig_data) == datetime_header)]
                    last_old_orig_record = old_orig_data[nrow(old_orig_data),which(colnames(old_orig_data) == record_header)]

                    rm(old_original_list)

                    colnames(header) = header[1,]   #new

                    if(identical(old_header[-1,], header[-1,])){

                      # -------------------------------------------------------------------------------
                      # append new data to old data if headers new and old are the same
                      new = rbind(old_data,mydata)
                      new = new[order(new[,which(colnames(new) == datetime_header)]),]

                      # -------------------------------------------------------------------------------
                      # append new raw data to old data if headers new and old are the same
                      new_raw = rbind(old_orig_data,orig_wihtout_dupli)
                      new_raw = new_raw[order(new_raw[,which(colnames(new_raw) == datetime_header)]),]

                      new_deletes_duplcated <- deletes_duplcated_data(DATA = new,DATETIME_HEADER = datetime_header)
                      new_mydata = new_deletes_duplcated [[1]]
                      new_duplicated_data = new_deletes_duplcated [[2]]

                      raw_new_deletes_duplcated <- deletes_duplcated_data(DATA = new_raw,DATETIME_HEADER = datetime_header)
                      raw_new_mydata = raw_new_deletes_duplcated [[1]]

                      orig_data_new = raw_new_mydata

                      raw_new_duplicated_data = raw_new_deletes_duplcated [[2]]


                      # non ha senso vedere se ci sono righe duplicate tra quelle scaricate e file vecchi --> gia processiati!
                      if(unique(as.character(new_duplicated_data[1,])) == "---"){
                        if(write_output_files == FALSE){
                          flag_new_duplicates_rows_tmp = c(flag_new_duplicates_rows_tmp,0)

                        }else{
                          flag_new_duplicates_rows_tmp = c(flag_new_duplicates_rows_tmp,0)
                        }
                      } else{
                        if(write_output_files == FALSE){
                          flag_new_duplicates_rows_tmp = c(flag_new_duplicates_rows_tmp,0)

                        }else{
                          flag_new_duplicates_rows_tmp = c(flag_new_duplicates_rows_tmp,1)
                        }
                      }
                      new_duplicated_data = time_to_char(DATA = new_duplicated_data, DATETIME_HEADER = datetime_header, DATETIME_FORMAT = datetime_format)
                      raw_new_duplicated_data = time_to_char(DATA = raw_new_duplicated_data, DATETIME_HEADER = datetime_header, DATETIME_FORMAT = datetime_format)

                      # new_overlap <- detect_overlap(DATA = new_mydata,DATETIME_HEADER = datetime_header, RECORD_HEADER = record_header)
                      new_overlap <- detect_overlap(DATA = raw_new_mydata,DATETIME_HEADER = datetime_header, RECORD_HEADER = record_header)
                      new_overlap_tot = rbind(new_overlap_tot, new_overlap)

                      if(length(new_overlap) == 0){

                        flag_new_overlap_tmp = c(flag_new_overlap_tmp,0)
                        w_last = which(new_mydata[,which(colnames(new_mydata) == datetime_header)] == last_old_datetime)
                        if(length(w_last) == 0){
                          # ????
                          rec_miss  <- missing_record(DATA = new_mydata[w_last:nrow(new_mydata),], DATETIME_HEADER = datetime_header, RECORD_HEADER = record_header, DATETIME_SAMPLING = datetime_sampling, DATETIME_FORMAT = datetime_format)  # <- fill missing dates with NA

                        }else{
                          rec_miss  <- missing_record(DATA = new_mydata[w_last:nrow(new_mydata),], DATETIME_HEADER = datetime_header, RECORD_HEADER = record_header, DATETIME_SAMPLING = datetime_sampling, DATETIME_FORMAT = datetime_format)  # <- fill missing dates with NA
                        }
                        records_missing_new = rec_miss[[2]]
                        records_restart_new = rec_miss[[3]]

                        if(record_check == 1){
                          flag_missing_records_new_tmp =c(flag_missing_records_new_tmp,rec_miss[[1]])
                        }else{
                          flag_missing_records_new_tmp = c(flag_missing_records_new_tmp, 50)
                        }
                        if(flag_missing_records_new_tmp[length(flag_missing_records_new_tmp)] != 1){
                          new_missing  <- missing_dates(DATA = new_mydata,
                                                        DATETIME_HEADER = datetime_header,
                                                        RECORD_HEADER = record_header,
                                                        DATETIME_SAMPLING = datetime_sampling)  # <- fill missing dates with NA
                          new_mydata = new_missing[[1]]
                          new_missing_index_date = new_missing[[2]]

                          new_missing_index_date_tot = rbind(new_missing_index_date_tot,new_missing_index_date)

                          if(record_check != 1 | flag_missing_records_new_tmp[length(flag_missing_records_new_tmp)] != 1){
                            # We avoid to write output if record control is active (record_check = 1) and record has some issues (indicated by flag_append_new = -1)
                            # prepare data for output

                            new_mydata <- time_to_char(DATA = new_mydata, DATETIME_HEADER = datetime_header, DATETIME_FORMAT = datetime_format)
                            orig_data_new <- time_to_char(DATA = orig_data_new, DATETIME_HEADER = datetime_header, DATETIME_FORMAT = datetime_format)

                            new_time_tot = as.POSIXct(new_mydata[,which(colnames(new_mydata) == datetime_header)], format = datetime_format, tz = 'Etc/GMT-1')
                            new_time_orig = as.POSIXct(orig_data_new[,which(colnames(orig_data_new) == datetime_header)], format = datetime_format, tz = 'Etc/GMT-1')

                            new_time_missing = as.POSIXct(new_missing_index_date[,2], format = datetime_format, tz = "Etc/GMT-1")

                            if(length(which(new_time_tot %in% new_time_missing )) == 0){
                              flag_new_missing_dates_tmp = c(flag_new_missing_dates_tmp,0)      # No missing dates
                            }else{
                              flag_new_missing_dates_tmp = c(flag_new_missing_dates_tmp,1)      # YES missing dates
                            }

                            rm(new_missing)

                            # -- split data in old and new files --
                            min_new = as.POSIXct(paste(years[k], "-01-01 00:15",sep = ""), format = datetime_format, tz = "Etc/GMT-1")
                            max_old = as.POSIXct(paste(years[k], "-01-01 00:00",sep = ""), format = datetime_format, tz = "Etc/GMT-1")
                            # dates_old = new_time_tot[new_time_tot<= max_old ]
                            # dates_new = all_dates[all_dates>= min_new ]

                            mmm_datetime = as.POSIXct(new_mydata$TIMESTAMP,format = datetime_format, tz = "Etc/GMT-1")
                            # w_old = which(format(mmm_datetime,format = "%m-%d %H:%M",tz = "Etc/GMT-1") ==  "01-01 00:00")
                            w_old = which(mmm_datetime <= max_old)
                            w_new = which(mmm_datetime >= min_new)
                            new_mydata_old = new_mydata[w_old,]         # <-- testare come funziona in caso di piu anni! forse da sistemare qualcosa prima!
                            new_mydata_new = new_mydata[w_new,]

                            ooo_datetime = as.POSIXct(orig_data_new$TIMESTAMP,format = datetime_format, tz = "Etc/GMT-1")
                            # w_new_year = which(format(ooo_datetime,format = "%Y",tz = "Etc/GMT-1") == years[k])[1]
                            w_old_orig = which(ooo_datetime <= max_old)
                            w_new_orig = which(ooo_datetime >= min_new)


                            # if(format(ooo_datetime[w_new_year],format = "%m-%d %H:%M",tz = "Etc/GMT-1") == "01-01 00:00"){
                            #   w_orig_old =  w_new_year - 2
                            # }else{
                            #   w_orig_old = w_new_year - 1
                            # }
                            orig_data_new_old = orig_data_new[w_old_orig,]         # <-- testare come funziona in caso di piu anni! forse da sistemare qualcosa prima!
                            orig_data_new_new = orig_data_new[w_new_orig,]

                            colnames(header) = header[1,]

                            #--- old file ---
                            new_mydata_old = time_to_char(DATA = new_mydata_old,DATETIME_HEADER = datetime_header, DATETIME_FORMAT = datetime_format)
                            out_my_old = new_mydata_old
                            colnames(out_my_old) = colnames(header)
                            out_mydata_old=rbind(header[-1,],out_my_old)
                            file_name_output_old = file_names_old

                            #--- new file ---
                            new_mydata_new = time_to_char(DATA = new_mydata_new,DATETIME_HEADER = datetime_header, DATETIME_FORMAT = datetime_format)
                            out_my = new_mydata_new
                            colnames(out_my) = colnames(header)
                            out_mydata=rbind(header[-1,],out_my)
                            file_name_output = file_names[k]

                            # flag_missing_records_new_tmp = c(flag_missing_records_new_tmp, 0)

                            #--- old file ---
                            orig_data_new_old = time_to_char(DATA = orig_data_new_old,DATETIME_HEADER = datetime_header, DATETIME_FORMAT = datetime_format)
                            out_orig_old = orig_data_new_old
                            out_orig_old[,which(colnames(out_orig_old)== datetime_header)] = format(out_orig_old[,which(colnames(out_orig_old)== datetime_header)], format = datetime_format)
                            colnames(out_orig_old) = colnames(header)
                            out_original_old=rbind(header[-1,],out_orig_old)
                            file_name_original_old = file_names_old

                            #--- new file ---
                            orig_data_new_new = time_to_char(DATA = orig_data_new_new,DATETIME_HEADER = datetime_header, DATETIME_FORMAT = datetime_format)
                            out_orig = orig_data_new_new
                            out_orig[,which(colnames(out_orig)== datetime_header)] = format(out_orig[,which(colnames(out_orig)== datetime_header)], format = datetime_format)
                            colnames(out_orig) = colnames(header)
                            out_original=rbind(header[-1,],out_orig)
                            file_name_original = file_names[k]

                            if(write_output_files == TRUE){    # here????

                              if(header_check == TRUE){
                                # keep updtate logger_info_file!
                                w_logger = which(logger_info_csv[,1] == station_name)
                                new_logger_info = cbind(station_name,header[1,1:8])
                                colnames(new_logger_info) = colnames(logger_info_csv)
                                logger_info_csv[w_logger,] = new_logger_info
                                write.csv(logger_info_csv,logger_info_file,row.names = F, na = "")
                              }
                              # write total .dat
                              # year - 1
                              write.csv(out_mydata_old,paste(output_dir_data,file_name_output_old,sep = ""),quote = F,row.names = F, na = "NaN")
                              write.csv(out_original_old,paste(output_dir_raw,file_name_original_old,sep = ""),quote = F,row.names = F, na = "NaN")
                              # year
                              write.csv(out_mydata,paste(output_dir_data,file_name_output,sep = ""),quote = F,row.names = F, na = "NaN")
                              write.csv(out_original,paste(output_dir_raw,file_name_original,sep = ""),quote = F,row.names = F, na = "NaN")

                              # write total .csv
                              output_dir_data_csv = substring(output_dir_data, 1, nchar(output_dir_data)-10)  ### NOTA: cartella livello sopra (elimino il num di caratteri di Files_dat)
                              # year - 1
                              file_name_output_csv = paste(substring(file_name_output_old, 1, nchar(file_name_output_old)-4),".csv",sep="")
                              file.copy(from = paste(output_dir_data,file_name_output_old,sep = ""), to = paste(output_dir_data_csv,file_name_output_csv,sep = ""), overwrite = T)
                              # year
                              file_name_output_csv = paste(substring(file_name_output, 1, nchar(file_name_output)-4),".csv",sep="")
                              file.copy(from = paste(output_dir_data,file_name_output,sep = ""), to = paste(output_dir_data_csv,file_name_output_csv,sep = ""), overwrite = T)
                            }

                          }


                        }
                      }else{
                        flag_new_overlap_tmp = c(flag_new_overlap_tmp,1)
                        new_overlap[,1]<- new_overlap[,1] + data_from_row - 1
                        colnames(new_overlap)[1]= "File Row"
                      }

                    }else{

                      first_new_datetime = as.POSIXct(mydata[1,which(colnames(mydata) == datetime_header)],tz = "Etc/GMT-1")

                      all_dates = seq(from = last_old_datetime, to = first_new_datetime, by = datetime_sampling)
                      all_dates = all_dates[-c(1,length(all_dates))]

                      mydata_tmp = mydata[,which(colnames(mydata) %in% c(datetime_header, record_header))]
                      mydata_tmp[,which(colnames(mydata_tmp) == datetime_header)] = as.POSIXct(mydata_tmp[,which(colnames(mydata_tmp) == datetime_header)], format = datetime_format, tz = "Etc/GMT-1")

                      olddata_tmp = old_data[,which(colnames(old_data) %in% c(datetime_header, record_header))]
                      olddata_tmp[,which(colnames(olddata_tmp) == datetime_header)] = as.POSIXct(olddata_tmp[,which(colnames(olddata_tmp) == datetime_header)], format = datetime_format, tz = "Etc/GMT-1")

                      mydata_rec_miss = rbind(olddata_tmp, mydata_tmp)
                      rec_miss  <- missing_record(DATA = mydata_rec_miss, DATETIME_HEADER = datetime_header, RECORD_HEADER = record_header, DATETIME_SAMPLING = datetime_sampling, DATETIME_FORMAT = datetime_format)  # <- fill missing dates with NA
                      records_missing = rec_miss[[2]]
                      records_restart = rec_miss[[3]]


                      if(record_check == 1){
                        flag_missing_records_new_tmp = c(flag_missing_records_new_tmp,rec_miss[[1]])
                      }else{
                        flag_missing_records_new_tmp = c(flag_missing_records_new_tmp,50)
                      }

                      if(flag_missing_records_new_tmp[length(flag_missing_records_new_tmp)] != 1){
                        new_missing  <- missing_dates(DATA = mydata_rec_miss,
                                                      DATETIME_HEADER = datetime_header,
                                                      RECORD_HEADER = record_header,
                                                      DATETIME_SAMPLING = datetime_sampling)  # <- fill missing dates with NA
                        new_mydata = new_missing[[1]]
                        new_missing_index_date = new_missing[[2]]

                        new_missing_index_date_tot = rbind(new_missing_index_date_tot,new_missing_index_date)

                        if(record_check != 1 | flag_missing_records_new_tmp[length(flag_missing_records_new_tmp)] != 1){

                          new_mydata <- time_to_char(DATA = new_mydata, DATETIME_HEADER = datetime_header, DATETIME_FORMAT = datetime_format)
                          new_time_tot = as.POSIXct(new_mydata[,which(colnames(new_mydata) == datetime_header)], format = datetime_format, tz = 'Etc/GMT-1')

                          new_time_missing = as.POSIXct(new_missing_index_date[,2],format = datetime_format, tz = "Etc/GMT-1")

                          if(length(which(new_time_tot %in% new_time_missing )) == 0){
                            flag_new_missing_dates_tmp = c(flag_new_missing_dates_tmp,0)      # No missing dates
                          }else{
                            flag_new_missing_dates_tmp = c(flag_new_missing_dates_tmp,1)      # YES missing dates
                          }

                          rm(new_missing)
                          #####



                          min_new = as.POSIXct(paste(years[k], "-01-01 00:15",sep = ""), format = datetime_format, tz = "Etc/GMT-1")
                          max_old = as.POSIXct(paste(years[k], "-01-01 00:00",sep = ""), format = datetime_format, tz = "Etc/GMT-1")
                          dates_old = all_dates[all_dates<= max_old ]
                          dates_new = all_dates[all_dates>= min_new ]

                          all_dates_df_old =  data.frame(matrix(nrow =length(dates_old), ncol = ncol(old_data)))
                          colnames(all_dates_df_old) = colnames(old_data)
                          all_dates_df_old[,which(colnames(all_dates_df_old) == datetime_header)] = format(dates_old,format = datetime_format,tz = "Etc/GMT-1")
                          all_dates_df_old[,which(colnames(all_dates_df_old) == record_header)] = -1        # Record gap filled with NaN were flagged with RECORD = -1

                          new_mydata_old = time_to_char(DATA = old_data,DATETIME_HEADER = datetime_header,DATETIME_FORMAT = datetime_format)
                          new_mydata_old = rbind(new_mydata_old, all_dates_df_old)

                          all_dates_df_new =  data.frame(matrix(nrow =length(dates_new), ncol = ncol(mydata)))
                          colnames(all_dates_df_new) = colnames(mydata)
                          all_dates_df_new[,which(colnames(all_dates_df_new) == datetime_header)] = format(dates_new,format = datetime_format,tz = "Etc/GMT-1")
                          all_dates_df_new[,which(colnames(all_dates_df_new) == record_header)] = -1        # Record gap filled with NaN were flagged with RECORD = -1

                          new_mydata_new = time_to_char(DATA = mydata,DATETIME_HEADER = datetime_header,DATETIME_FORMAT = datetime_format)
                          new_mydata_new = rbind(all_dates_df_new,new_mydata_new )

                          colnames(header) = header[1,]
                          colnames(old_header) = old_header[1,]

                          new_mydata_old = time_to_char(DATA = new_mydata_old,DATETIME_HEADER = datetime_header,DATETIME_FORMAT = datetime_format)
                          out_my_old = new_mydata_old
                          colnames(out_my_old) = colnames(old_header)
                          out_mydata_old=rbind(old_header[-1,],out_my_old)
                          file_name_output_old = file_names_old

                          new_mydata_new = time_to_char(DATA = new_mydata_new,DATETIME_HEADER = datetime_header,DATETIME_FORMAT = datetime_format)
                          out_my_new = new_mydata_new
                          colnames(out_my_new) = colnames(header)
                          out_mydata_new=rbind(header[-1,],out_my_new)
                          file_name_output_new = file_names[k]

                          orig_wihtout_dupli = time_to_char(DATA = orig_wihtout_dupli,DATETIME_HEADER = datetime_header,DATETIME_FORMAT = datetime_format)
                          out_orig = orig_wihtout_dupli
                          out_orig[,which(colnames(out_orig)== datetime_header)] = format(out_orig[,which(colnames(out_orig)== datetime_header)], format = datetime_format)
                          colnames(out_orig) = colnames(header)
                          out_original=rbind(header[-1,],out_orig)
                          file_name_original = paste(substring(file_names[k], 1, nchar(file_names[k])-4), ".dat",sep = "")

                          if(write_output_files == TRUE){
                            # --- write old ---
                            write.csv(out_mydata_old,paste(output_dir_data,file_name_output_old,sep = ""),quote = F,row.names = F, na = "NaN")
                            file_name_output_csv = paste(substring(file_name_output_old, 1, nchar(file_name_output_old)-4),".csv",sep="")
                            output_dir_data_csv = substring(output_dir_data, 1, nchar(output_dir_data)-10)  ### NOTA: cartella livello sopra (elimino il num di caratteri di Files_dat)
                            file.copy(from = paste(output_dir_data,file_name_output_old,sep = ""), to = paste(output_dir_data_csv,file_name_output_csv,sep = ""), overwrite = T)

                            # --- write new ---
                            write.csv(out_mydata_new,paste(output_dir_data,file_name_output_new,sep = ""),quote = F,row.names = F, na = "NaN")
                            file_name_output_csv = paste(substring(file_name_output_new, 1, nchar(file_name_output_new)-4),".csv",sep="")
                            output_dir_data_csv = substring(output_dir_data, 1, nchar(output_dir_data)-10)  ### NOTA: cartella livello sopra (elimino il num di caratteri di Files_dat)
                            file.copy(from = paste(output_dir_data,file_name_output_new,sep = ""), to = paste(output_dir_data_csv,file_name_output_csv,sep = ""), overwrite = T)

                            # --- write original ---
                            write.csv(out_original,paste(output_dir_raw,file_name_original,sep = ""),quote = F,row.names = F, na = "NaN")

                          }
                        }
                        # flag_missing_records_new_tmp = c(flag_missing_records_new_tmp, 1)   # da verificare!
                      }else{

                      }
                    }

                  }else{

                    colnames(header) = header[1,]
                    mydata = time_to_char(DATA = mydata,DATETIME_HEADER = datetime_header,DATETIME_FORMAT = datetime_format)
                    out_my = mydata
                    colnames(out_my) = colnames(header)
                    out_mydata=rbind(header[-1,],out_my)
                    file_name_output = file_names[k]

                    orig_wihtout_dupli = time_to_char(DATA = orig_wihtout_dupli,DATETIME_HEADER = datetime_header,DATETIME_FORMAT = datetime_format)
                    out_orig = orig_wihtout_dupli
                    out_orig[,which(colnames(out_orig)== datetime_header)] = format(out_orig[,which(colnames(out_orig)== datetime_header)], format = datetime_format)
                    colnames(out_orig) = colnames(header)
                    out_original=rbind(header[-1,],out_orig)
                    file_name_original = paste(substring(file_names[k], 1, nchar(file_names[k])-4), ".dat",sep = "")
                    flag_missing_records_new_tmp = c(flag_missing_records_new_tmp, 0)
                    if(write_output_files == TRUE){

                      if(header_check == TRUE){
                        # keep updtate logger_info_file!
                        w_logger = which(logger_info_csv[,1] == station_name)
                        new_logger_info = cbind(station_name,header[1,1:8])
                        colnames(new_logger_info) = colnames(logger_info_csv)
                        logger_info_csv[w_logger,] = new_logger_info
                        write.csv(logger_info_csv,logger_info_file,row.names = F, na = "")
                      }

                      # write total .dat
                      write.csv(out_mydata,paste(output_dir_data,file_name_output,sep = ""),quote = F,row.names = F, na = "NaN")
                      write.csv(out_original,paste(output_dir_raw,file_name_original,sep = ""),quote = F,row.names = F, na = "NaN")

                      # write total .csv
                      file_name_output_csv = paste(substring(file_name_output, 1, nchar(file_name_output)-4),".csv",sep="")
                      output_dir_data_csv = substring(output_dir_data, 1, nchar(output_dir_data)-10)  ### NOTA: cartella livello sopra (elimino il num di caratteri di Files_dat)
                      file.copy(from = paste(output_dir_data,file_name_output,sep = ""), to = paste(output_dir_data_csv,file_name_output_csv,sep = ""), overwrite = T)
                      # write.csv(db_mydata, paste(database_dir ,substring(file_name_output,1, nchar(file_name_output)-8),date_to_print_filename, ".csv",sep = ""),quote = F,row.names = F, na = "NaN")
                    }


                  }
                }
              }
            }

            # inserire qui conversione flag tmp
            if(all(flag_new_duplicates_rows_tmp == 0)){
              flag_new_duplicates_rows = 0
            } else{
              flag_new_duplicates_rows = 1
            }

            if(all(flag_new_overlap_tmp == 0)){
              flag_new_overlap = 0
            } else{
              flag_new_overlap = 1
            }

            if(all(flag_new_missing_dates_tmp == 0)){
              flag_new_missing_dates = 0
            } else{
              flag_new_missing_dates = 1
            }

            if(record_check == 1){
              if(all(flag_missing_records_new_tmp == 0)){
                flag_missing_records_new = 0
              } else{
                flag_missing_records_new = 1
              }
            }else{
              flag_missing_records_new = 50
            }
          }

        }

      }
    }
  }

  # }
  # }

  # new_missing_index_date_tot

  flags_names = c("flag_empty","flag_logger_number","flag_error_df","flag_date","flag_duplicates_rows","flag_overlap","flag_missing_records","flag_missing_dates","flag_range_variable_to_set","flag_range_variable_new","flag_out_of_range", "flag_new_duplicates_rows", "flag_new_overlap", "flag_new_missing_dates" ,"flag_missing_records_new")
  flags_df = data.frame(flags_names, rep(NA,times = length(flags_names)))
  colnames(flags_df) = c("flag_names", "value")

  for(i in 1: nrow(flags_df)){
    if(exists(flags_names[i])){
      flags_df$value[i] = eval(parse(text = flags_names[i]))
    }
  }

  if(!exists("mydata")){
    mydata= NULL

  }

  if(!exists("mydata_out_of_range")){
    mydata_out_of_range= NULL

  }

  # output1 = list(mydata, flags_df,file_names)

  # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # PART 2 --> PREPARE STATISTICS AND REPORT INFORMATION
  # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

  # - - - -  Provide information of empty file - - - - - - - - - - - - -

  if(!is.na(flag_empty) & flag_empty == 1){

    output_empty = list("Y", NA)
    names(output_empty) =c("Status", "Values")
  }else{
    output_empty = list("N",NA)
    names(output_empty) =c("Status", "Values")
  }

  # - - - -  Provide difference on logger numbers - - - - - - - - - - - - -



  if(!is.na(flag_logger_number) & flag_logger_number == 1){
    # logger_difference

    # file_logger_numb = logger_number
    # old_logger_numb = logger_info[,2]
    # logger_numbers=c(old_logger_numb,file_logger_numb)
    # names(logger_numbers) = c("old", "new")

    output_logger_number = list("Y",logger_difference)
    names(output_logger_number) = c("Status", "Values")

  }else{
    output_logger_number =list("N", NA)
    names(output_logger_number) = c("Status", "Values")
  }

  # - - - -  Provide difference on data structure - - - - - - - - - - - - -

  if(!is.na(flag_error_df) & (flag_error_df == 1)){
    output_structure = list("Y",df_out)
    names(output_structure) = c("Status", "Values")
  }else{
    if(!is.na(flag_error_df) & (flag_error_df == -1)){
      output_structure = list("Y",df_out)
      names(output_structure) = c("Status", "Values")
    }else{
      if(!is.na(flag_error_df) & (flag_error_df == 2)){
        output_structure = list("Y", df_out)
        names(output_structure) = c("Status", "Values")
      }else{
        output_structure = list("N",NA)
        names(output_structure) = c("Status", "Values")
      }

    }
  }

  # - - - -  Structure change: warning - - - - - - - - - - - - -

  if(exists("df_difference")){
    if(!is.na(flag_error_df) & (flag_error_df == 0  &  nrow(df_difference) != 0 )){
      # structure_message = df_difference
      output_structure_change = list("Y",df_difference)
      names(output_structure_change) = c("Status", "Values")
    }else{
      output_structure_change = list("N",NA)
      names(output_structure_change) = c("Status", "Values")
    }
  }else{
    output_structure_change = list("N",NA)
    names(output_structure_change) = c("Status", "Values")
  }

  # - - - -  Provide date issue - - - - - - - - - - - - -

  if(!is.na(flag_date) & flag_date == 1){
    dates_flag_date = c(start_date, recent_date)
    names(dates_flag_date) = c("Download_table_date", "Last_file_date")
    output_no_new_data = list("Y", dates_flag_date)
    names(output_no_new_data) =c("Status", "Values")
  }else{
    output_no_new_data = list("N",NA)
    names(output_no_new_data) =c("Status", "Values")
  }


  # - - - -  Provide overlaps - - - - - - - - - - - - -   # da modificare! --> no lista date ma periodo (inizio/fine)


  if(!is.na(flag_overlap) & flag_overlap == 1){
    # overlap_date = as.character(as.POSIXct(unique(overlap[,which(colnames(overlap) == datetime_header)]), tz = "Etc/GMT-1"))
    overlap_date = as.POSIXct(unique(overlap[,which(colnames(overlap) == datetime_header)]), tz = "Etc/GMT-1")

    min_date = min(overlap_date)
    max_date = max(overlap_date)

    seq_date = seq(min_date, max_date, by = datetime_sampling)


    stat = rep(0, times = length(seq_date))
    stat[which(seq_date %in% overlap_date )] = 1
    stat = c(0,stat,0)
    diff = diff(stat)

    start_overlap = seq_date[which(diff == 1)]
    end_overlap = seq_date[which(diff == -1)-1]

    n_record = which(diff == -1)-which(diff == 1)

    # end_tmp = seq(end_overlap,by = datetime_sampling,length.out = 2)[2]  # fix error in difference!
    # hour_overlap =  difftime(time1 = end_tmp,time2 = start_overlap,units = "hours")
    hour_overlap =  difftime(time1 = end_overlap,time2 = start_overlap,units = "hours")

    overl_table = data.frame(start_overlap, end_overlap, n_record, hour_overlap)
    colnames(overl_table) = c("From", "To", "Number of Record", "Hours")

    output_overlap = list("Y", overl_table)
    names(output_overlap) =c("Status", "Values")
  }else{
    if(!is.na(flag_new_overlap) & flag_new_overlap == 1){                                   #  possibile errore su new overlap --> testare se prende gli overlap corretti!
      overlap_date = as.POSIXct(unique(new_overlap_tot[,which(colnames(new_overlap_tot) == datetime_header)]), tz = "Etc/GMT-1")
      min_date = min(overlap_date)
      max_date = max(overlap_date)

      seq_date = seq(min_date, max_date, by = datetime_sampling)


      stat = rep(0, times = length(seq_date))
      stat[which(seq_date %in% overlap_date )] = 1
      stat = c(0,stat,0)
      diff = diff(stat)

      start_overlap = seq_date[which(diff == 1)]
      end_overlap = seq_date[which(diff == -1)-1]

      n_record = which(diff == -1)-which(diff == 1)

      hour_overlap =  difftime(time1 = end_overlap,time2 = start_overlap,units = "hours")

      overl_table = data.frame(start_overlap, end_overlap, n_record, hour_overlap)
      colnames(overl_table) = c("From", "To", "Number of Record", "Hours")

      output_overlap = list("Y", overl_table)
      names(output_overlap) =c("Status", "Values")
    }else{
      output_overlap = list("N", NA)
      names(output_overlap) =c("Status", "Values")
    }
  }

  # - - - -  Provide duplicated rows - - - - - - - - - - - - -   # da modificare! --> no lista date ma periodo (inizio/fine)


  if(!is.na(flag_duplicates_rows) & flag_duplicates_rows == 1){
    output_duplicates_rows = list("Y", NA)
    names(output_duplicates_rows) =c("Status", "Values")
  }else{
    if(!is.na(flag_new_duplicates_rows) & flag_new_duplicates_rows == 1){
      output_duplicates_rows = list("Y", NA)
      names(output_duplicates_rows) =c("Status", "Values")
    }else{
      output_duplicates_rows = list("N", NA)
      names(output_duplicates_rows) =c("Status", "Values")
    }
  }
  # - - - -  Provide table of missing records - - - - - - - - - - - - -

  if(!exists("records_missing")){
    records_missing = as.data.frame(matrix(ncol = 6, nrow = 0))
    colnames(records_missing) = c("Datetime_From","Datetime_To", "Datetime_Missing"," Record_From", "Record_To","Record_Missing")
  }
  if(!exists("records_missing_new")){
    records_missing_new = as.data.frame(matrix(ncol = 6, nrow = 0))
    colnames(records_missing_new) = c("Datetime_From","Datetime_To", "Datetime_Missing"," Record_From", "Record_To","Record_Missing")
  }

  if(!exists("records_restart")){
    records_restart = as.data.frame(matrix(ncol = 6, nrow = 0))
    colnames(records_restart) = c("Datetime_From","Datetime_To", "Datetime_Missing"," Record_From", "Record_To","Record_Missing")
  }

  if(!exists("records_restart_new")){
    records_restart_new = as.data.frame(matrix(ncol = 6, nrow = 0))
    colnames(records_restart_new) = c("Datetime_From","Datetime_To", "Datetime_Missing"," Record_From", "Record_To","Record_Missing")
  }

  if(nrow(records_missing) != 0 | nrow(records_missing_new) != 0){
    table_missing_record = rbind(records_missing[,c(1:5)],records_missing_new[,c(1:5)])
    colnames(table_missing_record) = c("Last.date.Before", "First.date.After", "Date.Gap","Last.record.Before","First.record.After")
  }else{
    table_missing_record = data.frame()
  }

  if(nrow(records_restart) != 0 | nrow(records_restart_new) != 0){
    table_restart_record = rbind(records_restart[,c(1:5)],records_restart_new[,c(1:5)])
    colnames(table_restart_record) =  c("Last.date.Before", "First.date.After", "Date.Gap","Last.record.Before","First.record.After")
    table_restart_record = table_restart_record[table_restart_record$Date.Gap != 0,]
  }else{
    table_restart_record = data.frame()
  }

  if(nrow(table_missing_record) != 0){
    output_missing_record = list("Y", table_missing_record)
    names(output_missing_record) =c("Status", "Values")
  }else{
    output_missing_record = list("N", NA)
    names(output_missing_record) =c("Status", "Values")
  }

  if(nrow(table_restart_record) != 0){
    output_restart_record = list("Y", table_restart_record)
    names(output_restart_record) =c("Status", "Values")
  }else{
    output_restart_record = list("N", NA)
    names(output_restart_record) =c("Status", "Values")
  }



  # - - - -  Provide missing dates - - - - - - - - - - - - -
  # missing_index_date

  # NOTE: the section inside *******     ****** was developed to avoid that missing/restart records were misclassificated as missing dates

  if((!is.na(flag_missing_dates) & flag_missing_dates == 1)|(!is.na(flag_new_missing_dates) & flag_new_missing_dates == 1)){

    date_missing = rbind(missing_index_date,new_missing_index_date_tot)

    # ************
    if(nrow(table_missing_record) >0){
      record_missing = as.POSIXct(rep(NA, 0))
      for( k in 1:nrow(table_missing_record)){
        record_missing_tmp = seq(from = as.POSIXct(table_missing_record$Last.date.Before[k], format = datetime_format, tz ="Etc/GMT-1"),
                                 to   = as.POSIXct(table_missing_record$First.date.After[k], format = datetime_format, tz ="Etc/GMT-1"), by = datetime_sampling)
        record_missing_tmp = record_missing_tmp[-c(1,length(record_missing_tmp))]
        record_missing = c(record_missing, record_missing_tmp)

      }
    }else{record_missing = as.POSIXct(rep(NA, 0))}

    if(nrow(table_restart_record) >0){
      k=1
      record_restart = as.POSIXct(rep(NA, 0))
      for( k in 1:nrow(table_restart_record)){
        record_restart_tmp = seq(from = as.POSIXct(table_restart_record$Last.date.Before[k], format = datetime_format, tz ="Etc/GMT-1"),
                                 to   = as.POSIXct(table_restart_record$First.date.After[k], format = datetime_format, tz ="Etc/GMT-1"), by = datetime_sampling)
        record_restart_tmp = record_restart_tmp[-c(1,length(record_restart_tmp))]
        record_restart = c(record_restart, record_restart_tmp)
      }
    }else{record_restart = as.POSIXct(rep(NA, 0))}

    record_tot = c(record_missing, record_restart)

    if(length(record_tot) != 0){
      date_missing = date_missing[-which(date_missing$Date %in% record_tot),]
    }

    if(nrow(date_missing) == 0){
      output_date_missing = list("N", NA)
      names(output_date_missing) =c("Status", "Values")
    }else{

      time_tot <- as.POSIXct(mydata[,which(colnames(mydata) == datetime_header)], format = datetime_format, tz = 'Etc/GMT-1' )
      # time_tot <- c(new_missing_index_date$Date, time_tot)
      # time_tot <- unique(c(date_missing$Date,time_tot)) # don't work! issues whith datatime --> unexpected timezone conversion! Why?
      time_tot <- as.POSIXct(unique(c(as.character(date_missing$Date), as.character(time_tot))),tz = "Etc/GMT-1")

      time_tot = time_tot[order(time_tot)]

      # time_missing <- missing_index_date[,2]
      time_missing <- as.POSIXct(date_missing[,2], format = datetime_format, tz = "Etc/GMT-1")


      df_missing <- data.frame(time_tot,rep("Dates in original file",times = length(time_tot)))
      colnames(df_missing) = c("time","Status")
      df_missing[which(time_tot %in% time_missing ),2] = "Missing dates filled"
      y = rep(1, times = length(time_tot))

      Status_num = rep(1,times = length(time_tot))
      Status_num[which(time_tot %in% time_missing )] = 0

      df_missing = cbind(df_missing, y,Status_num)
      Status_num_NA=df_missing

      Status_num_NA = Status_num_NA[,-c(2,3)]

      differ = c(0,diff(Status_num_NA$Status_num))

      if(Status_num_NA$Status_num[1] == 0){
        differ[1] = -1
      }

      start = which(differ == -1)
      end  = which(differ == 1) - 1
      # gap_lenght = end - start + 1
      gap_lenght = end - start

      date_start = Status_num_NA$time[start]
      date_end = Status_num_NA$time[end]

      if(length(date_end) != 0){
        date_end_tmp = as.POSIXct("1990-01-01 00:00")    # this for cycle is to fix a bug on time difference
        for(k in 1:length(date_end)){

          date_end_tmp[k] =  seq.POSIXt(date_end[k], by = datetime_sampling, length. =  2)[2]
        }
        gap_hour = difftime(time1 = date_end_tmp,time2 = date_start,units = "hours")
      }else{
        gap_hour = numeric(0)
      }

      statistic_missing = data.frame(date_start,date_end,gap_lenght,gap_hour)
      colnames(statistic_missing) = c("From", "To", "Number of Record", "Hours")
      statistic_missing[,1:2] = format(statistic_missing[,1:2], format = datetime_format)

      date_missing_period = statistic_missing

      output_date_missing = list("Y",date_missing_period )
      # output_restart_record = list("N", NA)
      names(output_date_missing) =c("Status", "Values")
    }
  }else{
    # date_missing = NULL
    output_date_missing = list("N", NA)
    names(output_date_missing) =c("Status", "Values")
  }

  # - - - -  Provide ALERT out of range - - - - - - - - - - - - -


  if((!is.na(flag_out_of_range_ALERT) & flag_out_of_range_ALERT == 1)){
    output_out_of_range_ALERT = list("Y", alert_out_of_range_table)
    names(output_out_of_range_ALERT) =c("Status", "Values")
  }else{
    output_out_of_range_ALERT = list("N", NA)
    names(output_out_of_range_ALERT) =c("Status", "Values")
  }

  # - - - -  Provide out of range - - - - - - - - - - - - -


  if((!is.na(flag_out_of_range) & flag_out_of_range == 1)){
    # out_of_range_table[,3:4] = format(out_of_range_table[,3:4], format = datetime_format)
    output_out_of_range = list("Y", out_of_range_table)
    names(output_out_of_range) =c("Status", "Values")
  }else{
    output_out_of_range = list("N", NA)
    names(output_out_of_range) =c("Status", "Values")
  }




  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # output_empty
  # output_logger_number
  # output_structure
  # output_no_new_data
  # output_overlap
  # output_missing_record
  # output_restart_record
  # output_date_missing
  # output_out_of_range

  errors_output = list(output_empty,
                       output_logger_number,
                       output_structure,
                       output_structure_change,
                       output_no_new_data,
                       output_overlap,
                       output_duplicates_rows,
                       output_missing_record,
                       output_restart_record,
                       output_date_missing,
                       output_out_of_range_ALERT,
                       output_out_of_range )

  names(errors_output) = c("err_empty",
                           "err_logger_number",
                           "err_structure",
                           "err_structure_change",
                           "err_no_new_data",    # "err_date_issue",
                           "err_overlap",
                           "err_duplicates_rows",
                           "err_missing_record",
                           "err_restart_record",
                           "err_date_missing",
                           "err_range_alert",
                           "err_out_of_range")


  # status = lapply(errors_output, function(x) x[[1]])



  # output2 = list(mydata, flags_df,file_names, logger_numbers, structure_message, overlap_date, table_missing_record, table_restart_record,date_missing)
  output2 = list(mydata, flags_df, file_names, errors_output, mydata_out_of_range)




  return(output2)
}
bridachristian/DataQualityCheckEuracAlpEnv documentation built on Oct. 27, 2019, 5:55 p.m.