percolator: ForTraCC Post-Processing Tool

Documented in get_prec_under_tb_mm

#' Calculate Precipitation related to the ForTraCC Tb objects
#'
#' @description This function calculates the precipitation area and volume under
#' the cloud shield. It overlap the brightness temperature (Tb) objects (clusters) 
#' generated by ForTracc with the corresponding precipitation field. Requires monthly
#' input files with hourly precipitation.
#'
#' @param day Integer. Day of the tracking or NULL (default).
#' @param month Integer. Month of the tracking or NULL (default).
#' @param ncols Integer. Number of columns in the native data.
#' @param nlins Integer. Number of lines in the native data.
#' @param type Character. Type of data used for tracking the MCSs. It will be 
#' added at the dataset for identification purposes.
#' @param year_start Integer. Starting year of the data.
#' @param year_end Integer. Ending year of the data.
#' @param cluster_prefix Character. 
#' @param basename Character. It's the basename for the output file. Default is "fam".
#' @param pcp_file Character. Precipitation monthly file name. 
#' Ex: "pathi_to_prec/200101_PREC_ACC_NC-on_IMERG_grid.nc"
#' @param pcp_varname Character. Precipitation variable name. Ex: "PREC_ACC_NC"
#' @param pcp_time_name Character. Time variable name. Default: "time"
#' @param area_file Character. Path to the area file.
#' @param family_file Character. Path to the family file.
#' @param path_to_fortracc_clusters Character. Path to the ForTraCC cluster files.
#' @param path_to_masks_files Character. Path to the masks files.
#' @param plot Logical. If clusters over precipitation should be plotted at the
#' path_to_masks_files. See figs directory inside that path. Default: FALSE.
#'
#' @return A csv file.
#' @importFrom data.table fread fwrite rbindlist :=
#' @importFrom ncdf4 nc_open nc_close ncvar_get
#' @importFrom ggplot2 ggplot geom_tile geom_contour scale_fill_gradient theme_minimal labs aes
#' @importFrom reshape2 melt
#' @importFrom grDevices dev.off png
#' @export
#' @examples \dontrun{
#' get_prec_under_tb(day=NULL,
#'             month = 1,
#'             ncols = 480,
#'             nlins = 690,
#'             type = "WRF",
#'             year_start = 2000,
#'             year_end = 2000,
#'             cluster_prefix = "WRF_20y_p_",
#'             area_file = system.file("extdata/grid_area_saag/area_wrf_km2.nc", package = "percolator"),
#'             family_file = "diag.txt/fam_SAAG_WRF_20yr_p_0112_s2.txt",
#'             path_to_prec_file = paste0("DATA/WRF/PREC_ACC_NC/"),
#'             path_to_fortracc_clusters = "clusters/",
#'             path_to_masks_files = "MCSs_MASKs/")
#' } 
get_prec_under_tb_mm <- function(day=NULL,
                              month=NULL,
                              year_start,
                              year_end,
                              ncols,
                              nlins,
                              type,
                              family_file,
                              cluster_prefix,
                              pcp_file,
                              pcp_varname,
                              basename = "fam",
                              pcp_time_name = "time", 
                              path_to_fortracc_clusters,
                              path_to_masks_files,
                              plot = FALSE,
                              area_file = system.file("extdata/grid_area_saag/area_wrf_km2.nc", 
                                                      package = "percolator")
) {
  # Get precipitation file
  read_netcdf <- function(file_path, 
                          var_name, 
                          time_name = "time") {
    nc <- ncdf4::nc_open(file_path)
    var <- ncdf4::ncvar_get(nc, var_name)
    time_var <- ncdf4::ncvar_get(nc, time_name)
    time_units <- ncdf4::ncatt_get(nc, time_name, "units")$value
    start_date <- sub("hours since ", "", time_units)
    start_date <- as.POSIXct(start_date, tz="UTC")
    ncdf4::nc_close(nc)
    pcp_date_seq <- seq(from = start_date, 
                        by = "hour", 
                        length.out = length(time_var))
    l_all <- list(var, pcp_date_seq)
    return(l_all)
  }
  if (file.exists(pcp_file)) {
    l_pcp <- read_netcdf(file_path = pcp_file, 
                         var_name = pcp_varname,
                         time_name = pcp_time_name)
  } else {
    stop("File not found: ", pcp_file)
  }
  pcp_timesteps_df <- data.table::data.table(index = seq_along(l_pcp[[2]]),
                                             date = l_pcp[[2]],
                                             yyyymmddhh = format(l_pcp[[2]], 
                                                                 "%Y%m%d%H"))
  # For plots if plot = TRUE
  figs_dir <- file.path(path_to_masks_files, "figs")
  if (!dir.exists(figs_dir)) dir.create(figs_dir, recursive = TRUE)
  
  # Open area file
  nc_area <- ncdf4::nc_open(area_file)
  area <- ncdf4::ncvar_get(nc_area, "Band1")
  ncdf4::nc_close(nc_area)
  
  # Read family data
  dt_fam <- data.table::fread(family_file, fill = TRUE)
  dt_fam$HOUR_INI <- dt_fam$HOUR
  dt_fam$HOUR <- dt_fam$HOUR2
  dt_fam$HOUR2 <- NULL
  dt_fam$MONTH_INI <- ifelse(trunc(dt_fam$TIME) == 0, dt_fam$MONTH, NA)
  dt_fam$DAY_INI <- ifelse(trunc(dt_fam$TIME) == 0, dt_fam$DAY, NA)
  dt_fam[, MONTH_INI := MONTH_INI[which(!is.na(MONTH_INI))[1]], by = FAMILY]
  dt_fam[, DAY_INI := DAY_INI[which(!is.na(DAY_INI))[1]], by = FAMILY]
  
  # Define intermediate file path
  base_intermedio_path <- paste0(path_to_masks_files, "/", basename,
                                 "_Tb_pcp_info_intermedio_",
                                 year_start, "-", year_end)
  if (is.null(month)) {
    # Se o mês é NULL, não adiciona mês ou dia no nome do arquivo
    intermedio_file <- paste0(base_intermedio_path, ".csv")
  } else {
    # Se mês não é NULL, converte para inteiro e formata o mês
    month_int <- as.integer(month)
    month_str <- sprintf("%02d", month_int)
    if (is.null(day)) {
      # Se o dia é NULL, adiciona apenas o mês
      intermedio_file <- paste0(base_intermedio_path, "_", month_str, ".csv")
    } else {
      day_int <- as.integer(day)
      day_str <- sprintf("%02d", day_int)
      intermedio_file <- paste0(base_intermedio_path, "_", month_str, "_", day_str, ".csv")
    }
  }
  
  # intermedio_file <- paste0(path_to_masks_files, 
  #                           "fam_SAAG_Tb_pcp_info_intermedio_", 
  #                           year_start, "-", year_end, ".csv")
  
  # Remove intermediate file if exists
  if (file.exists(intermedio_file)) {
    file.remove(intermedio_file)
  }
  
  # Obtain information about precipitation under the cloud shield
  message("Obtaining information about precipitation under the cloud shield.\nThis process may take some time. Please be patient.")
  message("(Please remember to use iterative job.)")
  
  # Iterate over unique families
  if (is.null(month)) {
    fams1 <- dt_fam
  } else {
    fams1 <- dt_fam[MONTH_INI == month]
  }
  
  if (is.null(day)) {
    fams <- fams1
  } else {
    fams <- fams1[DAY_INI == day]
  }
  
  
  uf <- unique(fams$FAMILY)

  ldf <- list()
  for(i in seq_along(uf)) {
    df <- fams[FAMILY == uf[i],]
    
    if (nrow(df) < 4) {
      next
    }
    
    print(paste0("MONTH = ", month, " DAY = ", day, " FAMILY = ", uf[i] , " / ",
                 length(uf), "  Max fam:", max(uf)))
    
    # Initialize lists to store results
    l_Tb_SIZE_km2 <- list()
    lsum <- list()
    lmax <- list()
    larea_pcp <- list()
    lvol <- list()
    
    # Iterate over rows in the dataframe
    for(j in seq_along(df$FAMILY)) {
      SYS <- df$`SYS#`[j]
      SYS_date <- sprintf("%04d%02d%02d%02d", 
                          df$YEAR[j], 
                          df$MONTH[j],
                          df$DAY[j], 
                          df$HOUR[j])
      
      # Precipitation file
      pcp_index <- pcp_timesteps_df[yyyymmddhh == SYS_date]$index
      m_pcp <- l_pcp[[1]][,,pcp_index]
      
      # Open Tb clusters
      cluster_files <- list.files(path = path_to_fortracc_clusters,
                                  pattern = paste0(cluster_prefix, SYS_date), 
                                  # sprintf("%s%04d%02d%02d%02d", 
                                  #                 cluster_prefix, 
                                  #                 df$YEAR[j], 
                                  #                 df$MONTH[j], 
                                  #                 df$DAY[j], 
                                  #                 df$HOUR[j]),
                                  full.names = TRUE)
      
      if (length(cluster_files) > 0) {
        cluster <- file(cluster_files[1], "rb")
        v_bin <- tryCatch(
          readBin(cluster, what = "integer", n = ncols * nlins * 2, size = 2),
          error = function(e) {
            message("Error reading binary data: ", e$message)
            NA
          }
        )
        close(cluster)
      } else {
        v_bin <- NA
      }
      
      # Transfor the Tb clusters vector data into matrix to make operations
      m_bin <- matrix(v_bin,
                      nrow = ncols, # precisa ser invertido!!
                      ncol = nlins) # precisa ser invertido!!
      m_bin <- m_bin[,nlins:1]
      
      # Calculate the area of the system 
      rtb <- m_bin
      rtb <- ifelse(rtb == SYS, area, NA) # without the merging systems
      Tb_SIZE_km2 <- sum(rtb, na.rm = T)
      l_Tb_SIZE_km2[[j]] <- Tb_SIZE_km2
      
      # Create a raster for precipitation identical to the Tb cluster raster
      r3 <- m_bin # creating raster for precipitation identical to Tb cluster raster
      
      # Selects precipitation below the Tb cluster numbered as SYS in the
      # remaining FAMILY in fam.txt after SAAG Tb Criteria.
      # Therefore, r3 is a raster with only one (or more in the merging cases) 
      # precipitation "cluster"
      r3 <- ifelse(r3 == SYS, m_pcp, NA) # without the merging systems
      #r3[] <- ifelse(r3[] == SYS | r3[] %in% SYS_ANT, r_pcp[], NA)
      
      # Save statistics of each precipitation clusters.
      # They will be used for filtering according to SAAG PCP Criteria.
      lsum[[j]] <- sum(r3, na.rm = TRUE)
      lmax[[j]] <- max(r3, na.rm = TRUE)
      # obtaining precipitation volume --> integrate the multiplication of the
      # pcp value in each pixel by the area of each pixel
      vol_in_the_px <- r3 * area
      lvol[[j]] <- sum(vol_in_the_px, na.rm = TRUE)
      # calculate the total area occupied by precipitation under the cloud shield
      r4 <- r3
      r4 <- ifelse(!is.na(r4), area, NA)
      larea_pcp[[j]] <- sum(r4, na.rm = TRUE)
      
      if (plot) {
        # Prepare data
        mat1_df <- reshape2::melt(m_bin)
        names(mat1_df) <- c("X", "Y", "value")
        mat2_df <- reshape2::melt(m_pcp)
        names(mat2_df) <- c("X", "Y", "value")
        
        # Plot
        p <- ggplot2::ggplot() +
          ggplot2::geom_tile(data = mat1_df, 
                             ggplot2::aes(x = X, y = Y, fill = value), alpha = 0.8) +
          ggplot2::geom_contour(data = mat2_df, 
                                ggplot2::aes(x = X, y = Y, z = value), color = "black") +
          ggplot2::scale_fill_gradient(low = "blue", high = "red") +
          ggplot2::theme_minimal() +
          ggplot2::labs(x = "", y = "",
                        title = paste0("Tb clusters (shaded) & Prec (contours)", SYS_date))
        
        grDevices::png(file.path(figs_dir, paste0(SYS_date, ".png")))
        print(p)
        grDevices::dev.off()
      }
      
    }
    
    # Unlist the individual and store them for the FINAL file
    df$Tb_SIZE_km2 <- unlist(l_Tb_SIZE_km2)
    df$pcp_sum <- unlist(lsum)
    df$pcp_max <- unlist(lmax) # Usar este para: Minimum peak precipitation - 10 mm/h at least one pixel (0.1 x 0.1 deg for 1 h) inside the whole MCS for 4 continuous hours
    df$pcp_total_vol <- unlist(lvol)
    df$pcp_total_area <- unlist(larea_pcp)
    
    ldf[[i]] <- df
    
    # Concatenate the calculations until this family as a bkp in case of any failure
    data.table::fwrite(df,
                       file = intermedio_file, row.names = FALSE, append = TRUE)
    gc()
  }
  
  # Get the final data.table with the calculations
  dt <- data.table::rbindlist(ldf)
  dt$MONTH_INI <- NULL
  dt$DAY_INI <- NULL
  
  # Define FINAL file path
  base_path <- paste0(path_to_masks_files, "/", basename,
                      "_Tb_pcp_info_intermedio_FINAL_",
                      year_start, "-", year_end)
  
  if (is.null(month)) {
    # Se o mês é NULL, não adiciona mês ou dia no nome do arquivo
    ofile <- paste0(base_path, ".csv")
  } else {
    month_int <- as.integer(month)
    month_str <- sprintf("%02d", month_int)
    if (is.null(day)) {
      # Se o dia é NULL, adiciona apenas o mês
      ofile <- paste0(base_path, "_", month_str, ".csv")
    } else {
      day_int <- as.integer(day)
      day_str <- sprintf("%02d", day_int)
      ofile <- paste0(base_path, "_", month_str, "_", day_str, ".csv")
    }
  }
  
  
  # ofile <- paste0(path_to_masks_files,
  #                 "/fam_SAAG_Tb_pcp_info_intermedio_FINAL_",
  #                 year_start, "-", year_end,
  #                 "_", sprintf(as.integer(month), fmt = "%02d"), ".csv")
  
  # Remove the FINAL file path if it already exists
  if (file.exists(ofile)) {
    file.remove(ofile)
  } else {
    print("no files to remove")
  }
  
  print(paste0("Saving ", ofile))
  # Save the FINAL Monthly file
  data.table::fwrite(dt,
                     file = ofile,
                     row.names = FALSE)
  
  print(paste0("Precipitation under Tb contiguous area for ", ofile,
               " were just obtained."))
  print("FIM !!!!! ")
  # Need to run step_02 and write_nc for generating the Mask files."
}