R/bkp/bkp_functions.R

Defines functions get_data_station_MECC2

# functions to process MECC for drought analysis


# Function to extract data from file.
# Returns list with two elements (1) data series; (2) data frame with info about station
#' Function to extract data from file.
#'
#' @param station_id DWD station number (can be obtained with `rdwd::findID("StationName")`)
#' @param var_name character with the name of the variable, corresponding to the default names from DWD (e.g. kl, more_precip, solar)
#' @param date_bounds vector with two dates in the format ('yyyy-mm-dd'), indicating beggining and end of study period.
#'
#' @return list with three elements (1) data series; (2) data frame with info about station, (3) vector with `NA` count.
#'
#' @details To avoid errors and NA values in the SPEI and drought computation, preferably keep `date_bounds = NA`
#'
#'@export
#'
get_data_station_MECC2 <- function(station_id, var_name='kl', date_bounds = NA){
    # station_id <- 880
    # var_name <- 'kl'
    # date_bounds <- c("1921-1-1", "2020-12-31")
    # rain_threshold <- 1

    # adding `current = T` it takes longer but makes the function more resilient to updates
    link <- selectDWD(id=station_id, res='daily', var = var_name, per='rh', current = T)

    meta_data <- metaInfo(station_id) #metadata

    # check if variable is available
    if (var_name %in% meta_data$var){
        data <- data.frame()

        if (link[1] == link[2]){
            data <-  readDWD(dataDWD(link[1], read=FALSE), varnames=TRUE, fread = F)
        } else {
            # join data from both recent and historical data sets
            if (grepl('historical', link[2])){
                # without fread it is slower, but more reliable
                data_2 <- readDWD(dataDWD(link[2], read=FALSE), varnames=TRUE, fread = F)
                data <- rbind(data, data_2)
            }
            if (grepl('recent', link[1])){
                # without fread it is slower, but more reliable
                data_1 <- readDWD(dataDWD(link[1], read=FALSE), varnames=TRUE, fread = F)
                data <- rbind(data, data_1)
            }

        }
        # filter data by dates, if bounds are provided
        if (!is.na(date_bounds)){
            data %<>% dplyr::filter(., MESS_DATUM >= date_bounds[1], MESS_DATUM <= date_bounds[2]) %>%
                dplyr::mutate(Date= .$MESS_DATUM) %>% complete(Date= seq(from = as.Date(date_bounds[1]),
                                                                         to = as.Date(date_bounds[2]),
                                                                         by = "day"))
        } else {
            date_bounds[1] <- as.Date(min(data$MESS_DATUM, na.rm = T))
            date_bounds[2] <- as.Date(max(data$MESS_DATUM, na.rm = T))

            date_bounds <- zoo::as.Date(date_bounds)

            data %<>% dplyr::filter(., MESS_DATUM >= date_bounds[1], MESS_DATUM <= date_bounds[2]) %>%
                dplyr::mutate(Date= .$MESS_DATUM) %>% complete(Date= seq(from = as.Date(date_bounds[1]),
                                                                         to = as.Date(date_bounds[2]),
                                                                         by = "day"))
        }

        data <- data[!duplicated(data$Date),] # remove duplicates

        data %<>% setNames(sub("\\..*", "", colnames(.))) %>%#rename columns to more simple name
            filter(!is.na(STATIONS_ID))

        # View(data)

        #Check how many NAs
        na_count <- c(station_id,summarise_all(data,funs(sum(is.na(.)))))|> unlist()

        my_data <- list(data = data,
                        meta = data.frame(name = meta_data$Stationsname[1],
                                          id = station_id,
                                          latitude = meta_data$geoBreite[1],
                                          longitude = meta_data$geoLaenge[1],
                                          height = meta_data$Stationshoehe[1],
                                          county = meta_data$Bundesland[1],
                                          var = var_name),
                        na_count = na_count)

    } else {
        data <- data.frame(val = -999) # error value
        na_count <- c(station_id, rep(-888, 30)) # error value

        my_data <-list(data = data,
                       meta = data.frame(name = meta_data$Stationsname[1],
                                         id = station_id,
                                         latitude = meta_data$geoBreite[1],
                                         longitude = meta_data$geoLaenge[1],
                                         height = meta_data$Stationshoehe[1],
                                         county = meta_data$Bundesland[1],
                                         var = '**VARIABLE NOT AVAILABLE'), # WARNING
                       na_count = na_count)
    }

    return(my_data)
}


#' Function to  calculate ETP
#'
#' @param input_type character string. Either "DWD" (to use the output of function `get_data_station()`) or "MANUAL" to give each variable manually
#' @param list_data if `input_type == "DWD"`, the output of `get_data_station()`, else it is `NA`
#' @param date series of dates (daily) to be of the time series.
#' @param t_max series of maximum daily temperature (Celcius)
#' @param t_min series of minimum daily temperature (Celcius)
#' @param rh series of average daily relative humidity (-)
#' @param rh_max series of maximum daily relative humidity (-)
#' @param rh_min series of minimum daily relative humidity (-)
#' @param n_hours series of daily total number of sunlight hours (hours)
#' @param cloud_cover series of daily degree of cloud coverage (Okta scale)
#' @param mean_wind series of daily average wind speed (m.s-1)
#' @param station_metadata data frame with metadata, preferably the output from `get_data_station()`. Should contain 6 columns (name, id, latitude, longitude, height, county and var; in this exact order and names).
#' @param method carachter string with what equation should be used to estimate Evapotranspiration. Values are *HargreavesSamani* or *PenmanMonteith*.
#'
#' @details
#' All inputs (expect `station_metadata`) are available in the first element of the output of function `get_data_station()`.
#' For clarity, they can be incerted separatly
#'
#' @export
#'
calculate_ETP_daily_MECC2 <- function(input_type = 'dwd', list_data = NA, date = NA, t_max = NA,
                                     t_min = NA, rh = NA, rh_max = NA, rh_min = NA, n_hours = NA,
                                     cloud_cover = NA, mean_wind = NA, station_metadata = NA,
                                     method = 'HargreavesSamani'){
    #####
    ## Uncomment to test function
    # data_id <- get_data_station_MECC(3988, var_name = 'kl',
    #                             date_bounds = c("1900-1-1", "2020-12-31"))
    # data_sol <- get_data_station_MECC(3987, var_name = 'solar',
    #                             date_bounds = c("1900-1-1", "2020-12-31"))
    #
    #
    # data <- cbind(data_id[[1]], FG_STRAHL = data_sol[[1]]$FG_STRAHL)
    # station_metadata <- data_id[[2]]
    #
    # date = data$Date
    # t_max = data$TXK
    # t_min = data$TNK
    # rh = data$UPM
    # n_hours = data$SDK
    # cloud_cover = data$NM
    # mean_wind = data$FM
    # solar_rad = data$FG_STRAHL
    #
    # input_type = 'dwd'
    # list_data = list_data
    #
    # method = 'hargreavessamani'
    #####

    ## 1. Input
    if (tolower(method) == 'hargreavessamani'){
        if (tolower(input_type) == 'dwd'){ #load from output `get_data_station`
            date <- list_data[[1]]$Date
            t_max <- list_data[[1]]$TXK
            t_min <- list_data[[1]]$TNK

            station_metadata <- list_data[[2]]
        } else if(tolower(input_type) == 'manual'){ # load manually each vector
            cat("You selected manual input.\n
        Minimum inputs are the vectors (or columns from dataframe):
          1. date
          2. t_max
          3. tmin")
        } else {
            cat("ERROR: invalid input type!
        Please choose DWD or MANUAL.")
            return(NULL)
        }
    } else if (tolower(method) == 'penmanmonteith') {
        if (tolower(input_type) == 'dwd'){ #load from output `get_data_station`
            date <- list_data[[1]]$Date
            t_max <- list_data[[1]]$TXK
            t_min <- list_data[[1]]$TNK
            rh <- list_data[[1]]$UPM
            n_hours <- list_data[[1]]$SDK
            cloud_cover <- list_data[[1]]$NM
            mean_wind <- list_data[[1]]$FM

            station_metadata <- list_data[[2]]
        } else if(tolower(input_type) == 'manual'){ # load manually each vector
            cat("You selected manual input.\n
        Minimum inputs are the vectors (or columns from dataframe):
          1. date
          2. t_max
          3. tmin
          4. rh (or rh_min and rh_max)
          5. n_hours
          6. cloud_cover
          7. mean_wind
          8. station_metadata")
        } else {
            cat("ERROR: invalid input type!
        Please choose DWD or MANUAL.")
            return(NULL)
        }
    } else {
        cat("ERROR: invalid mehtod!
        Please choose HargreavesSamani or PenmanMonteith.")
        return(NULL)
    }


    ## 2. Calculation

    require(Evapotranspiration)
    data("constants")
    if (tolower(method) == 'penmanmonteith'){

        # if only average daily RH is provided
        if (!is.na(rh[1])){
            rh_min <- rh
            rh_max <- rh
        }

        series <- data.frame(Year = lubridate::year(date), Month = lubridate::month(date),
                             Day = lubridate::day(date),
                             Tmax = t_max, Tmin = t_min, RHmax = rh_max, RHmin = rh_min,
                             n = n_hours, Cd = cloud_cover, uz = mean_wind)

        # load in pre-set constants and variables for the ETP Calculations

        # require(Evapotranspiration)
        # data("constants")
        # load('data/constants.rda')

        constants$lat <- station_metadata$latitude
        constants$lat_rad <- station_metadata$latitude * pi/180
        constants$Elev <- station_metadata$height
        constants$z <- 2 # ?? is it alright?

        var_names <- c('Tmax', 'Tmin', 'RHmax', 'RHmin', 'n', 'Cd', 'uz')

        input <- Evapotranspiration::ReadInputs(varnames = var_names,
                                                climatedata = series, constants = constants,
                                                stopmissing=c(20,20,3), timestep="daily",
                                                message = 'no')

        results1 <- ET.PenmanMonteith(input, constants, ts="daily", solar="sunshine hours",
                                      wind="yes", crop = "short", message="no", AdditionalStats="no", save.csv="no")
        results2 <- ET.PenmanMonteith(input, constants, ts="daily", solar="cloud",
                                      wind="yes", crop = "short", message="no", AdditionalStats="no", save.csv="no")

        etp_series <- data.frame(Date = date, Julian_day = yday(date),
                                 ETP_sunshine = results1$ET.Daily, ETP_cloud = results2$ET.Daily)

    } else  {

        series <- data.frame(Year = lubridate::year(date), Month = lubridate::month(date),
                             Day = lubridate::day(date), Tmax = t_max, Tmin = t_min)

        # data("constants")

        constants$lat <- station_metadata$latitude
        constants$lat_rad <- station_metadata$latitude * pi/180
        constants$Elev <- station_metadata$height
        constants$z <- 2 # ?? is it alright?

        var_names <- c('Tmax', 'Tmin')

        input <- Evapotranspiration::ReadInputs(varnames = var_names,
                                                climatedata = series, constants = constants,
                                                stopmissing=c(20,20,3), timestep="daily",
                                                message = 'no')

        results1 <- ET.HargreavesSamani (input, constants, ts="daily", crop = "short",
                                         message="no", AdditionalStats="no", save.csv="no")

        etp_series <- data.frame(Date = date, Julian_day = yday(date),
                                 ETP = results1$ET.Daily, row.names = NULL)

    }

    ## 3. Export results
    output <- data.frame(etp_series, dplyr::select(series, !c('Year', 'Month', 'Day')), row.names = NULL) |>
        select(Date, Julian_day, Tmax, Tmin, ETP)

    return(output)
}


#' Function to calculate SPEI
#'
#' @param Date series of dates (daily) to be of the time series.
#' @param Precipitation series of daily precipitation (mm)
#' @param ETP series of minimum daily potential evapotranspiration (mm), preferably obtained with `calculate_ETP_daily_MECC()`
#' @param scale integer, the number of months for accumulation computation
#'
#'@export
#'
calculate_SPEI_MECC2 <- function(Date, Precipitation, ETP, scale = 3){

    # uncomment to test function
    # Date = station_etp$Date
    # Precipitation = data_station$data$RSK
    # ETP = station_etp$ETP

    # get monthly accumulation
    series <- tibble::tibble(date = as.POSIXct(Date),
                             prec = Precipitation,
                             etp = ETP) |>
        tibbletime::as_tbl_time(date) |>
        collapse_by('month') |>
        group_by(date) |>
        summarise(prec = sum(.data[["prec"]]),
                  etp = sum(.data[['etp']])) |>
        mutate(deficit = prec - etp) |>
        mutate(spei = as.vector(ts(SPEI::spei(deficit, scale = 3)$fitted)))|>
        stats::setNames(c('Date', 'Precipitation', 'ETP', 'Deficit', 'SPEI'))|>
        as.tibble()

    return(series)

}


#' Function to select drought events
#'
#' @param Date series of dates (monthly) to be of the time series.
#' @param SPEI series of month SPEI (-), preferably obtained with `calculate_SPEI_MECC()`
#' @param threshold scalar, maximum value of SPEI for a period to be classified as droguht
#' @param year_bounds vector with two integer values, the years of beggining and end of analysis
#'
#'@export
#'
get_drought_series2 <- function(Date, SPEI, threshold = 0, year_bounds = c(1900,2019)){

    ## uncomment to test function
    # Date = station_spei$Date
    # SPEI = station_spei$SPEI
    # threshold = 0
    # year_bounds = c(1900,2019)


    series <- data.frame(date = Date, spei = SPEI) |>
        filter(lubridate::year(Date) >= year_bounds[1] & lubridate::year(Date) <= year_bounds[2]) |>
        mutate(drought = (spei <= threshold)*1) |>
        mutate(year = lubridate::year(date), spei_aux = spei*drought,
               drought_aux = with(rle(drought), rep(cumsum(values) * values, lengths)))

    droughts <- series[,c(2:6)] |>
        group_by(drought_aux) |>
        summarise(spei_acc = -sum(.data[["spei_aux"]]),
                  year_acc = max(.data[['year']]),
                  duration = sum(.data[["drought"]])) |>
        dplyr::slice_tail(n = -1) # remove first line filled with zeros by default

    max_drought <- data.frame(year = NULL, sev = NULL, dur = NULL)
    for (d_year in unique(droughts$year_acc)){
        # d_year <- 1928
        droughts_year <- dplyr::filter(droughts, year_acc == d_year) |>
            summarise_all(max) |> dplyr::select(year_acc,spei_acc, duration) |>
            setNames(c('year', 'sev','dur'))
        max_drought <- rbind(max_drought,droughts_year)
    }

    max_drought %<>% tidyr::complete(year = year_bounds[1]:year_bounds[2],
                                     fill = list(sev = 0, dur = 0))

    return(list(drought_series = droughts,
                drought_max = max_drought))

}





######################## ENTROPY FUNCTIONS

#' Function to download MATLAB files from github
#'
#' @details no input is require. The files are saved in the WD.
#'
#' @export
#'
get_matlab_files_MECC2 <- function(){

    download.file(url = "https://github.com/pedroalencar1/DroughtSDF/tree/master/Matlab/R_Copula.m"
                  , destfile = "R_Copula.m")

    download.file(url = "https://github.com/pedroalencar1/DroughtSDF/tree/master/Matlab/get_comparison_tr.m"
                  , destfile = "get_comparison_tr.m")

    download.file(url = "https://github.com/pedroalencar1/DroughtSDF/tree/master/Matlab/get_copula.m"
                  , destfile = "get_copula.m")

    download.file(url = "https://github.com/pedroalencar1/DroughtSDF/tree/master/Matlab/get_copula_distribution.m"
                  , destfile = "get_copula_distribution.m")

    download.file(url = "https://github.com/pedroalencar1/DroughtSDF/tree/master/Matlab/get_copula_multipliers.m"
                  , destfile = "get_copula_multipliers.m")

    download.file(url = "https://github.com/pedroalencar1/DroughtSDF/tree/master/Matlab/get_distribution_comparison.m"
                  , destfile = "get_distribution_comparison.m")

    download.file(url = "https://github.com/pedroalencar1/DroughtSDF/tree/master/Matlab/get_entropy_marginals.m"
                  , destfile = "get_entropy_marginals.m")

    download.file(url = "https://github.com/pedroalencar1/DroughtSDF/tree/master/Matlab/get_return_periods.m"
                  , destfile = "get_return_periods.m")
}

#' function to run MATLAB stript of MECC
#'
#' @param input_data vector with elements in the following order:
#'   d = threshold ratio for transformation (default is 0.01, after Singh and Zhang(2018)
#'   year_separator = indicates the point of breaking between reference na study datasets
#'   export_report = logical - option to export or not a report with informaiton about the copulas
#'   export_copula = logical - option to export or not the cumulative and survival copulas
#'   values_p = a sequence of numbers (until the end of the file) with values in (0,1) of
#'   probabilities of interest to compute the return periods.
#' @param input_series dataframe with 3 columns. Columns are YEARS, X1 (SEVERITY), X2(DURATION)
#'
#' @export
#'
run_matlab_MECC2 <- function(input_data, input_series){

    write.table(input_data, 'input_data.csv', row.names = F, col.names = F)
    write.csv(input_series, 'input_series.csv', row.names = F)


    if (matlabr::have_matlab()){
        matlabr::get_matlab()
        matlabr::run_matlab_script(fname = 'R_Copula.m')
    }

}


#' function to transform marginals into (0,1) interval
#'
#' @param data data frame with three columns containing years, severity and duration. Preferably the output of `get_drought_series()`
#' @param d a small value between zero and one. It is used bind the distribution to a open interval between zero and one, i.e. ]0,1[
#'
#' @details the default value of `d` is 0.01 (Zhang and Singh, 2019)
#'
#' @return a vector with three values, the Lagrangian multipliers *Lambda_0*, *Lambda_1*, and *Lambda_2*
#'
#' @export
#'
marginal_interval2 <- function(data, d = 0.01){

    data_raw <- data %>%
        dplyr::mutate(sev_t = unlist((data[,2] - (1-d)*min(data[,2]))/((1+d)*max(data[,2]) - (1-d)*min(data[,2]))),
                      dur_t = unlist(data[,3] - (1-d)*min(data[,3]))/((1+d)*max(data[,3]) - (1-d)*min(data[,3])))%>%
        stats::setNames(c('year', 'sev', 'dur', 'sev_t', 'dur_t'))

    return(data_raw)
}


#' function to obtain the constrains to compute marginal distribution
#'
#' @param data_column column containing the data of the analysed variable. Preferably the output of `get_drought_series()`
#'
#' @return a vector with 2 values, *c1* and *c2*
#'
#' @export
#'
get_1d_constraints2 <- function(data_column){
    # data_column <- data$sev

    c[1] <- mean(data_column)
    c[2] <- mean(data_column^2)

    return(c)
}


#' function to solve single variable entropy equation (marginal)
#'
#' @param data_series data frame with the columns of the variables to be analysed. Preferably a subset of the output of function `marginal_interval()`
#'
#' @return a data frame with three columns, the Lagrangian multipliers *Lambda_0*, *Lambda_1*, and *Lambda_2*
#'
#' @export
#'
get_marginal_multipliers2 <- function(data_series){

    # data_series <- data[,4:5]

    all_multipliers <- data.frame()

    c <- c(0,0)

    for (i in 1:ncol(data_series)){

        data_column <- pull(data_series, i)

        c[1] <- sum(data_column)/length(data_column)
        c[2] <- sum(data_column^2)/length(data_column)

        # define objective function
        fun_obj <- function(a) pracma::integral(function(x) exp(-a[1]*(x - c[1]) - a[2]*(x^2 - c[2])), 0,1)

        #solve integral
        solution <- pracma::fminsearch(fun_obj, c(-1, 1), method="Nelder-Mead", minimize = T)

        multipliers <- solution$xmin #vector with multipliers

        a0 <- log(pracma::integral(function(x) exp(-multipliers[1]*x - multipliers[2]*x^2),0,1))

        all_multipliers <- rbind(all_multipliers, c(a0, multipliers))
    }

    colnames(all_multipliers) <- c('Lambda_0', 'Lambda_1', 'Lambda_2')
    rownames(all_multipliers) <- c('sev', 'dur')

    return(all_multipliers)
}

#' function to transform marginals into (0,1) interval
#'
#' @param data data frame with five columns containing years, severity and duration, and its transformations. Preferably the output of `marginal_interval()`
#' @param mult data frame with three columns (three multipliers) and two rows (severity and duration). Preferably the output of `get_marginal_multipliers()`
#'
#' @return a vector with three values, the Lagrangian multipliers *Lambda_0*, *Lambda_1*, and *Lambda_2*
#'
#' @export
#'
get_marginal_distribution2 <- function(data, mult){
    ## uncomment to test function
    # data <- series1
    # mult <- mult1

    # set name of new columns
    data$F_sev <- NA
    data$F_dur <- NA

    for (i in 1:nrow(mult)){

        a <-  unlist(mult[i,]) # subset multipliers
        F_var <- function(x) pracma::integral(function(x) exp(-a[1]-a[2]*x - a[3]*x^2),0,x) #cummulative distribution
        data[, i+5] <- as.vector(unlist(lapply(pull(data, i+3), FUN = F_var)))
    }

    return(data)
}


#' function to calculate the copula's Lagrange multipliers
#'
#' @param data data frame with seven columns containing years, severity and duration, its
#' transformations and the entropy-based marginals. Preferably the output of `get_marginal_distribution()`
#'
#' @return a numeric vector with names and six elements, the Lagrangian multipliers *Lambda_0*,
#' *Lambda_1*, and *Lambda_2*, *Gamma_1*, *Gamma_2*, and *Lambda_3*
#'
#' @export
#'
get_copula_multipliers2 <- function(data){

    #calculate copula constraint based on spearman correlation
    Euv <- cor(x = data[,6], y = data[,7], method = 'spearman')
    Euv <- as.numeric((Euv + 3)/12)

    # define objective function
    fun_obj <- function(a) pracma::integral2(function(x,y)
        exp(-a[1]*(x - 0.5) - a[2]*(x^2 - 0.333333) - a[3]*(y - 0.5) - a[4]*(y^2 - 0.333333) - a[5]*(x*y - Euv)),
        xmin = 0,xmax = 1, ymin = 0, ymax = 1)$Q

    #solve integral
    multipliers <- pracma::fminsearch(fun_obj, c(1,1,1,1,-1), method="Nelder-Mead",
                                      maxiter = 10000)$xmin

    a0 <- pracma::integral2(function(x,y)
        exp(-multipliers[1]*(x) - multipliers[2]*(x^2) - multipliers[3]*(y) -
                multipliers[4]*(y^2) - multipliers[5]*x*y),
        xmin = 0,xmax = 1, ymin = 0, ymax = 1)$Q

    a0 <- log(1/a0)

    copula_multipliers <- c(a0, multipliers) |>
        setNames(c('l0','l1','l2','g1','g2','l3'))

    return(copula_multipliers)
}


#' function to obtain Copula and Survival Copula distributions
#'
#' @param copula_mult vector with the six copula lagrange multipliers (in this orther: *Lambda_0*,
#' *Lambda_1*, and *Lambda_2*, *Gamma_1*, *Gamma_2*, and *Lambda_3*). Preferably the output from `get_copula_multipliers()`
#'
#' @return a 4 column data frame with x and y values for mapping, and the values of Copula and Survival Copula distributions.
#'
#' @export
#'
get_copula_distribution2 <- function(copula_mult){

    copula <- function(x,y) exp(copula_mult[1] -copula_mult[2]*(x) - copula_mult[3]*(x^2) - copula_mult[4]*(y) -
                                    copula_mult[5]*(y^2) - copula_mult[6]*x*y)

    Copula <- function(xm,ym) pracma::integral2(fun = copula, xmin = 0,xmax = xm, ymin = 0, ymax = ym)$Q

    # MATLAB handels well xia nd yi values equal to 1 and 0, but R doesn't. This new interval fixes it.
    xi <- (seq(1,99, length.out = 99)/100)^0.5

    C_values <- crossing(xi,xi) |>
        setNames(c('yi', 'xi'))|>
        select(xi,yi) |>
        mutate(dist = 0, surv = 0)


    for (i in 1:nrow(C_values)){
        tryCatch({ # do to the numerical compuation of the integral, some values are not solvable. They are replaced by NA!
            C_values$dist[i] <- Copula(C_values$xi[i], C_values$yi[i])
            C_values$surv[i] <- -1 + C_values$xi[i] + C_values$yi[i] + Copula(1-C_values$xi[i], 1-C_values$yi[i])

        }, error=function(e){})

        if (C_values$dist[i] == 0){C_values$dist[i] <- NA}
        if (C_values$surv[i] == 0){C_values$surv[i] <- NA}

    }

    return(C_values)
}


# #' function to obtain Copula and Survival Copula distributions
# #'
# #' @param copula_mult vector with the six copula lagrange multipliers (in this orther: *Lambda_0*,
# #' *Lambda_1*, and *Lambda_2*, *Gamma_1*, *Gamma_2*, and *Lambda_3*). Preferably the output from `get_copula_multipliers()`
# #'
# #' @return a 4 column data frame with x and y values for mapping, and the values of Copula and Survival Copula distributions.
# get_distribution_comparison <- function(){}


#' function to obtain the return period of the copula to particular event features (based on the probability of the marginals)
#'
#' @data data frame with seven columns containing years, severity and duration, its
#' transformations and the entropy-based marginals. Preferably the output of `get_marginal_distribution()`
#' @param d a small value between zero and one. It is used bind the distribution to a open interval between zero and one, i.e. ]0,1[
#' @param marginal_mult data frame with three columns (three multipliers) and two rows (severity and duration).
#' Preferably the output of `get_marginal_multipliers()`
#' @param copula_mult vector with the six copula lagrange multipliers (in this orther: *Lambda_0*,
#' *Lambda_1*, and *Lambda_2*, *Gamma_1*, *Gamma_2*, and *Lambda_3*). Preferably the output from `get_copula_multipliers()`
#' @param p_values vector with values between 0 and 1, they are probabilities of interest. Default = `c(0.8,0.9,0.95,0.98,0.99)`
#'
#'@export
#'
get_return_periods2 <- function(data, d, marginal_mult, copula_mult, p_values){

    # data = series1
    # d = 0.01
    # marginal_mult = mult1
    # copula_mult = copula_mult_1
    # p_values = c(0.8,0.9,0.95,0.98,.99)

    #define marginal distributions
    f_x <- function(x) exp(-marginal_mult[1,1] - marginal_mult[1,2]*x - marginal_mult[1,3]*x^2)
    F_x <- function(xm) pracma::integral(f_x, 0, xm)

    f_y <- function(y) exp(-marginal_mult[2,1] - marginal_mult[2,2]*y - marginal_mult[2,3]*y^2)
    F_y <- function(ym) pracma::integral(f_y, 0, ym)


    # get real values of event features
    limits <- data.frame(sev = c(max(data[,2]), min(data[,2])),
                         dur = c(max(data[,3]), min(data[,3])),
                         row.names = c('max', 'min')) |> t() |>
        as.tibble() |>
        mutate(delta = (1+d)*max - (1-d)*min)


    # get normalized and absolute event values based on probability
    prob_marginals <- data.frame()
    for (i in 1:length(p_values)){

        obj_fun_x <- function(xm) abs(F_x(xm) - p_values[i])
        a_sol_x <- fminbnd(obj_fun_x, a = 0, b = 1)$xmin

        x_raw <- a_sol_x*limits$delta[1] + (1-d)*limits$min[1]

        obj_fun_y <- function(ym) abs(F_y(ym) - p_values[i])
        a_sol_y <- fminbnd(obj_fun_y, a = 0, b = 1)$xmin

        y_raw <- a_sol_y*limits$delta[2] + (1-d)*limits$min[2]

        prob_marginals <- rbind(prob_marginals,
                                c(p_values[i], a_sol_x, a_sol_y, x_raw, y_raw))

    }
    colnames(prob_marginals) <- c('Probability', 'Sev_p', 'Dur_p', 'Sev_abs', 'Dur_abs')

    # define Copula (primitive)

    copula <- function(x,y) exp(copula_mult[1] -copula_mult[2]*(x) - copula_mult[3]*(x^2) - copula_mult[4]*(y) -
                                    copula_mult[5]*(y^2) - copula_mult[6]*x*y)

    Copula <- function(xm,ym) pracma::integral2(fun = copula, xmin = 0,xmax = xm, ymin = 0, ymax = ym)$Q


    p_values_comb <- crossing(p_values,p_values) %>% .[, 1:2] |>
        setNames(c('x_prob', 'y_prob'))

    p_marginals_comb <- crossing(prob_marginals$Sev_abs,prob_marginals$Dur_abs) %>% .[, 1:2] |>
        setNames(c('x_abs', 'y_abs'))

    p_values_comb <- cbind(p_values_comb, p_marginals_comb)
    p_values_comb$TR <- 0

    for (i in 1:nrow(p_values_comb)){
        tryCatch({ # do to the numerical compuation of the integral, some values are not solvable. They are replaced by NA!
            x_i <- p_values_comb$x_prob[i]
            y_i <- p_values_comb$y_prob[i]
            p_values_comb$TR[i] <- (1 - x_i - y_i + Copula(x_i, y_i))^-1

        }, error=function(e){})

        if (p_values_comb$TR[i] == 0){p_values_comb$TR[i] <- NA}
    }

    colnames(p_values_comb) <- c('u = F(x)', 'v = F(y)', 'x', 'y', 'Tr(x,y)')

    return(p_values_comb[order(p_values_comb$`v = F(y)`),])
}

#' function to compare changes in the return period over time
#'
#' @param data_study data frame with seven columns containing years, severity and duration, its
#' transformations and the entropy-based marginals for the study period of interest. Preferably
#' the output of `get_marginal_distribution()`.
#' @param return_periods_reference data frame with marginal probabilities, absolute values and
#' copula-based return period for the reference period. Preferably the output from get_return_periods
#' @param d a small value between zero and one. It is used bind the distribution to a open interval between zero and one, i.e. ]0,1[
#'
#'@export
#'
get_comparison_tr2 <- function(data_study, return_periods_reference, d){

    data_study <- series2
    d = 0.01
    return_periods_reference <- return_periods_1

    # get marginal distribution of raw data of study area
    marginal_mult = get_marginal_multipliers(data_series = data_study[,4:5])

    #define marginal distributions
    f_x <- function(x) exp(-marginal_mult[1,1] - marginal_mult[1,2]*x - marginal_mult[1,3]*x^2)
    F_x <- function(xm) pracma::integral(f_x, 0, xm)

    f_y <- function(y) exp(-marginal_mult[2,1] - marginal_mult[2,2]*y - marginal_mult[2,3]*y^2)
    F_y <- function(ym) pracma::integral(f_y, 0, ym)

    # limits study
    limits <- data.frame(sev = c(max(data_study[,2]), min(data_study[,2])),
                         dur = c(max(data_study[,3]), min(data_study[,3])),
                         row.names = c('max', 'min')) |> t() |>
        as.tibble() |>
        mutate(delta = (1+d)*max - (1-d)*min)

    #values of interest (raw)
    marginal_values <- data.frame(x_values = return_periods_reference[,3],
                                  y_values = return_periods_reference[,4])

    # values of interest (transformed to between 0 and 1 using scale of study data)
    marginal_values$x_values_transformed <- (marginal_values$x_values - (1-d)*limits$min[1])/limits$delta[1]
    marginal_values$y_values_transformed <- (marginal_values$y_values - (1-d)*limits$min[2])/limits$delta[2]

    for (i in 1:nrow(marginal_values)){
        marginal_values$x_values_transformed_p[i] <- F_x(as.numeric(marginal_values$x_values_transformed[i]))
        marginal_values$y_values_transformed_p[i] <- F_y(as.numeric(marginal_values$y_values_transformed[i]))
    }

    #get copula multipliers *study*
    copula_multipliers <- get_copula_multipliers(data_study)

    # define copula (primitive) *study*
    copula <- function(x,y) exp(copula_multipliers[1] -copula_multipliers[2]*(x) -
                                    copula_multipliers[3]*(x^2) - copula_multipliers[4]*(y) -
                                    copula_multipliers[5]*(y^2) - copula_multipliers[6]*x*y)

    Copula <- function(xm,ym) pracma::integral2(fun = copula, xmin = 0,xmax = xm, ymin = 0, ymax = ym)$Q

    # calculate new probability of reference events

    marginal_values$new_tr <- 0
    for (i in 1:nrow(marginal_values)){
        tryCatch({ # do to the numerical compuation of the integral, some values are not solvable. They are replaced by NA!
            x_i <- F_x(marginal_values$x_values_transformed[i])
            y_i <- F_y(marginal_values$y_values_transformed[i])

            marginal_values$new_tr[i] <- (1 - x_i - y_i + Copula(x_i, y_i))^-1
        }, error=function(e){})

        if (marginal_values$new_tr[i] == 0){marginal_values$new_tr[i] <- NA}
    }

    output <- cbind(return_periods_reference, marginal_values$new_tr) |>
        setNames(c('u = F(x)', 'v = F(y)', 'x', 'y', 'Tr_ref', 'Tr_new'))

    return(output)
}


#' function to compare changes in the returnt period distribution over time
#'
#' @param data_reference data frame with seven columns containing years, severity and duration, its
#' transformations and the entropy-based marginals for the REFERENCE period of interest. Preferably
#' the output of `get_marginal_distribution()`.
#' @param data_study data frame with seven columns containing years, severity and duration, its
#' transformations and the entropy-based marginals for the STUDY period of interest. Preferably
#' the output of `get_marginal_distribution()`.
#' @param d a small value between zero and one. It is used bind the distribution to a open interval between zero and one, i.e. ]0,1[
#'
#'@export
#'
get_distribution_comparison2 <- function(data_study, data_reference, d){

    # get marginal multipliers
    marg_mult_1 <- get_marginal_multipliers(data_reference[,4:5])
    marg_mult_2 <- get_marginal_multipliers(data_study[,4:5])

    # get copula multipliers and distribution (reference)
    copula_mult_1 <- get_copula_multipliers(data_reference)
    copula_mult_2 <- get_copula_multipliers(data_study)

    copula_dist_1 <- get_copula_distribution(copula_mult_1)

    return_periods_all <- get_return_periods(data = data_reference, d = d, copula_mult = copula_mult_1,
                                             marginal_mult = marg_mult_1, p_values = copula_dist_1$xi[1:99]) |>
        setNames(c('u', 'v', 'x', 'y', 'TR_ref'))

    # return_periods_all$TR_ref[return_periods_all$TR_ref<=0] <- NA #remove numerical errors

    # get max and min of data sets
    # limits_1 <- data.frame(sev = c(max(data_reference[,2]), min(data_reference[,2])),
    #                                dur = c(max(data_reference[,3]), min(data_reference[,3])),
    #                                row.names = c('max', 'min')) |> t() |>
    #   as.tibble() |>
    #   mutate(delta = (1+d)*max - (1-d)*min)

    limits_2 <- data.frame(sev = c(max(data_study[,2]), min(data_study[,2])),
                           dur = c(max(data_study[,3]), min(data_study[,3])),
                           row.names = c('max', 'min')) |> t() |>
        as.tibble() |>
        mutate(delta = (1+d)*max - (1-d)*min)

    return_periods_all$xt_2 <- (return_periods_all$x - limits_2$min[1])/limits_2$delta[1]
    return_periods_all$yt_2 <- (return_periods_all$y - limits_2$min[2])/limits_2$delta[2]

    # define marginals and copula *study*
    F_x_2 = function(xm) integral(function(x) exp(-marg_mult_2[1,1] - marg_mult_2[1,2]*x - marg_mult_2[1,3]*x^2),0,xm);
    F_y_2 = function(ym) integral(function(y) exp(-marg_mult_2[2,1] - marg_mult_2[2,2]*y - marg_mult_2[2,3]*y^2),0,ym);

    Copula_2 = function(xm,ym) integral2(function(x,y) exp(copula_mult_2[1] - copula_mult_2[2]*x -
                                                               copula_mult_2[3]*x^2 - copula_mult_2[4]*y -
                                                               copula_mult_2[5]*y^2 - copula_mult_2[6]*x*y),
                                         0,xm, 0, ym)$Q

    # calculate marginals
    return_periods_all$u_2 <- sapply(return_periods_all$xt_2, F_x_2)
    return_periods_all$v_2 <- sapply(return_periods_all$yt_2, F_y_2)

    # compute new copula distribution
    for (i in 1:nrow(return_periods_all)){
        tryCatch({ # do to the numerical computation of the integral, some values are not solvable. They are replaced by NA!
            x_i <- return_periods_all$u_2[i]
            y_i <- return_periods_all$v_2[i]

            return_periods_all$C_2[i] <- Copula_2(x_i, y_i)
        }, error=function(e){})

        if(return_periods_all$C_2[i] == 0){return_periods_all$C_2[i] <- NA}
    }

    # compute new TR distribution
    for (i in 1:nrow(return_periods_all)){
        tryCatch({ # do to the numerical computation of the integral, some values are not solvable. They are replaced by NA!
            x_i <- return_periods_all$u_2[i]
            y_i <- return_periods_all$v_2[i]
            C_i <- return_periods_all$C_2[i]

            return_periods_all$Tr_2[i] <- 1/(1 - x_i - y_i + C_i)
        }, error=function(e){})

        if(return_periods_all$Tr_2[i] <= 0){return_periods_all$Tr_2[i] <- NA}
    }

    output <- return_periods_all |>
        setNames(c('u = F(x)', 'v = F(y)', 'x', 'y', 'Tr_ref', 'xt_2', 'yt_2', 'u_2', 'v_2', 'C_2', 'Tr_2'))

    return(output)
}


#' function to plot heatmap of changes in return period (Tr) over time for selected evetns
#'
#' @param tr_comparison data frame with columns containing the absolute value of the drought features (x and y),
#' the return period of reference (Tr_ref) and the return period of the study (Tr_new). Preferably the output
#' of function `get_comparison_tr()`.
#'
#'@export
#'
plot_tr_comparison2 <- function(tr_comparison){

    plot <- tr_comparison |> mutate(diff = Tr_ref - Tr_new,
                                    change = diff/Tr_new) |>
        ggplot(aes(x = as.factor(round(x, 1)), y = as.factor(round(y, 1)), fill = change)) +
        geom_tile() +
        scale_fill_gradient(low = "#ffcccc",high = "#ff2222") +
        ggtitle('Changes in the return period of reference events')+
        labs(x = 'Severity',
             y = 'Duration (months)')

    return(plot)
}


#' function to plot the  changes in the distribution of return period period (Tr) for the domain of events in the reference period.
#'
#' @param tr_dist_comparison The output of function `get_distribution_comparison()`. Contains
#' reference event feature values and the two distributions of TR.
#' @param data_reference data frame with seven columns containing years, severity and duration, its
#' transformations and the entropy-based marginals for the REFERENCE period of interest. Preferably
#' the output of `get_marginal_distribution()`.
#' @param data_study data frame with seven columns containing years, severity and duration, its
#' transformations and the entropy-based marginals for the STUDY period of interest. Preferably
#' the output of `get_marginal_distribution()`.
#' @param na.rm logical. Default is True. NA values are filled with `tidyr::fill(direction = 'down')`
#'
#'@export
#'
plot_tr_distribution_comparison2 <- function(tr_dist_comparison, data_reference, data_study,
                                            na.rm = F){

    if(na.rm){
        dist_comparison %<>% fill(Tr_ref, Tr_2)
    }

    theme_set(theme_bw())
    theme_update(aspect.ratio=1)

    plot <- ggplot2::ggplot(dist_comparison1, aes(x = x, y = y))+
        geom_contour_filled(aes(z = Tr_ref), breaks = c(1,2,5,10,20,50,100,200),polygon_outline = FALSE)+
        geom_contour(aes(z = Tr_2), breaks = c(2,5,10,20,50,100), colour = 'darkgrey')+
        geom_text_contour(aes(z = Tr_2), skip = 0,
                          breaks = c(2,5,10,20,50),
                          label.placer = label_placer_n(1),
                          colour = 'darkred')+
        ggtitle('MECC cumulative probability function')+
        scale_x_continuous(expand = c(0.0,0.1), limits = c(0,max(series1$sev)))+
        scale_y_continuous(expand = c(0.0,0.1), limits = c(0,max(series1$dur)))+
        labs(x = 'Severity (SPEI3)', y = 'Duration (months)')+
        theme_bw()+
        theme_update(aspect.ratio=1)+
        geom_point(inherit.aes = F, data = series1, aes(x = sev, y = dur), alpha = 0.5)+
        geom_point(inherit.aes = F, data = series2, aes(x = sev, y = dur), alpha = 0.5, col ='red')

    return(plot)
}
pedroalencar1/DroughtSDF documentation built on May 8, 2023, 6:58 p.m.