R/z_deprecated.R

model_predict_index <- function(
  model_estimation,
  knn = 3
) {
  .Deprecated("model_predict_index_v3")
  # Get names of computed distance measures -----
  dist_measures <- names(model_estimation[[1]][[1]])

  model_prediction <- lapply(model_estimation, function(estimation) {
    res <- lapply(dist_measures, function(mea) {
      # Select particular distance measure result for all stations -----
      values <- sapply(estimation, "[[", mea)
      names(values) <- 1:length(values)

      # Sort estimation results -----
      # values <- sort(values[values != 0])
      values <- sort(values)

      # Select best predictions based on knn -----
      as.numeric(names(values[1:knn]))
    })
    names(res) <- dist_measures
    res
  })
}

model_predict_index_v2 <- function(
  model_estimation,
  knn = 3
) {
  .Deprecated("model_predict_index_v3")
  # Get names of computed distance measures -----
  dist_measures <- names(model_estimation[[1]]$distance_stat[[1]])
  # TODO-20180705: make this nicer/more robust

  model_prediction <- lapply(model_estimation, function(estimation) {
    index <- lapply(dist_measures, function(mea) {
      # Select particular distance measure result for all stations -----
      values <- sapply(estimation$distance_stat, "[[", mea)
      names(values) <- 1:length(values)

      # Sort estimation results -----
      # values <- sort(values[values != 0])
      values <- sort(values)

      # Select best predictions based on knn -----
      as.numeric(names(values[1:knn]))
    })
    names(index) <- dist_measures

    list(
      distance_stat_index = index,
      dat_distance_geo = estimation$dat_distance_geo
    )
  })

  model_prediction
}

model_predict <- function(
  model_prediction_index,
  dat_input,
  dat_db,
  dat_station,
  knn
) {
  .Deprecated("model_predict_v3")
  # Ensure matrix -----
  dat_input <- as.matrix(dat_input)

  dat_result <- lapply(1:length(model_prediction_index), function(idx_input) {
    choice <- model_prediction_index[[idx_input]]
    dist_meassure <- unlist(lapply(names(choice), rep, knn))
    index <- unlist(choice)

    # Get station ID ------
    dim_station <- dat_db$dim_station[index]

    # Get ranks -----
    rank <- rep(seq(knn), length(choice))

    # Input data -----
    dat_input_row <- dat_input[idx_input, ]

    # Prediction data -----
    dat_prediction <- dat_db[index, ] %>%
      select(time_month, matches("msr_")) %>%
      mutate(
        dim_rank = rank,
        dim_station = dim_station,
        dim_dist_measure = dist_meassure,
        diff_time_month = time_month - dat_input_row["time_month"],
        diff_msr_temp_min = msr_temp_min - dat_input_row["msr_temp_min"],
        diff_msr_temp_max = msr_temp_max - dat_input_row["msr_temp_max"],
        diff_msr_temp_avg = msr_temp_avg - dat_input_row["msr_temp_avg"],
        diff_msr_precip_min = msr_precip_min - dat_input_row["msr_precip_min"],
        diff_msr_precip_max = msr_precip_max - dat_input_row["msr_precip_max"],
        diff_msr_precip_avg = msr_precip_avg - dat_input_row["msr_precip_avg"],
        diff_msr_sundur_avg = msr_sundur_avg - dat_input_row["msr_sundur_avg"]
      )

    dat_prediction <- dat_prediction %>% mutate_if(is.double, round, 1)

    # Join stations -----
    dat_prediction_2 <- inner_join(
      dat_prediction,
      dat_station,
      by = "dim_station"
    ) %>%
      select(
        dim_rank,
        dim_country,
        dim_station_name,
        time_month,
        diff_time_month,
        msr_temp_min,
        diff_msr_temp_min,
        msr_temp_max,
        diff_msr_temp_max,
        msr_temp_avg,
        diff_msr_temp_avg,
        msr_precip_min,
        diff_msr_precip_min,
        msr_precip_max,
        diff_msr_precip_max,
        msr_precip_avg,
        diff_msr_precip_avg,
        msr_sundur_avg,
        diff_msr_sundur_avg,
        dim_station,
        dim_latitude,
        dim_longitude,
        dim_high,
        dim_dist_measure,
        everything()
      )

    # Return -----
    list(
      input = dat_input_row,
      prediction = dat_prediction_2
    )
  })
}

model_predict_v2 <- function(
  model_prediction_index,
  dat_input,
  dat_db,
  dat_station,
  knn
) {
  .Deprecated("model_predict_v3")
  # Ensure matrix -----
  # dim_station <- default_name("station") %>% quo_prepend("dim")
  dat_input <- dat_input %>%
    # dplyr::select(-dim_station) %>%
    as.matrix()

  dat_result <- lapply(1:length(model_prediction_index), function(idx_input) {
    choice <- model_prediction_index[[idx_input]]$distance_stat_index
    dist_meassure <- unlist(lapply(names(choice), rep, knn))
    index <- unlist(choice)

    # Get station ID ------
    dim_station <- dat_db$dim_station[index]

    # Get ranks -----
    rank <- rep(seq(knn), length(choice))

    # Input data -----
    dat_input_row <- dat_input[idx_input, ]

    # Prediction data -----
    dat_prediction <- dat_db[index, ] %>%
      select(time_month, matches("msr_")) %>%
      mutate(
        dim_rank = rank,
        dim_station = dim_station,
        dim_dist_measure = dist_meassure,
        diff_time_month = time_month - dat_input_row["time_month"],
        diff_msr_temp_min = msr_temp_min - dat_input_row["msr_temp_min"],
        diff_msr_temp_max = msr_temp_max - dat_input_row["msr_temp_max"],
        diff_msr_temp_avg = msr_temp_avg - dat_input_row["msr_temp_avg"],
        diff_msr_precip_min = msr_precip_min - dat_input_row["msr_precip_min"],
        diff_msr_precip_max = msr_precip_max - dat_input_row["msr_precip_max"],
        diff_msr_precip_avg = msr_precip_avg - dat_input_row["msr_precip_avg"],
        diff_msr_sundur_avg = msr_sundur_avg - dat_input_row["msr_sundur_avg"]
      )

    dat_prediction <- dat_prediction %>% mutate_if(is.double, round, 1)

    dat_distance_geo <- model_prediction_index[[idx_input]]$dat_distance_geo

    # Join stations -----
    dat_prediction_2 <- inner_join(
      dat_prediction,
      dat_station,
      by = "dim_station"
    ) %>%
      left_join(
        dat_distance_geo,
        by = "dim_station"
      ) %>%
      select(
        dim_rank,
        dim_country,
        dim_station_name,
        msr_distance,
        time_month,
        diff_time_month,
        msr_temp_min,
        diff_msr_temp_min,
        msr_temp_max,
        diff_msr_temp_max,
        msr_temp_avg,
        diff_msr_temp_avg,
        msr_precip_min,
        diff_msr_precip_min,
        msr_precip_max,
        diff_msr_precip_max,
        msr_precip_avg,
        diff_msr_precip_avg,
        msr_sundur_avg,
        diff_msr_sundur_avg,
        dim_station,
        dim_latitude,
        dim_longitude,
        dim_high,
        dim_dist_measure,
        everything()
      )

    # Return -----
    list(
      input = dat_input_row,
      prediction = dat_prediction_2
    )
  })
}

model_run_v2 <- function(
  dat_input,
  dat_db,
  dat_station,
  dist_measures,
  dist_measure_final,
  knn
) {
  .Deprecated("model_run_v3")
  # Model estimation --------------------------------------------------------

  model_estimation <- model_estimate_v4(
    dat_input = dat_input,
    dat_db = dat_db_msr,
    dat_station = dat_station,
    dist_measures = dist_measures
  )

  # Identify best choices ---------------------------------------------------

  model_prediction_index <- model_predict_index_v4(
    model_estimation = model_estimation,
    knn = knn
  )

  # Model prediction --------------------------------------------------------

  model_prediction <- model_predict_v4(
    model_prediction_index,
    dat_station = dat_station,
    knn = knn
  )

  model_prediction_ensemble <- model_predict_ensemble_v2(
    model_prediction = model_prediction
  )

  # Model output ------------------------------------------------------------

  model_result_gathered <- model_output_gathered(model_prediction_ensemble)

  if (settings$output$show) {
    print("Show:")
    print(settings$output$show)
    model_result_render(
      model_result_gathered,
      knn = knn,
      dist_measures = dist_measures,
      dist_measure_final = dist_measure_final
    )
  }

  model_result_gathered
}

model_estimate_v4 <- function(
  dat_input,
  dat_db,
  dat_station,
  dist_measures = default_dist_measures()
) {
  .Deprecated("model_estimate_v5")
  dat_db_out <- dat_db %>%
    dplyr::mutate(
      fct_scaling = NA
    )
  # Argument handling -----
  dist_measures <- match.arg(dist_measures, several.ok = TRUE)

  # Ensure matrix -----
  # dim_station <- default_name("station") %>% quo_prepend("dim")
  dat_input <- dat_input %>%
    # dplyr::select(-dim_station) %>%
    as.matrix()

  # Compute distance for each input row/vector -----
  dat_dist <- lapply(1:nrow(dat_input), function(row_user) {
    dat_input_row <- dat_input[row_user, , drop = FALSE]

    # Distances -----
    dat_distance_geo <- data.frame(
      dim_station = unique(dat_station$dim_station),
      msr_distance = NA,
      stringsAsFactors = FALSE
    )

    # Handle time input -----
    # Only keep weather records that have time == user input
    dat_list <- handle_input_time(
      dat_input = dat_input_row,
      dat_db = dat_db
    )
    # Update input and DB data:
    dat_input_row <- dat_list$dat_input
    dat_db <- dat_list$dat_db

    cols <- c("dim_latitude", "dim_longitude", "msr_distance")
    if (all(cols %in% colnames(dat_input_row))) {
      # Get scaling factor(s) from settings -----
      # fct_scaling <- settings$scaling$distances
      dat_fct_scaling <- tibble(fct_scaling = unlist(settings$scaling$distances))

      # Compute geo distance -----
      # dat_msr_distance <- compute_geo_distance_v2(p_1 = dat_input_row, p_2 = dat_station)
      dat_msr_distance <- dat_fct_scaling %>%
        dplyr::group_by(fct_scaling) %>%
        dplyr::do(
          pipe_compute_geo_distance_v2(
            .,
            dat_input_row = dat_input_row,
            dat_station = dat_station
          )
        ) %>%
        ungroup()

      # Handle distance input -----
      # Only keep stations that have distance <= user input
      dat_list <- handle_input_distance(
        dat_input = dat_input_row,
        dat_msr_distance = dat_msr_distance,
        dat_fct_scaling = dat_fct_scaling
      )
      # Update input and distance data:
      dat_input_row <- dat_list$dat_input
      dat_msr_distance <- dat_list$dat_msr_distance

      # Join distances to DB data -----
      col_dim_station <- as.symbol("dim_station")
      value_dim_station <- dat_msr_distance %>% dplyr::pull(!!col_dim_station)
      # Ensure stations in DB data match those in distance data:
      dat_db <- dat_db %>%
        dplyr::filter(!!col_dim_station %in% value_dim_station)
      # Cache a version of DB data before the join for downstream processing:
      dat_db_out <- dat_db
      # Actual join:
      dat_db <- left_join(
        dat_db,
        dat_msr_distance,
        by = "dim_station"
      )

      dat_distance_geo <- dat_db %>%
        select(
          fct_scaling,
          dim_station,
          msr_distance,
          msr_distance_scaled
        ) %>%
        group_by(
          fct_scaling,
          dim_station
        ) %>%
        summarise(
          msr_distance = unique(msr_distance),
          msr_distance_scaled = unique(msr_distance_scaled)
        ) %>%
        ungroup()
      # TODO 20181122: tidy eval for column names
    } else {
      # PATCH 20181122
      # TODO 20181122: remove patch
      dat_input_row <- cbind(
        data.frame(fct_scaling = NA) %>% as.matrix(),
        dat_input_row
      )
      dat_db$fct_scaling <- NA
      dat_distance_geo$fct_scaling <- NA
    }

    # Drop all dims -----
    dat_db <- dat_db %>%
      select(-matches("dim_")) %>%
      as.matrix()
    # TODO-20180610: encapsulate in function
    idx <- colnames(dat_input_row) %>% str_detect("dim")
    dat_input_row <- dat_input_row[ , !idx, drop = FALSE]

    # estimation_result <- lapply(1:nrow(dat_db), function(row) {
    #   dat_db_this <- dat_db[row, colnames(dat_input_row), drop = FALSE]
    #
    #   # List availble distance measures
    #   # philentropy::getDistMethods()
    #
    #   res <- list(
    #     euclidean = if ("euclidean" %in% dist_measures) {
    #       philentropy:::euclidean(
    #         dat_input_row, dat_db_this, testNA = FALSE)
    #     }
    #   )
    #   res[!sapply(res, is.null)]
    # })

    dat_input_row <- dat_input_row %>% as_tibble()
    names_old <- dat_input_row %>%
      dplyr::select(-fct_scaling) %>%
      names() %>%
      rlang::syms()

    names_new <- c(
      list(rlang::sym("fct_scaling")),
      names_old %>%
        paste0("input_", .) %>%
        rlang::syms()
    )

    names_mapping <- c(
      list(rlang::sym("fct_scaling")),
      names_old
    )
    names(names_mapping) <- names_new
    dat_input_row <- dat_input_row %>% dplyr::rename(!!!names_mapping)

    dat_estimation <- left_join(
      dat_db %>% as_tibble(),
      dat_input_row,
      by = "fct_scaling"
    ) %>%
      dplyr::select(
        fct_scaling,
        everything()
      )

    estimation_result <- dat_estimation %>%
      dplyr::group_by(fct_scaling) %>%
      dplyr::do(
        pipe_model_estimate(
          .,
          dist_measures = dist_measures
        )
      ) %>%
      ungroup()

    list(
      estimation_result = estimation_result,
      dat_distance_geo = dat_distance_geo,
      dat_input = dat_input,
      dat_db = dat_db_out
    )
    # TODO-20180705: tidyfy column name

  })
}

pipe_compute_geo_distance_v2 <- function(
  dat_fct_scaling,
  dat_input_row,
  dat_station
) {
  .Deprecated("pipe_compute_geo_distance_v3")
  dat_msr_distance <- compute_geo_distance_v2(
    p_1 = dat_input_row,
    p_2 = dat_station
  )
  dat_msr_distance <- bind_cols(
    dat_msr_distance,
    dat_msr_distance * dat_fct_scaling %>% dplyr::pull(fct_scaling)
  ) %>%
    dplyr::rename(msr_distance_scaled = msr_distance1)
  # TODO 20181122: tidy eval for column names

  dat_msr_distance$dim_station <- dat_station$dim_station
  dat_msr_distance
}

# https://www.kompf.de/gps/distcalc.html
compute_geo_distance <- function(p_1, p_2) {
  dist <- 6378.388 * acos(
    sin(p_1$dim_latitude*pi/180) * sin(p_2$dim_latitude*pi/180) +
      cos(p_1$dim_latitude*pi/180) * cos(p_2$dim_latitude*pi/180) *
      cos(p_1$dim_longitude*pi/180 - p_2$dim_longitude*pi/180))
}

compute_geo_distance_v2 <- function(p_1, p_2) {
  dist <- 6378.388 * acos(
    sin(pi/180*p_1[ , "dim_latitude"]) * sin(pi/180*p_2[ , "dim_latitude"]) +
      cos(pi/180*p_1[ , "dim_latitude"]) * cos(pi/180*p_2[ , "dim_latitude"]) *
      cos(pi/180*p_1[ , "dim_longitude"] - pi/180*p_2[ , "dim_longitude"]))
  names(dist) <- "msr_distance"
  dist

  # TODO-20180610: robustify against dependency on tibble and column names from
  # settings
}

pipe_compute_geo_distance_v3 <- function(
  dat_fct_scaling,
  dat_input,
  dat_station
) {
  dat_fct_scaling
  dat_msr_distance <- compute_geo_distance_v2(
    p_1 = dat_input,
    p_2 = dat_station
  )
  dat_msr_distance <- bind_cols(
    dat_msr_distance,
    dat_msr_distance * dat_fct_scaling %>% dplyr::pull(fct_scaling)
  ) %>%
    dplyr::rename(msr_distance_scaled = msr_distance1)
  # TODO 20181122: tidy eval for column names

  dat_msr_distance$dim_station <- dat_station$dim_station
  dat_msr_distance
}

# handle_input_distance <- function(
#   dat_input,
#   dat_msr_distance,
#   dat_fct_scaling
# ) {
#   .Deprecated("handle_input_distance_v2")
#   # Handle distance
#   col_msr_distance <- settings$name_mapping$msr_distance$key
#   idx <- colnames(dat_input) %>% str_detect(quo_name(col_msr_distance))
#   if (any(idx)) {
#     value_msr_distance <- dat_input[ , quo_name(col_msr_distance)]
#     # dat_msr_distance %>% dplyr::arrange(desc(msr_distance)) %>% head()
#     dat_msr_distance <- dat_msr_distance %>% dplyr::filter(
#       !!col_msr_distance <= value_msr_distance
#     )
#   }
#
#   list(
#     dat_input = if (
#       all(dat_fct_scaling %>% dplyr::pull(fct_scaling) == 0)
#     ) {
#       # If scaling factor 0 then this implies to drop alltogether
#       dat_input[, !idx, drop = FALSE]
#     } else {
#       do_fun <- function(
#         dat_fct_scaling,
#         dat_input,
#         idx
#       ) {
#         # Apply scaling so input and DB data match up downstream
#         dat_input[, idx] <- dat_input[, idx] * dat_fct_scaling %>%
#           dplyr::pull(fct_scaling)
#         colnames(dat_input)[idx] <- "msr_distance_scaled"
#         # TODO 20181122: tidy eval for column names based on settings
#         as_tibble(dat_input)
#       }
#
#       dat_fct_scaling %>%
#         dplyr::group_by(fct_scaling) %>%
#         dplyr::do(
#           do_fun(
#             .,
#             dat_input = dat_input,
#             idx = idx
#           )
#         ) %>%
#         ungroup() %>%
#         as.matrix()
#     },
#     dat_msr_distance = dat_msr_distance
#   )
# }

model_predict_index_v4 <- function(
  model_estimation,
  knn = 3
) {
  .Deprecated("model_predict_index_v5")
  model_prediction <- lapply(model_estimation, function(estimation) {
    index <- estimation$estimation_result %>%
      dplyr::group_by(fct_scaling) %>%
      dplyr::mutate(index = 1:n()) %>%
      dplyr::arrange(estimation) %>%
      dplyr::slice(1:knn) %>%
      dplyr::mutate(rank = 1:n()) %>%
      ungroup()

    list(
      estimation_result_index = index,
      dat_distance_geo = estimation$dat_distance_geo,
      dat_input = estimation$dat_input,
      dat_db = estimation$dat_db
    )
  })

  model_prediction
}

model_predict_v4 <- function(
  model_prediction_index,
  dat_station,
  knn
) {
  .Deprecated("model_apply_v1")
  dat_result <- lapply(1:length(model_prediction_index), function(idx_input) {
    # Ensure matrix -----
    dat_input <- model_prediction_index[[idx_input]]$dat_input
    dat_db <- model_prediction_index[[idx_input]]$dat_db %>%
      as_tibble()

    choice <- model_prediction_index[[idx_input]]$estimation_result_index
    index <- choice %>% dplyr::pull(index)

    # Get station ID ------
    dim_station <- dat_db$dim_station[index]

    # Input data -----
    dat_input_row <- dat_input[idx_input, ]

    # Prediction data -----
    dat_prediction <- dat_db[index, ] %>%
      select(time_month, matches("msr_")) %>%
      mutate(
        dim_station = dim_station,
        diff_time_month = time_month - dat_input_row["time_month"],
        diff_msr_temp_min = msr_temp_min - dat_input_row["msr_temp_min"],
        diff_msr_temp_max = msr_temp_max - dat_input_row["msr_temp_max"],
        diff_msr_temp_avg = msr_temp_avg - dat_input_row["msr_temp_avg"],
        diff_msr_precip_min = msr_precip_min - dat_input_row["msr_precip_min"],
        diff_msr_precip_max = msr_precip_max - dat_input_row["msr_precip_max"],
        diff_msr_precip_avg = msr_precip_avg - dat_input_row["msr_precip_avg"],
        diff_msr_sundur_avg = msr_sundur_avg - dat_input_row["msr_sundur_avg"]
      )

    dat_prediction <- dat_prediction %>% mutate_if(is.double, round, 1)

    dat_prediction <- bind_cols(
      choice %>% dplyr::select(fct_scaling, dim_rank = rank),
      dat_prediction
    )

    dat_distance_geo <- model_prediction_index[[idx_input]]$dat_distance_geo

    # Join stations -----
    dat_prediction_2 <- inner_join(
      dat_prediction,
      dat_station,
      by = "dim_station"
    ) %>%
      left_join(
        dat_distance_geo,
        by = c("fct_scaling", "dim_station")
      ) %>%
      select(
        fct_scaling,
        dim_rank,
        dim_country,
        dim_station_name,
        msr_distance,
        time_month,
        diff_time_month,
        msr_temp_min,
        diff_msr_temp_min,
        msr_temp_max,
        diff_msr_temp_max,
        msr_temp_avg,
        diff_msr_temp_avg,
        msr_precip_min,
        diff_msr_precip_min,
        msr_precip_max,
        diff_msr_precip_max,
        msr_precip_avg,
        diff_msr_precip_avg,
        msr_sundur_avg,
        diff_msr_sundur_avg,
        dim_station,
        dim_latitude,
        dim_longitude,
        dim_high,
        everything()
      )

    # Return -----
    list(
      input = dat_input_row,
      prediction = dat_prediction_2
    )
  })
}

# model_estimate_inner <- function(.x) {
#   .Deprecated("model_estimate_inner_v2")
#   dat_input <- .x$dat_input
#   dat_db <- .x$dat_db
#   dat_station <- .x$dat_station
#
#   # Handle time input -----
#   # Only keep weather records that have time == user input
#   dat_list <- handle_input_time(
#     dat_input = dat_input,
#     dat_db = dat_db
#   )
#   # Update input and DB data:
#   dat_input <- dat_list$dat_input
#   dat_db <- dat_list$dat_db
#
#   # Get scaling factor(s) from settings -----
#   # fct_scaling <- settings$scaling$distances
#   dat_fct_scaling <- tibble(fct_scaling = unlist(settings$scaling$distances))
#
#   # Compute geo distance -----
#   # dat_msr_distance <- compute_geo_distance_v2(p_1 = dat_input_row, p_2 = dat_station)
#   dat_msr_distance <- compute_geo_distance_v3(
#     p_1 = dat_input %>%
#       dplyr::select(matches("latitude|longitude")) %>%
#       dplyr::summarise_all(unique),
#     p_2 = dat_station
#   )
#
#   # Handle distance input -----
#   # Only keep stations that have distance <= user input
#   dat_list <- handle_input_distance_v2(
#     dat_input = dat_input,
#     dat_msr_distance = dat_msr_distance
#   )
#   # Update input and distance data:
#   dat_input <- dat_list$dat_input
#   dat_input_out <- dat_input
#   dat_input <- purrr::map_df(
#     seq_len(nrow(dat_fct_scaling)), ~dat_input) %>%
#     dplyr::mutate(
#       fct_scaling = rep(dat_fct_scaling$fct_scaling, nrow(dat_input)) %>%
#         sort()
#     ) %>%
#     dplyr::select(
#       fct_scaling,
#       everything()
#     )
#   dat_msr_distance <- dat_list$dat_msr_distance
#
#   # Scale -----
#   dat_msr_distance <- dat_fct_scaling %>%
#     dplyr::group_by(fct_scaling) %>%
#     dplyr::do(
#       pipe_compute_geo_distance_v4(
#         .,
#         dat_msr_distance
#       )
#     ) %>%
#     ungroup()
#
#   # Handle DB -----
#   dat_list <- handle_input_db(
#     dat_db = dat_db,
#     dat_msr_distance = dat_msr_distance
#   )
#   dat_db <- dat_list$dat_db
#   dat_db_out <- dat_list$dat_db_out
#
#   # Create geo distance data frame ----
#   dat_distance_geo <- handle_dat_dist_geo(dat_db)
#
#   # Early exit -----
#   if (!nrow(dat_db)) {
#     model_result <- list(
#       estimation_result= tibble(
#         fct_scaling = NA,
#         dim_station = NA,
#         index = NA,
#         rank = NA,
#         estimation_result = NA
#       ),
#       dat_distance_geo = dat_distance_geo,
#       dat_input = dat_input_out,
#       dat_db = dat_db_out
#     )
#     return(model_result)
#   }
#
#   # Drop all dims -----
#   dat_list <- handle_drop_dims(dat_db, dat_input)
#   dat_input <- dat_list$dat_input
#   dat_db <- dat_list$dat_db
#
#   # Align dimensions of input -----
#   alignment_factor <- dat_db %>% nrow() / nrow(dat_fct_scaling)
#
#   dat_input <- purrr::map_df(
#     seq_len(alignment_factor), ~dat_input
#   ) %>%
#     dplyr::arrange(fct_scaling)
#   # dat_input <- dat_input %>%
#   #   dplyr::group_by(fct_scaling) %>%
#   #   purrr::map_df(seq_len(nrow(dat_db)), ~dat_input)
#   #
#   # Compute euclidean distance -----
#   # purrr::map2(dat_db, dat_input, pmap_inner_2)
#   fct_scaling <- dat_input %>%
#     dplyr::pull(fct_scaling)
#   dat_input_mat <- dat_input %>%
#     dplyr::select(-fct_scaling) %>%
#     as.matrix()
#   dat_db_mat <- dat_db %>%
#     dplyr::arrange(fct_scaling) %>%
#     dplyr::select(-fct_scaling) %>%
#     as.matrix()
#   vec_estimation_result <- sapply(1:nrow(dat_db), function(row) {
#     dat_db_this <- dat_db_mat[row, , drop = FALSE]
#     dat_input_this <- dat_input_mat[row, , drop = FALSE]
#     res <- philentropy:::euclidean(
#       dat_input_this, dat_db_this, testNA = FALSE)
#   })
#
#   estimation_result <- tibble(
#     fct_scaling = fct_scaling,
#     dim_station = rep(dat_db_out$dim_station, nrow(dat_fct_scaling)),
#     estimation_result = vec_estimation_result
#   ) %>%
#     dplyr::group_by(fct_scaling) %>%
#     # Add distances for arranging:
#     dplyr::left_join(dat_distance_geo,
#       by = c("fct_scaling", "dim_station")) %>%
#     # Add position index:
#     dplyr::mutate(index = row_number()) %>%
#     dplyr::arrange(fct_scaling, estimation_result, msr_distance) %>%
#     dplyr::mutate(rank = row_number()) %>%
#     dplyr::select(
#       fct_scaling,
#       dim_station,
#       index,
#       rank,
#       estimation_result
#     ) %>%
#     ungroup()
#
#   list(
#     estimation_result= estimation_result,
#     dat_distance_geo = dat_distance_geo,
#     dat_input = dat_input_out,
#     dat_db = dat_db_out
#   )
# }

handle_input_time <- function(
  dat_input,
  dat_db
) {
  .Deprecated("model_handle_input_time_v2")
  col_dim_time <- settings$name_mapping$time_month$key
  idx <- colnames(dat_input) %>% str_detect(quo_name(col_dim_time))
  if (any(idx)) {
    value_dim_time <- dat_input %>% dplyr::pull(!!col_dim_time)
    dat_db <- dat_db %>% dplyr::filter(
      !!col_dim_time == value_dim_time
    )
  }

  list(
    dat_input = dat_input,
    dat_db = dat_db
  )
}

handle_dat_dist_geo <- function(
  dat_db
) {
  .Deprecated("model_handle_dat_dist_geo_v2")
  df_dist_geo <- dat_db %>%
    select(
      fct_scaling,
      dim_station,
      msr_distance,
      msr_distance_scaled
    ) %>%
    group_by(
      fct_scaling,
      dim_station
    ) %>%
    summarise(
      msr_distance = unique(msr_distance),
      msr_distance_scaled = unique(msr_distance_scaled)
    ) %>%
    ungroup()
  # TODO 20181122: tidy eval for column names
}

handle_input_db <- function(
  dat_db,
  dat_msr_distance
) {
  .Deprecated("model_handle_input_db_v2")
  # Join distances to DB data -----
  col_dim_station <- default_name("station") %>% quo_prepend("dim")
  value_dim_station <- dat_msr_distance %>% dplyr::pull(!!col_dim_station)

  # Ensure stations in DB data match those in distance data:
  dat_db <- dat_db %>%
    dplyr::filter(!!col_dim_station %in% value_dim_station)

  # Cache a version of DB data before the join for downstream processing:
  dat_db_out <- dat_db

  # Actual join:
  dat_db <- left_join(
    dat_db,
    dat_msr_distance,
    by = "dim_station"
  )

  list(
    dat_db = dat_db,
    dat_db_out = dat_db_out
  )
}

pipe_compute_geo_distance_v4 <- function(
  dat_fct_scaling,
  dat_msr_distance
) {
  dat_msr_distance %>%
    dplyr::mutate(
      msr_distance_scaled = msr_distance * dat_fct_scaling %>%
        pull(fct_scaling)
    )
}

pipe_compute_geo_distance_v5 <- function(
  dat_fct_scaling,
  dat_msr_distance
) {
  .Deprecated("pipe_compute_geo_distance_v6")
  dat_msr_distance %>%
    dplyr::mutate(
      msr_distance_scaled = msr_distance * dat_fct_scaling %>%
        pull(fct_scaling_dist)
    )
}

handle_drop_dims <- function(dat_db, dat_input) {
  .Deprecated("model_handle_drop_dims_v2")
  dat_input <- dat_input %>%
    dplyr::select(-matches("dim"))
  dat_db <- dat_db %>%
    select(-matches("dim_"))
  # TODO-20180610: encapsulate in function

  cols <- names(dat_input) %>%
    rlang::syms()
  dat_db <- dat_db %>%
    dplyr::select(!!!cols)

  list(
    dat_input = dat_input,
    dat_db = dat_db
  )
}

model_run_v4 <- function(
  dat_input,
  dat_db,
  dat_station,
  knn
) {
  .Deprecated("model_run_v5")
  # Model estimation -----
  model_estimation <- model_estimate_v5(
    dat_input = dat_input,
    dat_db = dat_db,
    dat_station = dat_station
  )

  # Model output -----
  model_output <- model_apply_v2(
    model_estimation,
    dat_station = dat_station,
    knn = knn
  )

  model_output
}

model_estimate_v5 <- function(
  dat_input,
  dat_db,
  dat_station
) {
  .Deprecated("model_estimate_v6")
  # Get scaling factor(s) from settings -----
  if (TRUE) {
    dat_fct_scaling = tibble::tibble(
      fct_scaling_dist = unlist(settings$scaling$distances) %>%
        rep(nrow(dat_input)),
      fct_scaling_time = unlist(settings$scaling$time) %>%
        rep(nrow(dat_input))
    )

    # Expand -----
    dat_fct_scaling <- dat_fct_scaling %>%
      tidyr::expand(fct_scaling_dist, fct_scaling_time)

    # Add columns for join -----
    # dat_fct_scaling <- dat_fct_scaling %>%
    #   dplyr::mutate(
    #     time_month = dat_input %>%
    #       dplyr::select_at(if("time_month" %in% names(.))
    #         "time_month" else integer(0)) %>%
    #       dplyr::pull()
    #   )
    # KEEP AS REFERENCE
    dat_fct_scaling <- dat_fct_scaling %>%
      dplyr::mutate(
        time_month = dat_input %>%
          dplyr::pull(time_month),
        msr_distance = dat_input %>%
          dplyr::pull(msr_distance)
      )

    dat_input <- dat_input %>%
      # dplyr::select(time_month) %>%
      dplyr::left_join(
        dat_fct_scaling,
        by = intersect(names(dat_input), names(dat_fct_scaling))
      ) %>%
      dplyr::mutate(
        time_month_scaled = time_month * fct_scaling_time,
        msr_distance_scaled = msr_distance * fct_scaling_dist
      )
  } else {
    stop("Single set of scaling factors for all vars to be scaled not implemented yet")
    dat_fct_scaling = tibble::tibble(
      dat_fct_scaling = unlist(settings$scaling$distances)
    )
  }

  # dat_list <- model_handle_input_time_v2(
  #   dat_input = dat_input,
  #   dat_db = dat_db
  # )
  # dat_db <- dat_list$dat_db

  # Prepare input for subsequent {purrr} pipes -----
  # dat_estimation_input <- model_prepare_estimation_input(
  #   dat_input,
  #   dat_db,
  #   dat_station
  # )
  #
  # # Compute distance for each input vector -----
  # df_estimate <- dat_estimation_input %>%
  #   purrr::map(model_estimate_inner_v2)

  if (FALSE) {
    dat_db <- purrr::map(
      seq_len(nrow(dat_input)), ~dat_db
    )
    dat_station <- purrr::map(
      seq_len(nrow(dat_input)), ~dat_station
    )
    dat_input <- purrrlyr::by_row(dat_input,
      function(v) list(v)[[1L]], .collate = "list")$.out

    dat_estimation_input <- purrr::pmap(list(dat_input, dat_db, dat_station),
      function(a, b, c) {
        list(
          dat_input = a,
          dat_db = b,
          dat_station = c
        )
      })
    df_estimate <- dat_estimation_input %>%
      purrr::map(model_estimate_inner_v2)
  }

  dat_input <- dat_input %>%
    dplyr::mutate(
      id = str_glue("time={fct_scaling_time}_dist={fct_scaling_dist}")
    ) %>%
    dplyr::select(
      id,
      everything()
    )
  dat_input <- purrrlyr::by_row(dat_input,
    function(v) list(v)[[1L]], .collate = "list")$.out

  df_estimate <- dat_input %>%
    purrr::map(model_estimate_inner_v3,
      dat_db = dat_db, dat_station = dat_station)

  df_estimate
}

model_apply_v2 <- function(
  model_estimation,
  dat_station,
  knn
) {
  .Deprecated("model_apply_v3")
  # dat_result <- lapply(model_estimation, function(estimation) {
  model_output_list <- model_estimation %>%
    purrr::map(function(estimation) {
      # Ensure matrix -----
      dat_db <- estimation$dat_db

      choice <- estimation$estimation_result %>%
        dplyr::filter(rank %in% 1:knn)

      dat_input <- estimation$dat_input
      v_id <- dat_input %>%
        dplyr::distinct(id) %>%
        dplyr::pull()
      dat_input <- purrr::map_df(seq_len(knn), ~dat_input %>%
          dplyr::select(-id)) %>%
        as.matrix()

      # Prediction data -----
      foo <- function(choice, dat_db) {
        dat_db %>%
          dplyr::filter(
            dim_station %in% choice$dim_station
          )
      }

      dat_output <- choice %>%
        dplyr::do(foo(., dat_db)) %>%
        dplyr::ungroup() %>%
        select(dim_station, time_month, matches("msr_")) %>%
        mutate(
          # diff_time_month = time_month - dat_input %>%
          #   dplyr::pull(time_month),
          # diff_msr_temp_min = msr_temp_min - dat_input %>%
          #   dplyr::pull(msr_temp_min),
          # diff_msr_temp_max = msr_temp_max - dat_input %>%
          #   dplyr::pull(msr_temp_max),
          # diff_msr_temp_avg = msr_temp_avg - dat_input %>%
          #   dplyr::pull(msr_temp_avg),
          # diff_msr_precip_min = msr_precip_min - dat_input %>%
          #   dplyr::pull(msr_precip_min),
          # diff_msr_precip_max = msr_precip_max - dat_input %>%
          #   dplyr::pull(msr_precip_max),
          # diff_msr_precip_avg = msr_precip_avg - dat_input %>%
          #   dplyr::pull(msr_precip_avg),
          # diff_msr_sundur_avg = msr_sundur_avg - dat_input %>%
          #   dplyr::pull(msr_sundur_avg)
          diff_time_month = time_month - dat_input["time_month"],
          diff_msr_temp_min = msr_temp_min - dat_input["msr_temp_min"],
          diff_msr_temp_max = msr_temp_max - dat_input["msr_temp_max"],
          diff_msr_temp_avg = msr_temp_avg - dat_input["msr_temp_avg"],
          diff_msr_precip_min = msr_precip_min - dat_input["msr_precip_min"],
          diff_msr_precip_max = msr_precip_max - dat_input["msr_precip_max"],
          diff_msr_precip_avg = msr_precip_avg - dat_input["msr_precip_avg"],
          diff_msr_sundur_avg = msr_sundur_avg - dat_input["msr_sundur_avg"]
        )

      # dat_output <- dat_output %>% mutate_if(is.double, round, 1)
      dat_output <- dat_output %>% mutate_at(vars(matches("^diff")), round, 1)

      dat_output <- bind_cols(
        choice %>%
          dplyr::ungroup() %>%
          dplyr::select(dim_rank = rank),
        dat_output
      ) %>%
        dplyr::mutate(
          id = v_id
        ) %>%
        dplyr::select(
          id,
          dim_rank,
          everything()
        )

      dat_distance_geo <- estimation$dat_distance_geo

      # Join stations -----
      dat_output_2 <- inner_join(
        dat_output,
        dat_station,
        by = "dim_station"
      ) %>%
        left_join(
          dat_distance_geo,
          by = "dim_station"
        ) %>%
        select(
          id,
          dim_rank,
          dim_country,
          dim_station_name,
          msr_distance,
          time_month,
          diff_time_month,
          msr_temp_min,
          diff_msr_temp_min,
          msr_temp_max,
          diff_msr_temp_max,
          msr_temp_avg,
          diff_msr_temp_avg,
          msr_precip_min,
          diff_msr_precip_min,
          msr_precip_max,
          diff_msr_precip_max,
          msr_precip_avg,
          diff_msr_precip_avg,
          msr_sundur_avg,
          diff_msr_sundur_avg,
          dim_station,
          dim_latitude,
          dim_longitude,
          dim_high,
          everything()
        )

      # Return -----
      list(
        model_input = dat_input %>% tibble::as_tibble(),
        model_output = dat_output_2
      )
    })
  # names(model_output_list) <- paste0("input_", 1:length(model_output_list))

  model_output <- list(
    model_input = purrr::map_df(model_output_list, "model_input"),
    model_output = purrr::map_df(model_output_list, "model_output")
  )
}

model_estimate_inner_v3 <- function(
  dat_input,
  dat_db,
  dat_station
) {
  # Compute geo distance -----
  dat_msr_distance <- compute_geo_distance_v3(
    p_1 = dat_input %>%
      dplyr::select(matches("latitude|longitude")) %>%
      dplyr::summarise_all(unique),
    p_2 = dat_station
  )

  # Handle distance input -----
  # Only keep stations that have distance <= user input
  dat_list <- handle_input_distance_v2(
    dat_input = dat_input,
    dat_msr_distance = dat_msr_distance
  )
  # Update input and distance data:
  dat_input <- dat_list$dat_input
  dat_input_out <- dat_input
  dat_msr_distance <- dat_list$dat_msr_distance

  v_fct_scaling_dist <- dat_input %>%
    dplyr::distinct(fct_scaling_dist) %>%
    dplyr::pull(fct_scaling_dist)
  tmp <- tibble::tibble(
    dim_station = dat_msr_distance %>% dplyr::pull(dim_station) %>%
      rep(length(v_fct_scaling_dist)) %>%
      sort(),
    fct_scaling_dist = v_fct_scaling_dist %>%
      rep(nrow(dat_msr_distance))
  ) %>%
    dplyr::arrange(desc(fct_scaling_dist))
  dat_msr_distance <- dat_msr_distance %>%
    left_join(tmp, by = "dim_station")

  # Scale -----
  dat_msr_distance <- dat_msr_distance %>%
    dplyr::group_by(fct_scaling_dist) %>%
    dplyr::do(
      pipe_compute_geo_distance_v6(.)
    ) %>%
    ungroup()

  dat_time <- dat_db %>%
    dplyr::select(
      dim_station,
      time_month
    ) %>%
    dplyr::mutate(
      fct_scaling_time = dat_input %>%
        dplyr::pull(fct_scaling_time) %>%
        unique()
    ) %>%
    dplyr::group_by(fct_scaling_time) %>%
    dplyr::do(
      pipe_compute_time_v6(.)
    ) %>%
    ungroup()

  # Handle DB -----
  dat_list <- model_handle_input_db_v3(
    dat_db = dat_db,
    dat_time = dat_time,
    dat_msr_distance = dat_msr_distance
  )
  dat_db <- dat_list$dat_db
  dat_db_out <- dat_list$dat_db_out

  # Create geo distance data frame ----
  dat_distance_geo <- model_handle_dat_dist_geo_v2(dat_db)

  # Early exit -----
  if (!nrow(dat_db)) {
    model_result <- list(
      estimation_result= tibble::tibble(
        fct_scaling = NA,
        dim_station = NA,
        index = NA,
        rank = NA,
        estimation_result = NA
      ),
      dat_distance_geo = dat_distance_geo,
      dat_input = dat_input_out,
      dat_db = dat_db_out
    )
    return(model_result)
  }

  # Scaling factors -----
  fct_scaling_dist <- dat_input %>%
    dplyr::distinct(fct_scaling_dist) %>%
    dplyr::pull()
  fct_scaling_time <- dat_input %>%
    dplyr::distinct(fct_scaling_time) %>%
    dplyr::pull()

  # Drop all dims -----
  dat_list <- model_handle_column_alignment(dat_db, dat_input)
  dat_input <- dat_list$dat_input
  dat_db <- dat_list$dat_db

  # Align dimensions of input -----
  dat_input <- purrr::map_df(
    seq_len(dat_db %>% nrow()), ~dat_input
  )

  # Compute euclidean distance -----
  # purrr::map2(dat_db, dat_input, pmap_inner_2)
  dat_input_mat <- dat_input %>%
    as.matrix()
  dat_db_mat <- dat_db %>%
    as.matrix()
  vec_estimation_result <- sapply(1:nrow(dat_db), function(row) {
    dat_db_this <- dat_db_mat[row, , drop = FALSE]
    dat_input_this <- dat_input_mat[row, , drop = FALSE]
    res <- philentropy:::euclidean(
      dat_input_this, dat_db_this, testNA = FALSE)
  })

  estimation_result <- tibble::tibble(
    # fct_scaling_time = fct_scaling_time,
    # fct_scaling_dist = fct_scaling_dist,
    dim_station = dat_db_out$dim_station,
    estimation_result = vec_estimation_result
  ) %>%
    # dplyr::mutate(
    #   id_scaling = str_glue("time={fct_scaling_time}_dist={fct_scaling_dist}")
    # ) %>%
    # dplyr::select(
    #   id_scaling,
    #   everything()
    # ) %>%
    # dplyr::select(
    #   -fct_scaling_time,
    #   -fct_scaling_dist,
    # ) %>%
  # Add distances for arranging:
  dplyr::left_join(dat_distance_geo,
    by = "dim_station"
  ) %>%
    dplyr::select(
      -fct_scaling_dist,
    ) %>%
    # Add position index:
    dplyr::mutate(index = row_number()) %>%
    dplyr::arrange(estimation_result, msr_distance) %>%
    dplyr::mutate(rank = row_number()) %>%
    dplyr::select(
      dim_station,
      index,
      rank,
      estimation_result
    )

  list(
    estimation_result= estimation_result,
    dat_distance_geo = dat_distance_geo,
    dat_input = dat_input_out,
    dat_db = dat_db_out
  )
}

model_run_v5 <- function(
  dat_input,
  dat_db,
  dat_station,
  knn
) {
  .Deprecated("model_run_v6")
  withProgress(message = 'Getting recommendations', value = 0, {

    incProgress(1/3, detail = "Estimating...")
    # Add row UID -----
    dat_db <- dat_db %>%
      dplyr::mutate(
        uid = 1:n()
      )
    # TODO 20181206: put in own function and find best place to call it

    # Model estimation -----
    model_estimation <- model_estimate_v6(
      dat_input = dat_input,
      dat_db = dat_db,
      dat_station = dat_station
    )


    incProgress(2/3, detail = "Selecting...")
    # Model output -----
    model_output <- model_apply_v3(
      model_estimation,
      dat_station = dat_station,
      knn = knn
    )

    incProgress(3/3, detail = "Done...")

    model_output
  })

  model_output
}

model_estimate_v6 <- function(
  dat_input,
  dat_db,
  dat_station
) {
  .Deprecated("model_estimate_v7")
  # Get scaling factor(s) from settings -----
  if (TRUE) {
    dat_fct_scaling = tibble::tibble(
      fct_scaling_dist = unlist(settings$scaling$distances) %>%
        rep(nrow(dat_input)),
      fct_scaling_time = unlist(settings$scaling$time) %>%
        rep(nrow(dat_input))
    )

    # Expand -----
    dat_fct_scaling <- dat_fct_scaling %>%
      tidyr::expand(fct_scaling_dist, fct_scaling_time)

    # Add columns for join -----
    # dat_fct_scaling <- dat_fct_scaling %>%
    #   dplyr::mutate(
    #     time_month = dat_input %>%
    #       dplyr::select_at(if("time_month" %in% names(.))
    #         "time_month" else integer(0)) %>%
    #       dplyr::pull()
    #   )
    # KEEP AS REFERENCE
    dat_fct_scaling <- dat_fct_scaling %>%
      dplyr::mutate(
        time_month = dat_input %>%
          dplyr::pull(time_month),
        msr_distance = dat_input %>%
          dplyr::pull(msr_distance)
      )

    dat_input <- dat_input %>%
      # dplyr::select(time_month) %>%
      dplyr::left_join(
        dat_fct_scaling,
        by = intersect(names(dat_input), names(dat_fct_scaling))
      ) %>%
      dplyr::mutate(
        time_month_scaled = time_month * fct_scaling_time,
        msr_distance_scaled = msr_distance * fct_scaling_dist
      )
  } else {
    stop("Single set of scaling factors for all vars to be scaled not implemented yet")
    dat_fct_scaling = tibble::tibble(
      dat_fct_scaling = unlist(settings$scaling$distances)
    )
  }

  # dat_list <- model_handle_input_time_v2(
  #   dat_input = dat_input,
  #   dat_db = dat_db
  # )
  # dat_db <- dat_list$dat_db

  # Prepare input for subsequent {purrr} pipes -----
  # dat_estimation_input <- model_prepare_estimation_input(
  #   dat_input,
  #   dat_db,
  #   dat_station
  # )
  #
  # # Compute distance for each input vector -----
  # df_estimate <- dat_estimation_input %>%
  #   purrr::map(model_estimate_inner_v2)

  if (FALSE) {
    dat_db <- purrr::map(
      seq_len(nrow(dat_input)), ~dat_db
    )
    dat_station <- purrr::map(
      seq_len(nrow(dat_input)), ~dat_station
    )
    dat_input <- purrrlyr::by_row(dat_input,
      function(v) list(v)[[1L]], .collate = "list")$.out

    dat_estimation_input <- purrr::pmap(list(dat_input, dat_db, dat_station),
      function(a, b, c) {
        list(
          dat_input = a,
          dat_db = b,
          dat_station = c
        )
      })
    df_estimate <- dat_estimation_input %>%
      purrr::map(model_estimate_inner_v2)
  }

  dat_input <- dat_input %>%
    dplyr::mutate(
      id = str_glue("time={fct_scaling_time}_dist={fct_scaling_dist}")
    ) %>%
    dplyr::select(
      id,
      everything()
    )
  dat_input <- purrrlyr::by_row(dat_input,
    function(v) list(v)[[1L]], .collate = "list")$.out

  df_estimate <- dat_input %>%
    purrr::map(model_estimate_inner_v4,
      dat_db = dat_db, dat_station = dat_station)

  df_estimate
}

#' @export
model_run_v6 <- function(
  dat_input,
  dat_db,
  dat_station,
  knn
) {
  .Deprecated("model_run_v7")
  withProgress(message = 'Getting recommendations', value = 0, {
    incProgress(1/3, detail = "Estimating...")

    # Add row UID -----
    dat_db <- dat_db %>%
      dplyr::mutate(
        uid = 1:n()
      )
    # TODO 20181206: put in own function and find best place to call it

    # Model estimation -----
    model_estimation <- model_estimate_v7(
      dat_input = dat_input,
      dat_db = dat_db,
      dat_station = dat_station
    )


    incProgress(2/3, detail = "Selecting...")
    # Model output -----
    model_output <- model_apply_v3(
      model_estimation,
      dat_station = dat_station,
      knn = knn
    )

    incProgress(3/3, detail = "Done...")

    model_output
  })

  model_output
}

#' @export
model_estimate_v7 <- function(
  dat_input,
  dat_db,
  dat_station
) {
  .Deprecated("model_estimate_v8")
  # Get scaling factor(s) from settings -----
  if (TRUE) {
    # Expand scaling factor combinations -----
    # dat_fct_scaling = tibble::tibble(
    #   fct_scaling_dist = unlist(settings$scaling$distances) %>%
    #     rep(nrow(dat_input)),
    #   fct_scaling_time = unlist(settings$scaling$time) %>%
    #     rep(nrow(dat_input))
    # )
    # dat_fct_scaling <- dat_fct_scaling %>%
    #   tidyr::expand(fct_scaling_dist, fct_scaling_time)
    # KEEP AS REFERENCE

    v_fct_scaling_dist <- unlist(settings$scaling$distances) %>%
      rep(nrow(dat_input))
    v_fct_scaling_time <- unlist(settings$scaling$time) %>%
      rep(nrow(dat_input))
    dat_fct_scaling <- expand.grid(v_fct_scaling_dist, v_fct_scaling_time) %>%
      tibble::as_tibble()
    names(dat_fct_scaling) <- c("fct_scaling_dist", "fct_scaling_time")
    dat_fct_scaling <- dat_fct_scaling %>%
      dplyr::arrange(fct_scaling_dist, fct_scaling_time)

    # Add columns for join -----
    # dat_fct_scaling <- dat_fct_scaling %>%
    #   dplyr::mutate(
    #     time_month = dat_input %>%
    #       dplyr::select_at(if("time_month" %in% names(.))
    #         "time_month" else integer(0)) %>%
    #       dplyr::pull()
    #   )
    # KEEP AS REFERENCE

    dat_fct_scaling <- dat_fct_scaling %>%
      dplyr::mutate(
        time_month = dat_input %>%
          dplyr::pull(time_month),
        msr_distance = dat_input %>%
          dplyr::pull(msr_distance)
      )

    dat_input <- dat_input %>%
      # dplyr::select(time_month) %>%
      dplyr::left_join(
        dat_fct_scaling,
        by = intersect(names(dat_input), names(dat_fct_scaling))
      ) %>%
      dplyr::mutate(
        time_month_scaled = time_month * fct_scaling_time,
        msr_distance_scaled = msr_distance * fct_scaling_dist
      )
  } else {
    stop("Single set of scaling factors for all vars to be scaled not implemented yet")
    dat_fct_scaling = tibble::tibble(
      dat_fct_scaling = unlist(settings$scaling$distances)
    )
  }

  # dat_list <- model_handle_input_time_v2(
  #   dat_input = dat_input,
  #   dat_db = dat_db
  # )
  # dat_db <- dat_list$dat_db

  # Prepare input for subsequent {purrr} pipes -----
  # dat_estimation_input <- model_prepare_estimation_input(
  #   dat_input,
  #   dat_db,
  #   dat_station
  # )
  #
  # # Compute distance for each input vector -----
  # df_estimate <- dat_estimation_input %>%
  #   purrr::map(model_estimate_inner_v2)

  if (FALSE) {
    dat_db <- purrr::map(
      seq_len(nrow(dat_input)), ~dat_db
    )
    dat_station <- purrr::map(
      seq_len(nrow(dat_input)), ~dat_station
    )
    dat_input <- purrrlyr::by_row(dat_input,
      function(v) list(v)[[1L]], .collate = "list")$.out

    dat_estimation_input <- purrr::pmap(list(dat_input, dat_db, dat_station),
      function(a, b, c) {
        list(
          dat_input = a,
          dat_db = b,
          dat_station = c
        )
      })
    df_estimate <- dat_estimation_input %>%
      purrr::map(model_estimate_inner_v2)
  }

  dat_input <- dat_input %>%
    dplyr::mutate(
      id = str_glue("time={fct_scaling_time}_dist={fct_scaling_dist}")
    ) %>%
    dplyr::select(
      id,
      everything()
    )
  dat_input <- purrrlyr::by_row(dat_input,
    function(v) list(v)[[1L]], .collate = "list")$.out

  df_estimate <- dat_input %>%
    purrr::map(model_estimate_inner_v4,
      dat_db = dat_db, dat_station = dat_station)

  df_estimate
}

#' @export
model_estimate_inner_v4 <- function(
  dat_input,
  dat_db,
  dat_station
) {
  .Deprecated("model_estimate_inner")
  # if (dat_input$id == "time=0_dist=0.0003") {
  # if (dat_input$id == "time=0.25_dist=1") {
  #   message("DEBUG")
  #   message("DEBUG")
  # }

  # Compute geo distance -----
  dat_msr_distance <- compute_geo_distance_v3(
    p_1 = dat_input %>%
      dplyr::select(matches("latitude|longitude")) %>%
      dplyr::summarise_all(unique),
    p_2 = dat_station
  )

  # Handle distance input -----
  # Only keep stations that have distance <= user input
  dat_list <- handle_input_distance_v2(
    dat_input = dat_input,
    dat_msr_distance = dat_msr_distance
  )
  # Update input and distance data:
  dat_input <- dat_list$dat_input
  dat_input_out <- dat_input
  dat_msr_distance <- dat_list$dat_msr_distance

  v_fct_scaling_dist <- dat_input %>%
    dplyr::distinct(fct_scaling_dist) %>%
    dplyr::pull(fct_scaling_dist)
  tmp <- tibble::tibble(
    dim_station = dat_msr_distance %>% dplyr::pull(dim_station) %>%
      rep(length(v_fct_scaling_dist)) %>%
      sort(),
    fct_scaling_dist = v_fct_scaling_dist %>%
      rep(nrow(dat_msr_distance))
  ) %>%
    dplyr::arrange(desc(fct_scaling_dist))
  dat_msr_distance <- dat_msr_distance %>%
    left_join(tmp, by = "dim_station")

  # Scale -----
  dat_msr_distance <- dat_msr_distance %>%
    dplyr::group_by(fct_scaling_dist) %>%
    dplyr::do(
      pipe_compute_geo_distance_v6(.)
    ) %>%
    ungroup()

  dat_time <- dat_db %>%
    dplyr::select(
      dim_station,
      time_month
    ) %>%
    dplyr::mutate(
      fct_scaling_time = dat_input %>%
        dplyr::pull(fct_scaling_time) %>%
        unique()
    ) %>%
    dplyr::group_by(fct_scaling_time) %>%
    dplyr::do(
      pipe_compute_time_v6(.)
    ) %>%
    ungroup()

  # Handle DB -----
  dat_list <- model_handle_input_db_v3(
    dat_db = dat_db,
    dat_time = dat_time,
    dat_msr_distance = dat_msr_distance
  )
  dat_db <- dat_list$dat_db
  dat_db_out <- dat_list$dat_db_out

  # Create geo distance data frame ----
  dat_distance_geo <- model_handle_dat_dist_geo_v2(dat_db)

  # Early exit -----
  if (!nrow(dat_db)) {
    model_result <- list(
      estimation_result= tibble::tibble(
        fct_scaling = NA,
        dim_station = NA,
        index = NA,
        rank = NA,
        estimation_result = NA
      ),
      dat_distance_geo = dat_distance_geo,
      dat_input = dat_input_out,
      dat_db = dat_db_out
    )
    return(model_result)
  }

  # Scaling factors -----
  fct_scaling_dist <- dat_input %>%
    dplyr::distinct(fct_scaling_dist) %>%
    dplyr::pull()
  fct_scaling_time <- dat_input %>%
    dplyr::distinct(fct_scaling_time) %>%
    dplyr::pull()

  # Drop all dims -----
  dat_list <- model_handle_column_alignment(dat_db, dat_input)
  dat_input <- dat_list$dat_input
  dat_db <- dat_list$dat_db

  # Align dimensions of input -----
  dat_input <- purrr::map_df(
    seq_len(dat_db %>% nrow()), ~dat_input
  )

  # Compute euclidean distance -----
  # purrr::map2(dat_db, dat_input, pmap_inner_2)
  dat_input_mat <- dat_input %>%
    as.matrix()
  dat_db_mat <- dat_db %>%
    as.matrix()
  vec_estimation_result <- sapply(1:nrow(dat_db), function(row) {
    dat_db_this <- dat_db_mat[row, , drop = FALSE]
    dat_input_this <- dat_input_mat[row, , drop = FALSE]
    res <- philentropy:::euclidean(
      dat_input_this, dat_db_this, testNA = FALSE)
  })

  estimation_result <- tibble::tibble(
    # fct_scaling_time = fct_scaling_time,
    # fct_scaling_dist = fct_scaling_dist,
    dim_station = dat_db_out$dim_station,
    uid = dat_db_out$uid,
    estimation_result = vec_estimation_result
  ) %>%
    # dplyr::mutate(
    #   id_scaling = str_glue("time={fct_scaling_time}_dist={fct_scaling_dist}")
    # ) %>%
    # dplyr::select(
    #   id_scaling,
    #   everything()
    # ) %>%
    # dplyr::select(
    #   -fct_scaling_time,
    #   -fct_scaling_dist,
    # ) %>%
  # Add distances for arranging:
  dplyr::left_join(dat_distance_geo,
    by = "dim_station"
  ) %>%
    dplyr::select(
      -fct_scaling_dist,
    ) %>%
    # Add position index:
    dplyr::mutate(index = row_number()) %>%
    dplyr::arrange(estimation_result, msr_distance) %>%
    dplyr::mutate(rank = row_number()) %>%
    dplyr::select(
      dim_station,
      uid,
      index,
      rank,
      estimation_result
    )

  list(
    estimation_result= estimation_result,
    dat_distance_geo = dat_distance_geo,
    dat_input = dat_input_out,
    dat_db = dat_db_out
  )
}

pipe_compute_time_v6 <- function(
  dat_time
) {
  .Deprecated("pipe_compute_time_v7")
  dat_time %>%
    dplyr::mutate(
      time_month_scaled = time_month * fct_scaling_time
    )
}

model_estimate_prime_v1 <- function(
  dat_input,
  dat_db,
  dat_station
) {
  .Deprecated("model_estimate_prime_v2")
  dat_input <- dat_input %>%
    dplyr::mutate(
      id = "0"
    ) %>%
    dplyr::select(
      id,
      everything()
    )

  # Filter based on time input -----
  dat_list <- model_handle_input_time_v2(
    dat_input = dat_input,
    dat_db = dat_db
  )
  dat_db <- dat_list$dat_db

  df_estimate <- model_estimate_prime_inner_v1(
    dat_input = dat_input,
    dat_db = dat_db,
    dat_station = dat_station
  )

  list(df_estimate)
}

model_estimate_prime_inner_v1 <- function(
  dat_input,
  dat_db,
  dat_station
) {
  .Deprecated("model_estimate_prime_inner_v2")
  # Compute geo distance -----
  dat_msr_distance <- compute_geo_distance_v3(
    p_1 = dat_input %>%
      dplyr::select(matches("latitude|longitude")) %>%
      dplyr::summarise_all(unique),
    p_2 = dat_station
  )

  # Handle distance input -----
  # Only keep stations that have distance <= user input
  dat_list <- handle_input_distance_v2(
    dat_input = dat_input,
    dat_msr_distance = dat_msr_distance
  )
  # Update input and distance data:
  dat_input <- dat_list$dat_input
  dat_input_out <- dat_input
  dat_msr_distance <- dat_list$dat_msr_distance

  dat_time <- dat_db %>%
    dplyr::select(
      dim_station,
      time_month
    )

  # Handle DB -----
  dat_list <- model_handle_input_db_v3(
    dat_db = dat_db,
    dat_time = dat_time,
    dat_msr_distance = dat_msr_distance
  )
  dat_db <- dat_list$dat_db
  dat_db_out <- dat_list$dat_db_out

  # Create geo distance data frame ----
  dat_distance_geo <- model_prime_handle_dat_dist_geo_v1(dat_db)

  # Early exit -----
  if (!nrow(dat_db)) {
    model_result <- list(
      estimation_result= tibble::tibble(
        dim_station = NA,
        index = NA,
        rank = NA,
        estimation_result = NA
      ),
      dat_distance_geo = dat_distance_geo,
      dat_input = dat_input_out,
      dat_db = dat_db_out
    )
    return(model_result)
  }

  # Drop all dims -----
  dat_list <- model_handle_column_alignment(dat_db, dat_input)
  dat_input <- dat_list$dat_input
  dat_db <- dat_list$dat_db

  # Align dimensions of input -----
  dat_input <- purrr::map_df(
    seq_len(dat_db %>% nrow()), ~dat_input
  )

  # Compute euclidean distance -----
  # purrr::map2(dat_db, dat_input, pmap_inner_2)
  dat_input_mat <- dat_input %>%
    as.matrix()
  dat_db_mat <- dat_db %>%
    as.matrix()
  vec_estimation_result <- sapply(1:nrow(dat_db), function(row) {
    dat_db_this <- dat_db_mat[row, , drop = FALSE]
    dat_input_this <- dat_input_mat[row, , drop = FALSE]
    res <- philentropy:::euclidean(
      dat_input_this, dat_db_this, testNA = FALSE)
  })

  estimation_result <- tibble::tibble(
    dim_station = dat_db_out$dim_station,
    uid = dat_db_out$uid,
    estimation_result = vec_estimation_result
  ) %>%
    # Add distances for arranging:
    dplyr::left_join(dat_distance_geo,
      by = "dim_station"
    ) %>%
    # Add position index:
    dplyr::mutate(index = row_number()) %>%
    dplyr::arrange(estimation_result, msr_distance) %>%
    dplyr::mutate(rank = row_number()) %>%
    dplyr::select(
      dim_station,
      uid,
      index,
      rank,
      # msr_distance,
      estimation_result
    )

  list(
    estimation_result= estimation_result,
    dat_distance_geo = dat_distance_geo,
    dat_input = dat_input_out,
    dat_db = dat_db_out
  )
}

model_run_v7 <- function(
  dat_input,
  dat_db,
  dat_station,
  knn,
  session
) {
  .Deprecated("model_run")
  shiny::withProgress(message = 'Getting recommendations', value = 0,
    session = session,
    {
      shiny::incProgress(1/3, detail = "Estimating...", session = session)

      # Add row UID -----
      dat_db <- dat_db %>%
        dplyr::mutate(
          uid = 1:n()
        )
      # TODO 20181206: put in own function and find best place to call it

      # Model estimation -----
      model_estimation <- model_estimate_v8(
        dat_input = dat_input,
        dat_db = dat_db,
        dat_station = dat_station
      )


      shiny::incProgress(2/3, detail = "Selecting...", session = session)
      # Model output -----
      model_output <- model_apply_v3(
        model_estimation,
        dat_station = dat_station,
        knn = knn
      )

      shiny::incProgress(3/3, detail = "Done...", session = session)

      model_output
    })

  model_output
}

#' @export
# model_run <- function(
#   dat_input,
#   dat_db,
#   dat_station,
#   dist_measures,
#   dist_measure_final,
#   knn
# ) {
#   # Model estimation --------------------------------------------------------
#
#   model_estimation <- model_estimate(
#     dat_input = dat_input,
#     dat_db = dat_db_msr,
#     dat_station = dat_station,
#     dist_measures = dist_measures
#   )
#   # model_estimation[[1]][[1]]
#
#   # NOTE:
#   # * First level: data input vectors
#   # * Second level: station
#   # * Third level: distance measure values
#
#   # Identify best choices ---------------------------------------------------
#
#   model_prediction_index <- model_predict_index_v3(
#     model_estimation = model_estimation,
#     knn = knn
#   )
#   # model_prediction_index
#
#   # Model prediction --------------------------------------------------------
#
#   model_prediction <- model_predict_v3(
#     model_prediction_index,
#     # dat_input = dat_input,
#     # dat_db = dat_db,
#     dat_station = dat_station,
#     knn = knn
#   )
#
#   model_prediction_ensemble <- model_predict_ensemble(
#     model_prediction = model_prediction,
#     dist_measure = dist_measure_final
#   )
#   # model_prediction_ensemble[[1]]$prediction_ensemble$dim_dist_measure
#
#   # Model output ------------------------------------------------------------
#
#   # model_result <- model_output(model_prediction_ensemble)
#
#   model_result_gathered <- model_output_gathered(model_prediction_ensemble)
#
#   if (settings$output$show) {
#     print("Show:")
#     print(settings$output$show)
#     model_result_render(
#       model_result_gathered,
#       knn = knn,
#       dist_measures = dist_measures,
#       dist_measure_final = dist_measure_final
#     )
#   }
#
#   model_result_gathered
# }

#' #' @export
#' model_estimate <- function(
#'   dat_input,
#'   dat_db,
#'   dist_measures = default_dist_measures()
#' ) {
#'   # Argument handling -----
#'   dist_measures <- match.arg(dist_measures, several.ok = TRUE)
#'
#'   # Ensure matrix -----
#'   dat_input <- as.matrix(dat_input)
#'   dat_db <- as.matrix(dat_db)
#'
#'   # Compute distance for each input row/vector -----
#'   dat_dist <- lapply(1:nrow(dat_input), function(row_user) {
#'     dat_input_row <- dat_input[row_user, , drop = FALSE]
#'
#'     lapply(1:nrow(dat_db), function(row) {
#'       dat_db_this <- dat_db[row, colnames(dat_input_row), drop = FALSE]
#'
#'       # List availble distance measures
#'       # philentropy::getDistMethods()
#'
#'       res <- list(
#'         euclidean = if ("euclidean" %in% dist_measures) {
#'           philentropy:::euclidean(
#'             dat_input_row, dat_db_this, testNA = FALSE)
#'         },
#'         manhattan = if ("manhattan" %in% dist_measures) {
#'           philentropy:::manhattan(
#'             dat_input_row, dat_db_this, testNA = FALSE)
#'         },
#'         jaccard = if ("jaccard" %in% dist_measures) {
#'           philentropy:::jaccard(
#'             dat_input_row, dat_db_this, testNA = FALSE)
#'         },
#'         avg = if ("avg" %in% dist_measures) {
#'           philentropy:::avg(
#'             dat_input_row, dat_db_this, testNA = FALSE)
#'         },
#'         squared_euclidean = if ("squared_euclidean" %in% dist_measures) {
#'           philentropy:::squared_euclidean(
#'             dat_input_row, dat_db_this, testNA = FALSE)
#'         },
#'         chebyshev = if ("chebyshev" %in% dist_measures) {
#'           philentropy:::chebyshev(
#'             dat_input_row, dat_db_this, testNA = FALSE)
#'         }
#'       )
#'       res[!sapply(res, is.null)]
#'     })
#'   })
#' }

#' @export
model_apply_v1 <- function(
  model_estimation,
  dat_station,
  knn
) {
  # dat_result <- lapply(model_estimation, function(estimation) {
  model_output_list <- model_estimation %>%
    purrr::map(function(estimation) {
      # Ensure matrix -----
      dat_db <- estimation$dat_db

      choice <- estimation$estimation_result %>%
        dplyr::group_by(fct_scaling) %>%
        dplyr::filter(rank %in% 1:knn)

      dat_input <- estimation$dat_input
      fct_scaling <- choice %>%
        dplyr::ungroup() %>%
        # dplyr::pull(fct_scaling) %>%
        dplyr::distinct(fct_scaling) %>%
        dplyr::pull()
      factor_align_input <- fct_scaling %>%
        dplyr::n_distinct()
      dat_input <- purrr::map_df(seq_len(factor_align_input * knn), ~dat_input) %>%
        dplyr::mutate(
          fct_scaling = rep(fct_scaling, knn) %>% sort()
        ) %>%
        dplyr::group_by(fct_scaling) %>%
        as.matrix()

      # Prediction data -----
      foo <- function(choice, dat_db) {
        dat_db %>%
          dplyr::filter(
            dim_station %in% choice$dim_station
          )
      }
      dat_output <- choice %>%
        dplyr::do(foo(., dat_db)) %>%
        dplyr::ungroup() %>%
        select(dim_station, time_month, matches("msr_")) %>%
        mutate(
          # diff_time_month = time_month - dat_input %>%
          #   dplyr::pull(time_month),
          # diff_msr_temp_min = msr_temp_min - dat_input %>%
          #   dplyr::pull(msr_temp_min),
          # diff_msr_temp_max = msr_temp_max - dat_input %>%
          #   dplyr::pull(msr_temp_max),
          # diff_msr_temp_avg = msr_temp_avg - dat_input %>%
          #   dplyr::pull(msr_temp_avg),
          # diff_msr_precip_min = msr_precip_min - dat_input %>%
          #   dplyr::pull(msr_precip_min),
          # diff_msr_precip_max = msr_precip_max - dat_input %>%
          #   dplyr::pull(msr_precip_max),
          # diff_msr_precip_avg = msr_precip_avg - dat_input %>%
          #   dplyr::pull(msr_precip_avg),
          # diff_msr_sundur_avg = msr_sundur_avg - dat_input %>%
          #   dplyr::pull(msr_sundur_avg)
          diff_time_month = time_month - dat_input["time_month"],
          diff_msr_temp_min = msr_temp_min - dat_input["msr_temp_min"],
          diff_msr_temp_max = msr_temp_max - dat_input["msr_temp_max"],
          diff_msr_temp_avg = msr_temp_avg - dat_input["msr_temp_avg"],
          diff_msr_precip_min = msr_precip_min - dat_input["msr_precip_min"],
          diff_msr_precip_max = msr_precip_max - dat_input["msr_precip_max"],
          diff_msr_precip_avg = msr_precip_avg - dat_input["msr_precip_avg"],
          diff_msr_sundur_avg = msr_sundur_avg - dat_input["msr_sundur_avg"]
        ) %>%
        dplyr::mutate(
          fct_scaling = rep(fct_scaling, knn) %>% sort()
        ) %>%
        dplyr::select(
          fct_scaling,
          everything()
        )

      # dat_output <- dat_output %>% mutate_if(is.double, round, 1)
      dat_output <- dat_output %>% mutate_at(vars(matches("^diff")), round, 1)

      dat_output <- bind_cols(
        choice %>%
          dplyr::ungroup() %>%
          dplyr::select(dim_rank = rank),
        dat_output
      ) %>%
        dplyr::select(
          fct_scaling,
          dim_rank,
          everything()
        )
      # dat_output %>% dplyr::left_join(
      #   choice,
      #   by = c("fct_scaling", "dim_station")
      # ) %>%
      #   dplyr::select(
      #     fct_scaling,
      #     dim_station,
      #     rank,
      #     everything()
      #   ) %>%
      #   dplyr::group_by(fct_scaling) %>%
      #   dplyr::arrange(fct_scaling, rank) %>%
      #   dplyr::ungroup()

      dat_distance_geo <- estimation$dat_distance_geo

      # Join stations -----
      dat_output_2 <- inner_join(
        dat_output,
        dat_station,
        by = "dim_station"
      ) %>%
        left_join(
          dat_distance_geo,
          by = c("fct_scaling", "dim_station")
        ) %>%
        select(
          fct_scaling,
          dim_rank,
          dim_country,
          dim_station_name,
          msr_distance,
          time_month,
          diff_time_month,
          msr_temp_min,
          diff_msr_temp_min,
          msr_temp_max,
          diff_msr_temp_max,
          msr_temp_avg,
          diff_msr_temp_avg,
          msr_precip_min,
          diff_msr_precip_min,
          msr_precip_max,
          diff_msr_precip_max,
          msr_precip_avg,
          diff_msr_precip_avg,
          msr_sundur_avg,
          diff_msr_sundur_avg,
          dim_station,
          dim_latitude,
          dim_longitude,
          dim_high,
          everything()
        )

      # Return -----
      list(
        model_input = dat_input %>% tibble::as_tibble(),
        model_output = dat_output_2
      )
    })
  # names(model_output_list) <- paste0("input_", 1:length(model_output_list))

  model_output <- list(
    model_input = purrr::map_df(model_output_list, "model_input"),
    model_output = purrr::map_df(model_output_list, "model_output")
  )
}
rappster/climater documentation built on July 1, 2019, 12:23 a.m.