R/modelling.R

# Distance measures -------------------------------------------------------

#' @export
model_estimate_v2 <- function(
  dat_input,
  dat_db,
  dat_station,
  dist_measures = default_dist_measures()
) {
  # Argument handling -----
  dist_measures <- match.arg(dist_measures, several.ok = TRUE)

  # Ensure matrix -----
  # dim_station <- default_name("station") %>% quo_prepend("dim")
  dat_input <- dat_input %>%
    # dplyr::select(-dim_station) %>%
    as.matrix()

  # Compute distance for each input row/vector -----
  dat_dist <- lapply(1:nrow(dat_input), function(row_user) {
    dat_input_row <- dat_input[row_user, , drop = FALSE]

    # Distances -----
    dat_distance_geo <- data.frame(
      dim_station = unique(dat_station$dim_station),
      msr_distance = NA
    )

    cols <- c("dim_latitude", "dim_longitude", "msr_distance")
    if (all(cols %in% colnames(dat_input_row))) {
      # dat_input_row <- as.matrix(dat_input)
      msr_distance <- compute_geo_distance_v2(p_1 = dat_input_row, p_2 = dat_station)
      msr_distance$dim_station <- dat_station$dim_station
      dat_db <- left_join(dat_db, msr_distance, by = "dim_station")
      dat_distance_geo <- dat_db %>%
        select(dim_station, msr_distance) %>%
        group_by(dim_station) %>%
        summarise(msr_distance = unique(msr_distance)) %>%
        ungroup()
    }
    dat_db <- dat_db %>%
      select(-matches("dim_")) %>%
      as.matrix()
    # TODO-20180610: encapsulate in function

    # Drop all dims -----
    idx <- colnames(dat_input_row) %>% stringr::str_detect("dim")
    dat_input_row <- dat_input_row[ , !idx, drop = FALSE]

    ret <- lapply(1:nrow(dat_db), function(row) {
      dat_db_this <- dat_db[row, colnames(dat_input_row), drop = FALSE]

      # List availble distance measures
      # philentropy::getDistMethods()

      res <- list(
        euclidean = if ("euclidean" %in% dist_measures) {
          philentropy:::euclidean(
            dat_input_row, dat_db_this, testNA = FALSE)
        },
        manhattan = if ("manhattan" %in% dist_measures) {
          philentropy:::manhattan(
            dat_input_row, dat_db_this, testNA = FALSE)
        },
        jaccard = if ("jaccard" %in% dist_measures) {
          philentropy:::jaccard(
            dat_input_row, dat_db_this, testNA = FALSE)
        },
        avg = if ("avg" %in% dist_measures) {
          philentropy:::avg(
            dat_input_row, dat_db_this, testNA = FALSE)
        },
        squared_euclidean = if ("squared_euclidean" %in% dist_measures) {
          philentropy:::squared_euclidean(
            dat_input_row, dat_db_this, testNA = FALSE)
        },
        chebyshev = if ("chebyshev" %in% dist_measures) {
          philentropy:::chebyshev(
            dat_input_row, dat_db_this, testNA = FALSE)
        }
      )
      res[!sapply(res, is.null)]
    })

    list(
      distance_stat = ret,
      dat_distance_geo = dat_distance_geo
    )
    # TODO-20180705: tidyfy column name

  })
}

#' @export
model_estimate_prime <- function(
  dat_input,
  dat_db,
  dat_station
) {
  dat_input <- dat_input %>%
    dplyr::mutate(
      id = "0"
    ) %>%
    dplyr::select(
      id,
      everything()
    )

  # Filter based on time input -----
  dat_list <- model_handle_input_time(
    dat_input = dat_input,
    dat_db = dat_db
  )
  dat_db <- dat_list$dat_db

  df_estimate <- model_estimate_prime_inner(
    dat_input = dat_input,
    dat_db = dat_db,
    dat_station = dat_station
  )

  list(df_estimate)
}

#' @export
model_estimate <- function(
  dat_input,
  dat_db,
  dat_station,
  expand_weight_grid,
  msr_distance_prime,
  dat_model_output_prime_react
) {
  # Get scaling factor(s) from settings -----
  if (TRUE) {
    # Expand scaling factor combinations -----
    # dat_fct_scaling = tibble::tibble(
    #   fct_scaling_dist = unlist(settings$scaling$distances) %>%
    #     rep(nrow(dat_input)),
    #   fct_scaling_time = unlist(settings$scaling$time) %>%
    #     rep(nrow(dat_input))
    # )
    # dat_fct_scaling <- dat_fct_scaling %>%
    #   tidyr::expand(fct_scaling_dist, fct_scaling_time)
    # KEEP AS REFERENCE

    v_fct_scaling_dist <- unlist(settings$scaling$distances) %>%
      rep(nrow(dat_input))
    v_fct_scaling_time <- unlist(settings$scaling$time) %>%
      rep(nrow(dat_input))
    if (expand_weight_grid) {
      dat_fct_scaling <- expand.grid(v_fct_scaling_dist, v_fct_scaling_time) %>%
        tibble::as_tibble()
    } else {
      dat_fct_scaling <- tibble::tibble(v_fct_scaling_dist, v_fct_scaling_time)
    }
    names(dat_fct_scaling) <- c("fct_scaling_dist", "fct_scaling_time")
    dat_fct_scaling <- dat_fct_scaling %>%
      dplyr::arrange(fct_scaling_dist, fct_scaling_time)

    # Add columns for join -----
    # dat_fct_scaling <- dat_fct_scaling %>%
    #   dplyr::mutate(
    #     time_month = dat_input %>%
    #       dplyr::select_at(if("time_month" %in% names(.))
    #         "time_month" else integer(0)) %>%
    #       dplyr::pull()
    #   )
    # KEEP AS REFERENCE

    dat_fct_scaling <- dat_fct_scaling %>%
      dplyr::mutate(
        time_month = dat_input %>%
          dplyr::pull(time_month),
        msr_distance = dat_input %>%
          dplyr::pull(msr_distance)
      )

    dat_input <- dat_input %>%
      # dplyr::select(time_month) %>%
      dplyr::left_join(
        dat_fct_scaling,
        by = intersect(names(dat_input), names(dat_fct_scaling))
      ) %>%
      dplyr::mutate(
        time_month_scaled = time_month * fct_scaling_time,
        time_month_sin_scaled = time_month_sin * fct_scaling_time,
        time_month_cos_scaled = time_month_cos * fct_scaling_time,
        msr_distance_scaled = msr_distance * fct_scaling_dist
      )
  } else {
    stop("Single set of scaling factors for all vars to be scaled not implemented yet")
    dat_fct_scaling = tibble::tibble(
      dat_fct_scaling = unlist(settings$scaling$distances)
    )
  }

  # dat_list <- model_handle_input_time_v2(
  #   dat_input = dat_input,
  #   dat_db = dat_db
  # )
  # dat_db <- dat_list$dat_db

  # Prepare input for subsequent {purrr} pipes -----
  # dat_estimation_input <- model_prepare_estimation_input(
  #   dat_input,
  #   dat_db,
  #   dat_station
  # )
  #
  # # Compute distance for each input vector -----
  # df_estimate <- dat_estimation_input %>%
  #   purrr::map(model_estimate_inner_v2)

  if (FALSE) {
    dat_db <- purrr::map(
      seq_len(nrow(dat_input)), ~dat_db
    )
    dat_station <- purrr::map(
      seq_len(nrow(dat_input)), ~dat_station
    )
    dat_input <- purrrlyr::by_row(dat_input,
      function(v) list(v)[[1L]], .collate = "list")$.out

    dat_estimation_input <- purrr::pmap(list(dat_input, dat_db, dat_station),
      function(a, b, c) {
        list(
          dat_input = a,
          dat_db = b,
          dat_station = c
        )
      })
    df_estimate <- dat_estimation_input %>%
      purrr::map(model_estimate_inner_v2)
  }

  dat_input <- dat_input %>%
    dplyr::mutate(
      id = stringr::str_glue("time={fct_scaling_time}_dist={fct_scaling_dist}")
    ) %>%
    dplyr::select(
      id,
      everything()
    )
  dat_input <- purrrlyr::by_row(dat_input,
    function(v) list(v)[[1L]], .collate = "list")$.out

  df_estimate <- dat_input %>%
    purrr::map(
      model_estimate_inner,
      dat_db = dat_db,
      dat_station = dat_station,
      msr_distance_prime = msr_distance_prime,
      dat_model_output_prime_react = dat_model_output_prime_react
    )

  df_estimate
}

model_prepare_estimation_input <- function(
  dat_input,
  dat_db,
  dat_station
) {
  # Transform to list for subsequent purrr-style processing -----
  dat_input <- dat_input %>%
    tibble::rownames_to_column() %>%
    dplyr::group_by(rowname) %>%
    list(.) %>%
    purrr::map(~.x %>%
        ungroup() %>%
        dplyr::select(-rowname)
    )
  dat_input <- dat_input %>%
    purrr::map(
      ~list(
        dat_input = .x,
        dat_db = dat_db,
        dat_station = dat_station
      )
    )
}

model_estimate_inner_v2 <- function(.x) {
  dat_input <- .x$dat_input
  dat_db <- .x$dat_db
  dat_station <- .x$dat_station

  # Compute geo distance -----
  # dat_msr_distance <- compute_geo_distance_v2(p_1 = dat_input_row, p_2 = dat_station)
  dat_msr_distance <- compute_geo_distance_v3(
    p_1 = dat_input %>%
      dplyr::select(matches("latitude|longitude")) %>%
      dplyr::summarise_all(unique),
    p_2 = dat_station
  )

  # Handle distance input -----
  # Only keep stations that have distance <= user input
  dat_list <- handle_input_distance(
    dat_input = dat_input,
    dat_msr_distance = dat_msr_distance
  )
  # Update input and distance data:
  dat_input <- dat_list$dat_input
  dat_input_out <- dat_input
  dat_msr_distance <- dat_list$dat_msr_distance

  v_fct_scaling_dist <- dat_input %>%
    dplyr::distinct(fct_scaling_dist) %>%
    dplyr::pull(fct_scaling_dist)
  tmp <- tibble::tibble(
    dim_station = dat_msr_distance %>% dplyr::pull(dim_station) %>%
      rep(length(v_fct_scaling_dist)) %>%
      sort(),
    fct_scaling_dist = v_fct_scaling_dist %>%
      rep(nrow(dat_msr_distance))
  ) %>%
    dplyr::arrange(desc(fct_scaling_dist))
  dat_msr_distance <- dat_msr_distance %>%
    left_join(tmp, by = "dim_station")

  # Scale -----
  dat_msr_distance <- dat_msr_distance %>%
    dplyr::group_by(fct_scaling_dist) %>%
    dplyr::do(
      pipe_compute_geo_distance_v6(.)
    ) %>%
    ungroup()

  # Handle DB -----
  dat_list <- model_handle_input_db_v2(
    dat_db = dat_db,
    dat_msr_distance = dat_msr_distance
  )
  dat_db <- dat_list$dat_db
  dat_db_out <- dat_list$dat_db_out

  # Create geo distance data frame ----
  dat_distance_geo <- model_handle_dat_dist_geo_v2(dat_db)

  # Early exit -----
  if (!nrow(dat_db)) {
    model_result <- list(
      estimation_result= tibble::tibble(
        fct_scaling = NA,
        dim_station = NA,
        index = NA,
        rank = NA,
        estimation_result = NA
      ),
      dat_distance_geo = dat_distance_geo,
      dat_input = dat_input_out,
      dat_db = dat_db_out
    )
    return(model_result)
  }

  # PATCH: remove unwanted cols -----
  # dat_db <- dat_db %>%
  #   dplyr::select(-fct_scaling_dist)

  # Drop all dims -----
  dat_list <- model_handle_drop_dims_v2(dat_db, dat_input)
  dat_input <- dat_list$dat_input
  dat_db <- dat_list$dat_db

  # Align dimensions of input -----
  alignment_factor <- dat_db %>% nrow()

  dat_input <- purrr::map_df(
    seq_len(alignment_factor), ~dat_input
  )

  # Compute euclidean distance -----
  # purrr::map2(dat_db, dat_input, pmap_inner_2)
  dat_input_mat <- dat_input %>%
    as.matrix()
  dat_db_mat <- dat_db %>%
    as.matrix()
  vec_estimation_result <- sapply(1:nrow(dat_db), function(row) {
    dat_db_this <- dat_db_mat[row, , drop = FALSE]
    dat_input_this <- dat_input_mat[row, , drop = FALSE]
    res <- philentropy:::euclidean(
      dat_input_this, dat_db_this, testNA = FALSE)
  })

  estimation_result <- tibble::tibble(
    fct_scaling = fct_scaling,
    dim_station = rep(dat_db_out$dim_station, nrow(dat_fct_scaling)),
    estimation_result = vec_estimation_result
  ) %>%
    dplyr::group_by(fct_scaling) %>%
    # Add distances for arranging:
    dplyr::left_join(dat_distance_geo,
      by = c("fct_scaling", "dim_station")) %>%
    # Add position index:
    dplyr::mutate(index = row_number()) %>%
    dplyr::arrange(fct_scaling, estimation_result, msr_distance) %>%
    dplyr::mutate(rank = row_number()) %>%
    dplyr::select(
      fct_scaling,
      dim_station,
      index,
      rank,
      estimation_result
    ) %>%
    ungroup()

  list(
    estimation_result= estimation_result,
    dat_distance_geo = dat_distance_geo,
    dat_input = dat_input_out,
    dat_db = dat_db_out
  )
}

#' @export
model_estimate_prime_inner <- function(
  dat_input,
  dat_db,
  dat_station
) {
  # Compute geo distance -----
  dat_msr_distance <- compute_geo_distance_v3(
    p_1 = dat_input %>%
      dplyr::select(matches("latitude|longitude")) %>%
      dplyr::summarise_all(unique),
    p_2 = dat_station
  )

  # Handle distance input -----
  # Only keep stations that have distance <= user input
  dat_list <- handle_input_distance(
    dat_input = dat_input,
    dat_msr_distance = dat_msr_distance
  )
  # Update input and distance data:
  dat_input <- dat_list$dat_input
  # Remove distance input:
  dat_input <- dat_input %>%
    select(-matches("^msr_distance"))
  dat_input_out <- dat_input
  dat_msr_distance <- dat_list$dat_msr_distance

  dat_time <- dat_db %>%
    dplyr::select(
      dim_station,
      time_month
    )

  # Handle DB -----
  dat_list <- model_handle_input_db_v3(
    dat_db = dat_db,
    dat_time = dat_time,
    dat_msr_distance = dat_msr_distance
  )
  dat_db <- dat_list$dat_db
  dat_db_out <- dat_list$dat_db_out

  # Create geo distance data frame ----
  dat_distance_geo <- model_prime_handle_dat_dist_geo_v1(dat_db)

  # Early exit -----
  if (!nrow(dat_db)) {
    model_result <- list(
      estimation_result= tibble::tibble(
        dim_station = NA,
        index = NA,
        rank = NA,
        estimation_result = NA
      ),
      dat_distance_geo = dat_distance_geo,
      dat_input = dat_input_out,
      dat_db = dat_db_out
    )
    return(model_result)
  }

  # Drop all dims -----
  dat_list <- model_handle_column_alignment(dat_db, dat_input)
  dat_input <- dat_list$dat_input
  dat_db <- dat_list$dat_db

  # Align dimensions of input -----
  dat_input <- purrr::map_df(
    seq_len(dat_db %>% nrow()), ~dat_input
  )

  # Compute euclidean distance -----
  # purrr::map2(dat_db, dat_input, pmap_inner_2)
  dat_input_mat <- dat_input %>%
    as.matrix()
  dat_db_mat <- dat_db %>%
    as.matrix()
  vec_estimation_result <- sapply(1:nrow(dat_db), function(row) {
    dat_db_this <- dat_db_mat[row, , drop = FALSE]
    dat_input_this <- dat_input_mat[row, , drop = FALSE]
    res <- philentropy:::euclidean(
      dat_input_this, dat_db_this, testNA = FALSE)
  })

  estimation_result <- tibble::tibble(
    dim_station = dat_db_out$dim_station,
    uid = dat_db_out$uid,
    estimation_result = vec_estimation_result
  ) %>%
    # Add distances for arranging:
    dplyr::left_join(dat_distance_geo,
      by = "dim_station"
    ) %>%
    # Add position index:
    dplyr::mutate(index = row_number()) %>%
    dplyr::arrange(estimation_result, msr_distance) %>%
    dplyr::mutate(rank = row_number()) %>%
    dplyr::select(
      dim_station,
      uid,
      index,
      rank,
      # msr_distance,
      estimation_result
    )

  list(
    estimation_result= estimation_result,
    dat_distance_geo = dat_distance_geo,
    dat_input = dat_input_out,
    dat_db = dat_db_out
  )
}

#' @export
model_estimate_inner <- function(
  dat_input,
  dat_db,
  dat_station,
  msr_distance_prime,
  dat_model_output_prime_react
) {
  # if (dat_input$id == "time=0_dist=0.0003") {
  # if (dat_input$id == "time=0.25_dist=1") {
  #   message("DEBUG")
  #   message("DEBUG")
  # }

  # Compute geo distance -----
  dat_msr_distance <- compute_geo_distance_v3(
    p_1 = dat_input %>%
      dplyr::select(matches("latitude|longitude")) %>%
      dplyr::summarise_all(unique),
    p_2 = dat_station
  )

  # Handle distance input -----
  # Only keep stations that have distance <= user input
  dat_list <- handle_input_distance(
    dat_input = dat_input,
    dat_msr_distance = dat_msr_distance,
    msr_distance_max = msr_distance_prime
  )
  # Update input and distance data:
  dat_input <- dat_list$dat_input
  dat_input_out <- dat_input
  dat_msr_distance <- dat_list$dat_msr_distance

  v_fct_scaling_dist <- dat_input %>%
    dplyr::distinct(fct_scaling_dist) %>%
    dplyr::pull(fct_scaling_dist)
  tmp <- tibble::tibble(
    dim_station = dat_msr_distance %>% dplyr::pull(dim_station) %>%
      rep(length(v_fct_scaling_dist)) %>%
      sort(),
    fct_scaling_dist = v_fct_scaling_dist %>%
      rep(nrow(dat_msr_distance))
  ) %>%
    dplyr::arrange(desc(fct_scaling_dist))
  dat_msr_distance <- dat_msr_distance %>%
    left_join(tmp, by = "dim_station")

  # Scale -----
  dat_msr_distance <- dat_msr_distance %>%
    dplyr::group_by(fct_scaling_dist) %>%
    dplyr::do(
      pipe_compute_geo_distance_v6(.)
    ) %>%
    ungroup()

  dat_time <- dat_db %>%
    dplyr::select(
      dim_station,
      time_month,
      time_month_sin,
      time_month_cos
    ) %>%
    dplyr::mutate(
      fct_scaling_time = dat_input %>%
        dplyr::pull(fct_scaling_time) %>%
        unique()
    ) %>%
    dplyr::group_by(fct_scaling_time) %>%
    dplyr::do(
      pipe_compute_time_v7(.)
    ) %>%
    ungroup()

  # Handle DB -----
  dat_list <- model_handle_input_db_v3(
    dat_db = dat_db,
    dat_time = dat_time,
    dat_msr_distance = dat_msr_distance
  )
  dat_db <- dat_list$dat_db
  dat_db_out <- dat_list$dat_db_out

  # Create geo distance data frame ----
  dat_distance_geo <- model_handle_dat_dist_geo_v2(dat_db)

  # Early exit -----
  if (!nrow(dat_db)) {
    model_result <- list(
      estimation_result= tibble::tibble(
        fct_scaling = NA,
        dim_station = NA,
        index = NA,
        rank = NA,
        estimation_result = NA
      ),
      dat_distance_geo = dat_distance_geo,
      dat_input = dat_input_out,
      dat_db = dat_db_out
    )
    return(model_result)
  }

  # Scaling factors -----
  fct_scaling_dist <- dat_input %>%
    dplyr::distinct(fct_scaling_dist) %>%
    dplyr::pull()
  fct_scaling_time <- dat_input %>%
    dplyr::distinct(fct_scaling_time) %>%
    dplyr::pull()

  # Drop all dims -----
  dat_list <- model_handle_column_alignment(dat_db, dat_input)
  dat_input <- dat_list$dat_input
  dat_db <- dat_list$dat_db

  # Align dimensions of input -----
  dat_input <- purrr::map_df(
    seq_len(dat_db %>% nrow()), ~dat_input
  )

  # Compute euclidean distance -----
  # purrr::map2(dat_db, dat_input, pmap_inner_2)
  dat_input_mat <- dat_input %>%
    as.matrix()
  dat_db_mat <- dat_db %>%
    as.matrix()
  vec_estimation_result <- sapply(1:nrow(dat_db), function(row) {
    dat_db_this <- dat_db_mat[row, , drop = FALSE]
    dat_input_this <- dat_input_mat[row, , drop = FALSE]
    res <- philentropy:::euclidean(
      dat_input_this, dat_db_this, testNA = FALSE)
  })

  estimation_result <- tibble::tibble(
    # fct_scaling_time = fct_scaling_time,
    # fct_scaling_dist = fct_scaling_dist,
    dim_station = dat_db_out$dim_station,
    uid = dat_db_out$uid,
    estimation_result = vec_estimation_result
  ) %>%
    # dplyr::mutate(
    #   id_scaling = stringr::str_glue("time={fct_scaling_time}_dist={fct_scaling_dist}")
    # ) %>%
    # dplyr::select(
    #   id_scaling,
    #   everything()
    # ) %>%
    # dplyr::select(
    #   -fct_scaling_time,
    #   -fct_scaling_dist,
    # ) %>%
  # Add distances for arranging:
  dplyr::left_join(dat_distance_geo,
    by = "dim_station"
  ) %>%
    dplyr::select(
      -fct_scaling_dist,
    ) %>%
    # Add position index:
    dplyr::mutate(index = row_number()) %>%
    dplyr::arrange(estimation_result, msr_distance) %>%
    dplyr::mutate(rank = row_number()) %>%
    dplyr::select(
      dim_station,
      uid,
      index,
      rank,
      estimation_result
    )

  # Ensure that prime location(s) aren't part of alternatives -----
  estimation_result <- estimation_result %>%
    dplyr::filter(
      !uid %in% dplyr::pull(dat_model_output_prime_react, uid)
    )
  dat_distance_geo <- dat_distance_geo %>%
    dplyr::filter(
      !dim_station %in% dplyr::pull(dat_model_output_prime_react, dim_station)
    )

  list(
    estimation_result= estimation_result,
    dat_distance_geo = dat_distance_geo,
    dat_input = dat_input_out,
    dat_db = dat_db_out
  )
}

pipe_model_estimate <- function(
  dat_estimation,
  dist_measures
) {
  dat_estimation %>%
    dplyr::rowwise() %>%
    dplyr::do(
      pipe_model_estimate_inner(
        .,
        dist_measures = dist_measures
      )
    )
}

pipe_model_estimate_inner <- function(
  dat_estimation,
  dist_measures
) {
  # cols <- colnames(dat_estimation) %>%
  #   rlang::syms() %>%
  #   purrr::map(function(s) expr(!!s))
  dat_estimation <- dat_estimation %>%
    tibble::as_tibble()
  cols_input <- dat_estimation %>%
    dplyr::select(matches("^input_")) %>%
    names() %>%
    rlang::syms()

  cols_db <- cols_input %>%
    as.character() %>%
    stringr::str_replace("^input_", "") %>%
    rlang::syms()

  dat_input <- dat_estimation %>%
    dplyr::select(!!!cols_input) %>%
    as.matrix()
  dat_db <- dat_estimation %>%
    dplyr::select(!!!cols_db) %>%
    as.matrix()

  estimation <- philentropy:::euclidean(
    dat_input, dat_db, testNA = FALSE)

  tibble::tibble(
    fct_scaling = dat_estimation$fct_scaling %>% unique(),
    estimation = estimation
  )
}

pipe_compute_geo_distance_v6 <- function(
  dat_msr_distance
) {
  dat_msr_distance %>%
    dplyr::mutate(
      msr_distance_scaled = msr_distance * fct_scaling_dist
    )
}

pipe_compute_time_v7 <- function(
  dat_time
) {
  dat_time %>%
    dplyr::mutate(
      time_month_scaled = time_month * fct_scaling_time,
      time_month_sin_scaled = time_month_sin * fct_scaling_time,
      time_month_cos_scaled = time_month_cos * fct_scaling_time
    )
}

#' @export
model_predict_index_v3 <- function(
  model_estimation,
  knn = 3
) {
  # Get names of computed distance measures -----
  dist_measures <- names(model_estimation[[1]]$estimation_result[[1]])
  # TODO-20180705: make this nicer/more robust

  model_prediction <- lapply(model_estimation, function(estimation) {
    index <- lapply(dist_measures, function(mea) {
      # Select particular distance measure result for all stations -----
      values <- sapply(estimation$estimation_result, "[[", mea)
      names(values) <- 1:length(values)

      # Sort estimation results -----
      # values <- sort(values[values != 0])
      values <- sort(values)

      # Select best predictions based on knn -----
      as.numeric(names(values[1:knn]))
    })
    names(index) <- dist_measures

    list(
      estimation_result_index = index,
      dat_distance_geo = estimation$dat_distance_geo,
      dat_input = estimation$dat_input,
      dat_db = estimation$dat_db
    )
  })

  model_prediction
}

#' @export
model_predict_index_v5 <- function(
  model_estimation,
  knn = 3
) {

  model_prediction <- lapply(model_estimation, function(estimation) {
    index <- estimation$estimation_result %>%
      dplyr::group_by(fct_scaling) %>%
      dplyr::mutate(index = 1:n()) %>%
      dplyr::arrange(estimation_result) %>%
      dplyr::slice(1:knn) %>%
      dplyr::mutate(rank = 1:n()) %>%
      ungroup()

    list(
      estimation_result_index = index,
      dat_distance_geo = estimation$dat_distance_geo,
      dat_input = estimation$dat_input,
      dat_db = estimation$dat_db
    )
  })

  model_prediction
}

#' @export
model_predict_v3 <- function(
  model_prediction_index,
  # dat_input,
  # dat_db,
  dat_station,
  knn
) {
  dat_result <- lapply(1:length(model_prediction_index), function(idx_input) {
    # Ensure matrix -----
    dat_input <- model_prediction_index[[idx_input]]$dat_input
    dat_db <- model_prediction_index[[idx_input]]$dat_db %>%
      tibble::as_tibble()

    choice <- model_prediction_index[[idx_input]]$estimation_result_index
    dist_meassure <- unlist(lapply(names(choice), rep, knn))
    index <- unlist(choice)

    # Get station ID ------
    dim_station <- dat_db$dim_station[index]

    # Get ranks -----
    rank <- rep(seq(knn), length(choice))

    # Input data -----
    dat_input_row <- dat_input[idx_input, ]

    # Prediction data -----
    dat_prediction <- dat_db[index, ] %>%
      select(time_month, matches("msr_")) %>%
      mutate(
        dim_rank = rank,
        dim_station = dim_station,
        dim_dist_measure = dist_meassure,
        diff_time_month = time_month - dat_input_row["time_month"],
        diff_msr_temp_min = msr_temp_min - dat_input_row["msr_temp_min"],
        diff_msr_temp_max = msr_temp_max - dat_input_row["msr_temp_max"],
        diff_msr_temp_avg = msr_temp_avg - dat_input_row["msr_temp_avg"],
        diff_msr_precip_min = msr_precip_min - dat_input_row["msr_precip_min"],
        diff_msr_precip_max = msr_precip_max - dat_input_row["msr_precip_max"],
        diff_msr_precip_avg = msr_precip_avg - dat_input_row["msr_precip_avg"],
        diff_msr_sundur_avg = msr_sundur_avg - dat_input_row["msr_sundur_avg"]
      )

    dat_prediction <- dat_prediction %>% mutate_if(is.double, round, 1)

    dat_distance_geo <- model_prediction_index[[idx_input]]$dat_distance_geo

    # Join stations -----
    dat_prediction_2 <- inner_join(
      dat_prediction,
      dat_station,
      by = "dim_station"
    ) %>%
      left_join(
        dat_distance_geo,
        by = "dim_station"
      ) %>%
      select(
        dim_rank,
        dim_country,
        dim_station_name,
        msr_distance,
        time_month,
        diff_time_month,
        msr_temp_min,
        diff_msr_temp_min,
        msr_temp_max,
        diff_msr_temp_max,
        msr_temp_avg,
        diff_msr_temp_avg,
        msr_precip_min,
        diff_msr_precip_min,
        msr_precip_max,
        diff_msr_precip_max,
        msr_precip_avg,
        diff_msr_precip_avg,
        msr_sundur_avg,
        diff_msr_sundur_avg,
        dim_station,
        dim_latitude,
        dim_longitude,
        dim_high,
        dim_dist_measure,
        everything()
      )

    # Return -----
    list(
      input = dat_input_row,
      prediction = dat_prediction_2
    )
  })
}

#' @export
model_apply <- function(
  model_estimation,
  dat_station,
  knn
) {
  # dat_result <- lapply(model_estimation, function(estimation) {
  model_output_list <- model_estimation %>%
    purrr::map(function(estimation) {
      dat_input <- estimation$dat_input

      # Ensure matrix -----
      dat_db <- estimation$dat_db

      # Early exit for incomaptible input settings -----
      if (!nrow(dat_db)) {
        return(
          list(
            model_input = dat_input,
            model_output = tibble::tibble()
          )
        )
      }

      choice <- estimation$estimation_result %>%
        dplyr::filter(rank %in% 1:knn)

      v_id <- dat_input %>%
        dplyr::distinct(id) %>%
        dplyr::pull()
      dat_input <- purrr::map_df(seq_len(knn), ~dat_input %>%
          dplyr::select(-id)) %>%
        as.matrix()

      # dat_input %>%
      #   dplyr::select_if(~sum(!is.na(.)) > 0)

      # Prediction data -----
      foo <- function(choice, dat_db) {
        dat_db %>%
          dplyr::filter(
            dim_station %in% choice$dim_station,
            uid %in% choice$uid
          ) %>%
          dplyr::left_join(
            choice %>% select(uid, rank),
            by = "uid"
          ) %>%
          dplyr::arrange(rank)
      }

      dat_output <- choice %>%
        dplyr::do(foo(., dat_db)) %>%
        dplyr::ungroup() %>%
        select(dim_station, uid, time_month, matches("msr_")) %>%
        mutate(
          # diff_time_month = time_month - dat_input %>%
          #   dplyr::pull(time_month),
          # diff_msr_temp_min = msr_temp_min - dat_input %>%
          #   dplyr::pull(msr_temp_min),
          # diff_msr_temp_max = msr_temp_max - dat_input %>%
          #   dplyr::pull(msr_temp_max),
          # diff_msr_temp_avg = msr_temp_avg - dat_input %>%
          #   dplyr::pull(msr_temp_avg),
          # diff_msr_precip_min = msr_precip_min - dat_input %>%
          #   dplyr::pull(msr_precip_min),
          # diff_msr_precip_max = msr_precip_max - dat_input %>%
          #   dplyr::pull(msr_precip_max),
          # diff_msr_precip_avg = msr_precip_avg - dat_input %>%
          #   dplyr::pull(msr_precip_avg),
          # diff_msr_sundur_avg = msr_sundur_avg - dat_input %>%
          #   dplyr::pull(msr_sundur_avg)
          diff_time_month = time_month - dat_input["time_month"],
          diff_msr_temp_min = msr_temp_min - dat_input["msr_temp_min"],
          diff_msr_temp_max = msr_temp_max - dat_input["msr_temp_max"],
          diff_msr_temp_avg = msr_temp_avg - dat_input["msr_temp_avg"],
          diff_msr_precip_min = msr_precip_min - dat_input["msr_precip_min"],
          diff_msr_precip_max = msr_precip_max - dat_input["msr_precip_max"],
          diff_msr_precip_avg = msr_precip_avg - dat_input["msr_precip_avg"],
          diff_msr_sundur_avg = msr_sundur_avg - dat_input["msr_sundur_avg"]
        )

      # dat_output <- dat_output %>% mutate_if(is.double, round, 1)
      dat_output <- dat_output %>% mutate_at(vars(matches("^diff")), round, 1)

      dat_output <- bind_cols(
        choice %>%
          dplyr::ungroup() %>%
          dplyr::select(dim_rank = rank),
        dat_output
      ) %>%
        dplyr::mutate(
          id = v_id
        ) %>%
        dplyr::select(
          id,
          dim_rank,
          everything()
        )

      dat_distance_geo <- estimation$dat_distance_geo

      # Join stations -----
      dat_output_2 <- inner_join(
        dat_output,
        dat_station,
        by = "dim_station"
      ) %>%
        left_join(
          dat_distance_geo,
          by = "dim_station"
        ) %>%
        select(
          id,
          dim_rank,
          dim_country,
          dim_station_name,
          msr_distance,
          time_month,
          diff_time_month,
          msr_temp_min,
          diff_msr_temp_min,
          msr_temp_max,
          diff_msr_temp_max,
          msr_temp_avg,
          diff_msr_temp_avg,
          msr_precip_min,
          diff_msr_precip_min,
          msr_precip_max,
          diff_msr_precip_max,
          msr_precip_avg,
          diff_msr_precip_avg,
          msr_sundur_avg,
          diff_msr_sundur_avg,
          dim_station,
          dim_latitude,
          dim_longitude,
          dim_high,
          everything()
        )

      # Return -----
      list(
        model_input = dat_input %>% tibble::as_tibble(),
        model_output = dat_output_2
      )
    })
  # names(model_output_list) <- paste0("input_", 1:length(model_output_list))

  model_output <- list(
    model_input = purrr::map_df(model_output_list, "model_input"),
    model_output = purrr::map_df(model_output_list, "model_output")
  )
}

#' @export
model_predict_ensemble <- function(
  model_prediction,
  dist_measure = character()
) {
  dat_result_unique <- lapply(model_prediction, function(prediction) {
    dat_prediction <- prediction$prediction

    prediction$prediction_ensemble <- if (!length(dist_measure)) {
      plyr::ddply(dat_prediction, "dim_rank", function(dat) {
        # names(sort(table(dat$dim_station), decreasing = TRUE))
        tmp <- arrange(count(dat, dim_station), desc(n))

        # Resolve ties -----
        # station_id <- if (!all(c(0, diff(tmp$n)) == 0)) {
        station_id <- if (nrow(tmp) == nrow(dat)) {
          sample(tmp$dim_station, 1)
        } else {
          tmp$dim_station[1]
        }

        # Filter based on station ID -----
        res <- filter(dat, dim_station == station_id)[1, ]

        # Denote that this is an ensemble result -----
        res$dim_dist_measure <- "ensemble"

        # Add trust measure for ensemble result -----
        res$msr_ensemble_trust <- tmp$n[1] / nrow(dat)
        res
      })
    } else {
      filter(dat_prediction, dim_dist_measure == dist_measure)
    }
    prediction
  })
}

#' @export
model_predict_ensemble_v2 <- function(
  model_prediction
) {
  dat_result_unique <- lapply(model_prediction, function(prediction) {
    # dat_prediction <- prediction$prediction
    #
    # prediction$prediction_ensemble <- dat_prediction

    prediction
  })
}

#' @export
model_output <- function(model_prediction) {
  model_prediction %>% lapply(
    function(dat) {
      dat$prediction <- as.data.frame(dat$prediction)
      # dat$prediction_ensemble <- as.data.frame(dat$prediction_ensemble)
      dat$prediction_ensemble <- as.data.frame(dat$prediction)
      dat
    })
}

#' @export
model_output_gathered <- function(model_prediction) {
  # Prepare -----
  model_result <- model_output(model_prediction)
  names(model_result) <- sprintf("input_%s", seq(model_result))
  # model_result[[1]] %>% names()

  # Gather
  .gather <- function(dat, what) {
    model_input <- dat %>%
      lapply("[[", what) %>%
      lapply(function(dat) {
        if (inherits(dat, c("integer", "numeric"))) {
          data.frame(as.list(dat))
        } else {
          dat
        }
      }) %>%
      bind_rows(.id = "input")
  }

  model_input <- .gather(model_result, "input")
  model_prediction <- .gather(model_result, "prediction")
  # model_result$input_1$prediction %>% sapply(class) %>% stringr::str_detect("factor")
  model_prediction_ensemble <- .gather(model_result, "prediction_ensemble")

  # Return -----
  list(
    input = model_input,
    prediction = model_prediction,
    prediction_ensemble = model_prediction_ensemble
  )
}

#' @export
model_result_render <- function(
  model_result_gathered,
  knn,
  dist_measures,
  dist_measure_final
) {
  mdown <- c(
    "# Model result `r Sys.time()`",
    "## Input",
    " * `knn`: `r knn`",
    "",
    " * `dist_measures`: `r dist_measures`",
    "",
    " * `dist_measure_final`: `r dist_measure_final`",
    "",
    " * Actual numeric input:",
    "",
    "```{r echo=FALSE}",
    "DT::datatable(
      model_result_gathered$input,
        options = list(
          scrollX = TRUE
        )
    )",
    "```",
    " ",
    "## Output raw",
    "```{r echo=FALSE}",
    "DT::datatable(
      model_result_gathered$prediction,
        options = list(
          scrollX = TRUE
        )
    )",
    "```",
    " ",
    "## Output ensemble",
    "```{r echo=FALSE}",
    "DT::datatable(
      model_result_gathered$prediction_ensemble,
        options = list(
          scrollX = TRUE
        )
    )",
    "```"
  )
  file_stub <- tempfile()
  file_rmd <- paste0(file_stub, ".Rmd")
  file_html <- paste0(file_stub, ".html")

  writeLines(mdown, file_rmd)
  # knitr::knit2html(file_rmd, file_html)
  # knitr::knit(file_rmd, file_html)
  rmarkdown::render(file_rmd, output_file = file_html)
  # if (interactive())
  browseURL(file_html)
}

#' @export
shiny_run_model <- function(
  dat_input,
  dat_db,
  dat_station,
  dist_measures,
  dist_measure_final,
  knn
) {
  # Model estimation --------------------------------------------------------

  model_estimation <- model_estimate_v2(
    dat_input = dat_input,
    dat_db = dat_db_msr,
    dat_station = dat_station,
    dist_measures = dist_measures
  )
  # model_estimation[[1]][[1]]

  # NOTE:
  # * First level: data input vectors
  # * Second level: station
  # * Third level: distance measure values

  # Identify best choices ---------------------------------------------------

  model_prediction_index <- model_predict_index_v2(
    model_estimation = model_estimation,
    knn = knn
  )
  # model_prediction_index

  # Model prediction --------------------------------------------------------

  model_prediction <- model_predict_v2(
    model_prediction_index,
    dat_input = dat_input,
    dat_db = dat_db,
    dat_station = dat_station,
    knn = knn
  )

  model_prediction_ensemble <- model_predict_ensemble(
    model_prediction = model_prediction,
    dist_measure = dist_measure_final
  )
  # model_prediction_ensemble[[1]]$prediction_ensemble$dim_dist_measure

  # Model output ------------------------------------------------------------

  # model_result <- model_output(model_prediction_ensemble)

  model_result_gathered <- model_output_gathered(model_prediction_ensemble)

  if (settings$output$show) {
    print("Show:")
    print(settings$output$show)
    model_result_render(
      model_result_gathered,
      knn = knn,
      dist_measures = dist_measures,
      dist_measure_final = dist_measure_final
    )
  }

  model_result_gathered
}

#' @export
model_run_v3 <- function(
  dat_input,
  dat_db,
  dat_station,
  knn
) {
  # Model estimation -----
  model_estimation <- model_estimate_v5(
    dat_input = dat_input,
    dat_db = dat_db,
    dat_station = dat_station
  )

  # Model output -----
  model_output <- model_apply_v1(
    model_estimation,
    dat_station = dat_station,
    knn = knn
  )

  model_output
}

#' @export
model_run_prime <- function(
  dat_input,
  dat_db,
  dat_station,
  knn
) {
  # Add row UID -----
  dat_db <- dat_db %>%
    dplyr::mutate(
      uid = 1:n()
    )
  # TODO 20181206: put in own function and find best place to call it

  # Model estimation -----
  model_estimation <- model_estimate_prime(
    dat_input = dat_input,
    dat_db = dat_db,
    dat_station = dat_station
  )

  # Model output -----
  model_output <- model_apply(
    model_estimation,
    dat_station = dat_station,
    knn = knn
  )

  model_output
}

#' @export
model_run <- function(
  dat_input,
  dat_db,
  dat_station,
  knn,
  msr_distance_prime,
  dat_model_output_prime_react,
  session,
  expand_weight_grid
) {
  shiny::withProgress(message = 'Getting recommendations', value = 0,
    session = session,
    {
      shiny::incProgress(1/3, detail = "Estimating...", session = session)

      # Add row UID -----
      dat_db <- dat_db %>%
        dplyr::mutate(
          uid = 1:n()
        )
      # TODO 20181206: put in own function and find best place to call it

      # Model estimation -----
      model_estimation <- model_estimate(
        dat_input = dat_input,
        dat_db = dat_db,
        dat_station = dat_station,
        expand_weight_grid = expand_weight_grid,
        msr_distance_prime = msr_distance_prime,
        dat_model_output_prime_react = dat_model_output_prime_react
      )


      shiny::incProgress(2/3, detail = "Selecting...", session = session)
      # Model output -----
      model_output <- model_apply(
        model_estimation,
        dat_station = dat_station,
        knn = knn
      )

      shiny::incProgress(3/3, detail = "Done...", session = session)

      model_output
    })

  model_output
}

#' @export
handle_input_distance <- function(
  dat_input,
  dat_msr_distance,
  msr_distance_max = numeric()
) {
  # col_msr_distance <- settings$name_mapping$msr_distance$key
  col_msr_distance <- rlang::sym("msr_distance")
  col_msr_distance_scaled <- rlang::sym("msr_distance_scaled")
  # TODO 20181129: refactor column names from settings

  # Check if `msr_distance` was supplied as input -----
  idx <- colnames(dat_input) %>%
    stringr::str_detect(quo_name(col_msr_distance) %>%
        stringr::str_glue("$"))

  if (any(idx)) {
    value_msr_distance <- dat_input %>%
      dplyr::distinct(!!col_msr_distance) %>%
      dplyr::pull()

    # Find min of max. allowed distance from input and from prime location -----
    if (length(msr_distance_max)) {
      value_msr_distance <- min(value_msr_distance, msr_distance_max)
    }
    if (all(!is.na(value_msr_distance))) {
      dat_msr_distance <- dat_msr_distance %>% dplyr::filter(
        !!col_msr_distance <= value_msr_distance
      )
      dat_input <- dat_input %>%
        dplyr::mutate(
          !!col_msr_distance := 0,
          !!col_msr_distance_scaled := 0
        )
    }
  }

  list(
    dat_input = dat_input,
    dat_msr_distance = dat_msr_distance
  )
}

model_handle_input_time <- function(
  dat_input,
  dat_db
) {
  col_dim_time <- settings$name_mapping$time_month$key
  idx <- colnames(dat_input) %>% stringr::str_detect(quo_name(col_dim_time))
  if (any(idx)) {
    value_dim_time <- dat_input %>% dplyr::pull(!!col_dim_time)
    dat_db <- dat_db %>% dplyr::filter(
      !!col_dim_time %in% value_dim_time
    )
  }

  list(
    dat_input = dat_input,
    dat_db = dat_db
  )
}

model_handle_input_db_v2 <- function(
  dat_db,
  dat_msr_distance
) {
  # Join distances to DB data -----
  col_dim_station <- default_name("station") %>% quo_prepend("dim")
  value_dim_station <- dat_msr_distance %>% dplyr::pull(!!col_dim_station)

  # Ensure stations in DB data match those in distance data:
  dat_db <- dat_db %>%
    dplyr::filter(!!col_dim_station %in% value_dim_station)

  # Cache a version of DB data before the join for downstream processing:
  dat_db_out <- dat_db

  # Actual join:
  dat_db <- left_join(
    dat_db,
    dat_msr_distance,
    by = "dim_station"
  )

  list(
    dat_db = dat_db,
    dat_db_out = dat_db_out
  )
}

model_handle_input_db_v3 <- function(
  dat_db,
  dat_time,
  dat_msr_distance
) {
  # Join distances to DB data -----
  col_dim_station <- default_name("station") %>% quo_prepend("dim")
  value_dim_station <- dat_msr_distance %>% dplyr::pull(!!col_dim_station)

  # Ensure stations in DB data match those in distance data:
  dat_db <- dat_db %>%
    dplyr::filter(!!col_dim_station %in% value_dim_station)

  # Cache a version of DB data before the join for downstream processing:
  dat_db_out <- dat_db

  # Actual join:
  dat_db <- left_join(
    dat_db,
    dat_msr_distance,
    by = "dim_station"
  )

  # Join time to DB data -----
  dat_db <- left_join(
    dat_db,
    dat_time,
    by = c("dim_station", "time_month")
  )

  list(
    dat_db = dat_db,
    dat_db_out = dat_db_out
  )
}

#' @export
model_prime_handle_dat_dist_geo_v1 <- function(
  dat_db
) {
  df_dist_geo <- dat_db %>%
    select(
      dim_station,
      msr_distance
    ) %>%
    group_by(
      dim_station
    ) %>%
    summarise(
      msr_distance = unique(msr_distance)
    ) %>%
    ungroup()
  # TODO 20181122: tidy eval for column names
}

model_handle_dat_dist_geo_v2 <- function(
  dat_db
) {
  df_dist_geo <- dat_db %>%
    select(
      fct_scaling_dist,
      dim_station,
      msr_distance,
      msr_distance_scaled
    ) %>%
    group_by(
      fct_scaling_dist,
      dim_station
    ) %>%
    summarise(
      msr_distance = unique(msr_distance),
      msr_distance_scaled = unique(msr_distance_scaled)
    ) %>%
    ungroup()
  # TODO 20181122: tidy eval for column names
}

model_handle_drop_dims_v2 <- function(dat_db, dat_input) {
  select_non_na <- function(x) {
    all(!is.na(x))
  }
  dat_input <- dat_input %>%
    dplyr::select(-matches("dim|fct")) %>%
    dplyr::select_if(select_non_na)

  dat_db <- dat_db %>%
    select(-matches("dim_"))
  # TODO-20180610: encapsulate in function

  cols <- intersect(names(dat_db), names(dat_input)) %>%
    rlang::syms()
  dat_db <- dat_db %>%
    dplyr::select(!!!cols)

  list(
    dat_input = dat_input,
    dat_db = dat_db
  )
}

model_handle_column_alignment <- function(dat_db, dat_input) {
  select_non_na <- function(x) {
    all(!is.na(x))
  }
  dat_input <- dat_input %>%
    dplyr::select(-matches("dim|fct")) %>%
    dplyr::select_if(select_non_na)

  dat_db <- dat_db %>%
    select(-matches("dim_"))
  # TODO-20180610: encapsulate in function

  cols <- intersect(names(dat_db), names(dat_input))
  idx <- cols %>% stringr::str_detect("_scaled$")
  if (any(idx)) {
    cols_scaled <- cols[idx]
    cols_unscaled <- cols_scaled %>%
      sapply(function(x) x %>% stringr::str_replace("_scaled$", ""))
    cols <- cols[!cols %in% cols_unscaled]
  }

  # Ensure `time_month` is dropped in favor of sin and cos version -----
  if (any(idx <- stringr::str_detect(cols, "time_month_scaled$"))) {
    cols <- cols[!idx]
  }

  cols <- cols %>%
    rlang::syms()
  dat_db <- dat_db %>%
    dplyr::select(!!!cols)
  dat_input <- dat_input %>%
    dplyr::select(!!!cols)

  list(
    dat_input = dat_input,
    dat_db = dat_db
  )
}

pmap_inner_2 <- function(dat_db, dat_input) {
  dat_db <- as.matrix(dat_db)
  dat_input <- as.matrix(dat_input)
  philentropy:::euclidean(
    dat_input, dat_db, testNA = FALSE)
}
rappster/climater documentation built on July 1, 2019, 12:23 a.m.