R/counts.R

Defines functions readCounts

Documented in readCounts

#'@import tidyverse
#' @rawNamespace import(scales, except = c(discard, col_factor))
#'@import xml2
#'@import geomtextpath
#'@import readr

##### DEPRECATED ####

#'@title Deprecated Function
#'
#'@description \strong{readCounts} - Loads a MATSim Counts XML-file as tibble into memory
#'
#'@rdname matsimr-deprecated
#'
#'@param file File to load. Must be an .xml file
#'@param output.name test
#'
#'@return \strong{readCounts} - tibble with MATSim link id ("loc_id") as key
#'
#'@export
readCounts <- function(file){

  .Deprecated("read_counts")

  message = paste("Read counts file from", file)
  print(message)

  counts.xml <- read_xml(file)

  children <- xml_children(counts.xml)

  result <- tibble("loc_id" = character(),
                   "cs_id" = character(),
                   "h" = numeric(),
                   "val" = numeric())

  for(c in children){

    station <- xml_attrs(c)

    volume <- xml_find_all(c, "volume") %>%
      xml_attrs() %>%
      purrr::map_df(~as.list(.)) %>%
      type_convert()

    length <- nrow(volume)

    loc_col <- rep(station["loc_id"], length) %>% unname()
    station_col <- rep(station["cs_id"], length) %>% unname()

    counts.frame <- data.frame("cs_id" = station_col,"loc_id" = loc_col) %>% bind_cols(volume)
    result = bind_rows(result, counts.frame)
  }

  result
}


#'@title Deprecated Function
#'
#'@rdname matsimr-deprecated
#'
#'Load a MATSim linkstats file into memory
#'
#' Loads a linkstats tsv file created from the LinkStats class
#' as a dataframe into memory.
#' Counts can be provided in any time bin.
#' Counts can be provided for any qsim mode. The argument networkModes is used to
#' select and filter the columns.
#'
#' @rdname matsimr-deprecated
#'@param file File to load. Must be an .csv or .tsv file with comma separator
#'
#'@param run_id Id to tag columns with DTV
#'
#'@param sampleSize sample size of the MATSim scenario to scale DTV values
#'
#'@return Tibble with link stats for each qsim mode
#'
#'@export
readLinkStats <- function(run_id, file, sampleSize = 0.25){

  if(str_detect(string = run_id, pattern = "_")){
    message <- "run_id cannot contain '_'..."
    warning(message)
    return(NA)
  }

  message <- paste("Read in link stats from run", run_ID, ". Loading data from", file )
  print(message)

  linkstats <- readr::read_csv(file = file)

  volumeModes <- linkstats %>%
    select(starts_with("vol_")) %>%
    colnames()

  linkstats.1 <- linkstats %>%
    mutate_at(volumeModes, function(x){ x * (1 / sampleSize)})

  names = colnames(linkstats.1)
  newNames = character()

  for(name in names){

    if(str_detect(name, pattern = "vol_")){
      name = paste0(name, "_", run_ID)
    }

    newNames[length(newNames) + 1] = name
  }

  colnames(linkstats.1) = newNames

  linkstats.1
}

#'@title Deprecated Function
#'
#'@rdname matsimr-deprecated
#'
#'Join counts and linkstats to the network, creating a tibble into memory
#'
#'Function to join counts, linkstats and network links. Data can be aggregated
#'and filtered by time or mode.
#'
#' @rdname matsimr-deprecated
#'@param counts Tibble with counts data
#'
#'@param network Tibble with network nodes and links
#'
#'@param linkStats List with linkstats tibbles
#'
#'@param networkModes Vector with network modes that will be analyzed, default is "car".
#'
#'@param aggr_to Determines if data should be aggregated into hourly bins or as daily traffic volume, can either be "day" or "hour"
#'
#'@param earliest Integer. Lower limit to filter link stats by time, default = 0.
#'
#'@param latest Integer. Upper limit to filter link stats by time, default = 86400 (midnight).
#'
#'@return Long-format tibble with MATSim link id as key ("loc_id"), traffic volumes from MATSim runs and link type
#'
#'@export

mergeCountsAndLinks <- function(counts, network, linkStats, networkModes = c("car"), aggr_to = c("day", "hour"), earliest = 0, latest = 86400){

  if(!is.list(linkStats)){
    message <- "linkStatsList needs to be a list!"
    warning(message)

    return(NA)
  }

  if(!is.data.frame(counts)){
    message <- "counts needs to be a data frame!"
    warning(message)

    return(NA)
  }

  if(!is.list(network)){
    message <- "network needs to be a list!"
    warning(message)

    return(NA)
  }

  links <- network$links %>%
    select(id, type)

  if(aggr_to == "day"){

    counts = counts %>%
      group_by(loc_id) %>%
      summarise(cs_id = first(cs_id),
                h = first(h),
                val = sum(val)) %>%
      mutate(time = h)
  }

  print("Join counts and network links.")
  join <- left_join(x = counts, y = links, by = c("loc_id" = "id"))  %>%
    mutate(key = paste0(loc_id, "-", h))

  rm(links)

  print("Aggregate link stats.")
  for(i in 1:length(linkStats)){
    frame <- linkStats[[i]]

    time_value <- unique(frame$time)
    hours <- seq(0, 24 * 3600, 3600)

    ids <- counts %>% distinct(loc_id)

    if(aggr_to == "day"){

      sum_cols <- frame %>%
        select(starts_with("vol_")) %>%
        colnames()

      frame = left_join(ids, frame, by = c("loc_id" = "linkId")) %>%
        filter(time <= latest & time >= earliest) %>%
        group_by(loc_id) %>%
        summarise_at(sum_cols, sum)

      join = left_join(join, frame, by = "loc_id")
    }

    if(aggr_to == "hour"){

      hours <- seq(0, 24 * 3600, 3600)
      labels <- seq(1, 24, 1) %>% as.character()

      sum_cols <- frame %>%
        select(starts_with("vol_")) %>%
        colnames()

      frame.1 = left_join(ids, frame, by = c("loc_id" = "linkId")) %>%
        filter(time <= latest & time >= earliest) %>%
        mutate(hour = cut(time, labels = labels, breaks = hours, right = F)) %>%
        group_by(loc_id, hour) %>%
        summarise_at(sum_cols, sum, na.rm = T) %>%
        ungroup() %>%
        mutate(time = as.numeric(hour) * 3600,
               key = paste0(loc_id, "-", hour)) %>%
        select(-c(loc_id, hour))

      join = left_join(join, frame.1, by = "key")
    }
    rm(frame)
  }

  if("time.x" %in% colnames(join)){
    join = mutate(join, time = time.x)
  }

  if("key" %in% colnames(join)){
    join = join %>% select(-key)
  }

  print("Cleaning up ...")
  join.long <- join %>%
    mutate(count = val) %>%
    select(-c(ends_with(".x"), ends_with(".y"), val)) %>%
    pivot_longer(cols = starts_with("vol_"), names_to = "name", names_prefix = "vol_", values_to = "volume") %>%
    separate(col = name, into = c("mode", "src"), sep = "_") %>%
    mutate(type = str_remove(type, pattern = "highway."),
           type = factor(type, levels = c("motorway", "trunk", "primary", "secondary", "tertiary", "residential", "unclassified", "motorway_link", "primary_link", "trunk_link"))) %>%
    filter(mode %in% networkModes)

  print("Done!")
  join.long
}

#'@title Deprecated Function
#'
#'@rdname matsimr-deprecated
#' Categorize daily traffic volume (DTV) and calculate DTV for different link types.
#'
#' Takes a tibble from process_join_counts_and_links. DTV is categorized into bins. Finally
#' data is aggregated to calculate DTV distribution in each link type category,
#' excluding 'residential' and 'unclassified'
#' Data can be used to create multiple geom_col plots to visualize and compare
#' DTV distributions between count data and several MATSim runs
#'
#' @param joinedFrame A tibble from process_join_counts_and_links
#'
#' @param from Integer. Lower limit for count bin, default = 0.
#'
#' @param to Integer. Upper limit for count bins, default = 40000.
#'
#' @param by Integer. Size of each count bin, default = 5000.
#'
#' @return A long-format tibble which contains share of DTV for link types
#'
#' @export
processLinkStatsDtvDistribution <- function(joinedFrame, from = 0, to = 40000, by = 5000){

  if(!is.data.frame(joinedFrame)){

    message <- "joinedFrame needs to be a data frame, created from method process_join_counts_and_links!"
    warning(message)
    return(NA)
  }

  if((to - from) %% by != 0){

    message <- "Parameter 'from', 'to' and 'by' must be valid according to: (to - from) %% by == 0"
    warning(message)
    return(NA)
  }

  #### Counts in each bin
  options(scipen=999)
  breaks = seq(from = from, to = to, by = by)
  labels = character()

  for(i in 1:length(breaks) - 1){

    label = paste0(breaks[i] / 1000, "k < ", breaks[i + 1] / 1000, "k")
    labels[i] = label
    rm(label)
  }

  wider_cols <- unique(joinedFrame$src)

  join.1 = joinedFrame %>%
    filter(!is.na(type)) %>%
    filter(count > 0) %>%
    select(loc_id, src, volume, type, count) %>%
    pivot_wider(names_from = src, values_from = volume) %>%
    pivot_longer(cols = c(wider_cols, "count"), names_to = "src", values_to = "volume") %>%
    mutate(traffic_bin = cut(volume, labels = labels, breaks = breaks, right = T)) %>%
    group_by(type, src, traffic_bin) %>%
    summarise(n = n()) %>%
    group_by(type, src) %>%
    mutate(share = n / sum(n)) %>%
    filter(!str_detect(type, pattern = "_link")) %>%
    filter(!type %in% c("residential", "unclassified"))

  join.1
}

#'@title Deprecated Function
#'
#'@rdname matsimr-deprecated
#' Categorize DTV deviation and aggregate data
#'
#' Takes a tibble from process_join_counts_and_links.
#' Deviation between count volumes and Linkstats is calculated and
#' categorized (i.e. deviation of 1.2 means 20 percent more DTV in MATSim than in counts).
#' If parameter 'aggr' is set to TRUE, data will be aggregated for each run and link type.
#' Can be used to visualize model quality by link type and to compare several runs.
#'
#' Estimation quality is determined by the 'cut' function, limits for the label
#' 'exact' can be adjusted by tuning the parameters 'll' (lower limit) and 'ul' (upper limit)
#'
#' @param joinedFrame A tibble from process_join_counts_and_links
#' @param aggr Boolean, if categorized data should returned aggregated, default is TRUE.
#' @param ll Formula to calculate lower limit of the quality label 'exact', default = 0.8*x - 200
#' @param ul Formula to calculate lower limit of the quality label 'exact', default = 1.2*x + 200
#'
#' @return A long-format tibble, which contains share of estimation quality for each scenario and link type, if aggr is FALSE disaggregated data is returned
#'
#' @export
processDtvEstimationQuality <- function(joinedFrame, aggr = TRUE, ll =  ~ x *0.8 - 200, ul = ~ x * 1.2 + 200){

  ll.call <- ll[[length(ll)]]
  ul.call <- ul[[length(ul)]]

  x <- joinedFrame$count

  joinedFrame$ul <- eval(expr = ul.call)
  joinedFrame$ll <- eval(expr = ll.call)

  join.1 <- joinedFrame %>%
    mutate(ll = ifelse(ll < 0, 0, ll),
           estimation = ifelse(volume < ll, "less",
                               ifelse(volume > ul, "more",
                                      "exact")),
           estimation = factor(estimation, levels = c("less", "exact", "more")))

  if(aggr){
    join.2 <- join.1 %>%
      group_by(src, type, estimation) %>%
      summarise(n = n()) %>%
      mutate(share = n / sum(n)) %>%
      ungroup()

    return(join.2)
  }

  join.1
}

#'@title Deprecated Function
#'
#'@rdname matsimr-deprecated
#' Creates a Via-Style scatterplot for each run
#'
#' Takes a tibble from process_join_counts_and_links.
#' A scatterplot with counts on the x axis and MATSim dtv on the y axis is created and colored
#' by the road type.
#' Lower and upper limits define the section which is considered an 'exact' estimation. Limits
#' are defined by custom formulas.
#'
#' The function calls the matsim-r function processDtvEstimationQuality which is handling the limits.
#'
#' @param joinedFrame A tibble from process_join_counts_and_links
#' @param ll Formula to calculate lower limit of the quality label 'exact', default = 0.8*x - 200
#' @param ul Formula to calculate lower limit of the quality label 'exact', default = 1.2*x + 200
#' @param threshold Threshold from which data is scaled to log10.
#'
#' @return A ggplot Scatterplotplot, which can be adjusted, if needed.
#'
#' @export
createCountScatterPlot <- function(joinedFrame, ll = ~ x * 0.8 - 200, ul = ~ x * 1.2 + 200, threshold = 100){

  line.size <- 0.7

  x <- seq(threshold, round(max(joinedFrame$count), -2), 10)

  middle.line <- data.frame(x = x,
                            y = x)

  ll.call <- ll[[length(ll)]]
  ul.call <- ul[[length(ul)]]

  x <- middle.line$x

  middle.line$ul <- eval(expr = ul.call)
  middle.line$ll <- eval(expr = ll.call)

  middle.line = middle.line %>%
    mutate(ll = ifelse(ll < 0, 0, ll))

  ggplot(joinedFrame, aes(x = count, y = volume, color = type)) +

    geom_point() +

    geom_line(data = middle.line, mapping = aes(x = x, y = ul), color = "black", size = line.size + 0.1) +

    geom_line(data = middle.line, mapping = aes(x = x, y = ll), color = "black", size = line.size + 0.1) +

    geom_line(data = middle.line, mapping = aes(x, y), size = line.size, linetype = "dashed", color = "grey60") +

    geom_vline(xintercept = threshold, linetype = "dashed") +

    geom_textvline(xintercept = threshold, label = "x = 100", linetype = "dashed", vjust = -0.3) +

    scale_x_continuous(trans = symlog_trans(thr = threshold, scale = 1000), breaks = c(0, 300, 1000, 3000, 10000, 30000, 100000)) +

    scale_y_continuous(trans = "log10", breaks = c(3, 10, 30, 100, 300, 1000, 3000, 10000, 30000)) +

    facet_wrap(.~ src) +

    labs(x = "Daily traffic volume from Count Stations", y = "Daily traffic volume from MATSim Data", color = "Road type:") +

    theme_bw() +

    theme(legend.position = "bottom", panel.background = element_rect(fill = "grey90"),
          panel.grid = element_line(colour = "white"))
}

#'@title Deprecated Function
#'
#'@rdname matsimr-deprecated
#' A function to create symlog scaling for a plot
#'
#' Can be used to symlog scale the axis of a ggplot object. Is called in createCountScatterPlot.
#' Note that this function is taken from Stackoverflow!
#' For more informations, see the thread here:
#' https://stackoverflow.com/questions/14613355/how-to-get-something-like-matplotlibs-symlog-scale-in-ggplot-or-lattice
#'  @rdname matsimr-deprecated
#' @param base base for log
#' @param thr threshold from which data is scaled to log
#'
#' @export
symlog_trans <- function(base = 10, thr = 1, scale = 1){
  trans <- function(x)
    ifelse(abs(x) < thr, x, sign(x) *
             (thr + scale * suppressWarnings(log(sign(x) * x / thr, base))))

  inv <- function(x)
    ifelse(abs(x) < thr, x, sign(x) *
             base^((sign(x) * x - thr) / scale) * thr)

  breaks <- function(x){
    sgn <- sign(x[which.max(abs(x))])
    if(all(abs(x) < thr))
      pretty_breaks()(x)
    else if(prod(x) >= 0){
      if(min(abs(x)) < thr)
        sgn * unique(c(pretty_breaks()(c(min(abs(x)), thr)),
                       log_breaks(base)(c(max(abs(x)), thr))))
      else
        sgn * log_breaks(base)(sgn * x)
    } else {
      if(min(abs(x)) < thr)
        unique(c(sgn * log_breaks()(c(max(abs(x)), thr)),
                 pretty_breaks()(c(sgn * thr, x[which.min(abs(x))]))))
      else
        unique(c(-log_breaks(base)(c(thr, -x[1])),
                 pretty_breaks()(c(-thr, thr)),
                 log_breaks(base)(c(thr, x[2]))))
    }
  }
  trans_new(paste("symlog", thr, base, scale, sep = "-"), trans, inv, breaks)
}


#### READING ####

#'@title Load a MATSim Counts file into memory
#'
#'Loads a MATSim Counts XML-file as tibble into memory
#'
#'@param input_path character string, file path to the .xml file
#'
#'@return tibble of the counts file with the link id ("loc_id") as key
#'
#'@export
read_counts <- function(input_path){

  message = paste("Read counts file from", input_path)
  print(message)

  counts.xml <- read_xml(input_path)

  children <- xml_children(counts.xml)

  result <- tibble("loc_id" = character(),
                   "cs_id" = character(),
                   "h" = numeric(),
                   "val" = numeric())

  for(c in children){

    station <- xml_attrs(c)

    volume <- xml_find_all(c, "volume") %>%
      xml_attrs() %>%
      purrr::map_df(~as.list(.)) %>%
      type_convert()

    length <- nrow(volume)

    loc_col <- rep(station["loc_id"], length) %>% unname()
    station_col <- rep(station["cs_id"], length) %>% unname()

    counts.frame <- data.frame("cs_id" = station_col,"loc_id" = loc_col) %>% bind_cols(volume)
    result = bind_rows(result, counts.frame)
  }

  result
}

#' Load a MATSim linkstats file into memory
#'
#' Loads a linkstats tsv file created from the LinkStats class
#' as a dataframe into memory.
#' Counts can be provided in any time bin.
#' Counts can be provided for any qsim mode. The argument network_modes is used to
#' select and filter the columns.
#'
#'
#'@param input_path character string, file path to the .csv or .tsv file. Must be comma separated.
#'
#'@param run_ID character string, ID to tag columns with DTV
#'
#'@param sample_size optional, double, sample size of the MATSim scenario to scale DTV values, standard value is 0.25
#'
#'@return Tibble with link stats for each qsim mode
#'
#'@export
read_linkstats <- function(run_ID, input_path, sample_size = 0.25){

  if(str_detect(string = run_ID, pattern = "_")){
    message <- "run_ID cannot contain '_'..."
    warning(message)
    return(NA)
  }

  message <- paste("Read in link stats from run", run_ID, ". Loading data from", input_path )
  print(message)

  linkstats <- readr::read_csv(file = input_path)

  volumeModes <- linkstats %>%
    select(starts_with("vol_")) %>%
    colnames()

  linkstats.1 <- linkstats %>%
    mutate_at(volumeModes, function(x){ x * (1 / sample_size)})

  names = colnames(linkstats.1)
  newNames = character()

  for(name in names){

    if(str_detect(name, pattern = "vol_")){
      name = paste0(name, "_", run_id)
    }

    newNames[length(newNames) + 1] = name
  }

  colnames(linkstats.1) = newNames

  linkstats.1
}


#'Join counts and linkstats to the network, creating a tibble into memory
#'
#'Function to join counts, linkstats and network links. Data can be aggregated
#'and filtered by time or mode.
#'
#'
#'@param counts tibble of counts from \link{read_counts}
#'
#'@param network list with network nodes and links
#'
#'@param link_stats List with linkstats tibbles
#'
#'@param network_modes Vector with network modes that will be analyzed, default is "car".
#'
#'@param aggr_to Determines if data should be aggregated into hourly bins or as daily traffic volume, can either be "day" or "hour"
#'
#'@param earliest Integer. Lower limit to filter link stats by time, default = 0.
#'
#'@param latest Integer. Upper limit to filter link stats by time, default = 86400 (midnight).
#'
#'@return Long-format tibble with MATSim link id as key ("loc_id"), traffic volumes from MATSim runs and link type
#'
#'@export

process_join_counts_and_links <- function(counts, network, link_stats, network_modes = c("car"), aggr_to = c("day", "hour"), earliest = 0, latest = 86400){

  if(!is.list(link_stats)){
    message <- "link_stats needs to be a list!"
    warning(message)

    return(NA)
  }

  if(!is.data.frame(counts)){
    message <- "counts needs to be a data frame!"
    warning(message)

    return(NA)
  }

  if(!is.list(network)){
    message <- "network needs to be a list!"
    warning(message)

    return(NA)
  }

  links <- network$links %>%
    select(id, type)

  if(aggr_to == "day"){

    counts = counts %>%
      group_by(loc_id) %>%
      summarise(cs_id = first(cs_id),
                h = first(h),
                val = sum(val)) %>%
      mutate(time = h)
  }

  print("Join counts and network links.")
  join <- left_join(x = counts, y = links, by = c("loc_id" = "id"))  %>%
    mutate(key = paste0(loc_id, "-", h))

  rm(links)

  print("Aggregate link stats.")
  for(i in 1:length(link_stats)){
    frame <- link_stats[[i]]

    time_value <- unique(frame$time)
    hours <- seq(0, 24 * 3600, 3600)

    ids <- counts %>% distinct(loc_id)

    if(aggr_to == "day"){

      sum_cols <- frame %>%
        select(starts_with("vol_")) %>%
        colnames()

      frame = left_join(ids, frame, by = c("loc_id" = "linkId")) %>%
        filter(time <= latest & time >= earliest) %>%
        group_by(loc_id) %>%
        summarise_at(sum_cols, sum)

      join = left_join(join, frame, by = "loc_id")
    }

    if(aggr_to == "hour"){

      hours <- seq(0, 24 * 3600, 3600)
      labels <- seq(1, 24, 1) %>% as.character()

      sum_cols <- frame %>%
        select(starts_with("vol_")) %>%
        colnames()

      frame.1 = left_join(ids, frame, by = c("loc_id" = "linkId")) %>%
        filter(time <= latest & time >= earliest) %>%
        mutate(hour = cut(time, labels = labels, breaks = hours, right = F)) %>%
        group_by(loc_id, hour) %>%
        summarise_at(sum_cols, sum, na.rm = T) %>%
        ungroup() %>%
        mutate(time = as.numeric(hour) * 3600,
               key = paste0(loc_id, "-", hour)) %>%
        select(-c(loc_id, hour))

      join = left_join(join, frame.1, by = "key")
    }
    rm(frame)
  }

  if("time.x" %in% colnames(join)){
    join = mutate(join, time = time.x)
  }

  if("key" %in% colnames(join)){
    join = join %>% select(-key)
  }

  print("Cleaning up ...")
  join.long <- join %>%
    mutate(count = val) %>%
    select(-c(ends_with(".x"), ends_with(".y"), val)) %>%
    pivot_longer(cols = starts_with("vol_"), names_to = "name", names_prefix = "vol_", values_to = "volume") %>%
    separate(col = name, into = c("mode", "src"), sep = "_") %>%
    mutate(type = str_remove(type, pattern = "highway."),
           type = factor(type, levels = c("motorway", "trunk", "primary", "secondary", "tertiary", "residential", "unclassified", "motorway_link", "primary_link", "trunk_link"))) %>%
    filter(mode %in% network_modes)

  print("Done!")
  join.long
}

#' Categorize daily traffic volume (DTV) and calculate DTV for different link types.
#'
#' Takes a tibble from process_join_counts_and_links. DTV is categorized into bins. Finally
#' data is aggregated to calculate DTV distribution in each link type category,
#' excluding 'residential' and 'unclassified'
#' Data can be used to create multiple geom_col plots to visualize and compare
#' DTV distributions between count data and several MATSim runs
#'
#' @param joined_frame A tibble from process_join_counts_and_links
#'
#' @param from Integer. Lower limit for count bin, default = 0.
#'
#' @param to Integer. Upper limit for count bins, default = 40000.
#'
#' @param by Integer. Size of each count bin, default = 5000.
#'
#' @return A long-format tibble which contains share of DTV for link types
#'
#' @export
#'
process_get_dtv_distribution_by_linktype <- function(joined_frame, from = 0, to = 40000, by = 5000){

  if(!is.data.frame(joined_frame)){

    message <- "joined_frame needs to be a data frame, created from method process_join_counts_and_links!"
    warning(message)
    return(NA)
  }

  if((to - from) %% by != 0){

    message <- "Parameter 'from', 'to' and 'by' must be valid according to: (to - from) %% by == 0"
    warning(message)
    return(NA)
  }

  #### Counts in each bin
  options(scipen=999)
  breaks = seq(from = from, to = to, by = by)
  labels = character()

  for(i in 1:length(breaks) - 1){

    label = paste0(breaks[i] / 1000, "k < ", breaks[i + 1] / 1000, "k")
    labels[i] = label
    rm(label)
  }

  wider_cols <- unique(joined_frame$src)

  join.1 = joined_frame %>%
    filter(!is.na(type)) %>%
    filter(count > 0) %>%
    select(loc_id, src, volume, type, count) %>%
    pivot_wider(names_from = src, values_from = volume) %>%
    pivot_longer(cols = c(wider_cols, "count"), names_to = "src", values_to = "volume") %>%
    mutate(traffic_bin = cut(volume, labels = labels, breaks = breaks, right = T)) %>%
    group_by(type, src, traffic_bin) %>%
    summarise(n = n()) %>%
    group_by(type, src) %>%
    mutate(share = n / sum(n)) %>%
    filter(!str_detect(type, pattern = "_link")) %>%
    filter(!type %in% c("residential", "unclassified"))

  join.1
}

#' Categorize DTV deviation and aggregate data
#'
#' Takes a tibble from process_join_counts_and_links.
#' Deviation between count volumes and linkstats is calculated and
#' categorized (i.e. deviation of 1.2 means 20 percent more DTV in MATSim than in counts).
#' If parameter 'aggr' is set to TRUE, data will be aggregated for each run and link type.
#' Can be used to visualize model quality by link type and to compare several runs.
#'
#' Estimation quality is determined by the 'cut' function, limits for the label
#' 'exact' can be adjusted by tuning the parameters 'll' (lower limit) and 'ul' (upper limit)
#'
#' @param joined_frame A tibble from process_join_counts_and_links
#' @param aggr Boolean, if categorized data should returned aggregated, default is TRUE.
#' @param ll Formula to calculate lower limit of the quality label 'exact', default = 0.8*x - 200
#' @param ul Formula to calculate lower limit of the quality label 'exact', default = 1.2*x + 200
#'
#' @return A long-format tibble, which contains estimation quality for each scenario and link type, if aggr is FALSE disaggregated data is returned
#'
#' @export
process_get_dtv_estimation_quality <- function(joined_frame, aggr = TRUE, ll =  ~ x *0.8 - 200, ul = ~ x * 1.2 + 200){

  ll.call <- ll[[length(ll)]]
  ul.call <- ul[[length(ul)]]

  x <- joined_frame$count

  joined_frame$ul <- eval(expr = ul.call)
  joined_frame$ll <- eval(expr = ll.call)

  join.1 <- joined_frame %>%
    mutate(ll = ifelse(ll < 0, 0, ll),
           estimation = ifelse(volume < ll, "less",
                               ifelse(volume > ul, "more",
                                      "exact")),
           estimation = factor(estimation, levels = c("less", "exact", "more")))

  if(aggr){
    join.2 <- join.1 %>%
      group_by(src, type, estimation) %>%
      summarise(n = n()) %>%
      mutate(share = n / sum(n)) %>%
      ungroup()

    return(join.2)
  }

  join.1
}

#' Creates a Via-Style scatterplot for each run
#'
#' Takes a tibble from process_join_counts_and_links to create a scatterplot with counts on the x axis and MATSim DTV on the y axis, colored
#' by the road type.
#' Lower and upper limits define the section which is considered an 'exact' estimation. Limits
#' are defined by custom formulas.

#' @param joined_frame A tibble from process_join_counts_and_links
#' @param ll Formula to calculate lower limit of the quality label 'exact', default = 0.8*x - 200
#' @param ul Formula to calculate lower limit of the quality label 'exact', default = 1.2*x + 200
#' @param threshold Threshold from which data is scaled to log10.
#'
#' @return An adjustable ggplot scatter plot
#'
#' @export
plot_count_scatterplot <- function(joined_frame, ll = ~ x * 0.8 - 200, ul = ~ x * 1.2 + 200, threshold = 100){

  line.size <- 0.7

  x <- seq(threshold, round(max(joined_frame$count), -2), 10)

  middle.line <- data.frame(x = x,
                            y = x)

  ll.call <- ll[[length(ll)]]
  ul.call <- ul[[length(ul)]]

  x <- middle.line$x

  middle.line$ul <- eval(expr = ul.call)
  middle.line$ll <- eval(expr = ll.call)

  middle.line = middle.line %>%
    mutate(ll = ifelse(ll < 0, 0, ll))

  ggplot(joined_frame, aes(x = count, y = volume, color = type)) +

    geom_point() +

    geom_line(data = middle.line, mapping = aes(x = x, y = ul), color = "black", size = line.size + 0.1) +

    geom_line(data = middle.line, mapping = aes(x = x, y = ll), color = "black", size = line.size + 0.1) +

    geom_line(data = middle.line, mapping = aes(x, y), size = line.size, linetype = "dashed", color = "grey60") +

    geom_vline(xintercept = threshold, linetype = "dashed") +

    geom_textvline(xintercept = threshold, label = "x = 100", linetype = "dashed", vjust = -0.3) +

    scale_x_continuous(trans = symlog_trans(thr = threshold, scale = 1000), breaks = c(0, 300, 1000, 3000, 10000, 30000, 100000)) +

    scale_y_continuous(trans = "log10", breaks = c(3, 10, 30, 100, 300, 1000, 3000, 10000, 30000)) +

    facet_wrap(.~ src) +

    labs(x = "Daily traffic volume from Count Stations", y = "Daily traffic volume from MATSim Data", color = "Road type:") +

    theme_bw() +

    theme(legend.position = "bottom", panel.background = element_rect(fill = "grey90"),
          panel.grid = element_line(colour = "white"))
}


#' A function to create symlog scaling for a plot
#'
#' Can be used to symlog scale the axis of a ggplot object. Is called in plot_count_scatterplot.
#' Note that this function is taken from Stackoverflow!
#' For more informations, see the thread here:
#' https://stackoverflow.com/questions/14613355/how-to-get-something-like-matplotlibs-symlog-scale-in-ggplot-or-lattice
#'
#' @param base base for log
#' @param thr threshold from which data is scaled to log
#' @param scale scale
#'
#' @export
symlog_trans <- function(base = 10, thr = 1, scale = 1){
  trans <- function(x)
    ifelse(abs(x) < thr, x, sign(x) *
             (thr + scale * suppressWarnings(log(sign(x) * x / thr, base))))

  inv <- function(x)
    ifelse(abs(x) < thr, x, sign(x) *
             base^((sign(x) * x - thr) / scale) * thr)

  breaks <- function(x){
    sgn <- sign(x[which.max(abs(x))])
    if(all(abs(x) < thr))
      pretty_breaks()(x)
    else if(prod(x) >= 0){
      if(min(abs(x)) < thr)
        sgn * unique(c(pretty_breaks()(c(min(abs(x)), thr)),
                       log_breaks(base)(c(max(abs(x)), thr))))
      else
        sgn * log_breaks(base)(sgn * x)
    } else {
      if(min(abs(x)) < thr)
        unique(c(sgn * log_breaks()(c(max(abs(x)), thr)),
                 pretty_breaks()(c(sgn * thr, x[which.min(abs(x))]))))
      else
        unique(c(-log_breaks(base)(c(thr, -x[1])),
                 pretty_breaks()(c(-thr, thr)),
                 log_breaks(base)(c(thr, x[2]))))
    }
  }
  trans_new(paste("symlog", thr, base, scale, sep = "-"), trans, inv, breaks)
}
matsim-vsp/matsim-r documentation built on Feb. 3, 2025, 6:48 p.m.