Recca: R Energy Conversion Chain Analysis

#' #' Layout for a \code{qgraph} network representation of an energy conversion chain
#' #'
#' #' Industries and products are interleaved from left to right,
#' #' starting with the first industry, followed by the first product.
#' #' Industries and products alternate until all products are exhausted.
#' #' Finally, the last industry (which should be final demand) is placed at the far right.
#' #' Storage industries are arranged across the top of the network.
#' #'
#' #' @param Industries a data frame consisting of columns named
#' #'        \code{industry_colname}, \code{stage_colname}, and \code{group_colname}.
#' #'        Entries in \code{industry_colname} must be names of industries.
#' #'        Entries in \code{stage_colname} must be names of left-to-right stages in the ECC.
#' #'        Industries (or sectors) in final demand must be included in the \code{Industries}
#' #'        data frame.
#' #'        Industries that provide "storage" must also be included in the \code{Industries}
#' #'        data frame (such as Stock changes, International aviation and marine bunkers, and
#' #'        Statistical differences), their \code{stage_colname} value being \code{storage_stagename}.
#' #'        All industries with \code{storage_stagename} are laid out
#' #'        along the top of the network.
#' #'        The left-to-right order of stages in the network layout generated by this function
#' #'        is taken from the order of appearance of stages in the \code{stage_colname} column.
#' #'        The left-to-right order of storage industries is given by
#' #'        the appearance order of the industries in \code{industry_colname}.
#' #'        Entries in \code{group_colname} give groups of industries or products to keep together
#' #'        at each stage. The top-to-bottom order of groups is taken from the appearance order
#' #'        in \code{group_colname}.
#' #' @param Products a data frame consisting of same columns as \code{Industries},
#' #'        except that \code{product_colname} takes the place of \code{industry_colname}.
#' #'        Note that the number of stages in \code{Products} must be one less than than
#' #'        number of stages in \code{Industries}, not counting industries in stage
#' #'        \code{storage_stagename}.
#' #' @param industry_colname the name of the column in \code{Industries} containing
#' #'        names of industries (a string).
#' #'        Default is "\code{Industry}".
#' #' @param product_colname the name of the column in \code{Products} containing
#' #'        names of products (a string).
#' #'        Default is "\code{Product}".
#' #' @param stage_colname the name of the columns in \code{Industries} and \code{Products}
#' #'        containing names of stages (a string).
#' #'        The left-to-right order of stages in the network layout
#' #'        is determined by the top-to-bottom
#' #'        order in which stage names appear in the
#' #'        \code{Industries} and \code{Products} data frames.
#' #'        Default is "\code{Stage}".
#' #' @param group_colname the name of the column in \code{Industries} and \code{Products}
#' #'        containing industries and products that should be grouped together vertically
#' #'        at a stage (a string).
#' #'        The top-to-bottom order of groups at a stage is given by the top-to-bottom
#' #'        order of appearance in \code{group_colname}.
#' #'        Default is "\code{Group}".
#' #'        \code{group_colname} is optional in \code{Industries} and \code{Products}.
#' #'        If \code{group_colname} is missing, a column named "\code{group_colname}"
#' #'        will be created and filled with "\code{group_colname}",
#' #'        thereby creating a single group for all industries or products.
#' #' @param storage_stagename the name of the stage in \code{stage_colname} of \code{Industries}
#' #'        that identifies a "storage" industry (a string).
#' #'        Default is "\code{Storage}".
#' #'        Typical storage industries are bunkers, stock changes, and statistical differences.
#' #'        This layout puts storage industries across the top of the ECC graph.
#' #' @param g an optional \code{qgraph} object from which the order of node names is extracted,
#' #'        if present.
#' #'        It is assumed that the layout calculated by this function will be applied to \code{g},
#' #'        so the row order of the outgoing layout is set to the row order expected by \code{g}.
#' #'        Furthermore, additional error checking is performed:
#' #'        if the names of nodes in \code{Industries} and \code{Products}
#' #'        don't exactly match the names of nodes that \code{g} expects,
#' #'        a data frame of mismatching names is printed and execution is halted.
#' #'        An easy way to obtain a \code{qgraph} object for the \code{g} argument is to call
#' #'        \code{g <- qgraph(somedata, DoNotPlot = TRUE)}.
#' #' @param x_colname the name of the output column containing x coordinates for each node
#' #'        (a string).
#' #'        Default is "\code{x}".
#' #' @param y_colname the name of the output column containing y coordinates for each node
#' #'        (a string).
#' #'        Default is "\code{y}".
#' #'
#' #' @return a matrix with two columns.
#' #'         The first column contains x coordinates for the nodes and is named \code{x_colname}.
#' #'         The second column contains y coordinates for the nodes and is named \code{y_colname}.
#' #'         Node names are the row names for the outgoing matrix.
#' #'
#' #' @export
#' ecc_layout <- function(Industries,
#'                        Products,
#'                        industry_colname = "Industry",
#'                        product_colname = "Product",
#'                        stage_colname = "Stage",
#'                        group_colname = "Group",
#'                        storage_stagename = "Storage",
#'                        g,
#'                        # Output columns
#'                        x_colname = "x",
#'                        y_colname = "y"){
#'   # Set a name for the node name columsn that will be used throughout this function.
#'   node_name_colname <- ".Node_name"
#'   # First step is to eliminate factors in the incoming data frames
#'   Industries <- Industries %>% dplyr::mutate_if(is.factor, as.character)
#'   Products <- Products %>% dplyr::mutate_if(is.factor, as.character)
#'   # The group_colname is optional.
#'   # If the column is not present, add and fill with a single group (group_colname).
#'   if (!(group_colname %in% names(Industries))) {
#'     Industries <- Industries %>%
#'       dplyr::mutate(
#'         !!as.name(group_colname) := group_colname
#'       )
#'   }
#'   if (!(group_colname %in% names(Products))) {
#'     Products <- Products %>%
#'       dplyr::mutate(
#'         !!as.name(group_colname) := group_colname
#'       )
#'   }
#'   # Extract storage industries from the Industries data frame.
#'   Storage <- Industries %>%
#'     dplyr::filter((!!as.name(stage_colname)) == storage_stagename)
#'   Industries_less_Storage <- Industries %>%
#'     dplyr::filter((!!as.name(stage_colname)) != storage_stagename)
#'   # Ensure that the number of industry stages (less storage)
#'   # is one more than the number of product stages.
#'   N_industry_stages <- Industries_less_Storage %>% dplyr::select(!!as.name(stage_colname)) %>% unique() %>% nrow()
#'   N_product_stages <- Products %>% dplyr::select(!!as.name(stage_colname)) %>% unique() %>% nrow()
#'   # if (N_industry_stages - N_product_stages != 1) {
#'   #   stop(paste0("N_industry_stages = ", N_industry_stages,
#'   #               ". N_product_stages = ", N_product_stages, ". ",
#'   #               "There should be one more industry stages than product stages. "),
#'   #        call. = FALSE)
#'   # }
#'   assertthat::assert_that(N_industry_stages - N_product_stages == 1,
#'                           msg = paste0("N_industry_stages = ", N_industry_stages,
#'                                        ". N_product_stages = ", N_product_stages, ". ",
#'                                        "There should be one more industry stages than product stages. "))
#'   # Set groups for the Group variable based on order of appearance.
#'   # These groups will be used later for ordering the y coordinates of nodes.
#'   grps <- rbind(Industries_less_Storage %>% dplyr::select(!!as.name(group_colname)),
#'                 Products %>% dplyr::select(!!as.name(group_colname))) %>%
#'     dplyr::filter(!is.na(!!as.name(group_colname))) %>% unique()
#'   grps <- magrittr::set_rownames(grps, 1:nrow(grps))
#'   # Set levels for groups in order of their appearance.
#'   # These levels will be used later for calculating the y coordinates for the nodes.
#'   Industries <- Industries %>%
#'     dplyr::mutate(
#'       !!as.name(group_colname) := factor(!!as.name(group_colname), levels = grps[[group_colname]])
#'     )
#'   Industries_less_Storage <- Industries_less_Storage %>%
#'     dplyr::mutate(
#'       !!as.name(group_colname) := factor(!!as.name(group_colname), levels = grps[[group_colname]])
#'     )
#'   Products <- Products %>%
#'     dplyr::mutate(
#'       !!as.name(group_colname) := factor(!!as.name(group_colname), levels = grps[[group_colname]])
#'     )
#'   Storage <- Storage %>%
#'     dplyr::mutate(
#'       !!as.name(group_colname) := factor(!!as.name(group_colname), levels = grps[[group_colname]])
#'     )
#'   # Make data frames of stage numbers.
#'   # Left-to-right order across the network is taken from the order of appearance
#'   # in the respective data frames.
#'   # This process is simpler for industries and products
#'   # but takes additional calculations for storage industries (later).
#'   i_stage_colname <- paste0("i_", stage_colname)
#'   Industry_stage_order <- data.frame(temp = Industries_less_Storage %>%
#'                                        dplyr::select(!!as.name(stage_colname)) %>%
#'                                        unique()) %>%
#'     tibble::rownames_to_column(var = i_stage_colname) %>%
#'     dplyr::mutate(
#'       !!as.name(i_stage_colname) := as.numeric(!!as.name(i_stage_colname)),
#'       !!as.name(i_stage_colname) := 2 * (!!as.name(i_stage_colname)) - 1
#'     )
#'   Product_stage_order <- data.frame(temp = Products %>%
#'                                       dplyr::select(!!as.name(stage_colname)) %>%
#'                                       unique()) %>%
#'     tibble::rownames_to_column(var = i_stage_colname) %>%
#'     dplyr::mutate(
#'       !!as.name(i_stage_colname) := as.numeric(!!as.name(i_stage_colname)),
#'       !!as.name(i_stage_colname) := 2 * (!!as.name(i_stage_colname))
#'     )
#'   # rbind these *_stage_order data frames
#'   Stage_coords <- rbind(Industry_stage_order %>%
#'                           dplyr::rename(!!as.name(x_colname) := !!as.name(i_stage_colname)),
#'                         Product_stage_order %>%
#'                           dplyr::rename(!!as.name(x_colname) := !!as.name(i_stage_colname))) %>%
#'     dplyr::arrange(!!as.name(x_colname))
#'
#'   # Join Stage_coords to a list of industries and products to create the list of nodes.
#'   Temp <- rbind(
#'     Industries_less_Storage %>% dplyr::rename(!!as.name(node_name_colname) := !!as.name(industry_colname)),
#'     Products %>% dplyr::rename(!!as.name(node_name_colname) := !!as.name(product_colname))
#'   ) %>%
#'     dplyr::left_join(Stage_coords, by = stage_colname)
#'   # Split Temp in half by rows that have NA Group and those that don't.
#'   # Sort the nodes where Groups are specified.
#'   Node_coords <- dplyr::bind_rows(
#'     Temp %>% dplyr::filter(is.na(!!as.name(group_colname))),
#'     Temp %>% dplyr::filter(!is.na(!!as.name(group_colname))) %>%
#'       dplyr::group_by(!!as.name(x_colname), !!as.name(group_colname)) %>%
#'       # Put nodes in correct order.
#'       # First group on stage (x coordinate) followed by
#'       # the Group within each stage.
#'       dplyr::arrange(!!as.name(x_colname), !!as.name(group_colname)))
#'
#' # At this point, all x coordinates have been decided and are in the Node_coords data frame.
#'   # Furthermore, the Node_coords data frame is in the correct order for y coordinates.
#'   # So work on y coordinates.
#'   # Figure out the number of nodes in each stage.
#'   N_nodes <- Node_coords %>%
#'     dplyr::group_by(!!as.name(stage_colname)) %>%
#'     dplyr::count()
#'   # The count function automatically creates a column named "n".
#'   # We need this name later, so declare it here.
#'   n_colname <- "n"
#'   y_max <- max(N_nodes[[n_colname]])
#'   # y_center is the y coordinate that gives a horizontal line
#'   # that serves as the vertical center of the network layout
#'   # (not including storage nodes).
#'   y_center <- (y_max + 1) / 2
#'
#'   # Calculate y coordinates
#'   i_group_colname <- ".i_group"
#'   Node_coords <- Node_coords %>%
#'     # Re-group according to x_colname only,
#'     # thereby ensuring that we apply y coords to each stage independently.
#'     dplyr::group_by(!!as.name(x_colname)) %>%
#'     dplyr::mutate(
#'       # Add a column for the row number (i) within each group (_group).
#'       !!as.name(i_group_colname) := dplyr::row_number()
#'     ) %>%
#'     # Add a column (n) that gives the number of nodes in each stage
#'     dplyr::left_join(N_nodes, by = stage_colname) %>%
#'     dplyr::mutate(
#'       # Calculate the column of y coordinates
#'       # using the index within each group (i_group) and
#'       # the total number of nodes in the group (n)
#'       !!as.name(y_colname) := (y_max + 1) -
#'                               (y_center + (!!as.name(i_group_colname)) - ((!!as.name(n_colname)) + 1)/2)
#'     ) %>%
#'     # select only relevant columns
#'     dplyr::select(!!as.name(node_name_colname), !!as.name(x_colname), !!as.name(y_colname))
#'
#'   # Figure out where to place Storage nodes in x dimension.
#'   # Align storage industries with other industries.
#'   # Doing so will avoid vertical flows from products to storage.
#'   x_max <- max(Industry_stage_order[[i_stage_colname]], Product_stage_order[[i_stage_colname]])
#'   # Calculate the center of the x dimension.
#'   x_center <- (x_max + 1) / 2
#'   # Find out how many storage industries we have.
#'   N_storage <- nrow(Storage)
#'   # Find x coordinate of first Storage industry
#'   x_first_storage <- x_center - (N_storage - 1) / 2
#'   # If we have no Storage industries, we're done and can return Node_coords now.
#'   if (nrow(Storage) == 0) {
#'     out <- Node_coords
#'   } else {
#'     # We have some Storage nodes.
#'     # Calculate coordinates for storage nodes.
#'     Storage_coords <- data.frame(temp = Storage %>%
#'                                    dplyr::select(!!as.name(industry_colname)) %>%
#'                                    unique()) %>%
#'       dplyr::mutate(
#'         !!as.name(x_colname) := x_first_storage:(x_first_storage - 1 + N_storage)
#'       ) %>%
#'       dplyr::left_join(Storage, by = industry_colname) %>%
#'       dplyr::rename(
#'         !!as.name(node_name_colname) := !!as.name(industry_colname)
#'       ) %>%
#'       dplyr::mutate(
#'         # Add the y coordinate
#'         !!as.name(y_colname) := y_max + 1
#'       ) %>%
#'       # Select only columns that we want in output
#'       dplyr::select(!!as.name(node_name_colname), !!as.name(x_colname), !!as.name(y_colname))
#'     # Finally, rbind Node_coords and Storage_coords and return
#'     out <- rbind(as.data.frame(Node_coords), Storage_coords)
#'   }
#'   if (!missing(g)) {
#'     # When using a layout, qgraph respects the order, but not the names,
#'     # of nodes in the layout argument.
#'     # So, we need to re-order the rows of out to match the order of rows in g.
#'     # First, get the node names (in qgraph order) from the qgraph object (g).
#'     qgraph_node_names <- g$graphAttributes$Nodes$names
#'     if (is.null(names(qgraph_node_names))) {
#'       # If the graph is specified by an edge list, g has an unnamed list of full node names
#'       node_names_from_graph <- data.frame(qgraph_node_names, stringsAsFactors = FALSE)
#'     } else {
#'       # If the graph is specified from a matrix, g has a named list of abbreviated node names.
#'       # The list itself comprises abbreviated names.
#'       # But the names of the items in the list comprise the actual node names.
#'       node_names_from_graph <- data.frame(names(qgraph_node_names), stringsAsFactors = FALSE)
#'     }
#'     # Give the only column the correct name
#'     names(node_names_from_graph)[1] <- node_name_colname
#'     # By left_join-ing here, we keep the row order of node_names_from_graph
#'     # while we add the x and y coordinates for the nodes in this layout.
#'     out <- dplyr::left_join(node_names_from_graph, out, by = node_name_colname) %>%
#'       # Select only the columns that we want to return
#'       dplyr::select(!!as.name(x_colname), !!as.name(y_colname), !!as.name(node_name_colname))
#'     # Ensure that we have x and y values for each row.
#'     # If any are missing, we didn't have a matching set of node names
#'     # among g, Industries, and Products.
#'     err <- out %>%
#'       dplyr::filter(
#'         is.na((!!as.name(x_colname))) |
#'           is.na((!!as.name(y_colname)))
#'       )
#'     # Check for errors.
#'     assertthat::assert_that(nrow(err) == 0,
#'                             msg = paste("Error in ecc_layout().",
#'                                         "Check err data frame.",
#'                                         "Where NA appears, mismatched names among g, Industries, and Products in ecc_layout()."))
#'   }
#'   # Return a matrix.
#'   out %>%
#'     # Move the node names to rownames ...
#'     tibble::column_to_rownames(node_name_colname) %>%
#'     # ... which allows the as.matrix function to create numeric columns for x and y.
#'     # This matrix can be used by qgraph for its layout argument.
#'     as.matrix()
#' }
#'


#'
#'
#' #' Identify industry stages
#' #'
#' #' Names for industries in an ECC may be comprised of a general industry name
#' #' and the name of specific product,
#' #' such as "\code{Production - Hydro}".
#' #' Stages of an ECC are most easily identified simply by the general industry name
#' #' (e.g., all "\code{Production}" industries are \code{primary} industries),
#' #' but simple matching of specific industry names with general industry names to identify
#' #' industry stages will fail ("\code{Production - Hydro}" does not match "\code{Production}", e.g.).
#' #' This function assigns ECC stages to specific industry names
#' #' using a table of general industry names and associated ECC stages.
#' #' A general industry name matches a specific industry name if the specific industry name
#' #' starts with or is equal to the general industry name.
#' #'
#' #' @param GeneralIndustries a data frame containing at least two columns,
#' #'        \code{general_industry_colname} and \code{stage_colname}.
#' #' @param SpecificIndustries a data frame containing at least \code{specific_industry_colname}.
#' #' @param general_industry_colname a string for the name of the column in \code{GeneralIndustries}
#' #'        that contains the general names of industries (such as "\code{Production}").
#' #'        Default is "\code{Industry}".
#' #' @param stage_colname a string for the name of the column in \code{GeneralIndustries}
#' #'        that contains the names of ECC stages (such as "\code{Primary industry}",
#' #'        "\code{Primary-to-final industry}", or "\code{Final-to-useful industry}").
#' #'        Default is "\code{Stage}".
#' #' @param specific_industry_colname a string for the name
#' #'        of the column in \code{SpecificIndustries} that contains the specific names of industries
#' #'        (such as "\code{Production - Hydro}").
#' #'        Default is "\code{Industry}".
#' #'
#' #' @return a data frame containing \code{specific_industry_colname} and \code{stage_colname},
#' #'         wherein the stages associated with general industry names in \code{GeneralIndustries}
#' #'         are assigned to the specific industries from \code{SpecificIndustries}
#' #'         according to matches between \code{GeneralIndustries$general_industry_colname}
#' #'         and \code{SpecificIndustries$specific_industry_colname}.
#' #'
#' #' @export
#' #'
#' #' @examples
#' #' GeneralIndustries <- data.frame(Industry = c("Production", "Plants", "Cars"),
#' #'                                 Stage = c("Primary industry", "p-->f industry",
#' #'                                           "f-->u industry"), stringsAsFactors = FALSE)
#' #' SpecificIndustries <- data.frame(Industry = c("Production - Coal", "Production - Hydro",
#' #'                                               "Plants - Electric", "Plants - refinery",
#' #'                                               "Cars - Electric", "Cars - Petrol"),
#' #'                                               stringsAsFactors = FALSE)
#' #' identify_industry_stages(GeneralIndustries = GeneralIndustries,
#' #'                          SpecificIndustries = SpecificIndustries)
#' identify_industry_stages <- function(GeneralIndustries,
#'                                      SpecificIndustries,
#'                                      general_industry_colname = "Industry",
#'                                      stage_colname = "Stage",
#'                                      specific_industry_colname = "Industry"){
#'   SpecificIndustries %>%
#'     dplyr::select(!!as.name(specific_industry_colname)) %>%
#'     fuzzyjoin::fuzzy_left_join(GeneralIndustries %>%
#'                       dplyr::select(!!as.name(general_industry_colname), !!as.name(stage_colname)),
#'                     by = c(general_industry_colname) %>% magrittr::set_names(specific_industry_colname),
#'                     match_fun = function(s.ind, g.ind){
#'                       startsWith(s.ind, g.ind)
#'                     }
#'     ) %>%
#'     # Return only the specific industries (column 1) and the stages (column 3).
#'     # It would be nice to select these by name instead of position,
#'     # but when specific_industry_colname and general_industry_colname are the same,
#'     # fuzzy_left_join renames the columns with a ".x" and a ".y" suffix.
#'     # Because we can't know ahead of time how the columns will be named,
#'     # we select by position.
#'     dplyr::select(c(1,3))
#' }