R/split_flowline.R

Defines functions split_by_event split_lines_fun add_length rf new_line split_lines check_split_data split_flowlines

Documented in split_flowlines

#' @title Split Flowlines
#' @description A wrapper for split_lines that works on nhdplus attributes
#' @param flines data.frame with COMID, toCOMID, LENGTHKM
#' and LINESTRING sf column in "meters" projection
#' @param max_length maximum segment length to return
#' @param events data.frame containing events as generated by nhdplusTools::get_flowline_index() 
#' if an `identifier` attribute is included, it will be passed through in the output table.
#' @param para numeric how many threads to use in parallel computation
#' @param avoid vector of ids to avoid
#' @return All the flowlines with some split apart. COMIDs are returned as strings 
#' with a semantic part number appended. That is .1, .2, ... .10, .11, etc. are 
#' appended and must be treated as one would treat a semantic version. .1 is the 
#' most upstream and the sequence increases in the downstream direction.
#' @importFrom dplyr group_by ungroup filter select mutate lead n right_join
#' @importFrom nhdplusTools rename_geometry
#' @seealso The \code{\link{refactor_nhdplus}} function implements a complete
#' workflow using `split_flowlines()`.
#' @export
#' @examples
#' source(system.file("extdata", "new_hope_data.R", package = "hyRefactor"))
#' 
#' new_hope_flowline <- 
#'   dplyr::right_join(dplyr::select(new_hope_flowline, COMID, REACHCODE, FromMeas, ToMeas),
#'                     suppressWarnings(nhdplusTools::prepare_nhdplus(
#'                       new_hope_flowline, 0, 0, 0, FALSE, warn = FALSE)),
#'                     by = "COMID")
#' 
#' split <- split_flowlines(suppressWarnings(sf::st_cast(sf::st_transform(
#'   new_hope_flowline, 5070), "LINESTRING")),
#'                          max_length = 2000, events = new_hope_events)

split_flowlines <- function(flines, max_length = NULL, 
                            events = NULL, para = 0, avoid = NA) {
  
  check_names(flines, "split_flowlines")
  
  flines <- rename_geometry(flines, "geom")
  
  split <- split_lines(flines, max_length, events, 
                       para = para, 
                       avoid = avoid)
  
  if (nrow(split) > 0) {
    
    split <- left_join(split, 
                       select(sf::st_set_geometry(flines, NULL), COMID, toCOMID, LevelPathI, Hydroseq, TotDASqKM), 
                       by = "COMID")
    
    split$part <- unlist(lapply(strsplit(split$split_fID, "\\."),
                                function(x) x[2]))
    
    split <- group_by(split, COMID)
    
    split <- mutate(split, part = (ifelse(is.na(part), 0, as.integer(part)) + 1))
    
    # Assume flowdir is with digitized for now -- need to check in prep code.
    split <- ungroup(mutate(split,
                            toCOMID = ifelse(part == max(part),
                                             as.character(toCOMID),
                                             paste(lead(COMID),
                                                   lead(part), sep = "."))))
    
    split <- mutate(split, COMID = paste(COMID, part, sep = "."),
                    LENGTHKM = sf::st_length(sf::st_geometry(split)) / 1000)
    
    split <- sf::st_as_sf(select(split, -part, -split_fID))
    
    attr(split$LENGTHKM, "units") <- NULL
    split[["LENGTHKM"]] <- as.numeric(split[["LENGTHKM"]])
    
    remove_comid <- unique(as.integer(split[["COMID"]]))
    
    not_split <- dplyr::filter(select(flines, .data$COMID, .data$toCOMID, 
                                      .data$LENGTHKM, .data$LevelPathI, 
                                      .data$Hydroseq, .data$TotDASqKM), !(.data$COMID %in% remove_comid))
    
    not_split <- dplyr::mutate(not_split, event_REACHCODE = "", event_REACH_meas = NA, event_identifier = NA)
    
    flines <- rbind(not_split, split)
    
    # Rows with COMID like this need to be updated
    redirect_tocomid <- flines$COMID[which(grepl("\\.1$", flines$COMID))]
    
    old_tocomid <- gsub("\\.1$", "", redirect_tocomid)
    
    mutate(flines,
           toCOMID = ifelse(toCOMID %in% old_tocomid,
                            paste0(toCOMID, ".1"),
                            toCOMID))
    
  } else {
    flines %>%
      mutate(COMID = as.character(COMID),
             toCOMID = as.character(toCOMID),
             event_REACHCODE = NA,
             event_REACH_meas = NA,
             event_identifier = NA)
  }
}


check_split_data <- function(input_lines) {
  if(!"COMID" %in% names(input_lines)) stop("input lines must containt 'COMID' column.")
  if(attr(input_lines, "sf_column") != "geom") stop("sf_column must be 'geom'")
  if(!"sfc_LINESTRING" %in% class(st_geometry(input_lines))) stop("lines must be class LINESTRING")
  sf::st_crs(input_lines)
}

#' @title split lines
#' @description Splits lines longer than a given threshold into the
#' minimum number of pieces to all be under the given threshold.
#' @param input_lines data.frame of class sf with LINESTRING sfc column.
#' @param max_length maximum segment length to return
#' @param events data.frame containing events as generated by nhdplusTools::get_flowline_index() 
#' @param id name of ID column in data.frame
#' @param para how many cores to use
#' @param avoid vector of ids to avoid
#' @return only the split lines.
#' @importFrom dplyr group_by ungroup filter select mutate
#' @noRd
#'
split_lines <- function(input_lines, 
                        max_length = NULL, 
                        events = NULL, para = 0, 
                        avoid = NA, status = FALSE) {
  
  check_split_data(input_lines)
  
  # First split by event and number parts in a seperate column
  event_split_points <- split_by_event(input_lines, events)
  
  input_lines <- dplyr::select(input_lines, COMID)
  
  too_long_split_points <- split_by_length(input_lines, max_length, event_split_points, avoid)
  
  if(nrow(event_split_points) > 0) {
    split_points <- too_long_split_points %>%
      dplyr::left_join(select(event_split_points, e_start = .data$start, 
                              e_end = .data$end, .data$event_split_fID), 
                       by = "event_split_fID")  %>%
      dplyr::mutate(start = ifelse(!is.na(.data$e_start), 
                                   .data$start * (.data$e_end - .data$e_start) + .data$e_start, 
                                   .data$start),
                    end = ifelse(!is.na(.data$e_start), 
                                 .data$end * (.data$e_end - .data$e_start) + .data$e_start, 
                                 .data$end)) %>%
      dplyr::mutate(event_REACHCODE = ifelse((!.data$event_REACHCODE == "" & !is.na(.data$event_REACHCODE)) & 
                                               round(.data$end, digits = 5) == round(.data$e_end, digits = 5), 
                                             .data$event_REACHCODE, 
                                             NA)) %>%
      dplyr::mutate(event_REACH_meas = ifelse(is.na(.data$event_REACHCODE), NA, .data$event_REACH_meas)) %>%
      dplyr::mutate(event_identifier = ifelse(is.na(.data$event_identifier), NA, .data$event_identifier))
  } else {
    split_points <- too_long_split_points
    
    if(!"event_identifier" %in% names(split_points)) {
      split_points$event_identifier <- seq_len(nrow(split_points))
    }
    
  }
  
  input_lines <- split_points %>%  
    dplyr::group_by(.data$COMID) %>%
    dplyr::arrange(.data$end) %>%
    dplyr::mutate(split_fID = ifelse(dplyr::row_number() == 1,
                            as.character(.data$COMID),
                            paste0(.data$COMID, ".", dplyr::row_number() - 1))) %>%
    dplyr::ungroup() %>%
    dplyr::arrange(.data$fID, .data$end) %>%
    select(.data$COMID, .data$split_fID, .data$start, .data$end, 
           .data$event_REACHCODE, .data$event_REACH_meas, .data$event_identifier) %>%
    split_lines_fun(input_lines, para)
  
  if(is.logical(input_lines$split_fID)) input_lines$split_fID <- as.character(input_lines$split_fID)
  
  return(input_lines)
}

new_line <- function(i, f, t, l) {
  lwgeom::st_linesubstring(x = sf::st_geometry(l)[i],
                           from = f,
                           to = t)[[1]]
}

# rescale REACH_meas to comid_meas

rf <- function(m, f, t) {
  if(any(!sapply(m, function(m, f, t) (m >= f & m <= t), f = f, t = t))) stop("m must be between f and t")
  
  100 * (m - f) / (t - f)
  
}

add_length <- function(input_lines) {
  
  input_lines$geom_len <- sf::st_length(sf::st_geometry(input_lines))
  attr(input_lines$geom_len, "units") <- NULL
  input_lines$geom_len <- as.numeric(input_lines$geom_len)
  
  input_lines
}

split_lines_fun <- function(split_points, lines, para) {
  cl <- NULL
  if(para > 1) {
    cl <- parallel::makeCluster(para)
  }

  pbapply::pboptions("none")

  crs <- sf::st_crs(lines)
  
  lines <- pbapply::pblapply(seq_len(nrow(split_points)),
                               FUN = function(x, s, lines) {
                                 new_line(i = which(lines$COMID == s$COMID[x]),
                                          f = split_points$start[x],
                                          t = split_points$end[x],
                                          l = lines)
                               }, cl = cl, s = split_points, lines = lines)
  
  if(para > 1) {
    parallel::stopCluster(cl)
  }
  
  split_points <- dplyr::select(split_points, -.data$start, -.data$end)
  
  sf::st_sf(split_points, geom = sf::st_sfc(lines, crs = crs))
}

split_by_event <- function(input_lines, events) {
  if(is.null(events) || nrow(events) == 0) return(data.frame(COMID = integer(0), start = numeric(0), end = numeric(0), 
                                        event_REACHCODE = character(0), event_REACH_meas = numeric(0), event_split_fID = integer(0)))
  
  check_names(input_lines, "split_lines_event")
  
  # identify which flowlines need to be split so we can just iterate over those.
  to_split_ids <- sapply(seq_len(nrow(events)), 
                         function(x, events, input_lines) {
                           e <- events[x, ]
                           
                           # from is downstream -- 0 is the outlet
                           # to is upstream -- 100 is the inlet
                           dplyr::filter(input_lines, .data$REACHCODE == e$REACHCODE &
                                           .data$ToMeas > e$REACH_meas & 
                                           .data$FromMeas < e$REACH_meas)$COMID
                         }, events = events, input_lines = input_lines)
  
  # filter down so we could parallelize this without blowing out memory.
  to_split <- filter(input_lines, COMID %in% to_split_ids)
  
  if(!"identifier" %in% names(events)) {
    events$identifier <- as.character(seq(1, nrow(events)))
  } else {
    events$identifier <- as.character(events$identifier)
  }
  
  # For each catchment to split, find all events that need to be enforced
  # split into parts and add a "parts" numbered from upstream (0 measure)
  # to downstream (100 measure).
  split_l <- lapply(seq_len(nrow(to_split)), 
                    function(x, to_split, events) {
                      
                      l <- to_split[x, ]
                      
                      # only work on events in the interval of the current flowline
                      e <- dplyr::filter(events, 
                                         .data$REACHCODE %in% l$REACHCODE &
                                         .data$REACH_meas < l$ToMeas &
                                           .data$REACH_meas > l$FromMeas)
                      
                      # arrange in upstream downstream order (greater measures are upstream)
                      # from is downstream -- 0 is the outlet
                      # to is upstream -- 100 is the inlet
                      # This is important so things are in the right order for splitting
                      # and available in that order to be joined in back in.
                      e <- dplyr::arrange(e, desc(REACH_meas))
                      
                      # This function is defined elsewhere and tested.
                      comid_meas <- rf(e$REACH_meas, l$FromMeas, l$ToMeas)
                      
                      split_points <- data.frame(COMID = l$COMID,
                                                 start = c(0, 100 - comid_meas) / 100, 
                                                 end = c(100 - comid_meas, 100) / 100,
                                                 event_REACHCODE = c(e$REACHCODE, ""),
                                                 event_REACH_meas = c(e$REACH_meas, NA),
                                                 event_identifier = c(e$identifier, ""))
                      
                    }, to_split = to_split, events = events)
  
  do.call(rbind, split_l) %>%
    dplyr::mutate(event_split_fID = seq_len(nrow(.)))
}

split_by_length <- function(lines, max_length, event_split_points, avoid) {
  
  if(is.null(max_length)) return(data.frame())
  
  lines <- sf::st_drop_geometry(add_length(lines))
  
  if(nrow(event_split_points) > 0) {
    event_split_points <- dplyr::left_join(event_split_points, 
                                           dplyr::select(lines, .data$COMID, .data$geom_len), 
                                           by = "COMID") %>%
      mutate(geom_len = .data$geom_len * (.data$end - .data$start))
    
    lines <-  lines %>%
      dplyr::filter(!COMID %in% event_split_points$COMID) %>%
      dplyr::bind_rows(event_split_points)
  } else {
    lines <-  lines %>% 
      dplyr::mutate(event_REACHCODE = NA, event_REACH_meas = NA, event_split_fID = NA)
  }
  
  if (max_length < 50) warning(paste("short max length detected,",
                                     "do you have your units right?"))
  
  # pieces is the number of pieces we need to break each too long feature into.
  # fID is an identifier for a given geometry which may have already been split!
  lines <- lines %>%
    filter((.data$geom_len >= max_length & !.data$COMID %in% avoid) | !is.na(.data$event_split_fID)) %>%
    mutate(pieces = ceiling(.data$geom_len / max_length), 
           fID = seq_len(nrow(.))) %>%
    select(-.data$geom_len)
  
  if(nrow(lines) == 0) {
    return(dplyr::select(event_split_points, .data$COMID, .data$start, .data$end, 
                         .data$event_REACHCODE, .data$event_REACH_meas, .data$event_split_fID) %>%
             dplyr::mutate(fID = .data$event_split_fID))
  } 
  
  # rep here is done by row of lines so we get a row with the correct fID 
  # for each part we expect in the output.
  
  lines[rep(seq_len(nrow(lines)),lines$pieces), ] %>%
    dplyr::select(-.data$pieces) %>%
    # Assign featureIDs to the output. start and end are decimal % along catchment
    dplyr::group_by(.data$fID) %>%
    dplyr::mutate(piece = 1:n()) %>%
    dplyr::mutate(start = (.data$piece - 1) / dplyr::n(),
                  end = .data$piece / dplyr::n()) %>%
    dplyr::ungroup() %>%
    dplyr::select(-.data$piece)
  
}
dblodgett-usgs/hyRefactor documentation built on Aug. 25, 2023, 9:09 p.m.