R/extract.R
In urlexplorer: Structural Analysis and Pattern Discovery in URL Datasets

Documented in extract_file_extension extract_fragment extract_host extract_param_value extract_path extract_path_segment extract_port extract_query extract_scheme extract_userinfo

# Input is always a character vector. All functions return a character vector of
# the same length as the input vector, containing the extracted component or
# value.

# Internal regex patterns for URL components
.url_scheme_pattern <- "^(https?)"
.url_userinfo_pattern <- "://([^@/]+)@"
.url_host_pattern <- "://(?:[^@/]+@)?([^:/]+)"
.url_port_pattern <- "://[^/]+:(\\d+)"
.url_path_pattern <- "://[^/]+(/[^?#]*)"
.url_query_pattern <- "\\?([^#]*)"
.url_fragment_pattern <- "#(.*)"

#' Extract the scheme from URL
#'
#' @param url A character vector of URLs.
#'
#' @return A character vector containing the scheme from each URL.
#' @export
#'
#' @examples
#' extract_scheme(c("http://example.com", "https://example.com"))
extract_scheme <- function(url) {
  # Fast direct scheme extraction using regex
  stringr::str_extract(url, .url_scheme_pattern, group = 1)
}

#' Extract userinfo from URL
#'
#' @param url A character vector of URLs.
#'
#' @return A character vector containing the userinfo from each URL, if present.
#' @export
#'
#' @examples
#' extract_userinfo(c("http://user:pass@example.com"))
extract_userinfo <- function(url) {
  # Fast direct userinfo extraction using regex
  stringr::str_extract(url, .url_userinfo_pattern, group = 1)
}

#' Extract the host from URL
#'
#' @param url A character vector of URLs.
#' @return A character vector containing the host from each URL.
#' @export
#' @examples
#' extract_host(c("https://example.com", "http://www.example.com"))
extract_host <- function(url) {
  # Fast direct host extraction using regex
  stringr::str_extract(url, .url_host_pattern, group = 1)
}

#' Extract the port number from URL
#'
#' @param url A character vector of URLs.
#'
#' @return A character vector containing the port number from each URL, if
#'   specified.
#' @export
#'
#' @examples
#' extract_port(c("http://example.com:8080"))
extract_port <- function(url) {
  # Fast direct port extraction using regex
  port_str <- stringr::str_extract(url, .url_port_pattern, group = 1)
  as.integer(port_str)
}

#' Extract the path from URL
#'
#' @param url A character vector of URLs.
#' @return A character vector containing the path from each URL.
#' @export
#' @examples
#' extract_path(c("http://example.com/", "http://example.com/path/to/resource"))
extract_path <- function(url) {
  # Fast direct path extraction using regex
  stringr::str_extract(url, .url_path_pattern, group = 1)
}

#' Extract the query from URL
#'
#' @param url A character vector of URLs.
#' @return A character vector containing the query string from each URL.
#' @export
#' @examples
#' extract_query(c(
#'   "http://example.com?query1=value1&query2=value2",
#'   "http://example.com?query1=value3"
#' ))
extract_query <- function(url) {
  # Fast direct query extraction using regex
  stringr::str_extract(url, .url_query_pattern, group = 1)
}

#' Extract the fragment from URL
#'
#' @param url A character vector of URLs.
#'
#' @return A character vector containing the fragment from each URL, if present.
#' @export
#'
#' @examples
#' extract_fragment(c("http://example.com/#sec1", "http://example.com/#sec2"))
extract_fragment <- function(url) {
  # Fast direct fragment extraction using regex
  stringr::str_extract(url, .url_fragment_pattern, group = 1)
}

#' Extract a specific segment from a path
#'
#' @param path A character vector of paths.
#' @param segment_index The index of the segment to extract.
#'
#' @return A character vector containing the specified segment from each path.
#' @export
#'
#' @examples
#' extract_path_segment(c("/path/to/resource", "/another/path/"), 2)
extract_path_segment <- function(path, segment_index) {
  # Fast direct path segment extraction without calling split_path()
  # Remove leading slash and split by slash
  clean_paths <- stringr::str_remove(path, "^/")
  segments_list <- stringr::str_split(clean_paths, "/", simplify = FALSE)
  
  # Extract the specific segment index
  sapply(segments_list, function(segments) {
    if (length(segments) >= segment_index && segments[segment_index] != "") {
      segments[segment_index]
    } else {
      NA_character_
    }
  })
}

#' Extract the value of a specified parameter from the query string
#'
#' @param query A character vector of query strings.
#' @param param_name The name of the parameter to extract values for.
#'
#' @return A character vector containing the value of the specified parameter
#'   from each query string.
#' @export
#'
#' @examples
#' extract_param_value(c("param1=val1&param2=val2", "param1=val3"), "param1")
extract_param_value <- function(query, param_name) {
  # Fast direct regex extraction (same pattern as count_param_values)
  param_pattern <- paste0("(?:^|&)", stringr::fixed(param_name), "=([^&]*)")
  stringr::str_extract(query, param_pattern, group = 1)
}

#' Extract file extension from URLs or paths
#'
#' This function parses each input URL or path and extracts the file extension,
#' if present. It is particularly useful for identifying the type of files
#' referenced in URLs.
#'
#' @param url A character vector of URLs or paths from which to extract file
#'   extensions.
#'
#' @return A character vector with the file extension for each URL or path.
#'   Extensions are returned without the dot (e.g., "jpg" instead of ".jpg"),
#'   and URLs or paths without extensions will return `NA`.
#' @export
#'
#' @examples
#' extract_file_extension(
#'   c(
#'     "http://example.com/image.jpg",
#'     "https://example.com/archive.zip",
#'     "http://example.com/"
#'   )
#' )
extract_file_extension <- function(url) {
  # For URLs, extract path first; for paths, use directly
  paths <- ifelse(
    stringr::str_detect(url, "^https?://"),
    extract_path(url),
    url
  )
  
  # Extract file extension (everything after the last dot in the filename)
  # Remove query and fragment parts first
  clean_paths <- stringr::str_remove(paths, "[?#].*$")
  stringr::str_extract(clean_paths, "\\.([^./]+)$", group = 1)
}

Any scripts or data that you put into this service are public.

urlexplorer documentation built on Aug. 8, 2025, 7:29 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

urlexplorer
Structural Analysis and Pattern Discovery in URL Datasets

R/extract.R
In urlexplorer: Structural Analysis and Pattern Discovery in URL Datasets

Defines functions extract_file_extension extract_param_value extract_path_segment extract_fragment extract_query extract_path extract_port extract_host extract_userinfo extract_scheme

Documented in extract_file_extension extract_fragment extract_host extract_param_value extract_path extract_path_segment extract_port extract_query extract_scheme extract_userinfo

Try the urlexplorer package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

urlexplorer Structural Analysis and Pattern Discovery in URL Datasets

R/extract.R In urlexplorer: Structural Analysis and Pattern Discovery in URL Datasets

Defines functions extract_file_extension extract_param_value extract_path_segment extract_fragment extract_query extract_path extract_port extract_host extract_userinfo extract_scheme

Documented in extract_file_extension extract_fragment extract_host extract_param_value extract_path extract_path_segment extract_port extract_query extract_scheme extract_userinfo

Try the urlexplorer package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

urlexplorer
Structural Analysis and Pattern Discovery in URL Datasets

R/extract.R
In urlexplorer: Structural Analysis and Pattern Discovery in URL Datasets