R/ip.R

Defines functions exclude_ip check_ip mark_ip

Documented in check_ip exclude_ip mark_ip

#' Mark IP addresses from outside of a specified country.
#'
#' @description
#' The `mark_ip()` function creates a column labeling
#' rows of data that have IP addresses from outside the specified country.
#' The function is written to work with data from
#' [Qualtrics](https://www.qualtrics.com/) surveys.
#'
#' @details
#' To record this information in your Qualtrics survey, you must ensure that
#' [Anonymize responses is disabled](https://www.qualtrics.com/support/survey-platform/survey-module/survey-options/survey-protection/#AnonymizingResponses).
#'
#' Default column names are set based on output from the
#' [`qualtRics::fetch_survey()`](
#' https://docs.ropensci.org/qualtRics/reference/fetch_survey.html).
#' The function uses [ipaddress::country_networks()] to assign IP addresses to
#' specific countries using
#' [ISO 3166-1 alpha-2 country codes](
#' https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2).
#'
#' The function outputs to console a message about the number of rows
#' with IP addresses outside of the specified country. If there are `NA`s for IP
#' addresses (likely due to including preview data---see [check_preview()]), it
#' will print a message alerting to the number of rows with `NA`s.
#'
#' @param x Data frame or tibble (preferably imported from Qualtrics using
#' \{qualtRics\}).
#' @param id_col Column name for unique row ID (e.g., participant).
#' @param ip_col Column name for IP addresses.
#' @param rename Logical indicating whether to rename columns (using [rename_columns()])
#' @param country Two-letter abbreviation of country to check (default is "US").
#' @param include_na Logical indicating whether to include rows with NA in
#' IP address column in the output list of potentially excluded data.
#' @param quiet Logical indicating whether to print message to console.
#' @param print Logical indicating whether to print returned tibble to
#' console.
#'
#' @family ip functions
#' @family mark functions
#' @return
#' An object of the same type as `x` that includes a column marking rows
#' with IP addresses outside of the specified country.
#' For a function that checks these rows, use [check_ip()].
#' For a function that excludes these rows, use [exclude_ip()].
#' @export
#'
#' @note
#' This function **requires internet connectivity** as it uses the
#' [ipaddress::country_networks()] function, which pulls daily updated data
#' from \url{https://www.iwik.org/ipcountry/}. It only updates the data once
#' per session, as it caches the results for future work during the session.
#'
#' @examples
#' # Mark IP addresses outside of the US
#' data(qualtrics_text)
#' df <- mark_ip(qualtrics_text)
#'
#' # Remove preview data first
#' df <- qualtrics_text %>%
#'   exclude_preview() %>%
#'   mark_ip()
#'
#' # Mark IP addresses outside of Germany
#' df <- qualtrics_text %>%
#'   exclude_preview() %>%
#'   mark_ip(country = "DE")
mark_ip <- function(x,
                    id_col = "ResponseId",
                    ip_col = "IPAddress",
                    rename = TRUE,
                    country = "US",
                    include_na = FALSE,
                    quiet = FALSE,
                    print = TRUE) {
  # Rename columns
  if (rename) {
    x <- rename_columns(x, alert = FALSE)
  }

  # Check for presence of required column
  validate_columns(x, id_col)
  validate_columns(x, ip_col)

  # Extract IP address, latitude, and longitude vectors
  ip_vector <- x[[ip_col]]

  # Check for valid IP addresses
  classify_ip <- ipaddress::is_ipv4(ipaddress::ip_address(ip_vector))
  if (any(!classify_ip | all(is.na(classify_ip)), na.rm = TRUE)) {
    stop("Invalid IP addresses in 'ip_col'.")
  }

  # Remove rows with NAs for IP addresses
  na_rows <- which(is.na(ip_vector))
  n_na_rows <- length(na_rows)
  if (n_na_rows > 0 && identical(quiet, FALSE)) {
    cli::cli_alert_info(
      "{n_na_rows} out of {nrow(x)} row{?s} had NA values for IP addresses (check for preview data with 'check_preview()')."
    )
  }
  if (n_na_rows > 0) {
    filtered_data <- x[-na_rows, ]
  } else {
    filtered_data <- x
  }

  # Get IP ranges for specified country
  country_ip_ranges <- ipaddress::country_networks(country)$networks[[1]]

  # Check if country_ip_ranges is valid
  if (!curl::has_internet()) {
    cli::cli_alert_warning("There is no internet connection.")
    return(invisible(NULL))
  } else if (identical(country_ip_ranges, NA)) {
    cli::cli_alert_warning("The website for downloading country IP addresses is not available. Please try again later.")
    return(invisible(NULL))
  } else if (identical(country_ip_ranges, NULL)) {
    cli::cli_alert_warning("'{country}' is not recognized as a valid country code, so IP addresses could not be checked for this country.")
    return(invisible(NULL))
  } else {
    # Filter data based on IP ranges
    survey_ips <- ipaddress::ip_address(filtered_data[[ip_col]])
    outside_country <- !ipaddress::is_within_any(survey_ips, country_ip_ranges)
    filtered_data <- dplyr::bind_cols(filtered_data, outside = outside_country)
    filtered_data <- dplyr::filter(filtered_data, .data$outside == TRUE) %>%
      dplyr::select(-.data$outside)
    n_outside_country <- nrow(filtered_data)

    # Filter NAs when requested
    if (identical(include_na, TRUE)) {
      na_data <- x[na_rows, ]
      filtered_data <- dplyr::bind_rows(filtered_data, na_data)
    }

    # Print message and return output
    if (identical(quiet, FALSE)) {
      cli::cli_alert_info(
        "{n_outside_country} out of {nrow(x)} row{?s} had IP address outside of {country}."
      )
    }

    # Mark exclusion rows
    marked_data <- mark_rows(x, filtered_data, id_col, "ip")
    print_data(marked_data, print)
  }
}


#' Check for IP addresses from outside of a specified country.
#'
#' @description
#' The `check_ip()` function subsets rows of data, retaining rows
#' that have IP addresses from outside the specified country.
#' The function is written to work with data from
#' [Qualtrics](https://www.qualtrics.com/) surveys.
#'
#' @details
#' To record this information in your Qualtrics survey, you must ensure that
#' [Anonymize responses is disabled](https://www.qualtrics.com/support/survey-platform/survey-module/survey-options/survey-protection/#AnonymizingResponses).
#'
#' Default column names are set based on output from the
#' [`qualtRics::fetch_survey()`](
#' https://docs.ropensci.org/qualtRics/reference/fetch_survey.html).
#' The function uses [ipaddress::country_networks()] to assign IP addresses to
#' specific countries using
#' [ISO 3166-1 alpha-2 country codes](
#' https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2).
#'
#' The function outputs to console a message about the number of rows
#' with IP addresses outside of the specified country. If there are `NA`s for IP
#' addresses (likely due to including preview data---see [check_preview()]), it
#' will print a message alerting to the number of rows with `NA`s.
#'
#' @inheritParams mark_ip
#' @param keep Logical indicating whether to keep or remove exclusion column.
#'
#' @family ip functions
#' @family check functions
#' @return
#' An object of the same type as `x` that includes the rows with
#' IP addresses outside of the specified country.
#' For a function that marks these rows, use [mark_ip()].
#' For a function that excludes these rows, use [exclude_ip()].
#'
#' @note
#' This function **requires internet connectivity** as it uses the
#' [ipaddress::country_networks()] function, which pulls daily updated data
#' from \url{https://www.iwik.org/ipcountry/}. It only updates the data once
#' per session, as it caches the results for future work during the session.
#'
#' @export
#' @examples
#' # Check for IP addresses outside of the US
#' data(qualtrics_text)
#' check_ip(qualtrics_text)
#'
#' # Remove preview data first
#' qualtrics_text %>%
#'   exclude_preview() %>%
#'   check_ip()
#'
#' # Check for IP addresses outside of Germany
#' qualtrics_text %>%
#'   exclude_preview() %>%
#'   check_ip(country = "DE")
#'
#' # Do not print rows to console
#' qualtrics_text %>%
#'   exclude_preview() %>%
#'   check_ip(print = FALSE)
#'
#' # Do not print message to console
#' qualtrics_text %>%
#'   exclude_preview() %>%
#'   check_ip(quiet = TRUE)
check_ip <- function(x,
                     id_col = "ResponseId",
                     ip_col = "IPAddress",
                     rename = TRUE,
                     country = "US",
                     include_na = FALSE,
                     keep = FALSE,
                     quiet = FALSE,
                     print = TRUE) {
  # Mark and filter ip
  exclusions <- mark_ip(x,
    id_col = id_col,
    ip_col = ip_col,
    rename = rename,
    country = country,
    quiet = quiet
  ) %>%
    dplyr::filter(.data$exclusion_ip == "ip") %>%
    keep_marked_column(.data$exclusion_ip, keep)

  # Determine whether to print results
  print_data(exclusions, print)
}


#' Exclude IP addresses from outside of a specified country.
#'
#' @description
#' The `exclude_ip()` function removes rows of data that have
#' IP addresses from outside the specified country.
#' The function is written to work with data from
#' [Qualtrics](https://www.qualtrics.com/) surveys.
#'
#' @inherit check_ip details
#'
#' @inheritParams mark_ip
#' @param silent Logical indicating whether to print message to console. Note
#' this argument controls the exclude message not the check message.
#'
#' @family ip functions
#' @family exclude functions
#' @return
#' An object of the same type as `x` that excludes rows
#' with IP addresses outside of the specified country.
#' For a function that checks these rows, use [check_ip()].
#' For a function that marks these rows, use [mark_ip()].
#' @note
#' This function **requires internet connectivity** as it uses the
#' [ipaddress::country_networks()] function, which pulls daily updated data
#' from \url{http://www.iwik.org/ipcountry/}. It only updates the data once
#' per session, as it caches the results for future work during the session.
#'
#' @export
#'
#' @examples
#' # Exclude IP addresses outside of the US
#' data(qualtrics_text)
#' df <- exclude_ip(qualtrics_text)
#'
#' # Remove preview data first
#' df <- qualtrics_text %>%
#'   exclude_preview() %>%
#'   exclude_ip()
#'
#' # Exclude IP addresses outside of Germany
#' df <- qualtrics_text %>%
#'   exclude_preview() %>%
#'   exclude_ip(country = "DE")
exclude_ip <- function(x,
                       id_col = "ResponseId",
                       ip_col = "IPAddress",
                       rename = TRUE,
                       country = "US",
                       include_na = FALSE,
                       quiet = TRUE,
                       print = TRUE,
                       silent = FALSE) {
  # Mark and filter ip
  remaining_data <- mark_ip(x,
    id_col = id_col,
    ip_col = ip_col,
    rename = rename,
    country = country,
    include_na = include_na,
    quiet = quiet
  ) %>%
    dplyr::filter(.data$exclusion_ip != "ip") %>%
    dplyr::select(-.data$exclusion_ip)

  # Print exclusion statement
  if (identical(silent, FALSE)) {
    print_exclusion(
      remaining_data, x,
      paste0("rows with IP addresses outside of ", country)
    )
  }

  # Determine whether to print results
  print_data(remaining_data, print)
}

Try the excluder package in your browser

Any scripts or data that you put into this service are public.

excluder documentation built on Feb. 16, 2023, 7:09 p.m.