R/get_pdb_file.R

Defines functions get_pdb_file

Documented in get_pdb_file

#' Download and Process PDB Files from the RCSB Database
#'
#' The `get_pdb_file` function is a versatile tool designed to download Protein Data Bank (PDB) files from the RCSB database.
#' It supports various file formats such as 'pdb', 'cif', 'xml', and 'structfact', with options for file compression and handling
#' alternate locations (ALT) and insertion codes (INSERT) in PDB files. This function also provides the flexibility to save
#' the downloaded files to a specified directory or to a temporary directory for immediate use.
#'
#' @param pdb_id A 4-character string specifying the PDB entry of interest (e.g., "1XYZ"). This identifier uniquely represents
#'   a macromolecular structure within the PDB database.
#' @param filetype A string specifying the format of the file to be downloaded. The default is 'cif'. Supported file types include:
#'   \describe{
#'     \item{'pdb'}{The older PDB file format, which provides atomic coordinates and metadata.}
#'     \item{'cif'}{The Crystallographic Information File (CIF) format, which is a newer standard replacing PDB files.}
#'     \item{'xml'}{An XML format file, providing structured data that can be easily parsed for various applications.}
#'     \item{'structfact'}{Structure factor files in CIF format, available for certain PDB entries, containing experimental
#'     data used to determine the structure.}
#'   }
#' @param rm.insert Logical flag indicating whether to ignore PDB insertion codes. Default is FALSE. If TRUE, records with insertion
#'   codes will be removed from the final data.
#' @param rm.alt Logical flag indicating whether to ignore alternate location indicators (ALT) in PDB files. Default is TRUE. If
#'   TRUE, only the first alternate location is kept, and others are removed.
#' @param compression Logical flag indicating whether to download the file in a compressed format (e.g., .gz). Default is TRUE,
#'   which is recommended for faster downloads, especially for CIF files.
#' @param save Logical flag indicating whether to save the downloaded file to a specified directory. Default is FALSE, which means
#'   the file is processed and optionally saved, but not retained after processing unless specified.
#' @param path A string specifying the directory where the downloaded file should be saved. If NULL, the file is saved in a
#'   temporary directory. If `save` is TRUE, this path is required.
#' @param verbosity A boolean flag indicating whether to print status messages during the function execution.
#' @param download_base_url A string representing the base URL for the PDB file retrieval. By default, this is set to the global constant \code{DOWNLOAD_BASE_URL}, but users can specify a different URL if needed.
#'
#' @return A list of class \code{"pdb"} containing the following components:
#'   \describe{
#'     \item{\code{atom}}{A data frame containing atomic coordinate data (ATOM and HETATM records). Each row corresponds to an
#'     atom, and each column to a specific record type (e.g., element, residue, chain).}
#'     \item{\code{xyz}}{A numeric matrix of class \code{"xyz"} containing the atomic coordinates from the ATOM and HETATM records.}
#'     \item{\code{calpha}}{A logical vector indicating whether each atom is a C-alpha atom (TRUE) or not (FALSE).}
#'     \item{\code{call}}{The matched call, storing the function call for reference.}
#'     \item{\code{path}}{The file path where the file was saved, if `save` was TRUE.}
#'   }
#'   The function handles errors and warnings for various edge cases, such as unsupported file types, failed downloads, or issues
#'   with reading the file.
#' @importFrom httr GET http_status
#' @importFrom utils download.file
#' @importFrom stats na.omit
#' @importFrom xml2 read_xml as_list
#' @importFrom bio3d read.pdb read.cif as.xyz trim.xyz atom2xyz atom.select.pdb
#' @examples
#' \donttest{
#'   # Download a CIF file and process it without saving
#'   pdb_file <- get_pdb_file(pdb_id = "4HHB", filetype = "cif")
#'
#'   # Download a PDB file, save it, and remove alternate location records
#'   pdb_file <- get_pdb_file(pdb_id = "4HHB", filetype = "pdb", save = TRUE, path = tempdir())
#'
#'   # Understanding the tertiary structure of proteins is
#'   # crucial for elucidating their functional mechanisms,
#'   # especially in the context of ligand binding, enzyme catalysis,
#'   # and protein-protein interactions.
#'   # The tertiary structure refers to the three-dimensional arrangement
#'   # of all atoms within a protein,
#'   # including its secondary structure elements like alpha helices
#'   # and beta sheets, and how these elements
#'   # are organized in space. Using the get_pdb_file function
#'   # to retrieve the PDB file and the r3dmol
#'   # package for visualization, researchers can gain insights
#'   # into the overall 3D structure of a protein.
#'   # The following example demonstrates how to visualize the
#'   # ltertiary structure of a protein using the
#'   # PDB entry 4HHB:
#'
#'   if (any(rownames(installed.packages()) == "r3dmol")) {
#'     r3d <- asNamespace("r3dmol")
#'
#'     # Retrieve and parse a PDB structure
#'     pdb_path <- get_pdb_file("4HHB", filetype = "pdb", save = TRUE, verbosity = FALSE)
#'
#'     # Visualize the tertiary structure using r3dmol
#'     viewer <- r3d$r3dmol()
#'     viewer <- r3d$m_add_model(viewer, pdb_path$path, format = "pdb")
#'     viewer <- r3d$m_set_style(viewer, style = r3d$m_style_cartoon())
#'     viewer <- r3d$m_zoom_to(viewer)
#'
#'     # Display the molecular viewer
#'     viewer
#'   }

#'   # In this example, the protein structure is represented
#'   # in a cartoon style, which is particularly
#'   # effective for visualizing the overall fold of the protein,
#'   # including the orientation and interaction
#'   # of its secondary structure elements.
#'   #. To further enhance the analysis,
#'   # it is often important to
#'   # highlight specific regions of interest,
#'   # such as potential ligand-binding sites.
#'   # These sites can be identified based on prior knowledge,
#'   # experimental data, or computational predictions.
#'   # The following code snippet demonstrates
#'   # how to highlight potential ligand-binding sites in the
#'   # protein structure:
#'
#'   # Highlight potential ligand-binding sites
#'   # Note: Manually define residues of interest based
#'   # on prior knowledge or external analysis
#'   binding_sites <- c(45, 60, 85)  # Example residue numbers
#'
#'   if (exists("viewer")) {
#'     viewer <- r3d$m_set_style(
#'       viewer,
#'       sel = r3d$m_sel(resi = binding_sites),
#'       style = r3d$m_style_sphere(color = "red", radius = 1.5)
#'     )
#'   }
#'
#'   # Display the updated viewer with highlighted binding sites
#'   if (exists("viewer")) {
#'     viewer
#'   }
#'
#'   # In this step, specific residues that are
#'   # hypothesized to participate in ligand binding are
#'   #highlighted using a spherical representation.
#'   # The residues are selected manually based on either
#'   # experimental data or computational predictions.
#'   # By highlighting these sites, researchers can
#'   # visually inspect the spatial relationship between
#'   # these residues and other parts of the protein,
#'   # which may provide insights into the
#'   # protein's functional mechanisms.
#'
#'   # This visualization approach offers a powerful
#'   # way to explore and communicate the 3D structure
#'   # of proteins, making it easier to hypothesize about their function and
#'   # interaction with other molecules.
#'
#' }
#' @details
#' The `get_pdb_file` function is an essential tool for structural biologists and bioinformaticians who need to download and
#' process PDB files for further analysis. By providing options to handle alternate locations and insertion codes, this function
#' ensures that the data is clean and ready for downstream applications. Additionally, the ability to save files locally or
#' work with them in a temporary directory provides flexibility for various workflows. Error handling and informative messages
#' are included to guide the user in case of issues with file retrieval or processing.
#' @export

get_pdb_file <- function(pdb_id, filetype = 'cif', rm.insert = FALSE, rm.alt = TRUE, compression = TRUE,
                         save = FALSE, path = NULL, verbosity = TRUE, download_base_url = DOWNLOAD_BASE_URL) {

  cl <- match.call()
  is_structfact <- identical(filetype, "structfact")

  # Validate the input filetype
  if (!filetype %in% c('cif', 'pdb', 'xml', 'structfact')) {
    rpdbapi_abort(
      paste0("Unsupported filetype: '", filetype, "'. Please use one of the following: 'cif', 'pdb', 'xml', 'structfact'."),
      class = "rPDBapi_error_invalid_input",
      function_name = "get_pdb_file",
      pdb_id = pdb_id,
      filetype = filetype
    )
  }

  if (filetype == 'cif' && !compression) {
    warning("Consider using `get_pdb_file` with compression=TRUE for CIF files (it makes the file download faster!)")
  }

  # Construct the PDB URL based on input parameters
  pdb_url <- if (filetype == 'structfact') {
    paste0(download_base_url, pdb_id, "-sf.cif")
  } else {
    paste0(download_base_url, pdb_id, ".", filetype, ifelse(compression, ".gz", ""))
  }

  if (verbosity) {
    message(paste("Sending GET request to", pdb_url, "to fetch", pdb_id, "'s", filetype, "file."))
  }

  # Send API request using the core function
  response <- tryCatch(
    {
      send_api_request(url = pdb_url)
    },
    error = function(e) {
      rpdbapi_rethrow(
        e,
        message_prefix = paste0("Network Error: Failed to retrieve data for PDB ID '", pdb_id, "'. Error: "),
        class = "rPDBapi_error_network",
        wrap_typed = TRUE,
        function_name = "get_pdb_file",
        pdb_id = pdb_id
      )
    }
  )

  # Handle any potential HTTP errors using the core function
  tryCatch(
    {
      handle_api_errors(response, pdb_url)
    },
    error = function(e) {
      rpdbapi_rethrow(
        e,
        message_prefix = paste0("API Error: Failed to retrieve data for PDB ID '", pdb_id, "'. Error: "),
        class = "rPDBapi_error_http",
        wrap_typed = TRUE,
        function_name = "get_pdb_file",
        pdb_id = pdb_id
      )
    }
  )

  # Set the download path
  filename <- if (is_structfact) {
    paste0(pdb_id, "-sf.cif")
  } else {
    paste0(pdb_id, ".", filetype, ifelse(compression, ".gz", ""))
  }

  if (is.null(path)) {
    path <- file.path(tempdir(), filename)
  } else {
    if (!dir.exists(path)) {
      rpdbapi_abort(
        paste0("Specified path does not exist: ", path),
        class = "rPDBapi_error_invalid_input",
        function_name = "get_pdb_file",
        pdb_id = pdb_id,
        path = path
      )
    }
    path <- file.path(path, filename)
  }

  # Save the downloaded file
  tryCatch(
    {
      download.file(response$url, path, quiet = TRUE)
    },
    error = function(e) {
      rpdbapi_rethrow(
        e,
        message_prefix = "Failed to download the PDB file: ",
        class = "rPDBapi_error_network",
        wrap_typed = TRUE,
        function_name = "get_pdb_file",
        pdb_id = pdb_id
      )
    }
  )

  # Attempt to read the file
  result <- tryCatch(
    {
      if (filetype == 'pdb') {
        read.pdb(path)
      } else if (filetype %in% c('cif', 'structfact')) {
        read.cif(path)
      } else if (filetype == 'xml') {
        xml_file <- read_xml(path)
        as_list(xml_file)
      } else {
        rpdbapi_abort(
          paste0("Unsupported filetype encountered during file reading: '", filetype, "'."),
          class = "rPDBapi_error_invalid_input",
          function_name = "get_pdb_file",
          pdb_id = pdb_id,
          filetype = filetype
        )
      }
    },
    error = function(e) {
      rpdbapi_rethrow(
        e,
        message_prefix = "Failed to read the downloaded PDB file: ",
        class = "rPDBapi_error_malformed_response",
        wrap_typed = TRUE,
        function_name = "get_pdb_file",
        pdb_id = pdb_id
      )
    }
  )

  if (!save) {
    file.remove(path)
  } else {
    message("The file has been saved to ", path)
  }

  if (is.null(result)) {
    return(NULL)
  }

  # Process only coordinate-bearing formats.
  if (filetype %in% c("pdb", "cif")) {
    if (!is.null(result$error)) {
      rpdbapi_abort(
        paste0("Error encountered in reading the file: ", path),
        class = "rPDBapi_error_malformed_response",
        function_name = "get_pdb_file",
        pdb_id = pdb_id,
        path = path
      )
    } else {
      class(result) <- c("pdb")
    }

    result$xyz <- as.xyz(result$xyz)
    result$atom[result$atom == ""] <- NA
    result$atom[result$atom == "?"] <- NA
    result$atom[result$atom == "."] <- NA
    result$models <- NULL

    if (rm.alt) {
      if (sum(!is.na(result$atom$alt)) > 0) {
        first.alt <- sort(unique(na.omit(result$atom$alt)))[1]
        warning("PDB has ALT records; taking ", first.alt, " only, rm.alt=TRUE")
        alt.inds <- which(result$atom$alt != first.alt)
        if (length(alt.inds) > 0) {
          result$atom <- result$atom[-alt.inds, ]
          result$xyz <- trim.xyz(result$xyz, col.inds = -atom2xyz(alt.inds))
        }
      }
    }

    if (rm.insert) {
      if (sum(!is.na(result$atom$insert)) > 0) {
        warning("PDB has INSERT records; removing them, rm.insert=TRUE")
        insert.inds <- which(!is.na(result$atom$insert))
        result$atom <- result$atom[-insert.inds, ]
        result$xyz <- trim.xyz(result$xyz, col.inds = -atom2xyz(insert.inds))
      }
    }

    if (any(duplicated(result$atom$eleno))) {
      warning("Duplicated element numbers ('eleno') detected")
    }

    ca.inds <- atom.select.pdb(result, string = "calpha", verbose = FALSE)
    result$calpha <- seq(1, nrow(result$atom)) %in% ca.inds$atom
    result$call <- cl
    result$path <- path
  }

  return(result)
}

Try the rPDBapi package in your browser

Any scripts or data that you put into this service are public.

rPDBapi documentation built on March 9, 2026, 5:08 p.m.