R/fetch_pdb.R

Defines functions fetch_pdb

Documented in fetch_pdb

#' Fetch structure information from RCSB
#'
#' Fetches structure metadata from RCSB. If you want to retrieve atom data such as positions, use
#' the function \code{fetch_pdb_structure()}.
#'
#' @param pdb_ids a character vector of PDB identifiers.
#' @param batchsize a numeric value that specifies the number of structures to be processed in a
#' single query. Default is 100.
#' @param show_progress a logical value that indicates if a progress bar will be shown. Default is
#' TRUE.
#'
#' @return A data frame that contains structure metadata for the PDB IDs provided. The data frame
#' contains some columns that might not be self explanatory.
#' \itemize{
#' \item{auth_asym_id: }{Chain identifier provided by the author of the structure in order to
#' match the identification used in the publication that describes the structure.}
#' \item{label_asym_id: }{Chain identifier following the standardised convention for mmCIF files.}
#' \item{entity_beg_seq_id, ref_beg_seq_id, length, pdb_sequence: }{\code{entity_beg_seq_id} is a
#' position in the structure sequence (\code{pdb_sequence}) that matches the position given in
#' \code{ref_beg_seq_id}, which is a position within the protein sequence (not included in the
#' data frame). \code{length} identifies the stretch of sequence for which positions match
#' accordingly between structure and protein sequence. \code{entity_beg_seq_id} is a residue ID
#' based on the standardised convention for mmCIF files.}
#' \item{auth_seq_id: }{Residue identifier provided by the author of the structure in order to
#' match the identification used in the publication that describes the structure. This character
#' vector has the same length as the \code{pdb_sequence} and each position is the identifier for
#' the matching amino acid position in \code{pdb_sequence}. The contained values are not
#' necessarily numbers and the values do not have to be positive.}
#' \item{modified_monomer: }{Is composed of first the composition ID of the modification, followed
#' by the \code{label_seq_id} position. In parenthesis are the parent monomer identifiers as
#' they appear in the sequence.}
#' \item{ligand_*: }{Any column starting with the \code{ligand_*} prefix contains information about
#' the position, identity and donors for ligand binding sites. If there are multiple entities of
#' ligands they are separated by "|". Specific donor level information is separated by ";".}
#' \item{secondar_structure: }{Contains information about helix and sheet secondary structure elements.
#' Individual regions are separated by ";".}
#' \item{unmodeled_structure: }{Contains information about unmodeled or partially modeled regions in
#' the model. Individual regions are separated by ";".}
#' \item{auth_seq_id_original: }{In some cases the sequence positions do not match the number of residues
#' in the sequence either because positions are missing or duplicated. This always coincides with modified
#' residues, however does not always occur when there is a modified residue in the sequence. This column
#' contains the original \code{auth_seq_id} information that does not have these positions corrected.}
#' }
#' @import dplyr
#' @import progress
#' @import purrr
#' @import tidyr
#' @importFrom R.utils insert
#' @importFrom httr modify_url
#' @importFrom stringr str_replace_all
#' @importFrom curl has_internet
#' @importFrom utils URLencode
#' @importFrom magrittr %>%
#' @importFrom stats setNames
#' @export
#'
#' @examples
#' \donttest{
#' pdb <- fetch_pdb(pdb_ids = c("6HG1", "1E9I", "6D3Q", "4JHW"))
#'
#' head(pdb)
#' }
fetch_pdb <- function(pdb_ids, batchsize = 100, show_progress = TRUE) {
  if (!curl::has_internet()) {
    message("No internet connection.")
    return(invisible(NULL))
  }
  . <- NULL
  # query that is used for fetching information
  query <- 'query={
  entries(entry_ids: ["pdb_ids"]) {
    rcsb_id
    struct_keywords {
      pdbx_keywords
    }
    exptl {
      method
    }
    exptl_crystal_grow {
      pH
      temp
      method
    }
    rcsb_binding_affinity {
      comp_id
      value
    }
    rcsb_entry_info {
      experimental_method
      assembly_count
      resolution_combined
      inter_mol_metalic_bond_count
    }
    pdbx_nmr_exptl_sample_conditions {
      ionic_strength
      pH
      temperature
    }
    pdbx_nmr_refine {
      method
    }
    pdbx_nmr_exptl {
      type
    }
    polymer_entities {
      polymer_entity_instances{
        rcsb_ligand_neighbors{
            atom_id
            auth_seq_id
            comp_id
            ligand_asym_id
            ligand_atom_id
            ligand_comp_id
            ligand_entity_id
            ligand_is_bound
            seq_id
        }
        rcsb_polymer_instance_feature{
          name
          type
          feature_positions{
            beg_comp_id
            beg_seq_id
            end_seq_id
          }
        }
      	rcsb_polymer_entity_instance_container_identifiers {
          asym_id
          auth_asym_id
          entry_id
      		auth_to_entity_poly_seq_mapping
    		}
      }
      rcsb_polymer_entity_feature {
      type
      name
        feature_positions {
          beg_comp_id
          beg_seq_id
          end_seq_id
        }
      }
      entity_poly {
        pdbx_seq_one_letter_code
        pdbx_seq_one_letter_code_can
        rcsb_artifact_monomer_count
        rcsb_conflict_count
        rcsb_deletion_count
        rcsb_insertion_count
        rcsb_mutation_count
        rcsb_non_std_monomer_count
        rcsb_non_std_monomers
      }
      rcsb_polymer_entity{
        pdbx_mutation
      }
      rcsb_entity_source_organism{
        ncbi_scientific_name
        ncbi_taxonomy_id
      }
      rcsb_polymer_entity_container_identifiers {
        entry_id
        auth_asym_ids
      }
      rcsb_polymer_entity_align {
      aligned_regions {
        entity_beg_seq_id
        ref_beg_seq_id
        length
      }
      reference_database_accession
      reference_database_isoform
      reference_database_name
    }
      uniprots {
      rcsb_uniprot_container_identifiers {
        uniprot_id
      }
      rcsb_uniprot_protein {
        name {
          value
        }
      }
    }
    }
    nonpolymer_entities {
      rcsb_nonpolymer_entity_container_identifiers{
        auth_asym_ids
        entry_id
      }
      nonpolymer_comp {
        chem_comp {
          id
          type
          formula_weight
          name
          formula
        }
      }
    }
  }
}'

  # remove NA values
  pdb_ids <- pdb_ids[!is.na(pdb_ids)]
  if (length(pdb_ids) == 0) {
    stop("No PDB IDs were provided.")
  }
  # split pdb_ids into batches
  batches <- split(pdb_ids, ceiling(seq_along(pdb_ids) / batchsize))
  if (show_progress == TRUE) {
    pb <- progress::progress_bar$new(
      total = length(batches),
      format = "[1/6] Fetching PDB entries [:bar] (:percent) :eta"
    )
  }

  # query information from database

  query_result <- purrr::map(batches, function(x) {
    pdb_ids <- paste0(x, collapse = '", "')
    full_query <- stringr::str_replace_all(query, pattern = "pdb_ids", replacement = pdb_ids)
    url_encode_query <- utils::URLencode(full_query) %>%
      stringr::str_replace_all(pattern = "\\[", replacement = "%5B") %>%
      stringr::str_replace_all(pattern = "\\]", replacement = "%5D")

    query <- try_query(httr::modify_url("https://data.rcsb.org/graphql",
      query = url_encode_query
    ),
    type = "application/json",
    simplifyDataFrame = TRUE
    )

    if (show_progress == TRUE) {
      pb$tick()
    }
    # only proceed with data if it was correctly retrieved
    if ("list" %in% class(query)) {
      query <- query %>%
        purrr::flatten() %>%
        as.data.frame(stringsAsFactors = FALSE)
    }
    query
  })

  # catch any IDs that have not been fetched correctly
  error_list <- query_result %>%
    purrr::keep(.p = ~ is.character(.x))

  if (length(error_list) != 0) {
    error_table <- tibble::tibble(
      id = paste0("IDs: ", as.numeric(names(error_list)) * batchsize - batchsize + 1, " to ", as.numeric(names(error_list)) * batchsize),
      error = unlist(error_list)
    ) %>%
      dplyr::distinct()

    message("The following IDs have not been retrieved correctly.")
    message(paste0(utils::capture.output(error_table), collapse = "\n"))
  }

  # only keep data in output

  query_result <- query_result %>%
    purrr::keep(.p = ~ !is.character(.x))

  if (length(query_result) == 0) {
    message("No valid information could be retrieved!")
    return(invisible(NULL))
  }

  query_result <- purrr::map_dfr(
    .x = query_result,
    .f = ~.x
  ) %>%
    distinct() # make sure entries are unique otherwise there will be problems with unnesting

  if (nrow(query_result) == 0) {
    stop("None of the provided IDs could be retrieved!")
  }

  queried_ids <- unique(query_result$entries.rcsb_id)
  not_retrieved <- setdiff(stringr::str_to_lower(pdb_ids), stringr::str_to_lower(queried_ids))
  if (length(not_retrieved) > 0) {
    message("The following IDs have not been retrieved:")
    message(paste0(utils::capture.output(not_retrieved), collapse = "\n"))
  }

  # process information from database
  if (show_progress == TRUE) {
    message("[2/6] Extract experimental conditions ... ", appendLF = FALSE)
    start_time <- Sys.time()
  }

  query_result_clean <- query_result %>%
    dplyr::bind_cols(
      pdb_ids = .$entries.rcsb_id,
      structure_keywords = .$entries.struct_keywords,
      .$entries.rcsb_entry_info
    ) %>%
    dplyr::select(-c(
      "entries.rcsb_id",
      "entries.struct_keywords",
      "entries.rcsb_entry_info"
    )) %>%
    tidyr::unnest("entries.exptl") %>%
    dplyr::rename(structure_method = .data$method)

  crystal_growth_info <- query_result_clean %>%
    dplyr::select("pdb_ids", "entries.exptl_crystal_grow") %>%
    tidyr::unnest("entries.exptl_crystal_grow")

  # make sure that the data is complete even if there is no crystal structure
  should_not_be_here <- colnames(crystal_growth_info)[!colnames(crystal_growth_info) %in% c(
    "pdb_ids",
    "method",
    "pH",
    "temp"
  )]
  should_be_here <- c(
    "pdb_ids",
    "method",
    "pH",
    "temp"
  )[!c(
    "pdb_ids",
    "method",
    "pH",
    "temp"
  ) %in% colnames(crystal_growth_info)]

  crystal_growth_info <- crystal_growth_info %>%
    dplyr::select(-should_not_be_here) %>%
    dplyr::bind_cols(stats::setNames(
      data.frame(matrix(
        ncol = length(should_be_here),
        nrow = nrow(crystal_growth_info)
      )),
      should_be_here
    )) %>%
    dplyr::rename(
      pH_crystallisation = .data$pH,
      method_crystallisation = .data$method,
      temp_crystallisation = .data$temp
    )

  resolution_info <- query_result_clean %>%
    dplyr::select(.data$pdb_ids, .data$resolution_combined) %>%
    tidyr::unnest(.data$resolution_combined)

  nmr_info <- query_result_clean %>%
    dplyr::select(
      .data$pdb_ids,
      .data$entries.pdbx_nmr_exptl,
      .data$entries.pdbx_nmr_exptl_sample_conditions,
      .data$entries.pdbx_nmr_refine
    ) %>%
    tidyr::unnest(.data$entries.pdbx_nmr_exptl) %>%
    tidyr::unnest(.data$entries.pdbx_nmr_exptl_sample_conditions) %>%
    tidyr::unnest(.data$entries.pdbx_nmr_refine)

  # make sure that the data is complete even if there is no NMR structure
  should_not_be_here <- colnames(nmr_info)[!colnames(nmr_info) %in% c(
    "pdb_ids",
    "type",
    "pH",
    "temperature",
    "method",
    "ionic_strength"
  )]
  should_be_here <- c(
    "pdb_ids",
    "type",
    "pH",
    "temperature",
    "method",
    "ionic_strength"
  )[!c(
    "pdb_ids",
    "type",
    "pH",
    "temperature",
    "method",
    "ionic_strength"
  ) %in% colnames(nmr_info)]

  nmr_info <- nmr_info %>%
    dplyr::select(-should_not_be_here) %>%
    dplyr::bind_cols(stats::setNames(
      data.frame(matrix(
        ncol = length(should_be_here),
        nrow = nrow(nmr_info)
      )),
      should_be_here
    )) %>%
    dplyr::rename(
      type_nmr = .data$type,
      pH_nmr = .data$pH,
      temp_nmr = .data$temperature,
      method_nmr = .data$method,
      ionic_strength_nmr = .data$ionic_strength
    )

  rcsb_binding_affinity <- query_result_clean %>%
    dplyr::select(.data$pdb_ids, .data$entries.rcsb_binding_affinity) %>%
    tidyr::unnest(.data$entries.rcsb_binding_affinity)

  # make sure that the data is complete even if there is no affinity information
  should_not_be_here <- colnames(rcsb_binding_affinity)[!colnames(rcsb_binding_affinity) %in% c(
    "pdb_ids",
    "comp_id",
    "value"
  )]
  should_be_here <- c(
    "pdb_ids",
    "comp_id",
    "value"
  )[!c(
    "pdb_ids",
    "comp_id",
    "value"
  ) %in% colnames(rcsb_binding_affinity)]

  rcsb_binding_affinity <- rcsb_binding_affinity %>%
    dplyr::select(-should_not_be_here) %>%
    dplyr::bind_cols(stats::setNames(
      data.frame(matrix(
        ncol = length(should_be_here),
        nrow = nrow(rcsb_binding_affinity)
      )),
      should_be_here
    )) %>%
    dplyr::rename(
      affinity_comp_id = .data$comp_id,
      affinity_value = .data$value
    )

  if (show_progress == TRUE) {
    message("DONE ", paste0("(", round(as.numeric(difftime(Sys.time(), start_time, units = "secs")), digits = 2), "s)"))
    message("[3/6] Extracting polymer information: ")
    message("-> 1/6 UniProt IDs ... ", appendLF = FALSE)
    start_time <- Sys.time()
  }

  polymer_entities <- query_result_clean %>%
    dplyr::select(.data$pdb_ids, .data$entries.polymer_entities) %>%
    tidyr::unnest(.data$entries.polymer_entities) %>%
    dplyr::bind_cols(
      .$entity_poly,
      .$rcsb_polymer_entity_container_identifiers
    ) %>%
    dplyr::select(-c(
      .data$entity_poly,
      .data$rcsb_polymer_entity_container_identifiers,
      .data$rcsb_entity_source_organism
    )) %>%
    tidyr::unnest(.data$rcsb_polymer_entity) %>%
    dplyr::rowwise() %>%
    dplyr::mutate(rcsb_non_std_monomers = ifelse(!is.null(unlist(.data$rcsb_non_std_monomers)),
      paste0(.data$rcsb_non_std_monomers, collapse = ";"),
      NA
    )) %>%
    dplyr::ungroup()

  # Deal with cases in which UniProt information of some entries is available but not for others
  polymer_entities_no_uniprots <- polymer_entities %>%
    dplyr::rowwise() %>%
    dplyr::mutate(no_uniprots = is.null(unlist(.data$uniprots))) %>%
    dplyr::ungroup() %>%
    dplyr::filter(.data$no_uniprots) %>%
    dplyr::select(-c(.data$uniprots, .data$no_uniprots))

  if (nrow(polymer_entities_no_uniprots) > 0) {
    polymer_entities <- polymer_entities %>%
      tidyr::unnest(c(.data$uniprots)) %>%
      dplyr::bind_rows(polymer_entities_no_uniprots)
  } else {
    polymer_entities <- polymer_entities %>%
      tidyr::unnest(c(.data$uniprots))
  }

  if (show_progress == TRUE) {
    message("DONE ", paste0("(", round(as.numeric(difftime(Sys.time(), start_time, units = "secs")), digits = 2), "s)"))
    message("-> 2/6 UniProt alignment ... ", appendLF = FALSE)
    start_time <- Sys.time()
  }

  # Deal with cases in which polymer entity alignment information of some entries is available but not for others
  polymer_entities_no_rcsb_polymer_entity_align <- polymer_entities %>%
    dplyr::rowwise() %>%
    dplyr::mutate(no_rcsb_polymer_entity_align = is.null(unlist(.data$rcsb_polymer_entity_align))) %>%
    dplyr::ungroup() %>%
    dplyr::filter(.data$no_rcsb_polymer_entity_align) %>%
    dplyr::select(-c(.data$rcsb_polymer_entity_align, .data$no_rcsb_polymer_entity_align))

  if (nrow(polymer_entities_no_rcsb_polymer_entity_align) > 0) {
    polymer_entities <- polymer_entities %>%
      tidyr::unnest(c(.data$rcsb_polymer_entity_align)) %>%
      dplyr::bind_rows(polymer_entities_no_rcsb_polymer_entity_align)
  } else {
    polymer_entities <- polymer_entities %>%
      tidyr::unnest(c(.data$rcsb_polymer_entity_align))
  }

  # some proteins do not contain UniProt information therefore data needs to be extracted differently
  if ("rcsb_uniprot_container_identifiers" %in% colnames(polymer_entities)) {
    polymer_entities <- polymer_entities %>%
      dplyr::bind_cols(
        uniprot_container_identifiers = .$rcsb_uniprot_container_identifiers,
        uniprot_protein = .$rcsb_uniprot_protein
      ) %>%
      dplyr::select(-c(.data$rcsb_uniprot_container_identifiers, .data$rcsb_uniprot_protein))
  } else {
    polymer_entities <- polymer_entities %>%
      dplyr::select(-c(.data$uniprots)) %>%
      dplyr::mutate(
        uniprot_id = NA,
        name = data.frame(value = NA)
      )
  }

  # The alignment can also be missing independent of the UniProt information
  if ("aligned_regions" %in% colnames(polymer_entities)) {
    # if there are entries that do not have aligned regions these need to be processed separately to not lose them to an unnest
    polymer_entities_no_aligned_regions <- polymer_entities %>%
      dplyr::rowwise() %>%
      dplyr::mutate(no_aligned_regions = is.null(unlist(.data$aligned_regions))) %>%
      dplyr::ungroup() %>%
      dplyr::filter(.data$no_aligned_regions)

    polymer_entities <- polymer_entities %>%
      tidyr::unnest(c(.data$aligned_regions)) %>%
      dplyr::bind_cols(.$name) %>%
      dplyr::select(-c(.data$name, .data$entry_id)) %>%
      dplyr::rename(name_protein = .data$value) %>%
      tidyr::unnest(c(.data$auth_asym_ids, .data$polymer_entity_instances)) %>%
      dplyr::bind_cols(
        rcsb_polymer_entity_instance_container_identifiers = .$rcsb_polymer_entity_instance_container_identifiers
      ) %>%
      dplyr::select(-c(.data$rcsb_polymer_entity_instance_container_identifiers))

    if (nrow(polymer_entities_no_aligned_regions) > 0) {
      polymer_entities_no_aligned_regions <- polymer_entities_no_aligned_regions %>%
        dplyr::select(-c(.data$aligned_regions, .data$no_aligned_regions)) %>%
        dplyr::bind_cols(.$name) %>%
        dplyr::select(-c(.data$name, .data$entry_id)) %>%
        dplyr::rename(name_protein = .data$value) %>%
        tidyr::unnest(c(.data$auth_asym_ids, .data$polymer_entity_instances)) %>%
        dplyr::bind_cols(
          rcsb_polymer_entity_instance_container_identifiers = .$rcsb_polymer_entity_instance_container_identifiers
        ) %>%
        dplyr::select(-c(.data$rcsb_polymer_entity_instance_container_identifiers)) %>%
        dplyr::mutate(
          entity_beg_seq_id = NA,
          ref_beg_seq_id = NA,
          length = NA
        )

      # Join columns back into main data.frame
      polymer_entities <- polymer_entities %>%
        dplyr::bind_rows(polymer_entities_no_aligned_regions)
    }
  } else {
    polymer_entities <- polymer_entities %>%
      dplyr::select(-c(.data$rcsb_polymer_entity_align)) %>%
      dplyr::bind_cols(.$name) %>%
      dplyr::select(-c(.data$name, .data$entry_id)) %>%
      dplyr::rename(name_protein = .data$value) %>%
      tidyr::unnest(c(.data$auth_asym_ids, .data$polymer_entity_instances)) %>%
      dplyr::bind_cols(
        rcsb_polymer_entity_instance_container_identifiers = .$rcsb_polymer_entity_instance_container_identifiers
      ) %>%
      dplyr::select(-c(.data$rcsb_polymer_entity_instance_container_identifiers)) %>%
      dplyr::mutate(
        entity_beg_seq_id = NA,
        ref_beg_seq_id = NA,
        length = NA,
        reference_database_accession = NA,
        reference_database_isoform = NA,
        reference_database_name = NA
      )
  }

  # Extract ligand information
  if (show_progress == TRUE) {
    message("DONE ", paste0("(", round(as.numeric(difftime(Sys.time(), start_time, units = "secs")), digits = 2), "s)"))
    message("-> 3/6 Ligand binding sites ... ", appendLF = FALSE)
    start_time <- Sys.time()
  }

  polymer_entities_no_ligands <- polymer_entities %>%
    dplyr::rowwise() %>%
    dplyr::mutate(no_ligands = is.null(unlist(.data$rcsb_ligand_neighbors))) %>%
    dplyr::ungroup() %>%
    dplyr::filter(.data$no_ligands) %>%
    dplyr::select(-c(.data$rcsb_ligand_neighbors, .data$no_ligands)) %>%
    dplyr::mutate(
      atom_id = as.character(NA),
      auth_seq_id = as.integer(NA),
      comp_id = as.character(NA),
      ligand_asym_id = as.character(NA),
      ligand_atom_id = as.character(NA),
      ligand_comp_id = as.character(NA),
      ligand_entity_id = as.character(NA),
      ligand_is_bound = as.character(NA),
      seq_id = as.integer(NA)
    )

  polymer_entities <- polymer_entities %>%
    tidyr::unnest(c(.data$rcsb_ligand_neighbors)) %>%
    dplyr::bind_rows(polymer_entities_no_ligands) %>%
    dplyr::mutate(ligand_is_bound = ifelse(.data$ligand_is_bound == "Y", "TRUE", "FALSE")) %>%
    dplyr::group_by(.data$pdb_ids, .data$auth_asym_id, .data$ligand_entity_id) %>%
    dplyr::mutate(dplyr::across(
      .cols = c(
        .data$atom_id,
        .data$auth_seq_id,
        .data$comp_id,
        .data$ligand_asym_id,
        .data$ligand_atom_id,
        .data$ligand_comp_id,
        .data$ligand_is_bound,
        .data$seq_id
      ),
      .fns = ~ paste0(.x, collapse = ";")
    )) %>%
    dplyr::distinct() %>%
    dplyr::group_by(.data$pdb_ids, .data$auth_asym_id) %>%
    dplyr::mutate(dplyr::across(
      .cols = c(
        .data$atom_id,
        .data$auth_seq_id,
        .data$comp_id,
        .data$ligand_asym_id,
        .data$ligand_atom_id,
        .data$ligand_comp_id,
        .data$ligand_entity_id,
        .data$ligand_is_bound,
        .data$seq_id
      ),
      .fns = ~ paste0(.x, collapse = "|")
    )) %>%
    dplyr::distinct() %>%
    dplyr::mutate(dplyr::across(
      .cols = c(
        .data$atom_id,
        .data$auth_seq_id,
        .data$comp_id,
        .data$ligand_asym_id,
        .data$ligand_atom_id,
        .data$ligand_comp_id,
        .data$ligand_entity_id,
        .data$ligand_is_bound,
        .data$seq_id
      ),
      .fns = ~ ifelse(str_detect(.data$atom_id, pattern = "NA"), NA, .x)
    )) %>%
    dplyr::ungroup() %>%
    dplyr::rename(
      ligand_donor_atom_id = .data$atom_id,
      ligand_donor_auth_seq_id = .data$auth_seq_id,
      ligand_donor_id = .data$comp_id,
      ligand_id = .data$ligand_comp_id,
      ligand_label_asym_id = .data$ligand_asym_id,
      ligand_bond_is_covalent_or_coordinating = .data$ligand_is_bound,
      ligand_donor_label_seq_id = .data$seq_id
    )

  if ("rcsb_ligand_neighbors" %in% colnames(polymer_entities)) {
    # if none of the retrieved entries contains any ligands then this column needs to be removed manually
    polymer_entities <- polymer_entities %>%
      select(-.data$rcsb_ligand_neighbors)
  }
  # extract modified monomer information

  if (show_progress == TRUE) {
    message("DONE ", paste0("(", round(as.numeric(difftime(Sys.time(), start_time, units = "secs")), digits = 2), "s)"))
    message("-> 4/6 Modified monomers ... ", appendLF = FALSE)
    start_time <- Sys.time()
  }

  rcsb_polymer_entity_feature <- polymer_entities %>%
    dplyr::select(.data$pdb_ids, .data$auth_asym_id, .data$rcsb_polymer_entity_feature) %>%
    tidyr::unnest(.data$rcsb_polymer_entity_feature) %>%
    tidyr::unnest(.data$feature_positions)

  modified_monomer <- rcsb_polymer_entity_feature %>%
    dplyr::filter(.data$type %in% c("modified_monomer")) %>%
    dplyr::group_by(.data$pdb_ids, .data$auth_asym_id) %>%
    dplyr::mutate(residue = paste0(
      .data$beg_comp_id,
      ":",
      .data$beg_seq_id,
      "(",
      stringr::str_replace(.data$name, pattern = "Parent monomer ", replacement = ""),
      ")"
    )) %>%
    dplyr::mutate(modified_monomer = paste0(.data$residue, collapse = ";")) %>%
    dplyr::ungroup() %>%
    dplyr::distinct(.data$modified_monomer, .data$pdb_ids, .data$auth_asym_id)

  # extract secondary structure information
  if (show_progress == TRUE) {
    message("DONE ", paste0("(", round(as.numeric(difftime(Sys.time(), start_time, units = "secs")), digits = 2), "s)"))
    message("-> 5/6 Secondary structure ... ", appendLF = FALSE)
    start_time <- Sys.time()
  }

  rcsb_polymer_instance_feature_data <- polymer_entities %>%
    dplyr::select(.data$pdb_ids, .data$auth_asym_id, .data$rcsb_polymer_instance_feature) %>%
    tidyr::unnest(.data$rcsb_polymer_instance_feature) %>%
    tidyr::unnest(.data$feature_positions)

  secondary_structures <- rcsb_polymer_instance_feature_data %>%
    dplyr::filter(.data$name %in% c("helix", "sheet")) %>%
    dplyr::group_by(.data$pdb_ids, .data$auth_asym_id) %>%
    dplyr::mutate(from_to = paste0(.data$name, ":", .data$beg_seq_id, "-", .data$end_seq_id)) %>%
    dplyr::mutate(secondary_structure = paste0(.data$from_to, collapse = ";")) %>%
    dplyr::ungroup() %>%
    dplyr::distinct(.data$secondary_structure, .data$pdb_ids, .data$auth_asym_id)

  # extract info about unmodeled residues
  if (show_progress == TRUE) {
    message("DONE ", paste0("(", round(as.numeric(difftime(Sys.time(), start_time, units = "secs")), digits = 2), "s)"))
    message("-> 6/6 Unmodeled residues ... ", appendLF = FALSE)
    start_time <- Sys.time()
  }

  unmodeled_residues <- rcsb_polymer_instance_feature_data %>%
    dplyr::filter(.data$name %in% c("partially modeled residue", "unmodeled residue")) %>%
    dplyr::group_by(.data$pdb_ids, .data$auth_asym_id) %>%
    dplyr::mutate(from_to = paste0(.data$name, ":", .data$beg_seq_id, "-", .data$end_seq_id)) %>%
    dplyr::mutate(unmodeled_structure = paste0(.data$from_to, collapse = ";")) %>%
    dplyr::ungroup() %>%
    dplyr::distinct(.data$unmodeled_structure, .data$pdb_ids, .data$auth_asym_id)

  # join secondary structure, unmodeled info and modified monomer

  polymer_entities <- polymer_entities %>%
    left_join(modified_monomer, by = c("pdb_ids", "auth_asym_id")) %>%
    left_join(secondary_structures, by = c("pdb_ids", "auth_asym_id")) %>%
    left_join(unmodeled_residues, by = c("pdb_ids", "auth_asym_id")) %>%
    select(-c(.data$rcsb_polymer_instance_feature, .data$rcsb_polymer_entity_feature))

  # Modify auth_seq_id positions that are either duplicated or missing.
  # Missing or duplicated entries are identified by comparing the length of auth_seq_id to the length of the sequence.
  if (show_progress == TRUE) {
    message("DONE ", paste0("(", round(as.numeric(difftime(Sys.time(), start_time, units = "secs")), digits = 2), "s)"))
    message("[4/6] Correct author sequence positions for some PDB IDs ... ", appendLF = FALSE)
    start_time <- Sys.time()
  }

  fix_auth_seq_positions <- polymer_entities %>%
    dplyr::rowwise() %>%
    dplyr::filter(length(unlist(.data$auth_to_entity_poly_seq_mapping)) != nchar(.data$pdbx_seq_one_letter_code_can))

  # Remove duplicated positions
  polymer_entities_seq_id_duplications <- fix_auth_seq_positions %>%
    dplyr::filter(length(unlist(.data$auth_to_entity_poly_seq_mapping)) > nchar(.data$pdbx_seq_one_letter_code_can)) %>%
    dplyr::mutate(auth_seq_id = list(unique(unlist(.data$auth_to_entity_poly_seq_mapping)))) %>%
    dplyr::ungroup()

  # Add missing positions
  polymer_entities_seq_id_missing <- fix_auth_seq_positions %>%
    dplyr::filter(length(unlist(.data$auth_to_entity_poly_seq_mapping)) < nchar(.data$pdbx_seq_one_letter_code_can))

  if (nrow(polymer_entities_seq_id_missing) > 0) { # do not run the code below if the data frame is empty, this causes error
    polymer_entities_seq_id_missing <- polymer_entities_seq_id_missing %>%
      dplyr::mutate(auth_seq_id_pdb_numeric = list(stringr::str_replace(.data$auth_to_entity_poly_seq_mapping, pattern = "[:alpha:]", replacement = ""))) %>%
      dplyr::mutate(non_consecutive = list(as.numeric(.data$auth_seq_id_pdb_numeric)[c(diff(as.numeric(.data$auth_seq_id_pdb_numeric)) > 1)])) %>%
      dplyr::mutate(n_missing = list(diff(as.numeric(.data$auth_seq_id_pdb_numeric))[which(diff(as.numeric(.data$auth_seq_id_pdb_numeric)) > 1)] - 1)) %>%
      dplyr::mutate(replacement_values_addition = list(purrr::map(
        .x = .data$n_missing,
        .f = ~ 1:.x
      ))) %>%
      dplyr::mutate(replacement_positions = list(rep(which(.data$auth_seq_id_pdb_numeric %in% as.character(.data$non_consecutive)) + 1, .data$n_missing))) %>%
      dplyr::mutate(auth_seq_id = list(R.utils::insert(.data$auth_to_entity_poly_seq_mapping,
        .data$replacement_positions,
        values = as.character(rep(.data$non_consecutive, .data$n_missing) + unlist(.data$replacement_values_addition))
      ))) %>%
      dplyr::ungroup() %>%
      dplyr::select(-c(.data$auth_seq_id_pdb_numeric, .data$non_consecutive, .data$n_missing, .data$replacement_values_addition, .data$replacement_positions))
  }
  # Join corrected entries back
  polymer_entities <- polymer_entities %>%
    dplyr::rowwise() %>%
    dplyr::filter(length(unlist(.data$auth_to_entity_poly_seq_mapping)) == nchar(.data$pdbx_seq_one_letter_code_can) |
      is.null(.data$auth_to_entity_poly_seq_mapping) |
      is.na(.data$pdbx_seq_one_letter_code_can)) %>%
    dplyr::ungroup() %>%
    dplyr::mutate(auth_seq_id = .data$auth_to_entity_poly_seq_mapping) %>%
    dplyr::bind_rows(polymer_entities_seq_id_duplications) %>%
    dplyr::bind_rows(polymer_entities_seq_id_missing)

  if (show_progress == TRUE) {
    if (nrow(fix_auth_seq_positions) > 0) {
      message("DONE ", paste0("(", round(as.numeric(difftime(Sys.time(), start_time, units = "secs")), digits = 2), "s)"))
      message("Corrected entries: ", paste0(unique(fix_auth_seq_positions$pdb_ids), collapse = ", "))
    } else {
      message("None to correct", paste0("(", round(as.numeric(difftime(Sys.time(), start_time, units = "secs")), digits = 2), "s)"))
    }
  }

  entity_instance_info <- polymer_entities %>%
    dplyr::distinct(
      .data$asym_id,
      .data$auth_asym_id,
      .data$entry_id,
      .data$auth_to_entity_poly_seq_mapping
    ) %>%
    dplyr::rename(
      auth_asym_ids = .data$auth_asym_id,
      pdb_ids = .data$entry_id
    )

  uniprot_info <- polymer_entities %>%
    dplyr::distinct(.data$uniprot_id, .data$name_protein) %>%
    dplyr::rename(
      reference_database_accession = .data$uniprot_id,
      protein_name = .data$name_protein
    )

  polymer_entities <- polymer_entities %>%
    select(-c(
      .data$uniprot_id,
      .data$name_protein,
      .data$asym_id,
      .data$auth_asym_id,
      .data$entry_id,
      .data$auth_to_entity_poly_seq_mapping
    )) %>%
    distinct()

  if (show_progress == TRUE) {
    message("[5/6] Extract non-polymer information ... ", appendLF = FALSE)
    start_time <- Sys.time()
  }

  if (!all(is.na(query_result_clean$entries.nonpolymer_entities))) {
    nonpolymer_entities <- query_result_clean %>%
      dplyr::select(.data$pdb_ids, .data$entries.nonpolymer_entities) %>%
      tidyr::unnest(.data$entries.nonpolymer_entities) %>%
      dplyr::bind_cols(
        .$rcsb_nonpolymer_entity_container_identifiers,
        .$nonpolymer_comp
      ) %>%
      dplyr::bind_cols(.$chem_comp) %>%
      dplyr::select(-c(
        .data$nonpolymer_comp,
        .data$rcsb_nonpolymer_entity_container_identifiers,
        .data$chem_comp,
        .data$entry_id
      )) %>%
      tidyr::unnest(.data$auth_asym_ids) %>%
      dplyr::rename(
        name_nonpolymer = .data$name,
        formula_nonpolymer = .data$formula,
        formula_weight_nonpolymer = .data$formula_weight,
        type_nonpolymer = .data$type,
        id_nonpolymer = .data$id
      )
  } else {
    nonpolymer_entities <- polymer_entities %>%
      dplyr::select(.data$pdb_ids, .data$auth_asym_ids) %>%
      dplyr::mutate(
        name_nonpolymer = NA,
        formula_nonpolymer = NA,
        formula_weight_nonpolymer = NA,
        type_nonpolymer = NA,
        id_nonpolymer = NA
      )
  }

  additional_info <- query_result_clean %>%
    dplyr::select(-c(
      .data$entries.nonpolymer_entities,
      .data$entries.polymer_entities,
      .data$entries.rcsb_binding_affinity,
      .data$entries.pdbx_nmr_exptl,
      .data$entries.pdbx_nmr_exptl_sample_conditions,
      .data$entries.pdbx_nmr_refine,
      .data$entries.exptl_crystal_grow,
      .data$resolution_combined
    ))

  if (show_progress == TRUE) {
    message("DONE ", paste0("(", round(as.numeric(difftime(Sys.time(), start_time, units = "secs")), digits = 2), "s)"))
    message("[6/6] Combine information ... ", appendLF = FALSE)
    start_time <- Sys.time()
  }

  combined <- polymer_entities %>%
    dplyr::full_join(nonpolymer_entities, by = c("pdb_ids", "auth_asym_ids")) %>%
    dplyr::left_join(rcsb_binding_affinity, by = "pdb_ids") %>%
    dplyr::left_join(additional_info, by = "pdb_ids") %>%
    dplyr::left_join(crystal_growth_info, by = "pdb_ids") %>%
    dplyr::left_join(nmr_info, by = "pdb_ids") %>%
    dplyr::left_join(resolution_info, by = "pdb_ids") %>%
    dplyr::left_join(uniprot_info, by = "reference_database_accession") %>%
    dplyr::left_join(entity_instance_info, by = c("pdb_ids", "auth_asym_ids")) %>%
    dplyr::rename(
      auth_asym_id = .data$auth_asym_ids,
      label_asym_id = .data$asym_id,
      pdb_sequence = .data$pdbx_seq_one_letter_code_can,
      auth_seq_id_original = .data$auth_to_entity_poly_seq_mapping,
      engineered_mutation = .data$pdbx_mutation,
      non_std_monomer = .data$rcsb_non_std_monomers
    ) %>%
    dplyr::rowwise() %>%
    # make character string out of list column
    dplyr::mutate(
      auth_seq_id = paste0(.data$auth_seq_id, collapse = ";"),
      auth_seq_id_original = paste0(.data$auth_seq_id_original, collapse = ";")
    ) %>%
    dplyr::ungroup() %>%
    dplyr::mutate(
      auth_seq_id = ifelse(.data$auth_seq_id == "",
        NA,
        .data$auth_seq_id
      ),
      auth_seq_id_original = ifelse(.data$auth_seq_id_original == "",
        NA,
        .data$auth_seq_id_original
      )
    ) %>%
    dplyr::select(
      .data$pdb_ids,
      .data$auth_asym_id,
      .data$label_asym_id,
      .data$reference_database_accession,
      .data$protein_name,
      .data$reference_database_name,
      .data$entity_beg_seq_id,
      .data$ref_beg_seq_id,
      .data$length,
      .data$pdb_sequence,
      .data$auth_seq_id,
      .data$auth_seq_id_original,
      .data$engineered_mutation,
      .data$modified_monomer,
      .data$ligand_donor_atom_id,
      .data$ligand_donor_auth_seq_id,
      .data$ligand_donor_label_seq_id,
      .data$ligand_donor_id,
      .data$ligand_label_asym_id,
      .data$ligand_atom_id,
      .data$ligand_id,
      .data$ligand_entity_id,
      .data$ligand_bond_is_covalent_or_coordinating,
      .data$secondary_structure,
      .data$unmodeled_structure,
      .data$id_nonpolymer,
      .data$type_nonpolymer,
      .data$formula_weight_nonpolymer,
      .data$name_nonpolymer,
      .data$formula_nonpolymer,
      .data$experimental_method,
      .data$structure_method,
      .data$affinity_comp_id,
      .data$affinity_value,
      .data$pdbx_keywords,
      .data$assembly_count,
      .data$inter_mol_metalic_bond_count,
      .data$pH_crystallisation,
      .data$temp_crystallisation,
      .data$method_crystallisation,
      .data$type_nmr,
      .data$ionic_strength_nmr,
      .data$pH_nmr,
      .data$temp_nmr,
      .data$method_nmr,
      .data$resolution_combined
    )

  if (show_progress == TRUE) {
    message("DONE ", paste0("(", round(as.numeric(difftime(Sys.time(), start_time, units = "secs")), digits = 2), "s)"))
  }

  combined
}

Try the protti package in your browser

Any scripts or data that you put into this service are public.

protti documentation built on Jan. 22, 2023, 1:11 a.m.