Nothing
#' Fetch structure information from RCSB
#'
#' Fetches structure metadata from RCSB. If you want to retrieve atom data such as positions, use
#' the function \code{fetch_pdb_structure()}.
#'
#' @param pdb_ids a character vector of PDB identifiers.
#' @param batchsize a numeric value that specifies the number of structures to be processed in a
#' single query. Default is 100.
#' @param show_progress a logical value that indicates if a progress bar will be shown. Default is
#' TRUE.
#'
#' @return A data frame that contains structure metadata for the PDB IDs provided. The data frame
#' contains some columns that might not be self explanatory.
#' \itemize{
#' \item{auth_asym_id: }{Chain identifier provided by the author of the structure in order to
#' match the identification used in the publication that describes the structure.}
#' \item{label_asym_id: }{Chain identifier following the standardised convention for mmCIF files.}
#' \item{entity_beg_seq_id, ref_beg_seq_id, length, pdb_sequence: }{\code{entity_beg_seq_id} is a
#' position in the structure sequence (\code{pdb_sequence}) that matches the position given in
#' \code{ref_beg_seq_id}, which is a position within the protein sequence (not included in the
#' data frame). \code{length} identifies the stretch of sequence for which positions match
#' accordingly between structure and protein sequence. \code{entity_beg_seq_id} is a residue ID
#' based on the standardised convention for mmCIF files.}
#' \item{auth_seq_id: }{Residue identifier provided by the author of the structure in order to
#' match the identification used in the publication that describes the structure. This character
#' vector has the same length as the \code{pdb_sequence} and each position is the identifier for
#' the matching amino acid position in \code{pdb_sequence}. The contained values are not
#' necessarily numbers and the values do not have to be positive.}
#' \item{modified_monomer: }{Is composed of first the composition ID of the modification, followed
#' by the \code{label_seq_id} position. In parenthesis are the parent monomer identifiers as
#' they appear in the sequence.}
#' \item{ligand_*: }{Any column starting with the \code{ligand_*} prefix contains information about
#' the position, identity and donors for ligand binding sites. If there are multiple entities of
#' ligands they are separated by "|". Specific donor level information is separated by ";".}
#' \item{secondar_structure: }{Contains information about helix and sheet secondary structure elements.
#' Individual regions are separated by ";".}
#' \item{unmodeled_structure: }{Contains information about unmodeled or partially modeled regions in
#' the model. Individual regions are separated by ";".}
#' \item{auth_seq_id_original: }{In some cases the sequence positions do not match the number of residues
#' in the sequence either because positions are missing or duplicated. This always coincides with modified
#' residues, however does not always occur when there is a modified residue in the sequence. This column
#' contains the original \code{auth_seq_id} information that does not have these positions corrected.}
#' }
#' @import dplyr
#' @import progress
#' @import purrr
#' @import tidyr
#' @importFrom R.utils insert
#' @importFrom httr modify_url
#' @importFrom stringr str_replace_all
#' @importFrom curl has_internet
#' @importFrom utils URLencode
#' @importFrom magrittr %>%
#' @importFrom stats setNames
#' @export
#'
#' @examples
#' \donttest{
#' pdb <- fetch_pdb(pdb_ids = c("6HG1", "1E9I", "6D3Q", "4JHW"))
#'
#' head(pdb)
#' }
fetch_pdb <- function(pdb_ids, batchsize = 100, show_progress = TRUE) {
if (!curl::has_internet()) {
message("No internet connection.")
return(invisible(NULL))
}
. <- NULL
# query that is used for fetching information
query <- 'query={
entries(entry_ids: ["pdb_ids"]) {
rcsb_id
struct_keywords {
pdbx_keywords
}
exptl {
method
}
exptl_crystal_grow {
pH
temp
method
}
rcsb_binding_affinity {
comp_id
value
}
rcsb_entry_info {
experimental_method
assembly_count
resolution_combined
inter_mol_metalic_bond_count
}
pdbx_nmr_exptl_sample_conditions {
ionic_strength
pH
temperature
}
pdbx_nmr_refine {
method
}
pdbx_nmr_exptl {
type
}
polymer_entities {
polymer_entity_instances{
rcsb_ligand_neighbors{
atom_id
auth_seq_id
comp_id
ligand_asym_id
ligand_atom_id
ligand_comp_id
ligand_entity_id
ligand_is_bound
seq_id
}
rcsb_polymer_instance_feature{
name
type
feature_positions{
beg_comp_id
beg_seq_id
end_seq_id
}
}
rcsb_polymer_entity_instance_container_identifiers {
asym_id
auth_asym_id
entry_id
auth_to_entity_poly_seq_mapping
}
}
rcsb_polymer_entity_feature {
type
name
feature_positions {
beg_comp_id
beg_seq_id
end_seq_id
}
}
entity_poly {
pdbx_seq_one_letter_code
pdbx_seq_one_letter_code_can
rcsb_artifact_monomer_count
rcsb_conflict_count
rcsb_deletion_count
rcsb_insertion_count
rcsb_mutation_count
rcsb_non_std_monomer_count
rcsb_non_std_monomers
}
rcsb_polymer_entity{
pdbx_mutation
}
rcsb_entity_source_organism{
ncbi_scientific_name
ncbi_taxonomy_id
}
rcsb_polymer_entity_container_identifiers {
entry_id
auth_asym_ids
}
rcsb_polymer_entity_align {
aligned_regions {
entity_beg_seq_id
ref_beg_seq_id
length
}
reference_database_accession
reference_database_isoform
reference_database_name
}
uniprots {
rcsb_uniprot_container_identifiers {
uniprot_id
}
rcsb_uniprot_protein {
name {
value
}
}
}
}
nonpolymer_entities {
rcsb_nonpolymer_entity_container_identifiers{
auth_asym_ids
entry_id
}
nonpolymer_comp {
chem_comp {
id
type
formula_weight
name
formula
}
}
}
}
}'
# remove NA values
pdb_ids <- pdb_ids[!is.na(pdb_ids)]
if (length(pdb_ids) == 0) {
stop("No PDB IDs were provided.")
}
# split pdb_ids into batches
batches <- split(pdb_ids, ceiling(seq_along(pdb_ids) / batchsize))
if (show_progress == TRUE) {
pb <- progress::progress_bar$new(
total = length(batches),
format = "[1/6] Fetching PDB entries [:bar] (:percent) :eta"
)
}
# query information from database
query_result <- purrr::map(batches, function(x) {
pdb_ids <- paste0(x, collapse = '", "')
full_query <- stringr::str_replace_all(query, pattern = "pdb_ids", replacement = pdb_ids)
url_encode_query <- utils::URLencode(full_query) %>%
stringr::str_replace_all(pattern = "\\[", replacement = "%5B") %>%
stringr::str_replace_all(pattern = "\\]", replacement = "%5D")
query <- try_query(httr::modify_url("https://data.rcsb.org/graphql",
query = url_encode_query
),
type = "application/json",
simplifyDataFrame = TRUE
)
if (show_progress == TRUE) {
pb$tick()
}
# only proceed with data if it was correctly retrieved
if ("list" %in% class(query)) {
query <- query %>%
purrr::flatten() %>%
as.data.frame(stringsAsFactors = FALSE)
}
query
})
# catch any IDs that have not been fetched correctly
error_list <- query_result %>%
purrr::keep(.p = ~ is.character(.x))
if (length(error_list) != 0) {
error_table <- tibble::tibble(
id = paste0("IDs: ", as.numeric(names(error_list)) * batchsize - batchsize + 1, " to ", as.numeric(names(error_list)) * batchsize),
error = unlist(error_list)
) %>%
dplyr::distinct()
message("The following IDs have not been retrieved correctly.")
message(paste0(utils::capture.output(error_table), collapse = "\n"))
}
# only keep data in output
query_result <- query_result %>%
purrr::keep(.p = ~ !is.character(.x))
if (length(query_result) == 0) {
message("No valid information could be retrieved!")
return(invisible(NULL))
}
query_result <- purrr::map_dfr(
.x = query_result,
.f = ~.x
) %>%
distinct() # make sure entries are unique otherwise there will be problems with unnesting
if (nrow(query_result) == 0) {
stop("None of the provided IDs could be retrieved!")
}
queried_ids <- unique(query_result$entries.rcsb_id)
not_retrieved <- setdiff(stringr::str_to_lower(pdb_ids), stringr::str_to_lower(queried_ids))
if (length(not_retrieved) > 0) {
message("The following IDs have not been retrieved:")
message(paste0(utils::capture.output(not_retrieved), collapse = "\n"))
}
# process information from database
if (show_progress == TRUE) {
message("[2/6] Extract experimental conditions ... ", appendLF = FALSE)
start_time <- Sys.time()
}
query_result_clean <- query_result %>%
dplyr::bind_cols(
pdb_ids = .$entries.rcsb_id,
structure_keywords = .$entries.struct_keywords,
.$entries.rcsb_entry_info
) %>%
dplyr::select(-c(
"entries.rcsb_id",
"entries.struct_keywords",
"entries.rcsb_entry_info"
)) %>%
tidyr::unnest("entries.exptl") %>%
dplyr::rename(structure_method = .data$method)
crystal_growth_info <- query_result_clean %>%
dplyr::select("pdb_ids", "entries.exptl_crystal_grow") %>%
tidyr::unnest("entries.exptl_crystal_grow")
# make sure that the data is complete even if there is no crystal structure
should_not_be_here <- colnames(crystal_growth_info)[!colnames(crystal_growth_info) %in% c(
"pdb_ids",
"method",
"pH",
"temp"
)]
should_be_here <- c(
"pdb_ids",
"method",
"pH",
"temp"
)[!c(
"pdb_ids",
"method",
"pH",
"temp"
) %in% colnames(crystal_growth_info)]
crystal_growth_info <- crystal_growth_info %>%
dplyr::select(-should_not_be_here) %>%
dplyr::bind_cols(stats::setNames(
data.frame(matrix(
ncol = length(should_be_here),
nrow = nrow(crystal_growth_info)
)),
should_be_here
)) %>%
dplyr::rename(
pH_crystallisation = .data$pH,
method_crystallisation = .data$method,
temp_crystallisation = .data$temp
)
resolution_info <- query_result_clean %>%
dplyr::select(.data$pdb_ids, .data$resolution_combined) %>%
tidyr::unnest(.data$resolution_combined)
nmr_info <- query_result_clean %>%
dplyr::select(
.data$pdb_ids,
.data$entries.pdbx_nmr_exptl,
.data$entries.pdbx_nmr_exptl_sample_conditions,
.data$entries.pdbx_nmr_refine
) %>%
tidyr::unnest(.data$entries.pdbx_nmr_exptl) %>%
tidyr::unnest(.data$entries.pdbx_nmr_exptl_sample_conditions) %>%
tidyr::unnest(.data$entries.pdbx_nmr_refine)
# make sure that the data is complete even if there is no NMR structure
should_not_be_here <- colnames(nmr_info)[!colnames(nmr_info) %in% c(
"pdb_ids",
"type",
"pH",
"temperature",
"method",
"ionic_strength"
)]
should_be_here <- c(
"pdb_ids",
"type",
"pH",
"temperature",
"method",
"ionic_strength"
)[!c(
"pdb_ids",
"type",
"pH",
"temperature",
"method",
"ionic_strength"
) %in% colnames(nmr_info)]
nmr_info <- nmr_info %>%
dplyr::select(-should_not_be_here) %>%
dplyr::bind_cols(stats::setNames(
data.frame(matrix(
ncol = length(should_be_here),
nrow = nrow(nmr_info)
)),
should_be_here
)) %>%
dplyr::rename(
type_nmr = .data$type,
pH_nmr = .data$pH,
temp_nmr = .data$temperature,
method_nmr = .data$method,
ionic_strength_nmr = .data$ionic_strength
)
rcsb_binding_affinity <- query_result_clean %>%
dplyr::select(.data$pdb_ids, .data$entries.rcsb_binding_affinity) %>%
tidyr::unnest(.data$entries.rcsb_binding_affinity)
# make sure that the data is complete even if there is no affinity information
should_not_be_here <- colnames(rcsb_binding_affinity)[!colnames(rcsb_binding_affinity) %in% c(
"pdb_ids",
"comp_id",
"value"
)]
should_be_here <- c(
"pdb_ids",
"comp_id",
"value"
)[!c(
"pdb_ids",
"comp_id",
"value"
) %in% colnames(rcsb_binding_affinity)]
rcsb_binding_affinity <- rcsb_binding_affinity %>%
dplyr::select(-should_not_be_here) %>%
dplyr::bind_cols(stats::setNames(
data.frame(matrix(
ncol = length(should_be_here),
nrow = nrow(rcsb_binding_affinity)
)),
should_be_here
)) %>%
dplyr::rename(
affinity_comp_id = .data$comp_id,
affinity_value = .data$value
)
if (show_progress == TRUE) {
message("DONE ", paste0("(", round(as.numeric(difftime(Sys.time(), start_time, units = "secs")), digits = 2), "s)"))
message("[3/6] Extracting polymer information: ")
message("-> 1/6 UniProt IDs ... ", appendLF = FALSE)
start_time <- Sys.time()
}
polymer_entities <- query_result_clean %>%
dplyr::select(.data$pdb_ids, .data$entries.polymer_entities) %>%
tidyr::unnest(.data$entries.polymer_entities) %>%
dplyr::bind_cols(
.$entity_poly,
.$rcsb_polymer_entity_container_identifiers
) %>%
dplyr::select(-c(
.data$entity_poly,
.data$rcsb_polymer_entity_container_identifiers,
.data$rcsb_entity_source_organism
)) %>%
tidyr::unnest(.data$rcsb_polymer_entity) %>%
dplyr::rowwise() %>%
dplyr::mutate(rcsb_non_std_monomers = ifelse(!is.null(unlist(.data$rcsb_non_std_monomers)),
paste0(.data$rcsb_non_std_monomers, collapse = ";"),
NA
)) %>%
dplyr::ungroup()
# Deal with cases in which UniProt information of some entries is available but not for others
polymer_entities_no_uniprots <- polymer_entities %>%
dplyr::rowwise() %>%
dplyr::mutate(no_uniprots = is.null(unlist(.data$uniprots))) %>%
dplyr::ungroup() %>%
dplyr::filter(.data$no_uniprots) %>%
dplyr::select(-c(.data$uniprots, .data$no_uniprots))
if (nrow(polymer_entities_no_uniprots) > 0) {
polymer_entities <- polymer_entities %>%
tidyr::unnest(c(.data$uniprots)) %>%
dplyr::bind_rows(polymer_entities_no_uniprots)
} else {
polymer_entities <- polymer_entities %>%
tidyr::unnest(c(.data$uniprots))
}
if (show_progress == TRUE) {
message("DONE ", paste0("(", round(as.numeric(difftime(Sys.time(), start_time, units = "secs")), digits = 2), "s)"))
message("-> 2/6 UniProt alignment ... ", appendLF = FALSE)
start_time <- Sys.time()
}
# Deal with cases in which polymer entity alignment information of some entries is available but not for others
polymer_entities_no_rcsb_polymer_entity_align <- polymer_entities %>%
dplyr::rowwise() %>%
dplyr::mutate(no_rcsb_polymer_entity_align = is.null(unlist(.data$rcsb_polymer_entity_align))) %>%
dplyr::ungroup() %>%
dplyr::filter(.data$no_rcsb_polymer_entity_align) %>%
dplyr::select(-c(.data$rcsb_polymer_entity_align, .data$no_rcsb_polymer_entity_align))
if (nrow(polymer_entities_no_rcsb_polymer_entity_align) > 0) {
polymer_entities <- polymer_entities %>%
tidyr::unnest(c(.data$rcsb_polymer_entity_align)) %>%
dplyr::bind_rows(polymer_entities_no_rcsb_polymer_entity_align)
} else {
polymer_entities <- polymer_entities %>%
tidyr::unnest(c(.data$rcsb_polymer_entity_align))
}
# some proteins do not contain UniProt information therefore data needs to be extracted differently
if ("rcsb_uniprot_container_identifiers" %in% colnames(polymer_entities)) {
polymer_entities <- polymer_entities %>%
dplyr::bind_cols(
uniprot_container_identifiers = .$rcsb_uniprot_container_identifiers,
uniprot_protein = .$rcsb_uniprot_protein
) %>%
dplyr::select(-c(.data$rcsb_uniprot_container_identifiers, .data$rcsb_uniprot_protein))
} else {
polymer_entities <- polymer_entities %>%
dplyr::select(-c(.data$uniprots)) %>%
dplyr::mutate(
uniprot_id = NA,
name = data.frame(value = NA)
)
}
# The alignment can also be missing independent of the UniProt information
if ("aligned_regions" %in% colnames(polymer_entities)) {
# if there are entries that do not have aligned regions these need to be processed separately to not lose them to an unnest
polymer_entities_no_aligned_regions <- polymer_entities %>%
dplyr::rowwise() %>%
dplyr::mutate(no_aligned_regions = is.null(unlist(.data$aligned_regions))) %>%
dplyr::ungroup() %>%
dplyr::filter(.data$no_aligned_regions)
polymer_entities <- polymer_entities %>%
tidyr::unnest(c(.data$aligned_regions)) %>%
dplyr::bind_cols(.$name) %>%
dplyr::select(-c(.data$name, .data$entry_id)) %>%
dplyr::rename(name_protein = .data$value) %>%
tidyr::unnest(c(.data$auth_asym_ids, .data$polymer_entity_instances)) %>%
dplyr::bind_cols(
rcsb_polymer_entity_instance_container_identifiers = .$rcsb_polymer_entity_instance_container_identifiers
) %>%
dplyr::select(-c(.data$rcsb_polymer_entity_instance_container_identifiers))
if (nrow(polymer_entities_no_aligned_regions) > 0) {
polymer_entities_no_aligned_regions <- polymer_entities_no_aligned_regions %>%
dplyr::select(-c(.data$aligned_regions, .data$no_aligned_regions)) %>%
dplyr::bind_cols(.$name) %>%
dplyr::select(-c(.data$name, .data$entry_id)) %>%
dplyr::rename(name_protein = .data$value) %>%
tidyr::unnest(c(.data$auth_asym_ids, .data$polymer_entity_instances)) %>%
dplyr::bind_cols(
rcsb_polymer_entity_instance_container_identifiers = .$rcsb_polymer_entity_instance_container_identifiers
) %>%
dplyr::select(-c(.data$rcsb_polymer_entity_instance_container_identifiers)) %>%
dplyr::mutate(
entity_beg_seq_id = NA,
ref_beg_seq_id = NA,
length = NA
)
# Join columns back into main data.frame
polymer_entities <- polymer_entities %>%
dplyr::bind_rows(polymer_entities_no_aligned_regions)
}
} else {
polymer_entities <- polymer_entities %>%
dplyr::select(-c(.data$rcsb_polymer_entity_align)) %>%
dplyr::bind_cols(.$name) %>%
dplyr::select(-c(.data$name, .data$entry_id)) %>%
dplyr::rename(name_protein = .data$value) %>%
tidyr::unnest(c(.data$auth_asym_ids, .data$polymer_entity_instances)) %>%
dplyr::bind_cols(
rcsb_polymer_entity_instance_container_identifiers = .$rcsb_polymer_entity_instance_container_identifiers
) %>%
dplyr::select(-c(.data$rcsb_polymer_entity_instance_container_identifiers)) %>%
dplyr::mutate(
entity_beg_seq_id = NA,
ref_beg_seq_id = NA,
length = NA,
reference_database_accession = NA,
reference_database_isoform = NA,
reference_database_name = NA
)
}
# Extract ligand information
if (show_progress == TRUE) {
message("DONE ", paste0("(", round(as.numeric(difftime(Sys.time(), start_time, units = "secs")), digits = 2), "s)"))
message("-> 3/6 Ligand binding sites ... ", appendLF = FALSE)
start_time <- Sys.time()
}
polymer_entities_no_ligands <- polymer_entities %>%
dplyr::rowwise() %>%
dplyr::mutate(no_ligands = is.null(unlist(.data$rcsb_ligand_neighbors))) %>%
dplyr::ungroup() %>%
dplyr::filter(.data$no_ligands) %>%
dplyr::select(-c(.data$rcsb_ligand_neighbors, .data$no_ligands)) %>%
dplyr::mutate(
atom_id = as.character(NA),
auth_seq_id = as.integer(NA),
comp_id = as.character(NA),
ligand_asym_id = as.character(NA),
ligand_atom_id = as.character(NA),
ligand_comp_id = as.character(NA),
ligand_entity_id = as.character(NA),
ligand_is_bound = as.character(NA),
seq_id = as.integer(NA)
)
polymer_entities <- polymer_entities %>%
tidyr::unnest(c(.data$rcsb_ligand_neighbors)) %>%
dplyr::bind_rows(polymer_entities_no_ligands) %>%
dplyr::mutate(ligand_is_bound = ifelse(.data$ligand_is_bound == "Y", "TRUE", "FALSE")) %>%
dplyr::group_by(.data$pdb_ids, .data$auth_asym_id, .data$ligand_entity_id) %>%
dplyr::mutate(dplyr::across(
.cols = c(
.data$atom_id,
.data$auth_seq_id,
.data$comp_id,
.data$ligand_asym_id,
.data$ligand_atom_id,
.data$ligand_comp_id,
.data$ligand_is_bound,
.data$seq_id
),
.fns = ~ paste0(.x, collapse = ";")
)) %>%
dplyr::distinct() %>%
dplyr::group_by(.data$pdb_ids, .data$auth_asym_id) %>%
dplyr::mutate(dplyr::across(
.cols = c(
.data$atom_id,
.data$auth_seq_id,
.data$comp_id,
.data$ligand_asym_id,
.data$ligand_atom_id,
.data$ligand_comp_id,
.data$ligand_entity_id,
.data$ligand_is_bound,
.data$seq_id
),
.fns = ~ paste0(.x, collapse = "|")
)) %>%
dplyr::distinct() %>%
dplyr::mutate(dplyr::across(
.cols = c(
.data$atom_id,
.data$auth_seq_id,
.data$comp_id,
.data$ligand_asym_id,
.data$ligand_atom_id,
.data$ligand_comp_id,
.data$ligand_entity_id,
.data$ligand_is_bound,
.data$seq_id
),
.fns = ~ ifelse(str_detect(.data$atom_id, pattern = "NA"), NA, .x)
)) %>%
dplyr::ungroup() %>%
dplyr::rename(
ligand_donor_atom_id = .data$atom_id,
ligand_donor_auth_seq_id = .data$auth_seq_id,
ligand_donor_id = .data$comp_id,
ligand_id = .data$ligand_comp_id,
ligand_label_asym_id = .data$ligand_asym_id,
ligand_bond_is_covalent_or_coordinating = .data$ligand_is_bound,
ligand_donor_label_seq_id = .data$seq_id
)
if ("rcsb_ligand_neighbors" %in% colnames(polymer_entities)) {
# if none of the retrieved entries contains any ligands then this column needs to be removed manually
polymer_entities <- polymer_entities %>%
select(-.data$rcsb_ligand_neighbors)
}
# extract modified monomer information
if (show_progress == TRUE) {
message("DONE ", paste0("(", round(as.numeric(difftime(Sys.time(), start_time, units = "secs")), digits = 2), "s)"))
message("-> 4/6 Modified monomers ... ", appendLF = FALSE)
start_time <- Sys.time()
}
rcsb_polymer_entity_feature <- polymer_entities %>%
dplyr::select(.data$pdb_ids, .data$auth_asym_id, .data$rcsb_polymer_entity_feature) %>%
tidyr::unnest(.data$rcsb_polymer_entity_feature) %>%
tidyr::unnest(.data$feature_positions)
modified_monomer <- rcsb_polymer_entity_feature %>%
dplyr::filter(.data$type %in% c("modified_monomer")) %>%
dplyr::group_by(.data$pdb_ids, .data$auth_asym_id) %>%
dplyr::mutate(residue = paste0(
.data$beg_comp_id,
":",
.data$beg_seq_id,
"(",
stringr::str_replace(.data$name, pattern = "Parent monomer ", replacement = ""),
")"
)) %>%
dplyr::mutate(modified_monomer = paste0(.data$residue, collapse = ";")) %>%
dplyr::ungroup() %>%
dplyr::distinct(.data$modified_monomer, .data$pdb_ids, .data$auth_asym_id)
# extract secondary structure information
if (show_progress == TRUE) {
message("DONE ", paste0("(", round(as.numeric(difftime(Sys.time(), start_time, units = "secs")), digits = 2), "s)"))
message("-> 5/6 Secondary structure ... ", appendLF = FALSE)
start_time <- Sys.time()
}
rcsb_polymer_instance_feature_data <- polymer_entities %>%
dplyr::select(.data$pdb_ids, .data$auth_asym_id, .data$rcsb_polymer_instance_feature) %>%
tidyr::unnest(.data$rcsb_polymer_instance_feature) %>%
tidyr::unnest(.data$feature_positions)
secondary_structures <- rcsb_polymer_instance_feature_data %>%
dplyr::filter(.data$name %in% c("helix", "sheet")) %>%
dplyr::group_by(.data$pdb_ids, .data$auth_asym_id) %>%
dplyr::mutate(from_to = paste0(.data$name, ":", .data$beg_seq_id, "-", .data$end_seq_id)) %>%
dplyr::mutate(secondary_structure = paste0(.data$from_to, collapse = ";")) %>%
dplyr::ungroup() %>%
dplyr::distinct(.data$secondary_structure, .data$pdb_ids, .data$auth_asym_id)
# extract info about unmodeled residues
if (show_progress == TRUE) {
message("DONE ", paste0("(", round(as.numeric(difftime(Sys.time(), start_time, units = "secs")), digits = 2), "s)"))
message("-> 6/6 Unmodeled residues ... ", appendLF = FALSE)
start_time <- Sys.time()
}
unmodeled_residues <- rcsb_polymer_instance_feature_data %>%
dplyr::filter(.data$name %in% c("partially modeled residue", "unmodeled residue")) %>%
dplyr::group_by(.data$pdb_ids, .data$auth_asym_id) %>%
dplyr::mutate(from_to = paste0(.data$name, ":", .data$beg_seq_id, "-", .data$end_seq_id)) %>%
dplyr::mutate(unmodeled_structure = paste0(.data$from_to, collapse = ";")) %>%
dplyr::ungroup() %>%
dplyr::distinct(.data$unmodeled_structure, .data$pdb_ids, .data$auth_asym_id)
# join secondary structure, unmodeled info and modified monomer
polymer_entities <- polymer_entities %>%
left_join(modified_monomer, by = c("pdb_ids", "auth_asym_id")) %>%
left_join(secondary_structures, by = c("pdb_ids", "auth_asym_id")) %>%
left_join(unmodeled_residues, by = c("pdb_ids", "auth_asym_id")) %>%
select(-c(.data$rcsb_polymer_instance_feature, .data$rcsb_polymer_entity_feature))
# Modify auth_seq_id positions that are either duplicated or missing.
# Missing or duplicated entries are identified by comparing the length of auth_seq_id to the length of the sequence.
if (show_progress == TRUE) {
message("DONE ", paste0("(", round(as.numeric(difftime(Sys.time(), start_time, units = "secs")), digits = 2), "s)"))
message("[4/6] Correct author sequence positions for some PDB IDs ... ", appendLF = FALSE)
start_time <- Sys.time()
}
fix_auth_seq_positions <- polymer_entities %>%
dplyr::rowwise() %>%
dplyr::filter(length(unlist(.data$auth_to_entity_poly_seq_mapping)) != nchar(.data$pdbx_seq_one_letter_code_can))
# Remove duplicated positions
polymer_entities_seq_id_duplications <- fix_auth_seq_positions %>%
dplyr::filter(length(unlist(.data$auth_to_entity_poly_seq_mapping)) > nchar(.data$pdbx_seq_one_letter_code_can)) %>%
dplyr::mutate(auth_seq_id = list(unique(unlist(.data$auth_to_entity_poly_seq_mapping)))) %>%
dplyr::ungroup()
# Add missing positions
polymer_entities_seq_id_missing <- fix_auth_seq_positions %>%
dplyr::filter(length(unlist(.data$auth_to_entity_poly_seq_mapping)) < nchar(.data$pdbx_seq_one_letter_code_can))
if (nrow(polymer_entities_seq_id_missing) > 0) { # do not run the code below if the data frame is empty, this causes error
polymer_entities_seq_id_missing <- polymer_entities_seq_id_missing %>%
dplyr::mutate(auth_seq_id_pdb_numeric = list(stringr::str_replace(.data$auth_to_entity_poly_seq_mapping, pattern = "[:alpha:]", replacement = ""))) %>%
dplyr::mutate(non_consecutive = list(as.numeric(.data$auth_seq_id_pdb_numeric)[c(diff(as.numeric(.data$auth_seq_id_pdb_numeric)) > 1)])) %>%
dplyr::mutate(n_missing = list(diff(as.numeric(.data$auth_seq_id_pdb_numeric))[which(diff(as.numeric(.data$auth_seq_id_pdb_numeric)) > 1)] - 1)) %>%
dplyr::mutate(replacement_values_addition = list(purrr::map(
.x = .data$n_missing,
.f = ~ 1:.x
))) %>%
dplyr::mutate(replacement_positions = list(rep(which(.data$auth_seq_id_pdb_numeric %in% as.character(.data$non_consecutive)) + 1, .data$n_missing))) %>%
dplyr::mutate(auth_seq_id = list(R.utils::insert(.data$auth_to_entity_poly_seq_mapping,
.data$replacement_positions,
values = as.character(rep(.data$non_consecutive, .data$n_missing) + unlist(.data$replacement_values_addition))
))) %>%
dplyr::ungroup() %>%
dplyr::select(-c(.data$auth_seq_id_pdb_numeric, .data$non_consecutive, .data$n_missing, .data$replacement_values_addition, .data$replacement_positions))
}
# Join corrected entries back
polymer_entities <- polymer_entities %>%
dplyr::rowwise() %>%
dplyr::filter(length(unlist(.data$auth_to_entity_poly_seq_mapping)) == nchar(.data$pdbx_seq_one_letter_code_can) |
is.null(.data$auth_to_entity_poly_seq_mapping) |
is.na(.data$pdbx_seq_one_letter_code_can)) %>%
dplyr::ungroup() %>%
dplyr::mutate(auth_seq_id = .data$auth_to_entity_poly_seq_mapping) %>%
dplyr::bind_rows(polymer_entities_seq_id_duplications) %>%
dplyr::bind_rows(polymer_entities_seq_id_missing)
if (show_progress == TRUE) {
if (nrow(fix_auth_seq_positions) > 0) {
message("DONE ", paste0("(", round(as.numeric(difftime(Sys.time(), start_time, units = "secs")), digits = 2), "s)"))
message("Corrected entries: ", paste0(unique(fix_auth_seq_positions$pdb_ids), collapse = ", "))
} else {
message("None to correct", paste0("(", round(as.numeric(difftime(Sys.time(), start_time, units = "secs")), digits = 2), "s)"))
}
}
entity_instance_info <- polymer_entities %>%
dplyr::distinct(
.data$asym_id,
.data$auth_asym_id,
.data$entry_id,
.data$auth_to_entity_poly_seq_mapping
) %>%
dplyr::rename(
auth_asym_ids = .data$auth_asym_id,
pdb_ids = .data$entry_id
)
uniprot_info <- polymer_entities %>%
dplyr::distinct(.data$uniprot_id, .data$name_protein) %>%
dplyr::rename(
reference_database_accession = .data$uniprot_id,
protein_name = .data$name_protein
)
polymer_entities <- polymer_entities %>%
select(-c(
.data$uniprot_id,
.data$name_protein,
.data$asym_id,
.data$auth_asym_id,
.data$entry_id,
.data$auth_to_entity_poly_seq_mapping
)) %>%
distinct()
if (show_progress == TRUE) {
message("[5/6] Extract non-polymer information ... ", appendLF = FALSE)
start_time <- Sys.time()
}
if (!all(is.na(query_result_clean$entries.nonpolymer_entities))) {
nonpolymer_entities <- query_result_clean %>%
dplyr::select(.data$pdb_ids, .data$entries.nonpolymer_entities) %>%
tidyr::unnest(.data$entries.nonpolymer_entities) %>%
dplyr::bind_cols(
.$rcsb_nonpolymer_entity_container_identifiers,
.$nonpolymer_comp
) %>%
dplyr::bind_cols(.$chem_comp) %>%
dplyr::select(-c(
.data$nonpolymer_comp,
.data$rcsb_nonpolymer_entity_container_identifiers,
.data$chem_comp,
.data$entry_id
)) %>%
tidyr::unnest(.data$auth_asym_ids) %>%
dplyr::rename(
name_nonpolymer = .data$name,
formula_nonpolymer = .data$formula,
formula_weight_nonpolymer = .data$formula_weight,
type_nonpolymer = .data$type,
id_nonpolymer = .data$id
)
} else {
nonpolymer_entities <- polymer_entities %>%
dplyr::select(.data$pdb_ids, .data$auth_asym_ids) %>%
dplyr::mutate(
name_nonpolymer = NA,
formula_nonpolymer = NA,
formula_weight_nonpolymer = NA,
type_nonpolymer = NA,
id_nonpolymer = NA
)
}
additional_info <- query_result_clean %>%
dplyr::select(-c(
.data$entries.nonpolymer_entities,
.data$entries.polymer_entities,
.data$entries.rcsb_binding_affinity,
.data$entries.pdbx_nmr_exptl,
.data$entries.pdbx_nmr_exptl_sample_conditions,
.data$entries.pdbx_nmr_refine,
.data$entries.exptl_crystal_grow,
.data$resolution_combined
))
if (show_progress == TRUE) {
message("DONE ", paste0("(", round(as.numeric(difftime(Sys.time(), start_time, units = "secs")), digits = 2), "s)"))
message("[6/6] Combine information ... ", appendLF = FALSE)
start_time <- Sys.time()
}
combined <- polymer_entities %>%
dplyr::full_join(nonpolymer_entities, by = c("pdb_ids", "auth_asym_ids")) %>%
dplyr::left_join(rcsb_binding_affinity, by = "pdb_ids") %>%
dplyr::left_join(additional_info, by = "pdb_ids") %>%
dplyr::left_join(crystal_growth_info, by = "pdb_ids") %>%
dplyr::left_join(nmr_info, by = "pdb_ids") %>%
dplyr::left_join(resolution_info, by = "pdb_ids") %>%
dplyr::left_join(uniprot_info, by = "reference_database_accession") %>%
dplyr::left_join(entity_instance_info, by = c("pdb_ids", "auth_asym_ids")) %>%
dplyr::rename(
auth_asym_id = .data$auth_asym_ids,
label_asym_id = .data$asym_id,
pdb_sequence = .data$pdbx_seq_one_letter_code_can,
auth_seq_id_original = .data$auth_to_entity_poly_seq_mapping,
engineered_mutation = .data$pdbx_mutation,
non_std_monomer = .data$rcsb_non_std_monomers
) %>%
dplyr::rowwise() %>%
# make character string out of list column
dplyr::mutate(
auth_seq_id = paste0(.data$auth_seq_id, collapse = ";"),
auth_seq_id_original = paste0(.data$auth_seq_id_original, collapse = ";")
) %>%
dplyr::ungroup() %>%
dplyr::mutate(
auth_seq_id = ifelse(.data$auth_seq_id == "",
NA,
.data$auth_seq_id
),
auth_seq_id_original = ifelse(.data$auth_seq_id_original == "",
NA,
.data$auth_seq_id_original
)
) %>%
dplyr::select(
.data$pdb_ids,
.data$auth_asym_id,
.data$label_asym_id,
.data$reference_database_accession,
.data$protein_name,
.data$reference_database_name,
.data$entity_beg_seq_id,
.data$ref_beg_seq_id,
.data$length,
.data$pdb_sequence,
.data$auth_seq_id,
.data$auth_seq_id_original,
.data$engineered_mutation,
.data$modified_monomer,
.data$ligand_donor_atom_id,
.data$ligand_donor_auth_seq_id,
.data$ligand_donor_label_seq_id,
.data$ligand_donor_id,
.data$ligand_label_asym_id,
.data$ligand_atom_id,
.data$ligand_id,
.data$ligand_entity_id,
.data$ligand_bond_is_covalent_or_coordinating,
.data$secondary_structure,
.data$unmodeled_structure,
.data$id_nonpolymer,
.data$type_nonpolymer,
.data$formula_weight_nonpolymer,
.data$name_nonpolymer,
.data$formula_nonpolymer,
.data$experimental_method,
.data$structure_method,
.data$affinity_comp_id,
.data$affinity_value,
.data$pdbx_keywords,
.data$assembly_count,
.data$inter_mol_metalic_bond_count,
.data$pH_crystallisation,
.data$temp_crystallisation,
.data$method_crystallisation,
.data$type_nmr,
.data$ionic_strength_nmr,
.data$pH_nmr,
.data$temp_nmr,
.data$method_nmr,
.data$resolution_combined
)
if (show_progress == TRUE) {
message("DONE ", paste0("(", round(as.numeric(difftime(Sys.time(), start_time, units = "secs")), digits = 2), "s)"))
}
combined
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.