Nothing
#' Inspect And Clean Up A PDB Object
#'
#' Inspect alternative coordinates, chain breaks, bad residue
#' numbering, non-standard/unknow amino acids, etc. Return
#' a 'clean' pdb object with fixed residue numbering and optionally
#' relabeled chain IDs, corrected amino acid names, removed water,
#' ligand, or hydrogen atoms. All changes are recorded in a log in the
#' returned object.
#'
#' @details call for its effects.
#'
#' @param pdb an object of class \code{pdb} as obtained from
#' function \code{\link{read.pdb}}.
#' @param consecutive logical, if TRUE renumbering will result in
#' consecutive residue numbers spanning all chains. Otherwise new residue
#' numbers will begin at 1 for each chain.
#' @param force.renumber logical, if TRUE atom and residue records are renumbered
#' even if no 'insert' code is found in the \code{pdb} object.
#' @param fix.chain logical, if TRUE chains are relabeled based on chain breaks detected.
#' @param fix.aa logical, if TRUE non-standard amino acid names are converted into
#' equivalent standard names.
#' @param rm.wat logical, if TRUE water atoms are removed.
#' @param rm.lig logical, if TRUE ligand atoms are removed.
#' @param rm.h logical, if TRUE hydrogen atoms are removed.
#' @param verbose logical, if TRUE details of the conversion process are printed.
#'
#' @return a 'pdb' object with an additional \code{$log} component storing
#' all the processing messages.
#'
#' @seealso \code{\link{read.pdb}}
#'
#' @author Xin-Qiu Yao & Barry Grant
#'
#' @examples
#' \donttest{
#' # PDB server connection required - testing excluded
#'
#' pdb <- read.pdb("1a7l")
#' clean.pdb(pdb)
#' }
"clean.pdb" <-
function(pdb, consecutive=TRUE, force.renumber = FALSE, fix.chain = FALSE,
fix.aa = FALSE, rm.wat = FALSE, rm.lig = FALSE, rm.h = FALSE, verbose=FALSE) {
if(!is.pdb(pdb))
stop("Input should be a 'pdb' object")
cl <- match.call()
## processing message
## stored as an N-by-3 matrix with columns:
## FACT, OPERATION, IMPORTANT NOTE
log <- NULL
## a flag to indicate if the pdb is clean
clean <- TRUE
## Recognized amino acid names
prot.aa <- bio3d::aa.table$aa3
## for residues and atoms renumbering
first.eleno = 1
first.resno = 1
## remove water
if(rm.wat) {
wat <- atom.select(pdb, "water", verbose = FALSE)
if(length(wat$atom) > 0) {
pdb$atom <- pdb$atom[-wat$atom, ,drop=FALSE]
pdb$xyz <- pdb$xyz[, -wat$xyz, drop=FALSE]
log <- .update.log(log, paste("Found", length(wat$atom), "water atoms"), "REMOVED")
}
}
## remove ligands
if(rm.lig) {
lig <- atom.select(pdb, "ligand", verbose = FALSE)
if(length(lig$atom) > 0) {
pdb$atom <- pdb$atom[-lig$atom, ,drop=FALSE]
pdb$xyz <- pdb$xyz[, -lig$xyz, drop=FALSE]
log <- .update.log(log, paste("Found", length(lig$atom), "ligand atoms"), "REMOVED")
}
}
## remove hydrogens
if(rm.h) {
h.inds <- atom.select(pdb, "h", verbose = FALSE)
if(length(h.inds$atom) > 0) {
pdb$atom <- pdb$atom[-h.inds$atom, ,drop=FALSE]
pdb$xyz <- pdb$xyz[, -h.inds$xyz, drop=FALSE]
log <- .update.log(log, paste("Found", length(h.inds$atom), "hydrogens"), "REMOVED")
}
}
## check if 'alt' coords exist
if(any(rm.p <- !is.na(pdb$atom$alt) & pdb$atom$alt != "A")) {
pdb$atom <- pdb$atom[!rm.p, , drop=FALSE]
pdb$xyz <- pdb$xyz[, -atom2xyz(which(rm.p)), drop=FALSE]
log <- .update.log(log, paste("Found", sum(rm.p), "ALT records"), "REMOVED")
}
## Some initial check:
## 1. Are all amino acid and/or nucleic acid residues
## distinguished by the combination chainID_resno_insert?
.check.residue.ambiguity(pdb)
## 2. Check and clean up SSE annotation
pdb <- .check.sse(pdb)
log <- .update.log(log, pdb$log)
## 3. Fix pdb$calpha if it is mismatch pdb$atom
ca.inds <- atom.select(pdb, "calpha", verbose = FALSE)
calpha <- seq(1, nrow(pdb$atom)) %in% ca.inds$atom
if(!identical(pdb$calpha, calpha)) {
pdb$calpha <- calpha
log <- .update.log(log, "pdb$calpha", "UPDATED")
}
## 4. Fix object class if it is incorrect
if(!inherits(pdb, "pdb") || !inherits(pdb, "sse") ||
!inherits(pdb$xyz, "xyz")) {
class(pdb) <- c("pdb", "sse")
class(pdb$xyz) <- "xyz"
log <- .update.log(log, "Object class", "UPDATED")
}
###########
## following operations are on an independent object
npdb <- pdb
## check chain breaks and missing chain ids
has.fixed.chain <- FALSE
capture.output( new.chain <- chain.pdb(npdb) )
chain <- npdb$atom[, "chain"]
if(any(is.na(npdb$atom[, "chain"]))) {
log <- .update.log(log, "Found empty chain IDs",
ifelse(fix.chain, "FIXED", "NO CHANGE"),
ifelse(fix.chain, "ALL CHAINS ARE RELABELED", ""))
if(fix.chain) {
npdb$atom[, "chain"] <- new.chain
has.fixed.chain <- TRUE
}
else if(clean)
clean <- FALSE
}
else {
## check if new chain id assignment is consistent to original one
chn.brk <- bounds(chain[ca.inds$atom], dup.inds=TRUE, pre.sort=FALSE)
new.chn.brk <- bounds(new.chain[ca.inds$atom], dup.inds=TRUE, pre.sort=FALSE)
if(!isTRUE(all.equal(chn.brk, new.chn.brk))) {
log <- .update.log(log, "Found inconsistent chain breaks",
ifelse(fix.chain, "FIXED", "NO CHANGE"),
ifelse(fix.chain, "ALL CHAINS ARE RELABELED", ""))
log <- .update.log(log, "Original chain breaks:")
if(nrow(chn.brk) == 1) {
log <- .update.log(log, " No chain break")
}
else {
pre.ca <- ca.inds$atom[chn.brk[-nrow(chn.brk), "end"]]
pre.log <- capture.output( print(npdb$atom[pre.ca, c("resid", "resno", "chain")], row.names = FALSE) )
log <- .update.log(log, pre.log)
}
log <- .update.log(log)
log <- .update.log(log, "New chain breaks:")
if(nrow(new.chn.brk) == 1) {
log <- .update.log(log, " No chain breaks")
}
else {
new.ca <- ca.inds$atom[new.chn.brk[-nrow(new.chn.brk), "end"]]
new.log <- capture.output( print(npdb$atom[new.ca, c("resid", "resno", "chain")], row.names = FALSE) )
log <- .update.log(log, new.log)
}
if(fix.chain) {
npdb$atom[, "chain"] <- new.chain
has.fixed.chain <- TRUE
}
else if(clean)
clean <- FALSE
}
}
## Renumber residues and atoms
renumber <- FALSE
if( any(!is.na(npdb$atom[, "insert"])) ) {
renumber <- TRUE
log <- .update.log(log, "Found INSERT records", "RENUMBERED")
npdb$atom[, "insert"] <- as.character(NA)
}
else if(force.renumber) {
renumber <- TRUE
log <- .update.log(log, "force.renumber = TRUE", "RENUMBERED")
}
else if(has.fixed.chain) {
## check again the ambiguity of residue labeling
chk <- try(.check.residue.ambiguity(npdb))
if(inherits(chk, "try-error")) {
renumber <- TRUE
log <- .update.log(log, "Found ambiguious residues after chain relabeling", "RENUMBERED")
}
}
if(renumber) {
## Assign consecutive atom numbers
npdb$atom[,"eleno"] <- seq(first.eleno, length=nrow(npdb$atom))
## Determine what chain ID we have
chain <- unique(npdb$atom[, "chain"])
##- Assign new (consecutive) residue numbers for each chain
prev.chain.res = 0 ## Number of residues in previous chain
for(i in 1:length(chain)) {
inds <- which(npdb$atom[, "chain"] == chain[i])
## Combination of chain id, resno and insert code uniquely defines a residue (wwpdb.org)
## Here we use original pdb because we assume it should at least
## distinguish different residues by above combination.
## We don't use the modified pdb (npdb) because all non-protein residues
## are assigned a chain ID as "X" after calling chain.pdb();
## These residues could have the same resno (which are still in original
## form) as they may be assigned different chain IDs in the original pdb.
res <- paste(pdb$atom[inds, "chain"], pdb$atom[inds, "resno"],
pdb$atom[inds, "insert"], sep="_")
n.chain.res <- length(unique(res))
new.nums <- (first.resno+prev.chain.res):(first.resno+n.chain.res-1+prev.chain.res)
npdb$atom[inds, "resno"] <- vec2resno(new.nums, res)
if(consecutive) {
## Update prev.chain.res for next iteration
prev.chain.res = prev.chain.res + n.chain.res
}
}
}
## update SSE
if(has.fixed.chain || renumber) {
## Must use original pdb to unfold SSE
sse <- pdb2sse(pdb, verbose = FALSE)
if(!is.null(sse)) {
id <- sub(".*_.*_.*_([^_]*)$", "\\1", names(sse))
names(sse) <- paste(npdb$atom[ca.inds$atom, "resno"],
npdb$atom[ca.inds$atom, "chain"],
npdb$atom[ca.inds$atom, "insert"],
id, sep = "_")
new.sse <- bounds.sse(sse)
if(length(new.sse$helix$start) > 0) {
npdb$helix$start <- new.sse$helix$start
npdb$helix$end <- new.sse$helix$end
npdb$helix$chain <- new.sse$helix$chain
}
if(length(new.sse$sheet$start) > 0) {
npdb$sheet$start <- new.sse$sheet$start
npdb$sheet$end <- new.sse$sheet$end
npdb$sheet$chain <- new.sse$sheet$chain
}
if(!isTRUE(all.equal(npdb$helix, pdb$helix)) ||
!isTRUE(all.equal(npdb$sheet, pdb$sheet)) )
log <- .update.log(log, "SSE annotation", "UPDATED")
}
}
## update seqres
if(has.fixed.chain && !is.null(npdb$seqres)) {
tinds <- combine.select(atom.select(npdb, 'protein'), atom.select(npdb, 'nucleic'),
operator='+', verbose=FALSE)
npdb2 <- trim(npdb, tinds)
id <- paste(npdb2$atom$resno, npdb2$atom$chain, npdb2$atom$insert, sep='_')
chs <- npdb2$atom[!duplicated(id), "chain"]
names(npdb$seqres) <- chs
if(!identical(npdb$seqres, pdb$seqres))
log <- .update.log(log, "SEQRES", "UPDATED")
}
## update amino acid name
naa.atom <- which(npdb$atom[, "resid"] %in% prot.aa[-c(1:20)])
naa.res <- intersect(ca.inds$atom, naa.atom)
unk.atom <- which(!npdb$atom[, "resid"] %in% prot.aa)
unk.res <- intersect(ca.inds$atom, unk.atom)
if(length(naa.res) > 0) {
log <- .update.log(log, paste("Found", length(naa.res), "non-standard amino acids"),
ifelse(fix.aa, "FIXED", "NO CHANGE"),
ifelse(fix.aa, "AMINO ACID NAMES ARE CHANGED", ""))
tbl <- table(npdb$atom[naa.res, "resid"])
tbl <- paste(" ", names(tbl), "(", tbl, ")", collapse = ",")
log <- .update.log(log, tbl)
if(fix.aa) {
npdb$atom[naa.atom, "resid"] <- aa123(aa321(npdb$atom[naa.atom, "resid"]))
log <- .update.log(log, " Converted to")
tbl <- table(npdb$atom[naa.res, "resid"])
tbl <- paste(" ", aa123(aa321(names(tbl))), "(", tbl, ")", collapse = ",")
log <- .update.log(log, tbl)
}
else
if(clean) clean <- FALSE
}
if(length(unk.res) > 0) {
log <- .update.log(log, paste("Found", length(unk.res), "unknow amino acids"),
"NO CHANGE")
tbl <- table(npdb$atom[unk.res, "resid"])
tbl <- paste(" ", names(tbl), "(", tbl, ")", collapse = ",")
log <- .update.log(log, tbl)
# if(clean) clean <- FALSE
}
## update pdb$call
npdb$call <- cl
## is the pdb clean?
# if(clean)
# log <- .update.log(log, "PDB is clean!")
if(!clean) {
msg <- "PDB is still not clean. Try fix.chain=TRUE and/or fix.aa=TRUE"
# log <- .update.log(log, msg)
warning(msg)
}
## format log
# log <- .format.log(log)
if(verbose) print(log)
npdb$log <- log
return(npdb)
}
.update.log <- function(log, fact="", op="", note="") {
if(is.null(fact)) log
else if(is.data.frame(fact)) .update.log(log, fact[,1], fact[,2], fact[,3])
else rbind(log, data.frame(Data = fact, Action = op, Note = note))
}
.format.log <- function(log, format = c("print", "cat"), op.sign = "->", note.sign = "!!") {
format <- match.arg(format)
if(!is.null(log)) {
log <- apply(log, 1, function(x) {
if(nchar(x[2]) == 0)
sprintf("%-40s", x[1])
else if(nchar(x[3]) == 0)
paste(sprintf("%-40s", x[1]), op.sign, sprintf("%-12s", x[2]))
else
paste(paste(sprintf("%-40s", x[1]), op.sign, sprintf("%-12s", x[2]), note.sign, x[3], note.sign) )
} )
}
else {
log <- "No problem found"
}
log <- switch(format,
print = log,
cat = paste(paste(log, collapse="\n"), "\n", sep="")
)
return(log)
}
.check.residue.ambiguity <- function(pdb) {
ca.inds <- atom.select(pdb, "calpha", verbose = FALSE)
c1p.inds <- atom.select(pdb, "nucleic", elety = "C1'", verbose = FALSE)
inds <- combine.select(ca.inds, c1p.inds, operator = "+", verbose = FALSE)
if(length(inds$atom) > 0) {
strings <- paste(pdb$atom[inds$atom, "resno"],
pdb$atom[inds$atom, "chain"],
pdb$atom[inds$atom, "insert"], sep = "_")
if(any(duplicated(strings)))
stop(".check.residue.ambiguity(): Found ambiguous residue labeling")
}
invisible(NULL)
}
.check.sse <- function(pdb) {
log <- NULL
if(!is.null(pdb$helix) | !is.null(pdb$sheet)) {
if(is.null(pdb$helix)) {
pdb$helix <- list(start=NULL, end=NULL, chain=NULL, type=NULL)
log <- .update.log(log, "Helix is null but sheet is not", "UPDATED")
}
if(is.null(pdb$sheet)) {
pdb$sheet <- list(start=NULL, end=NULL, chain=NULL, sense=NULL)
log <- .update.log(log, "Sheet is null but helix is not", "UPDATED")
}
}
# if there is problem to generate sse vector
ss <- try(pdb2sse(pdb, verbose = FALSE))
if(inherits(ss, "try-error"))
stop(".check.sse(): Unable to generate SSE sequence")
pdb$log <- log
return(pdb)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.