R/read.fasta.pdb.R

"read.fasta.pdb" <-
  function(aln, prefix="", pdbext="", fix.ali = FALSE, pdblist=NULL, 
           ncore=1, nseg.scale=1, progress=NULL, ...) {

  ## Log the call
  cl <- match.call()
  
  # Parallelized by parallel package (Fri Apr 26 17:58:26 EDT 2013)
  ncore <- setup.ncore(ncore)
  if(ncore > 1) {
     # Issue of serialization problem
     # Maximal number of cells of a double-precision matrix
     # that each core can serialize: (2^31-1-61)/8
     R_NCELL_LIMIT_CORE = 2.68435448e8
     R_NCELL_LIMIT = ncore * R_NCELL_LIMIT_CORE

     if(nseg.scale < 1) {
        warning("nseg.scale should be 1 or a larger integer\n")
        nseg.scale=1
     }
  }
  
  if(is.null(pdblist)) {
    files  <- paste(prefix, aln$id, pdbext,sep="")
    
    ##cat(files,sep="\n")
    toread <- file.exists(files)

    ## check for online files
    toread[ substr(files,1,4)=="http" ] <- TRUE
    
    if(all(!toread))
      stop("No corresponding PDB files found")

    ## Avoid multi-thread downloading
    if(any(substr(files,1,4) == "http")) {
      ncore = 1
#      options(cores = ncore)
    }
  }
  else {
    toread <- rep(FALSE, length(pdblist))
    files <- rep(NA, length(pdblist))
  }
  

  blank <- rep(NA, ncol(aln$ali))
 
  retval <- mclapply(1:length(aln$id), function(i) {
    coords <- NULL; res.nu <- NULL
    res.bf <- NULL; res.ch <- NULL
    res.id <- NULL; res.ss <- NULL

    ## edit for shiny version
    if(!is.null(progress)) {
      progress$inc(1/length(aln$id)/2)
    }
    ## edit end

    cat(paste("pdb/seq:",i,"  name:", aln$id[i]),"\n")

    if(!toread[i] & is.null(pdblist)) {
      warning(paste("No PDB file found for seq", aln$id[i],
              ": (with filename) ",files[i]), call.=FALSE)
      coords <- rbind(coords, rep(blank,3))
      res.nu <- rbind(res.nu, blank)
      res.bf <- rbind(res.bf, blank)
      res.ch <- rbind(res.ch, blank)
      res.id <- rbind(res.id, blank)
      res.ss <- rbind(res.ss, blank)
      
    } else {
      if(is.null(pdblist))
        pdb <- read.pdb( files[i], verbose=FALSE, ... )
      else
        pdb <- pdblist[[i]]
      ca.inds <- atom.select(pdb, "calpha", verbose=FALSE)
      pdbseq  <- aa321(pdb$atom$resid[ca.inds$atom])
      aliseq  <- toupper(aln$ali[i,])
      tomatch <- gsub("X","[A-Z]",aliseq[!is.gap(aliseq)])
      
      if(length(pdbseq)<1)
        stop(paste(basename(aln$id[i]), ": insufficent Calpha's in PDB"), call.=FALSE)
      
      ##-- Search for ali residues (1:15) in pdb
      start.num <- regexpr(pattern = paste(c(na.omit(tomatch[1:15])),collapse=""),
                           text = paste(pdbseq,collapse=""))[1]
            
      if (start.num == -1) {
        start.num <- 1
        ##stop(paste(basename(aln$id[i]), ": starting residues of sequence does not match starting residues in PDB"), call.=FALSE)
      }

      ##-- Numeric vec, 'nseq', for mapping aln to pdb
      nseq <- rep(NA,length(aliseq))
      ali.res.ind <- which(!is.gap(aliseq))
      if( length(ali.res.ind) > (length(pdbseq) - start.num + 1) ) {
        warning(paste(aln$id[i],
         ": sequence has more residues than PDB has Calpha's"), call.=FALSE)
        ali.res.ind <- ali.res.ind[1:(length(pdbseq)-start.num+1)] ## exclude extra
        tomatch <-  tomatch[1:(length(pdbseq)-start.num+1)]        ## terminal residues
      }
      nseq[ali.res.ind] = start.num:((start.num - 1) + length(tomatch))

      ##-- Check for miss-matches
      match <- aliseq != pdbseq[nseq] 
      if ( sum(match, na.rm=TRUE) >= 1 ) {
        mismatch.ind <- which(match)
        mismatch <- cbind(aliseq, pdbseq[nseq])[mismatch.ind,]
        n.miss <- length(mismatch.ind)

        if(sum(mismatch=="X") != n.miss) { ## ignore masked X res        
          details <- seqbind(aliseq, pdbseq[nseq])
          details$ali[is.na(details$ali)] <- "-"
          rownames(details$ali) <- c("aliseq","pdbseq")
          details$id <- c("aliseq","pdbseq")
          
          resmatch <- which(!apply(details$ali, 2, function(x) x[1]==x[2]))
          
          resid <- paste(pdb$atom$resid[ca.inds$atom][nseq][resmatch][1], "-",
                         pdb$atom$resno[ca.inds$atom][nseq][resmatch][1],
                         " (", pdb$atom$chain[ca.inds$atom][nseq][resmatch][1], ")", sep="")
          
          cat("\n ERROR   Alignment mismatch. See alignment below for further details\n")
          cat("         (row ", i, " of aln and sequence of '", aln$id[i], "').\n", sep="")
          cat("         First mismatch residue in PDB is:", resid, "\n")
          cat("         occurring at alignment position:", which(match)[1], "\n\n")
          .print.fasta.ali(details)
          
          msg <- paste(basename.pdb(aln$id[i]),
                       " alignment and PDB sequence miss-match\n",
                       "       beginning at position ",
                       which(match)[1], " (PDB RESNO ", resid, ")", sep="")
          stop(msg, call.=FALSE)
        }
      }
      
      ##-- Store nseq justified/aligned PDB data
      ca.ali <- pdb$atom[ca.inds$atom,][nseq,]
      coords <- rbind(coords, as.numeric( t(ca.ali[,c("x","y","z")]) ))
      res.nu <- rbind(res.nu, ca.ali[, "resno"])
      res.bf <- rbind(res.bf, as.numeric( ca.ali[,"b"] ))
      res.ch <- rbind(res.ch, ca.ali[, "chain"])
      res.id <- rbind(res.id, ca.ali[, "resid"])

      sse <- pdb2sse(pdb, verbose = FALSE)
      res.ss <- rbind(res.ss, sse[nseq])
    } ## end else for (non)missing PDB file
    return (list(coords=coords, res.nu=res.nu, res.bf=res.bf, res.ch=res.ch, res.id=res.id, res.ss=res.ss))
  } , mc.cores=ncore) ## end mylapply

  retval <- do.call(rbind, retval)
  coords <- matrix(unlist(retval[, "coords"]), nrow=length(aln$id), byrow=TRUE)
  res.nu <- matrix(unlist(retval[, "res.nu"]), nrow=length(aln$id), byrow=TRUE)
  res.bf <- matrix(unlist(retval[, "res.bf"]), nrow=length(aln$id), byrow=TRUE)
  res.ch <- matrix(unlist(retval[, "res.ch"]), nrow=length(aln$id), byrow=TRUE)
  res.id <- matrix(unlist(retval[, "res.id"]), nrow=length(aln$id), byrow=TRUE)
 
  if( any(sapply(retval[, "res.ss"], is.null)) ) {
     res.ss <- NULL
  }
  else {
     res.ss <- matrix(unlist(retval[, "res.ss"]), nrow=length(aln$id), byrow=TRUE)
     rownames(res.ss) <- aln$id
  }

  rownames(aln$ali) <- aln$id
  rownames(coords) <- aln$id
  rownames(res.nu) <- aln$id
  rownames(res.bf) <- aln$id
  rownames(res.ch) <- aln$id
  rownames(res.id) <- aln$id
 
  if(fix.ali) {
     i1 <- which(is.na(res.nu))
     i2 <- which(is.gap(aln$ali))
     if(!identical(i1, i2)) {
        aln$ali[i1] <- aln$ali[i2[1]]
        warning("$ali component is modified to match $resno")
     }
  }

  out<-list(xyz=coords, resno=res.nu, b=res.bf,
            chain = res.ch, id=aln$id, ali=aln$ali, resid=res.id, sse=res.ss,
            call = cl)
  class(out)=c("pdbs","fasta")
  class(out$xyz) = "xyz"

  return(out)
}

Try the bio3d package in your browser

Any scripts or data that you put into this service are public.

bio3d documentation built on Oct. 27, 2022, 1:06 a.m.