R/calcStrucFeats.R

Defines functions calc.strucfeats subcomplex.combos run.dssp run.sasa query.pdb

Documented in calc.strucfeats query.pdb run.dssp run.sasa subcomplex.combos

# Copyright (C) 2018  Jochen Weile, Roth Lab
#
# This file is part of MaveVis.
#
# MaveVis is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# MaveVis is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with MaveVis.  If not, see <https://www.gnu.org/licenses/>.


options(stringsAsFactors=FALSE)
# source("lib/libpdb.R")
# source("lib/cliargs.R")
# source("lib/trackdrawer.R")
# source("lib/amasLite.R")



#' Query PDB
#' 
#' Queries the Protein Data Bank (PDB) for protein structure data for
#' a given database accession and writes the result to file
#' 
#' @param pdb.acc the PDB accession to query
#' @param pdb.file the name of the file to which the output will be written.
#' @return the name of the output file.
#' @examples
#' \dontrun{
#' #with a pre-determined output file
#' outfile <- "sumo_conjugase_structure.pdb"
#' query.pdb("3UIP",outfile)
#' 
#' #with an autogenerated output file
#' outfile <- query.pdb("3UIP")
#' }
#' @export
query.pdb <- function(pdb.acc,pdb.file=paste0(pdb.acc,".pdb")) {

	library("httr")

	pdb.url <- paste0("https://files.rcsb.org/download/",pdb.acc,".pdb")
	htr <- GET(pdb.url)
	if (http_status(htr)$category == "Success") {
		writeBin(content(htr, "raw"), pdb.file)
		return(pdb.file)
	} else {
		# logger$fatal("Unable to download PDB file")
		stop("Unable to download PDB file")
	}
}

#' Query Alphafold
#' 
#' Queries the EMBL Alphafold database for predicted protein structure data for
#' a given database accession and writes the result to file
#' 
#' @param uniprot.acc the Uniprot accession to query
#' @param pdb.file the name of the file to which the output will be written.
#' @return the name of the output file.
#' @examples
#' \dontrun{
#' #with a pre-determined output file
#' outfile <- "sumo_conjugase_structure.pdb"
#' query.pdb("3UIP",outfile)
#' 
#' #with an autogenerated output file
#' outfile <- query.pdb("3UIP")
#' }
#' @export
query.alphafold <- function (uniprot.acc,pdb.file=paste0(uniprot.acc,"_af.pdb")) {
	library("httr")
	af.url <- paste0("https://alphafold.ebi.ac.uk/files/AF-",uniprot.acc,"-F1-model_v4.pdb")
	htr <- GET(af.url)
	if (http_status(htr)$category == "Success") {
		writeBin(content(htr, "raw"), pdb.file)
		return(pdb.file)
	} else {
		# logger$fatal("Unable to download PDB file")
		stop("Unable to download PDB file")
	}
}

#' Run FreeSASA on a PDB structure
#' 
#' Calculates solvent accessible surface area on a given structure using the 
#' external software FreeSASA, which must be installed and available via $PATH.
#' 
#' @param pdb.file The PDB file on which to run FreeSASA
#' @return a \code{data.frame} with the output of FreeSASA
#' @examples
#' \dontrun{
#' sasa <- run.sasa("3UIP.pdb")
#' }
#' @export
run.sasa <- function(pdb.file) {
	suppressMessages(library("gdata"))
	widths <- c(
		type=3,res=4,chain=2,pos=4,all.abs=9,all.rel=6,side.abs=7,side.rel=6,
		main.abs=7,main.rel=6,nonpolar.abs=7,nonpolar.rel=6,polar.abs=7,polar.rel=6
	)
	lines <- system(paste("freesasa --format=rsa",pdb.file),intern=TRUE)
	lines <- lines[regexpr("^RES",lines) > 0]
	con <- textConnection(lines)
	sasa.out <- read.fwf(con,widths,strip.white=TRUE)
	close(con)
	colnames(sasa.out) <- names(widths)
	for (i in 4:14) sasa.out[,i] <- as.numeric(sasa.out[,i])
	#fix boolean misclassification bug
	if (inherits(sasa.out$chain,"logical")) {
		sasa.out$chain <- sapply(sasa.out$chain,function(x) ifelse(x,"T","F"))
	}
	return(sasa.out)
}

#' Run DSSP on a PDB structure
#' 
#' Calculates secondary structure information from a PDB file using the
#' external software DSSP, which must be installed and available via $PATH
#' 
#' @param pdb.file The PDB file on which to run DSSP
#' @return a \code{data.frame} with the output of DSSP
#' @examples
#' \dontrun{
#' dssp.out <- run.dssp(pdb.file)
#' }
#' @export
run.dssp <- function(pdb.file) {
	suppressMessages(library("gdata"))
	cols <- c(num=5,residue=5,chain=2,aa=3,struc=3,struc.flags=7,
		bp1=5,bp2=4,acc=4,no1=12,on1=11,no2=11,on2=11,tco=8,
		kappa=6,alpha=6,phi=6,psi=6,x.ca=7,y.ca=7,z.ca=7)
	lines <- system(paste("dssp",pdb.file),intern=TRUE)
	lines <- lines[regexpr("^\\s+\\d+\\s+\\d+ \\w{1} \\w{1}",lines) > 0]
	con <- textConnection(lines)	
	dssp.out <- read.fwf(
		con,
		widths=cols,
		stringsAsFactors=FALSE,
		strip.white=TRUE
	)
	close(con)
	colnames(dssp.out) <- names(cols)
	substr(dssp.out$structure,1,1)
	return(dssp.out)
}


#' Generate PDB files for complex components
#' 
#' This function finds the component chains of a given PDB structure and generates
#' new PDB files for any desired set of combinations of these components
#' 
#' @param pdb.file the input PDB file for the complex
#' @param chain.sets a list of character vectors, detailing the combinations of chains to 
#'     produce. E.g. \code{list("A",c("A","B"),c("A","C"))} produces one file for 
#'     chain A, one for the combination of chains A and B, and one for the combination
#'     of chains A and C.
#' @return a vector of file names corresponding to the elements of \code{chain.sets}
#' @examples
#' \dontrun{
#' subcplx.files <- sub.complex.combos("3UIP.pdb",chain.sets=list("A",c("A","B"),c("A","C")))
#' }
#' @export
subcomplex.combos <- function(pdb.file,chain.sets) {
	lines <- scan(pdb.file,what="character",sep="\n")
	atom.lines <- which(grepl("^ATOM|^TER",lines))
	model.lines <- which(grepl("^MODEL",lines))
	if (length(model.lines) > 0) {
		endmdl.lines <- which(grepl("^ENDMDL",lines))
		header.lines <- 1:model.lines[[1]]
		footer.lines <- tail(endmdl.lines,1):length(lines)
		applicable.atom.lines <- atom.lines[atom.lines > model.lines[[1]] & atom.lines < endmdl.lines[[1]]]		
	} else {
		header.lines <- 1:(atom.lines[[1]]-1)
		footer.lines <- (tail(atom.lines,1)+1):length(lines)
		applicable.atom.lines <- atom.lines
	}
	applicable.chains <- substr(lines[applicable.atom.lines],22,22)
	subfiles <- sapply(chain.sets,function(cset) {
		outfile <- tempfile(fileext=".pdb")
		con <- file(outfile,open="w")
		writeLines(lines[header.lines],con)
		writeLines(lines[applicable.atom.lines[applicable.chains %in% cset]],con)
		writeLines(lines[footer.lines],con)
		close(con)
		return(outfile)
	})
	return(subfiles)
}


#' Calculate structural features
#' 
#' Calculates structural features of a protein from a given PDB entry
#' 
#' @param acc The PDB or Uniprot accession to use, e.g. "3UIP"
#' @param main.chain The chain identifier in the PDB file that corresponds 
#'    to the protein of interest. Should be a single uppercase letter, e.g. "A".
#' @param db either 'pdb' or 'alphafold'
#' @param offset position offset for incorrectly annotated pdb files (integer)
#' @param overrideCache if true, ignore cached files and query all data from scratch
#' @return a \code{data.frame} detailing secondary structure, solvent accessibility,
#'    and burial in interfaces.
#' @examples
#' \dontrun{
#' sfeats <- calc.strucfeats("3UIP","A")
#' }
#' @export
calc.strucfeats <- function(acc,main.chain,db=c("pdb","alphafold"),offset=0L,overrideCache=FALSE) {

	struc.cache.file <- getCacheFile(paste0(acc,":",main.chain,"_features.csv"))

	if (file.exists(struc.cache.file) && !overrideCache) {
		cat("Using cached features...\n")
		burial.all <- read.csv(struc.cache.file)
		return(burial.all)
	}
	
	library("httr")
	set_config(config(ssl_verifypeer = 0L))

	#set up cache file for structure
	pdb.file <- getCacheFile(paste0(acc,".pdb"))
	# pdb.file <- tempfile(fileext=".pdb")
	if (!file.exists(pdb.file)) {
		switch(match.arg(db,c("pdb","alphafold")),
			pdb={
				cat("Querying PDB...\n")
				query.pdb(acc,pdb.file)
			},
			alphafold={
				cat("Querying PDB...\n")
				query.alphafold(acc,pdb.file)
			}
		)
	} else {
		cat("Using cached PDB file...\n")
	}
	cat("Parsing PDB file...\n")

	cplx.struc <- new.structure(pdb.file)
	chains <- cplx.struc$get.chains()

	if (!(main.chain %in% chains)) {
		stop("The requested chain ",main.chain," doesn't exist in ",acc)
	}

	chain.info <- cplx.struc$get.chain.info()
	# rownames(chain.info) <- chain.info$chain

	chain.names <- sapply(chains,function(ch) with(chain.info,as.character(name[db=="UNP" & chain == ch])))
	chain.names <- sub("_HUMAN","",chain.names)


	if (length(chains) > 1) {
		cat("Splitting protein complex...\n")
		
		#list of chain pairs
		other.chains <- setdiff(chains,main.chain)
		chain.sets <- c(main.chain,lapply(other.chains,c,main.chain))
		#Subdivide into chain pairs
		# subfiles <- cplx.struc$subcomplex.combos(main.chain)
		subfiles <- subcomplex.combos(pdb.file,chain.sets)

		cat("Calculating surface area...\n")
		acctables <- mapply(function(cset,subfile) {
			#re-order chain ids to original file order
			cset <- intersect(chains,cset)
			acctable <- run.sasa(subfile)
			acctable[acctable$chain==main.chain,]
		},cset=chain.sets,subfile=subfiles,SIMPLIFY=FALSE)

		cat("Calculating interface areas...\n")
		#calculate burial based on accessibities
		burial <- cbind(acctables[[1]],do.call(cbind,lapply(2:length(acctables),function(i) {
			before <- acctables[[1]][,"all.abs"]
			data.frame(
				abs.burial=before-acctables[[i]][,"all.abs"],
				rel.burial=(before-acctables[[i]][,"all.abs"])/(before+.000001)
			)
		})))
		colnames(burial)[seq(15,ncol(burial),2)] <- paste0("abs.burial.",chain.names[other.chains])
		colnames(burial)[seq(16,ncol(burial),2)] <- paste0("rel.burial.",chain.names[other.chains])

	} else {
		acctable <- run.sasa(pdb.file)
		burial <- acctable
	}

	cat("Calculating secondary structures...\n")
	#calculate secondary structure
	dssp.out <- run.dssp(pdb.file)
	secstruc <- dssp.out[dssp.out$chain==main.chain,c("residue","struc")]


	cat("Compiling results...\n")
	rownames(burial) <- burial$pos+offset
	rownames(secstruc) <- secstruc$residue+offset

	allpos <- 1:max(burial$pos)
	burial.all <- cbind(burial[as.character(allpos),],secstruc=secstruc[as.character(allpos),"struc"])
	rownames(burial.all) <- burial.all$pos <- allpos

	write.table(burial.all,struc.cache.file,sep=",")

	# cat("Deleting temporary files...\n")
	# #clean up temp files
	# file.remove(pdb.file)
	# file.remove(subfiles)

	return(burial.all)
}
jweile/mavevis documentation built on Oct. 30, 2023, 7:16 a.m.