Nothing
#' Pattern Discovery in PDB Structures of Metalloproteins
#'
#' @author Luca Belmonte, Sheref S. Mansy
#' @usage PdPDB(path, metal, n, perc, interactive, dropsReplicate)
#' @keywords metalloproteins, PDB, coordinating patterns, metal, alignment, ligand alignment, motifs
#' @description Looks for amino acid and/or nucleotide patterns coordinated to a given prosthetic centre. It also accounts for small molecule ligands. Patterns are aligned, clustered and translated to logo-like sequences to infer coordination motifs.
#' @references Belmonte L, Mansy SS Patterns of Ligands Coordinated to Metallocofactors Extracted from the Protein Data Bank, Journal of Chemical Information and Modeling (accepted)
#' @param path A string containing the path to the PDB directory.
#' @param metal A string containing the PDB chemical symbol of the target prosthetic centre; e.g. SF4 for [4Fe-4S] cluster, ZN for zinc. The PDB chemical symbol is case sensitive for macOS.
#' @param n A numerical value that contains the number or residue in following/preceding n positions from the ligated amino acid or nucleotide; if n=1 PdPDB searches for x(L)x motif-like chains, if n=2 for xx(L)xx. (L)igand.
#' @param perc A numerical value about the minimum percent of letters in a column otherwise residues are dropped.
#' @param interactive A numerical value. 0 interactive, 1 automated (will not cut dendrogram), 2 user decided cut. In mode 1 and 2 ExPASy amino acid frequencies are used as reference.
#' @param dropsReplicate A numerical value. 0 keeps replicated patterns, 1 drops replicated patterns entry by entry, 2 keeps only unique patterns.
#' @return PdPDB generates a list of ".csv" and ".svg" files that will be stored in the same folder of the analyzed pdb/cif files (see "path"), its output is as follows:
#' \item{frequency.csv}{PDB-like patterns (i.e. with PDB chem Ids). "-" and "+" are used for residues out of the n inspecting window or from different monomers, respectively. Patterns come along with their frequency.}
#' \item{alignment.csv}{Ligand-aligned patterns with dashes, plus signs and gaps ("*"). See 'frequency.csv'.}
#' \item{following_X_enrichment.csv}{n files. Each file contains enrichment score, z-score and statistics at up to n following positions. X is the +position from ligated residue.}
#' \item{ligands_enrichment.csv}{Enrichment scores and statistics for ligands.}
#' \item{notLigands_enrichment.csv}{Enrichment statistics for the whole specimen but ligands.}
#' \item{preceeding_X_enrichment.csv}{As for "following" but this is meant for residues preceeding ligands. See "following_X_enrichment.csv."}
#' \item{root_enrichment.csv}{Overall enrichment score.}
#' \item{logo_Y.csv}{Y files. Each file contains the logo and consensus sequence for a cluster. Y is the cluster number.}
#' \item{dendrogram.svg}{The dendrogram along with the user deciced cutoff and clusters.}
#' \item{following_X_proportions.svg}{Plot of the enrichment score per each amino acid in following positions.}
#' \item{ligands_proportions.svg}{Plot of the enrichment score per each amino acid in ligated position.}
#' \item{notLigands_proportions.svg}{Plot of the enrichment score per each amino acid in non ligated position.}
#' \item{preceeding_X_proportions.svg}{Plot of the enrichment score per each amino acid in preceeding positions.}
#' \item{root_proportions.svg}{Plot of the root enrichment score.}
#' \item{logo_Y.svg}{Plot of the logo and consensus sequence of the Yth cluster. The complete aligned cluster is given as homonym '.csv' file. Sequences come along with percentages. If the dendrogram is not cut the root logo is given.}
#' \item{following_X_standardized.svg}{Plot of the z-score per each amino acid in following positions.}
#' \item{ligands_standardized.svg}{Plot of the z-score per each amino acid in ligated position.}
#' \item{notLigands_standardized.svg}{Plot of the z-score per each amino acid in non ligated position.}
#' \item{preceeding_X_standardized.svg}{Plot of the z-score per each amino acid in preceeding positions.}
#' \item{root_standardized.svg}{Plot of the root z-score.}
#' \item{patterns.csv}{PDB like extracted patterns along with the PDB ID and metal IDs. Useful for debbugging. Needed for restore.}
#' \item{PdPDB.log}{PdPDB log file. Useful for debbugging. Needed for restore.}
#' @note Files have to be in the local file system and contain the ".pdb" or ".cif" extension. Output files use brackets to highlight ligands and/or 'L' in heading line.
#' @importFrom utils count.fields head read.csv read.table write.table
#' @importFrom plyr count ldply rbind.fill
#' @importFrom graphics axis legend plot points text mtext par abline arrows barplot
#' @importFrom stats as.dist cutree hclust rect.hclust as.dendrogram order.dendrogram pnorm p.adjust prop.test wilcox.test
#' @importFrom utils adist write.csv
#' @importFrom grDevices dev.copy dev.off svg
#' @importFrom dendextend labels_colors<- labels_cex<-
#' @importFrom tseries jarque.bera.test
#' @examples
#' ################ Defining path to PDBs
#' path_to_PDB="inst/extdata/PDB" # this is where pdb/cif files are stored
#'
#' ################ Research Parameters
#' metal="SF4" # searches for [4fe-4s] coordinating patterns
#' n=1 # searches for x(L)x patterns, (L) coordinates to SF4
#' perc=20 # drops residues with less than the 20% of frequency
#' interactive= 0 # interactive. User decided references and dendrogram cut
#' dropsReplicate=0 # do not remove replicated patterns
#'
#' ################ Launch PdPDB
#' PdPDB(path_to_PDB,metal,n, perc, interactive, dropsReplicate)
#' @export
PdPDB <- function(path, metal, n, perc, interactive, dropsReplicate){
path_to_commands = file.path(find.package("PdPDB"),"exec/")
if (dropsReplicate>2) dropsReplicate<-2
if (dropsReplicate<0) dropsReplicate<-0
# formatting output path
path_to_out<-paste(paste("cd", path, sep=" "), ";", sep="")
print("########################################################## ")
print(" PdPDB: Pattern discovery in PDB files")
print("########################################################## ")
################
saveLogoText <- 1
verbose <- 0 # turn to 1 for debug
################ Long tests
if((interactive > 2)||(interactive<0))
interactive <- 2
debug<-interactive
if((debug==1)||(debug==2)){
print(paste("Interactive mode: ", debug))
reference<-as.data.frame(c(8.26,1.37,5.46,6.74,3.86,7.08,2.27,5.93,5.82,9.65,2.41,4.06,4.72,3.93,5.53,6.60,5.35,6.86,1.09,2.92)) # expasy
colnames(reference)<-"referenceVec"
rownames(reference)<-c("A","C","D","E","F","G","H","I","K","L","M","N","P","Q","R","S","T","V","W","Y")
numberOfSamples<-554515 # expasy
averageLength<-357 # expasy
defineSeq<-1
defineDictionary<-0
}
# for debug comment the following if section and setup recovery however you prefer
if (file.exists(file.path(path,"patterns.csv"))){
print("")
print(paste("PdPDB has found a job to recover!!! ",file.path(path,"patterns.csv")))
print("If you want to start from scratch remove it and restart!")
print("WARNING: check n before to proceed. It must be compliant with previous run for PdPDB to work properly!!!")
print("")
recovery <-1
}
else {
recovery <-0
}
################
if((n>5)&&(perc<40)) {
print("For optimal results consider to increase Trim frequency or reduce n...")
print(" ")
}
if (verbose==1) { print("VERBOSE MODE ON...")}
# check if bash scripts are installed correctly - if not escape!
noScripts <-0
if ((!file.exists(file.path(path_to_commands,"fileManager")))||(!file.exists(file.path(path_to_commands,"findPatterns")))){
print(file.path("Some of the core functions of PdPDB are missing, please check paths:", path_to_commands))
noScripts <-1
}
# Ask permission for file system access, if denied abort execution!
writeOnFS <- readline(paste(paste("Do you want PdPDB writes results in ", path, sep=" "), "? [1= Yes, 0=No] ", sep=" "))
# writeOnFS <- 1
if (dropsReplicate==2) print("PdPDB now keeps only unique entries...")
if (dropsReplicate==1) print("PdPDB now drops replicated patterns in PDB entries...")
if (dropsReplicate==0) print("PdPDB now keeps replicated patterns...")
if (!(is.na(as.numeric(writeOnFS)==0))&&(as.numeric(writeOnFS)==0)){
print("PdPDB ABORTED!!!")
noScripts <-1
}
# Safe mode - Backup directories are created
if (noScripts==0){
command1=paste(path_to_out, (paste(path_to_commands, "fileManager", sep="")))
system(command1)
# Verbosity
if (verbose==1){
print(command1)
}
# launch findPatterns
filesFound <-TRUE
if (recovery==0){
print("PdPDB runs from scratch!")
pdbNames<-list.files(path = path,pattern = "\\.pdb$")
cifNames<-list.files(path = path,pattern = "\\.cif$")
filesFound <-FALSE
if((length(pdbNames)>0)||(length(cifNames)>0)) {
if((length(pdbNames))>(length(cifNames))){
print("PDB entries correctly located")
command2=paste(path_to_out, paste(path_to_commands, paste("findPatterns",paste(metal,n,sep=" "),sep=" "), sep=""), sep=" ")
}
else{
print("CIF entries correctly located")
command2=paste(path_to_out, paste(path_to_commands, paste("findPatternsX",paste(metal,n,sep=" "),sep=" "), sep=""), sep=" ")
}
if (verbose == 1){ print(command2) }
system(command2)
filesFound <-TRUE
}
else{
print("Invalid PATH! No PDB entries in this directory!")
}
} # eo if recover
############### Pattern analysis
if (isTRUE(filesFound)){
print ("Patterns analysis START!!!!")
# check if file exist before recovery
if (file.exists(file.path(path,"patterns.csv"))){
# check if the generated files are suitable for analysis before to launch the pipeline, is it > 10b?
datFileSize<-file.info(file.path(path,"patterns.csv"))$size
# check if file is empty
if(datFileSize>10){
if (mean(count.fields(file.path(path,"patterns.csv"))) > 1 ){
block <- read.csv (file.path(path,"patterns.csv"), header=FALSE, sep=" ", dec=".", stringsAsFactors = FALSE)
if (dropsReplicate!=1) {
patterns<- arrangePatterns(block, n)
if (dropsReplicate==2) {
print("---> Drop option 2 <---")
patterns<-patterns[!duplicated(patterns),]
}
}
# this will iterate on all the pdb entries
else {
# get starting positions
print("---> Drop option 1 <---")
print("Please be patient this option takes time...")
sbsp<-grep("pdb",block[,1]) # subBlockStartPoints = sbsp
patterns<-subPatternsExtract(block, sbsp, n, verbose)
}
defineDictionary<-0
if(debug==0)
defineDictionary<-as.integer(readline("Type in the number of additional symbols to be used: [ 0 = no symbols ] "))
if((defineDictionary>=1)&&(!is.na(defineDictionary>=1))){
dictionary<-defineADictionary(defineDictionary)
}
else{
dictionary<-NA
}
fastaPatternsAligned<- alignLigands(toFasta(patterns, dictionary, verbose), n, verbose) # translate to FASTA and align ligands
# calculate frequencies
print("Calculating frequencies...")
df<-as.data.frame(count(patterns))
df.new <- df[with(df, order(df$freq, decreasing = TRUE)), ]
numberOfPDB<-sum(df.new$freq)
df.new<-as.data.frame(cbind(df.new, df.new$freq/numberOfPDB*100))
names(df.new)[length(names(df.new))]<-"freq_%"
write.table(df.new, file = file.path(path,"frequencies.csv"),row.names=FALSE, na="", sep=" ", quote = FALSE)
fastaPatterns <-checkChains(fastaPatternsAligned, which(colnames(fastaPatternsAligned)=="L"), dictionary, verbose)
# trim and plot logo
percAbs<-abs(perc)
trimmed<-trimAlignment(fastaPatterns, percAbs, colnames(fastaPatterns), verbose) # removing rows that contains perc% of "*"; ie missing positions
lastCol<-length(trimmed[1,])
for(i in 1:length(trimmed[,1])){
lastLigand<-(max(which(substr(trimmed[i,],1,1)=="("))+1) # it is the position next to the last L
if(is.finite(lastLigand)){
if(lastLigand<lastCol)
trimmed[i, lastLigand:lastCol]<-moveCharLeft(trimmed[i, lastLigand:lastCol],verbose)
}
else print(paste("WARNING at line of the alignment!!! ", i))
}
write.table(trimmed, file = file.path(path,"alignment.csv"),row.names=FALSE, na="", sep=" ", quote = FALSE)
# clustering
df.clusters<-clustering(trimmed, path, "dendrogram.svg", debug)
# order clusters
df.clusters.new<-df.clusters[order(df.clusters$points, decreasing = FALSE), ]
numberOfClusters <- max(df.clusters.new$points)
print(paste("This action results in", numberOfClusters,"clusters!", sep=" "))
# make a logo for each cluster
for(i in 1:numberOfClusters){
cluster<-subset(df.clusters.new, points==i) # get sequences clustered together - i.e. with the same score
df.cluster<-(as.data.frame(matrix(unlist(lapply(gsub("\\(|\\)","",cluster$seq), function(x) strsplit(x, split = ""))), nrow=length(cluster[,1]), byrow = TRUE))) # convert the string to a suitable data frame for make logo function
maxFreqSeq<-maxFreq(df.cluster, colnames(trimmed), verbose)
colnames(df.cluster)<-colnames(trimmed)
makeTheLogo(df.cluster, colnames(df.cluster), maxFreqSeq, dictionary, path,paste("logo_",i,".svg",sep=""))
if(saveLogoText==1){
if(verbose==1) print("PdPDB is going to save logos in text format")
df.logo<-as.data.frame(count(df.cluster))
df.logo.new <- df.logo[with(df.logo, order(df.logo$freq, decreasing = TRUE)), ]
numberOfPDB<-sum(df.logo.new$freq)
df.logo.new<-as.data.frame(cbind(df.logo.new, df.logo.new$freq/numberOfPDB*100))
names(df.logo.new)<-c(names(df.cluster),"freq","freq_%")
write.csv(as.data.frame(df.logo.new), file = file.path(path,paste("logo_",i,".csv",sep="")),row.names=FALSE, col.names=colnames(trimmed), na="", sep=" ", quote = FALSE)
}
}
# EO Clustering
# enrichement analysis
# bonferroniCorrection <- NA
if(debug==0) # remove after debug
defineSeq<-as.numeric(readline("Do you want to define a reference sequence for enrichment analysis? [ Yes = 1 ] ")) # turn on after debug
if((defineSeq==1)&&(!is.na(defineSeq==1))){
print("This will save a CSV file containing standardized distribution and Chi-squared p values")
if(debug==0){
# definition of the reference
averageLength <- as.numeric(readline("Type in the average sequence length of the reference: "))
numberOfSamplesP <- as.numeric(readline("PROTEINS: How many samples have been used to define this reference? "))
if((numberOfSamplesP>0)&&(!is.na(numberOfSamplesP))) {
reference<-defineSeqRef(1,0)
numberOfSamples<-numberOfSamplesP
}
else{
reference<-defineSeqRef(1,0)
numberOfSamples<-0
print("###### ERROR: a void reference has been defined!!!")
}
}
# root enrichment analysis
enrichmentAnalysis(trimmed, numberOfPDB, reference, numberOfSamples, averageLength, path, "root", verbose)
# enrichment for ligands
ligands<-trimmed[which(colnames(trimmed) %in% "L")]
enrichmentAnalysis(ligands, numberOfPDB, reference, numberOfSamples, averageLength, path, "ligands", verbose)
notligands<-trimmed[which(colnames(trimmed) %in% "x")]
enrichmentAnalysis(notligands, numberOfPDB, reference, numberOfSamples, averageLength, path, "notLigands", verbose)
colnumbersP<-NA
colnumbersF<-NA
# enrichment analysis position-by-position
for(i in 1:n){
colnumbersP<-(which((substr(colnames(trimmed),1,1)) %in% "L"))-i # select -i positions
colnumbersP.filt<-colnumbersP[which(colnumbersP %in% 1:length(trimmed[1,]))] # take the possible ones
preceed.old<-trimmed[colnumbersP.filt]
preceed<-preceed.old[substr(colnames(preceed.old),1,1) != "L"] # discard ligands
enrichmentAnalysis(preceed, numberOfPDB, reference, numberOfSamples, averageLength, path, paste("preceeding",i,sep="_"), verbose)
colnumbersF<-(which((substr(colnames(trimmed),1,1)) %in% "L"))+i
colnumbersF.filt<-colnumbersF[which(colnumbersF %in% 1:length(trimmed[1,]))]
follow.old<-trimmed[colnumbersF.filt]
follow<-follow.old[substr(colnames(follow.old),1,1) != "L"]
enrichmentAnalysis(follow, numberOfPDB, reference, numberOfSamples, averageLength, path, paste("following",i,sep="_"), verbose)
}
} # EO Enrichment
}
else print("PdPDB is not able to get suitable PATTERNS... check PdPDB.log file!")
}
else print("Empty patterns.csv file!")
}
else print ("Missing patterns.csv: No file to recovery!")
}
# No patterns analysis
else {print ("Patterns won't be analyzed!!!")}
}
system(paste("rm -f ", path,"*.tmp", sep = ""))
system(paste("echo 'Alignment has been trimmed to ", perc,"%' >> ",path,"PdPDB.log", sep = ""))
system(paste("echo 'Drop option was ", dropsReplicate,"!' >> ",path,"PdPDB.log", sep = ""))
if(recovery==1) {
system(paste("echo 'You recovered an old job' >> ",path,"PdPDB.log", sep = ""))
system(paste("echo 'n is now tuned to ", n,"' >> ",path,"PdPDB.log", sep = ""))
}
print(" ")
print("---------------------------------------------------------- ")
print(paste("Logos and output files (.csv, .svg, .log) are in ",path,sep=" "))
print("---------------------------------------------------------- ")
print(" ")
print("Legend for 'frequencies.csv' file: () = ligand, - = conect residues interleaved by of more than a position in the same chain, ... = conect residues from different chains")
print("Legend for logos and 'alignment.csv' file: * = gap introduced by ligands alignment")
print(" ")
print("########################################################## ")
system(paste("echo ' ' >> ",path,"PdPDB.log", sep = ""))
system(paste("echo 'PdPDB exited gracefully...!!!' >> ",path,"PdPDB.log", sep = ""))
print(" PdPDB exited gracefully...!!!")
print("########################################################## ")
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.