Nothing
#'SMILES generator
#' @description SMILES generator thanks to a sequential Monte-Carlo sampler
#'
#' @param smis is an initial vector of SMILES from which the generation of novel SMILES begins.
#' @param v_engram is an Engram object, \emph{a priori} created, encapsulating the SMILES grammar
#' (see \code{\link{ENgram}} for more details).
#' @param v_m is a positive scalar representing the order of a used ENgram model for generation.
#' @param v_qsprpred is a QSPRpred object in which a regression model, initially trained, is accessible for properties
#' predictions (i.e. physico-chemical properties here) of compounds from newly created SMILES.
#' @param v_temp is a vector of numerical values, with a length equals to the number of properties, which represents
#' the annealing parametrization in the sequential Monte-Carlo sampler.
#' @param v_decay is a positive scalar corresponding to the decay rate of temp above (\eqn{temp_{i+1}=temp_{i}^decay}).
#' @param v_ESSth is a positive scalar representing the threshold from which a re-sampling over the set of newly created SMILES is
#' done (0.5 by default). This threshold limits the degeneracy in the set of newly created SMILES. A lower (higher) value allows
#' more (less) degeneracy.
#' @param gentype is the type of the procedure used by the SMILES strings generator. For a Back-off procedure, use "ML"
#' (by default), and for a Neaser-Nay smoothing procedure, use "KN".
#' @param v_maxstock is the maximum of newly created SMILES kept in stock (2000 by default).
#' @param keeptrack is set to TRUE by default. It allows the tracking of the mean of predicted properties, and thus the plotting
#' and/or listing of the latest newly created SMILES during the generation process. It is extremely useful in order to tune the
#' annealing parameters, as to visualize the convergence speed to a targeted physico-chemical properties space.
#' @param smidatabase is a vector of known SMILES to which the generated SMILES should not match. This is useful to avoid the creation
#' of SMILES with great similarity with existing and/or un-wanted ones.
#' @examples
#' \dontrun{#sample data
#' data(qspr.data)
#' idx <- sample(nrow(qspr.data), 5000)
#' smis <- paste(qspr.data[idx,1])
#' y <- qspr.data[idx,c(2,5)]
#'
#' #learning a pattern of chemical strings
#' data(trainedSMI)
#' data(engram_5k) #same as run => engram <- ENgram$new(trainedSMI, order=10)
#'
#' #learning QSPR model
#' data(qsprpred_EG_5k)
#' #same as run => qsprpred <- QSPRpred$new(smis=smis, y=as.matrix(y), v_fpnames="graph")
#'
#' #set target range
#' targ.min <- c(200,1.5)
#' targ.max <- c(350,2.5)
#' qsprpred_EG_5k$set_target(targ.min,targ.max)
#'
#' #getting chemical strings from the Inverse-QSPR model
#' smchem <- SmcChem$new(smis = rep("c1ccccc1O", 25), v_qsprpred=qsprpred_EG_5k,
#' v_engram=engram_5k,temp=3)
#'
#' smchem$smcexec(niter=5, preorder=0, nview=4)
#' #if OpenBabel (>= 2.3.1) is installed, you can use reordering for better mixing as
#' #smchem$smcexec(niter=100, preorder=0.2, nview=4)
#' #see http://openbabel.org
#'
#' #check
#' gensmis <- smchem$get_hiscores(nsmi=5, exsim=0.9)
#' pred <- qsprpred_EG_5k$qspr_predx(gensmis[,1])}
#'
#' @import methods
#' @importFrom graphics abline plot rasterImage
#' @importFrom utils packageVersion
#' @importFrom methods new
#' @export SmcChem
#' @exportClass SmcChem
SmcChem <- setRefClass("SmcChem",
fields = list(
esmi = "list",
oldesmi = "list",
m = "numeric",
engram = "ENgram",
qsprpred = "QSPRpred",
weights = "numeric",
oldscore = "numeric",
fkernel = "numeric",
bkernel = "numeric",
isvalid = "logical",
temp = "numeric",
ESSth = "numeric",
decay = "numeric",
maxstock = "numeric",
smistock = "character",
scorestock = "numeric",
#fpstock = "matrix",
pytrack = "logical",
predystock = "matrix",
predytrack = "list",
smidb = "character"
),
methods = list(
initialize = function(smis=NULL, v_engram=NULL, v_m=NULL, v_qsprpred=NULL, v_temp=c(1,1),
v_decay=0.95, v_ESSth=0.5, gentype="ML", v_maxstock=2000,
keeptrack=TRUE, smidatabase=NULL){
"Initialize the SMC chemical generator with initial SMILES strings smis, ENgram class object v_engram and QSPRpred class object v_qsprpred"
if(is.null(v_engram)){
cat("Need ENgram object\n")
return(NULL)
}else{
engram <<- v_engram
}
if(is.null(v_qsprpred)){
cat("Need QSPRpred object\n")
return(NULL)
}else{
qsprpred <<- v_qsprpred
}
if(is.null(v_m)){
m <<- length(engram$mat)
}else{
m <<- v_m
}
ESSth <<- v_ESSth
if(is.null(smis)){
esmi <<- lapply(rep("c1ccccc1O", 100), function(x) Esmi$new(x, m=m, engram, type=gentype))
}else{
esmi <<- lapply(smis, function(x) Esmi$new(x, m=m, engram, type=gentype))
}
oldesmi <<- sapply(esmi, function(x) x$copy())
csmi <- sapply(esmi, function(x) x$get_validsmi())
temp <<- v_temp
num_prop <- qsprpred$propndim+length(qsprpred$func)
# if(dim(temp)[1]==num_prop){
# if(dim(temp)[2]!=length(csmi)){
# if(dim(temp)[2]==1){
# temp <<- matrix(rep(temp,length(csmi)),nrow=num_prop,ncol=length(csmi))
# cat("The defined numerical vector of initial temperatures will be applied to the",length(csmi),"annealing processes.\n\n")
# } else {
# cat("Caution: The provided temperature matrix has a number of columns: ",dim(temp)[2]," that doesn't match the number of initial SMILES: ",length(csmi),".\n\n",sep="")
# return(NULL)
# }
# }
# } else {
# cat("Caution: The provided temperature vector/matrix has a number of elements/lines: ",dim(temp)[1]," that doesn't match the number of properties: ",qsprpred$propndim+length(qsprpred$func),".\n",sep="")
# cat("Pay attention that if you have requested an empirical calculation of supplementary properties, you shoud take them into account and apply an individual temperature to each of them.\n\n")
# return(NULL)
# }
invpredtmp <- qsprpred$iqspr_predict(csmi, temp)
oldscore <<- invpredtmp[[1]]
weights <<- rep(1, length(esmi))
fkernel <<- rep(1, length(esmi))
bkernel <<- rep(1, length(esmi))
isvalid <<- rep(TRUE, length(esmi))
decay <<- v_decay
maxstock <<- v_maxstock
smistock <<- character(0)
scorestock <<- numeric(0)
#smistock <<- c("")
#scorestock <<- c(0)
#fpstock <<- matrix(0, nrow = 1, ncol=ncol(invpredtmp[[4]]))
pytrack <<- keeptrack
predystock <<- matrix(0, nrow = 1, ncol=nrow(invpredtmp[[2]])) # ncol = nrow(...) to take the transpose
predytrack <<- list()
smidb <<- as.character(smidatabase)
},
smcexec = function(niter, nsteps=5, preorder=0, nview=0){
"modify chemical structures with niter SMC updates"
for(i in 1:niter){
localmove(nsteps)
update_particles()
reordering(preorder)
if(nview>0){
idx <- which(isvalid==T)
viewstr(idx[1:nview])
}
}
},
localmove = function(nstep){
oldesmi <<- sapply(esmi, function(x) x$copy())
for(i in 1:length(esmi)){
if(i<length(esmi)){
cat("\rlocal move for ", i, "th molecules")
}else{
cat("\rlocal move for ", i, "th molecules\n")
}
u <- rbinom(1, nstep, 0.5)
rep.times <- min(u,length(esmi[[i]]$vstr)-2)
if(rep.times<0){
rep.times <- 0 # rep.times <- u
}
steps <- c(rep(1, rep.times), rep(0, nstep-u))
suppressWarnings(tryres <- try(tempk <- esmi[[i]]$chem_local(engram, steps, 1), silent=T))
if(class(tryres)=="try-error"){
fkernel[i] <<- 1
bkernel[i] <<- 1
isvalid[i] <<- F
}else{
fkernel[i] <<- 1
bkernel[i] <<- 1
isvalid[i] <<- TRUE
}
}
},
reordering = function(prob){
idxs <- which(sapply(esmi, function(x) nchar(x$get_validsmi()))>1)
idxs <- intersect(idxs, which(sapply(esmi, function(x) x$numn+x$bcount)==0))
idxs <- intersect(idxs, which(isvalid==T))
idxs <- sample(idxs, round(length(idxs)*prob, 0))
for(idx in idxs){
natoms <- esmi[[idx]]$get_natoms()
prevsmi <- esmi[[idx]]$get_validsmi()
if(.Platform$OS=="windows"){
suppressWarnings(newsmi <- system(paste('obabel -:"', prevsmi ,'" -osmi -xf ', sample(natoms, 1) , sep=""), intern=T, show.output.on.console = F, ignore.stderr=T))
}else{
suppressWarnings(newsmi <- system(paste('obabel -:"', prevsmi ,'" -osmi -xf ', sample(natoms, 1) , sep=""), intern=T, ignore.stderr=T))
}
newsmi <- substr(newsmi[1], 1, nchar(newsmi)-1)
if(idx!=idxs[length(idxs)]){
cat("\rreordering prev:", prevsmi, "=> new:", newsmi, paste(rep(" ", 40), collapse=" "))
flush.console()
}else{
cat("\rreordering prev:", prevsmi, "=> new:", newsmi, paste(rep(" ", 40), collapse=" "), "\n\n")
flush.console()
}
suppressWarnings(reschk <- try(tempesmi<- Esmi$new(smi=newsmi, m=m, engram), silent=T))
if(class(reschk)!="try-error"){ # exception for reordering
temp_vstr <- tempesmi$vstr
idxc <- grep("=\\[S", temp_vstr)
if(length(idxc)>0) temp_vstr[idxc] <- sapply(idxc, function(x) paste(substr(temp_vstr[x], 1, 1), substr(temp_vstr[x], 3, 3), sep=""))
idxc <- grep("=\\[C", temp_vstr)
if(length(idxc)>0) temp_vstr[idxc] <- sapply(idxc, function(x) paste(substr(temp_vstr[x], 1, 1), substr(temp_vstr[x], 3, 3), sep=""))
idxc <- grep("=\\[N", temp_vstr)
if(length(idxc)>0) temp_vstr[idxc] <- sapply(idxc, function(x) paste(substr(temp_vstr[x], 1, 1), substr(temp_vstr[x], 3, 3), sep=""))
idxc <- grep("=\\[O", temp_vstr)
if(length(idxc)>0) temp_vstr[idxc] <- sapply(idxc, function(x) paste(substr(temp_vstr[x], 1, 1), substr(temp_vstr[x], 3, 3), sep=""))
idxc <- grep("\\[S", temp_vstr)
if(length(idxc)>0) temp_vstr[idxc] <- sapply(idxc, function(x) substr(temp_vstr[x], 2, 2))
idxc <- grep("\\[C", temp_vstr)
if(length(idxc)>0) temp_vstr[idxc] <- sapply(idxc, function(x) substr(temp_vstr[x], 2, 2))
idxc <- grep("\\[N", temp_vstr)
if(length(idxc)>0) temp_vstr[idxc] <- sapply(idxc, function(x) substr(temp_vstr[x], 2, 2))
idxc <- grep("\\[O", temp_vstr)
if(length(idxc)>0) temp_vstr[idxc] <- sapply(idxc, function(x) substr(temp_vstr[x], 2, 2))
tempesmi$vstr <- temp_vstr
tempesmi$vstr2smi()
suppressWarnings(reschk <- try(tempesmi <- Esmi$new(smi=tempesmi$get_validsmi(), m=m, engram), silent=T))
if(class(reschk)!="try-error"){ # exception for reordering
ar <- 1
if(runif(1, 0, 1) < ar){
esmi[[idx]] <<- tempesmi
weights[idx] <<- weights[idx]*ar
}
}
}
}
weights <<- weights/sum(weights)
},
update_particles = function(){
csmi <- sapply(esmi, function(x) x$get_validsmi())
isvalid[which(sapply(csmi, function(x) class(try(parse.smiles(as.character(x), kekulise=T), silent=T))!="list"))] <<- F
idx <- which(isvalid==T)
mols <- parse.smiles(as.character(csmi[idx]), kekulise=T)
rcdk_version <- as.numeric(gsub("\\.","",packageVersion("rcdk")))
if(rcdk_version > 338){
csmi[idx] <- sapply(mols, function(x) get.smiles(x))
} else {
csmi[idx] <- sapply(mols, function(x) get.smiles(x, aromatic = T, type = 'generic'))
}
invpredtemp <- qsprpred$iqspr_predict(csmi[idx], temp)
newscore <- numeric(length(csmi))
newscore[idx] <- invpredtemp[[1]]
newscore[-idx] <- 0
smistock <<- c(smistock, csmi[idx])
scorestock <<- c(scorestock, newscore[idx])
#fpstock <<- rbind(fpstock, newfpstock)
or <- order(scorestock, decreasing=T)
smistock <<- smistock[or[1:min(length(or),maxstock)]]
scorestock <<- scorestock[or[1:min(length(or),maxstock)]]
#fpstock <<- fpstock[or[1:min(length(or),maxstock)],]
if(pytrack){
newpredystock <- data.matrix(t(invpredtemp[[2]]))
if(dim(predystock)[1]==1 && apply(predystock,1,sum)==0){
predystock <<- newpredystock
} else {
predystock <<- data.matrix(rbind(predystock, newpredystock))
}
predystock <<- data.matrix(predystock[or[1:min(length(or),maxstock)],])
predytrack_size <- length(predytrack)
#predytrack[[predytrack_size+1]] <<- predystock[1:min(length(or),100),] # keep track of 100 predy max. per step
predytrack[[predytrack_size+1]] <<- data.frame(csmi[idx],invpredtemp[[1]],t(invpredtemp[[2]]),t(invpredtemp[[3]]))
}
weights <<- weights*(newscore/oldscore)
weights[which(oldscore==0)] <<- 0
weights <<- weights/sum(weights)
ESS <- 1/sum(weights^2)
if(ESS < ESSth*length(esmi)){
cat("weight update done, ESS=", ESS, "\n")
idx <- sample(1:length(esmi), prob=weights, replace=T)
esmi <<- lapply(esmi[idx], function(x) x$copy())
weights <<- rep(1/length(esmi), length(esmi))
oldscore <<- newscore[idx]
cat("resampling done\n\n")
}else{
cat("weight update done, ESS=", ESS, "\n\n")
oldscore <<- newscore
}
temp <<- temp^decay
},
get_smiles = function(){
"get SMILES strings from the SmcChem object (same as get_smiles function) "
csmi <- sapply(esmi, function(x) x$get_validsmi())
idx <- which(isvalid==T)
mols <- parse.smiles(as.character(csmi[idx]), kekulise=T)
rcdk_version <- as.numeric(gsub("\\.","",packageVersion("rcdk")))
if(rcdk_version > 338){
csmi[idx] <- sapply(mols, function(x) get.smiles(x, flavor = smiles.flavors(c('Generic','UseAromaticSymbols'))))
} else {
csmi[idx] <- sapply(mols, function(x) get.smiles(x, aromatic = T, type = 'generic'))
}
csmi
},
get_hiscores = function(nsmi=100, exsim=0.8){
"get chemical structures with high QSPR score from SmcChem object (same as get_hiscores function) "
SMILES <- character(0)
QSPRScore <- numeric(0)
tsmi <- smistock
tscore <- scorestock
if(!is.null(smidb)){
smired <- tsmi %in% smidb # Check if newly created molecules already exist in the database
tsmi <- tsmi[!smired]
tscore <- tscore[!smired]
}
fp <- get_descriptor(smis = paste(tsmi), desctypes = c("standard","extended","circular"))[[1]]
# To be used if
#fpnames corresponds to fingerprints alone
#fp <- fpstock # Not wanted for a Tanimoto similarity check
cat("\n")
j <- 1
while((j<=nsmi) & (length(tsmi)>2)){
SMILES <- c(SMILES, tsmi[1])
QSPRScore <- c(QSPRScore, tscore[1])
sim <- apply(fp[-1,], 1, function(x) sum(x*fp[1,])/sum((x+fp[1,])>=1)) # Tanimoto similarity
#print(sim)
fp <- fp[-1,]
tsmi <- tsmi[-1]
if(length(tsmi)>1){
if(sum(sim>=exsim)>0){
fp <- fp[-which(sim>=exsim),]
tsmi <- tsmi[-which(sim>=exsim)]
tscore <- tscore[-which(sim>=exsim)]
}
}
cat("\r", j, "th molecules is chosen")
j <- j + 1
}
cat("\n")
return(cbind(SMILES, QSPRScore))
},
viewstr = function(idx){
"view 2D structures from SMILES string vector with index idx (same as viewstr function) "
tsmi <- sapply(smchem$esmi, function(x) x$get_validsmi())[idx]
nrow = ncol = ceiling(sqrt(length(idx)))
par(mar = c(0,0,0,0))
par(mfrow = c(nrow,ncol))
smisl <- length(tsmi)
for(i in 1:smisl){
mol <- parse.smiles(as.character(tsmi[i]), kekulise = T)[[1]] # if kekulise = F, aromatic rings are missed!
dep <- get.depictor(width=500, height=500, zoom=3)
im_temp <- view.image.2d(molecule=mol,depictor=dep)
plot(NA,NA,xlim=c(1,10),ylim=c(1,10),xaxt='n',yaxt='n',xlab='',ylab='')
rasterImage(im_temp,1,1,10,10)
legend("topleft", legend[i], bty = "n", cex = 1.3)
}
}
)
)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.