#'Extended N-gram model for learning SMILES strings
#' @description Extended N-gram model for learning SMILES strings
#' @examples data(trainedSMI)
#' data(engram_5k) #same as run => engram <- ENgram$new(trainedSMI, order=10)
#' #-----arguments
#' #smis: character vector, SMILES string set for training an extended N-gram model
#' #order: numeric, value representing the maximum order in modified N-gram model
#' #-----arguments
#' @import methods
#' @importFrom methods new
#' @export ENgram
#' @exportClass ENgram
ENgram <- setRefClass(
Class = "ENgram",
fields = list(
mat = "list",
N1p = "list",
denom = "list",
mat_D = "matrix",
order = "numeric"
methods = list(
initialize = function(smis=NULL, order=NULL){
"Initialize the extend N-gram model with SMILES strings smis and numeric value order"
order <<- 10
order <<- order
mat <<- list()
update_mat = function(data){
"update the model with additional SMILES strings data"
## Enhance a given matrix of n-grams by one gram tagged by its rowname
enlarge_row <- function(tmat, letter){
tmat <- rbind(tmat, numeric(dim(tmat)[2]))
rownames(tmat)[dim(tmat)[1]] <- letter
## Enhance a given matrix of n-grams by one gram tagged by its colname
enlarge_col <- function(tmat, letter){
tmat <- cbind(tmat, numeric(dim(tmat)[1]))
colnames(tmat)[dim(tmat)[2]] <- letter
## Check for characters as numbers and numbers only (molecular rings opening/closing)
## Numbers between [ ] are skipped
parsenum <- function(x){
if(length(grep("^[0-9]$", x))==1){
## Collapse the vector vec of characters as one string
comb_les <- function(vec){
# if(length(vec)==1){return(vec)}
# res <- vec[1]
# for(i in 2:length(vec)){
# res <- paste(res, vec[i], sep="")
# }
res <- paste(vec,collapse = "")
#######main proc
## Filtering of SMILES with specific characters chain [s, [c, +, etc.
#cand <- which(sapply(as.character(data), function(x) length(grep("\\[[A-Z]", x)))==0)
cand <- 1:length(data) # data row ids from 1 to length(data)
cand <- intersect(cand, which(sapply(as.character(data), function(x) length(grep("\\[s", x)))==0))
cand <- intersect(cand, which(sapply(as.character(data), function(x) length(grep("\\[c", x)))==0))
#cand <- intersect(cand, which(sapply(as.character(data), function(x) length(grep("\\+", x)))==0))
#cand <- intersect(cand, which(sapply(as.character(data), function(x) length(grep("\\-", x)))==0))
#cand <- intersect(cand, which(sapply(as.character(data), function(x) length(grep("\\.", x)))==0))
#cand <- intersect(cand, which(sapply(as.character(data), function(x) length(grep("#", x)))==0))
cand <- intersect(cand, which(sapply(as.character(data), function(x) length(grep("%", x)))==0))
#cand <- intersect(cand, which(sapply(as.character(data), function(x) length(grep("\\/", x)))==0))
#cand <- intersect(cand, which(sapply(as.character(data), function(x) length(grep("\\\\", x)))==0))
cand <- intersect(cand, which(sapply(as.character(data), function(x) nchar(x)>=(order+4))))
## Initialization of matrix mat and res
res <- c()
if(!((is.list(mat)==T) & (length(mat)==order))){ # ?! pourquoi
mat <<- list()
for(m in 1:order){ # loop over orders
p_inmat <- c()
for(k in 1:20){ # loop over k = 20 models (presence or absence of unclosed branches (=2) x {0,...,9} unclosed rings number)
# tmp <- matrix(numeric(1), 1, 1)
tmp <- matrix(0, 1, 1) # 1x1 matrix of 0
rownames(tmp) <- comb_les(rep("C", m)) # rownames(tmp) <- string of mxC carbons
# colnames(tmp) <- rep("C")
colnames(tmp) <- "C"
p_inmat <- c(p_inmat, list(tmp)) # k times duplication of a list of 0 matrix with mxC as rowname and C as colname
mat <<- c(mat, list(p_inmat)) # m (=order) lists of k (=1:20) lists of 0 matrix
# of mx"C" rowname and "C" colname
for(m in 1:order){ # loop over orders
tempmat <- mat[[m]] # for m order, k lists of 0 matrix
for(i in cand){ # loop over filtered SMILES
cat("\rdim=", m, " no of molecule=", i)
#str <- as.character(data[[i]])
smi_str <- data[i] # SMILES to parse
## SMILES splitting character by character
## Elements enclosed in square brackets, [ ], are transformed as a single token
smi_vstr <- unlist(strsplit(smi_str, '\\[[^]]*\\](*SKIP)(*F)|(?=)', perl=T))
smi_vstr <- c(smi_vstr, "\t")
# vstr <- sapply(1:nchar(str), function(x) substr(str, x, x))
# vstr <- c(vstr, "\t")
# while(1){
# p1 <- which(vstr=="[")
# p2 <- which(vstr=="]")
# if(length(p1)>0){
# vstr[p1[1]] <- paste(vstr[p1[1]:p2[1]], collapse="")
# vstr <- vstr[-((p1[1]+1):p2[1])]
# }else{
# break
# }
# }
## Merge double bond with the right next character
p <- which(smi_vstr=="=") # double bonds search
smi_vstr[p] <- paste(smi_vstr[p], smi_vstr[p+1], sep="")
smi_vstr <- smi_vstr[-(p+1)]
#numchecker <- numeric(9) # 9 zeros -> 9 potential molecular rings maximum - numstk is enough!
numstk <- c() # keep the order of appearance of molecular rings number in a SMILES
start <- 2
numn <- 1 # number of opened molecular rings
numbr <- 0 # number of opened molecular branches
skip_flag <- F # don't change later ?!
## ini_flag <- F; # Not Used !
## tm <- m; # Not Used !
startbr <- c();
## next_flag <- F;# Not Used !
brstart <- F; # molecular branch start flag
brsteps <- 0
## brpos <- c() # Not Used !
brflag <- F;
short_flag <- F
b <- smi_vstr[1]
n <- 2
# if(skip_flag==F){
a <- b
if(length(b) >= m){
b <- c(b[-1], smi_vstr[n])
b <- c(b, smi_vstr[n])
short_flag <- T
short_flag <- F
lastc <- parsenum(b[length(b)]) # check for numbers (rings opening/closing)
if(lastc=="("){ # check for branch opening
if((brstart==T) & (brsteps==2)){
tempa <- "#brterm#"
tempa <- c(a[2:length(a)], "#brterm#")
#cat(tempa, "\n")
startbr <- c(startbr, list(tempa))
brstart <- T
brsteps <- 0
#cat(lastc, " ", brstart, " ", brsteps, " ", numn, " ", b, "\n")
tempa <- "#brterm#"
tempa <- c(a[2:length(a)], "#brterm#")
startbr <- c(startbr, list(tempa))
brstart <- F
brsteps <- 0
brsteps <- brsteps + 1
## opened/closed molecular rings number storage
numdiff <- length(numstk)-which(lastc==numstk)
#numchecker[lastc] <- 0
numstk <- numstk[-which(lastc==numstk)]
lastc <- paste("#numt_", numdiff, sep="") ## keep track if rings close in disorder
#numchecker[lastc] <- 1
numstk <- c(numstk, lastc)
lastc <- "#num#"
lastc <- "#brterm#"
brflag <- T
matidx <- 10*(numbr>0)+numn
lastc <- "#term#"
b[length(b)] <- lastc
if(short_flag == F){
tempmat[[matidx]] <- enlarge_row(tempmat[[matidx]], comb_les(a))
tempmat[[matidx]] <- enlarge_col(tempmat[[matidx]], lastc)
tempmat[[matidx]][comb_les(a), lastc] <- tempmat[[matidx]][comb_les(a), lastc]+1
b[length(b)] <- lastc
matidx <- 10*(numbr>0)+numn
if(short_flag == F){
tempmat[[matidx]] <- enlarge_row(tempmat[[matidx]], comb_les(a))
tempmat[[matidx]] <- enlarge_col(tempmat[[matidx]], lastc)
tempmat[[matidx]][comb_les(a), lastc] <- tempmat[[matidx]][comb_les(a), lastc]+1 # occurence counting
matidx <- 10*(numbr>0)+numn
b <- startbr[[length(startbr)]]
startbr <- startbr[-length(startbr)]
tempmat[[matidx]] <- enlarge_row(tempmat[[matidx]], comb_les(b))
brflag <- F
# }else{
# skip_flag <- F
# }
numbr <- numbr + 1
}else if(lastc=="#brterm#"){
numbr <- numbr - 1
}else if(lastc=="#num#"){
numn <- numn + 1
}else if(length(grep("numt_", lastc))==1){
numn <- numn - 1
n <- n+1
for(k in 1:length(tempmat)){
tempmat[[k]] <- tempmat[[k]][order(rownames(tempmat[[k]])),]
res <- c(res, list(tempmat))
mat <<- res
mat_D <<- matrix(0, order, 20)
for(i in 1:nrow(mat_D)){
for(j in 1:ncol(mat_D)){
n1 <- sum(mat[[i]][[j]]==1)
n2 <- sum(mat[[i]][[j]]==2)
mat_D[i,j] <<- n1/(n1+2*n2)
denom <<- list()
N1p <<- list()
for(i in 1:length(mat)){
p_denom <- list()
p_N1p <- list()
for(j in 1:length(mat[[i]])){
p_denom <- c(p_denom, list(rowSums(mat[[i]][[j]])))
p_N1p <- c(p_N1p, list(apply(mat[[i]][[j]], 1, function(x) sum(x==1))))
p_denom <- c(p_denom, list(NA))
p_N1p <- c(p_N1p, list(NA))
denom <<- c(denom, list(p_denom))
N1p <<- c(N1p, list(p_N1p))
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.