Nothing
#'Extended N-gram model for learning SMILES strings
#' @description Extended N-gram model for learning SMILES strings
#'
#' @examples data(trainedSMI)
#' data(engram_5k) #same as run => engram <- ENgram$new(trainedSMI, order=10)
#' #-----arguments
#' #smis: character vector, SMILES string set for training an extended N-gram model
#' #order: numeric, value representing the maximum order in modified N-gram model
#' #-----arguments
#'
#' @import methods
#' @importFrom methods new
#' @export ENgram
#' @exportClass ENgram
ENgram <- setRefClass(
Class = "ENgram",
fields = list(
mat = "list",
N1p = "list",
denom = "list",
mat_D = "matrix",
order = "numeric"
),
methods = list(
initialize = function(smis=NULL, order=NULL){
"Initialize the extend N-gram model with SMILES strings smis and numeric value order"
if(is.null(order)){
order <<- 10
}else{
order <<- order
}
if(is.null(smis)){
mat <<- list()
}else{
update_mat(smis)
}
},
update_mat = function(data){
"update the model with additional SMILES strings data"
## Enhance a given matrix of n-grams by one gram tagged by its rowname
enlarge_row <- function(tmat, letter){
tmat <- rbind(tmat, numeric(dim(tmat)[2]))
rownames(tmat)[dim(tmat)[1]] <- letter
return(tmat)
}
##
## Enhance a given matrix of n-grams by one gram tagged by its colname
enlarge_col <- function(tmat, letter){
tmat <- cbind(tmat, numeric(dim(tmat)[1]))
colnames(tmat)[dim(tmat)[2]] <- letter
return(tmat)
}
##
## Check for characters as numbers and numbers only (molecular rings opening/closing)
## Numbers between [ ] are skipped
parsenum <- function(x){
if(length(grep("^[0-9]$", x))==1){
return(as.numeric(x))
}else{
return(x)
}
}
##
## Collapse the vector vec of characters as one string
comb_les <- function(vec){
# if(length(vec)==1){return(vec)}
# res <- vec[1]
# for(i in 2:length(vec)){
# res <- paste(res, vec[i], sep="")
# }
res <- paste(vec,collapse = "")
return(res)
}
##
#######main proc
## Filtering of SMILES with specific characters chain [s, [c, +, etc.
#cand <- which(sapply(as.character(data), function(x) length(grep("\\[[A-Z]", x)))==0)
cand <- 1:length(data) # data row ids from 1 to length(data)
cand <- intersect(cand, which(sapply(as.character(data), function(x) length(grep("\\[s", x)))==0))
cand <- intersect(cand, which(sapply(as.character(data), function(x) length(grep("\\[c", x)))==0))
#cand <- intersect(cand, which(sapply(as.character(data), function(x) length(grep("\\+", x)))==0))
#cand <- intersect(cand, which(sapply(as.character(data), function(x) length(grep("\\-", x)))==0))
#cand <- intersect(cand, which(sapply(as.character(data), function(x) length(grep("\\.", x)))==0))
#cand <- intersect(cand, which(sapply(as.character(data), function(x) length(grep("#", x)))==0))
cand <- intersect(cand, which(sapply(as.character(data), function(x) length(grep("%", x)))==0))
#cand <- intersect(cand, which(sapply(as.character(data), function(x) length(grep("\\/", x)))==0))
#cand <- intersect(cand, which(sapply(as.character(data), function(x) length(grep("\\\\", x)))==0))
cand <- intersect(cand, which(sapply(as.character(data), function(x) nchar(x)>=(order+4))))
##
## Initialization of matrix mat and res
res <- c()
if(!((is.list(mat)==T) & (length(mat)==order))){ # ?! pourquoi
mat <<- list()
for(m in 1:order){ # loop over orders
p_inmat <- c()
for(k in 1:20){ # loop over k = 20 models (presence or absence of unclosed branches (=2) x {0,...,9} unclosed rings number)
# tmp <- matrix(numeric(1), 1, 1)
tmp <- matrix(0, 1, 1) # 1x1 matrix of 0
rownames(tmp) <- comb_les(rep("C", m)) # rownames(tmp) <- string of mxC carbons
# colnames(tmp) <- rep("C")
colnames(tmp) <- "C"
p_inmat <- c(p_inmat, list(tmp)) # k times duplication of a list of 0 matrix with mxC as rowname and C as colname
}
mat <<- c(mat, list(p_inmat)) # m (=order) lists of k (=1:20) lists of 0 matrix
# of mx"C" rowname and "C" colname
}
}
##
##
for(m in 1:order){ # loop over orders
tempmat <- mat[[m]] # for m order, k lists of 0 matrix
cat("\n")
for(i in cand){ # loop over filtered SMILES
cat("\rdim=", m, " no of molecule=", i)
#str <- as.character(data[[i]])
smi_str <- data[i] # SMILES to parse
## SMILES splitting character by character
## Elements enclosed in square brackets, [ ], are transformed as a single token
smi_vstr <- unlist(strsplit(smi_str, '\\[[^]]*\\](*SKIP)(*F)|(?=)', perl=T))
smi_vstr <- c(smi_vstr, "\t")
# vstr <- sapply(1:nchar(str), function(x) substr(str, x, x))
# vstr <- c(vstr, "\t")
# while(1){
# p1 <- which(vstr=="[")
# p2 <- which(vstr=="]")
# if(length(p1)>0){
# vstr[p1[1]] <- paste(vstr[p1[1]:p2[1]], collapse="")
# vstr <- vstr[-((p1[1]+1):p2[1])]
# }else{
# break
# }
# }
## Merge double bond with the right next character
p <- which(smi_vstr=="=") # double bonds search
if(length(p)>0){
smi_vstr[p] <- paste(smi_vstr[p], smi_vstr[p+1], sep="")
smi_vstr <- smi_vstr[-(p+1)]
}
##
#numchecker <- numeric(9) # 9 zeros -> 9 potential molecular rings maximum - numstk is enough!
numstk <- c() # keep the order of appearance of molecular rings number in a SMILES
start <- 2
numn <- 1 # number of opened molecular rings
numbr <- 0 # number of opened molecular branches
skip_flag <- F # don't change later ?!
## ini_flag <- F; # Not Used !
## tm <- m; # Not Used !
startbr <- c();
## next_flag <- F;# Not Used !
brstart <- F; # molecular branch start flag
brsteps <- 0
## brpos <- c() # Not Used !
brflag <- F;
short_flag <- F
b <- smi_vstr[1]
n <- 2
while(1){
# if(skip_flag==F){
a <- b
if(length(b) >= m){
b <- c(b[-1], smi_vstr[n])
}else{
b <- c(b, smi_vstr[n])
}
if(length(a)<m){
short_flag <- T
}else{
short_flag <- F
}
lastc <- parsenum(b[length(b)]) # check for numbers (rings opening/closing)
if(lastc=="("){ # check for branch opening
if((brstart==T) & (brsteps==2)){
if(m==1){
tempa <- "#brterm#"
}else{
tempa <- c(a[2:length(a)], "#brterm#")
#cat(tempa, "\n")
}
startbr <- c(startbr, list(tempa))
}
brstart <- T
brsteps <- 0
}
#cat(lastc, " ", brstart, " ", brsteps, " ", numn, " ", b, "\n")
if(brstart==T){
if(brsteps==2){
if(m==1){
tempa <- "#brterm#"
}else{
tempa <- c(a[2:length(a)], "#brterm#")
}
startbr <- c(startbr, list(tempa))
brstart <- F
brsteps <- 0
}else{
brsteps <- brsteps + 1
}
}
## opened/closed molecular rings number storage
if(is.numeric(lastc)){
if(sum(lastc==numstk)>0){
numdiff <- length(numstk)-which(lastc==numstk)
#numchecker[lastc] <- 0
numstk <- numstk[-which(lastc==numstk)]
lastc <- paste("#numt_", numdiff, sep="") ## keep track if rings close in disorder
}else{
#numchecker[lastc] <- 1
numstk <- c(numstk, lastc)
lastc <- "#num#"
}
}
##
if(lastc==")"){
lastc <- "#brterm#"
brflag <- T
}
if(lastc=="\t"){
matidx <- 10*(numbr>0)+numn
lastc <- "#term#"
b[length(b)] <- lastc
if(short_flag == F){
if(sum(rownames(tempmat[[matidx]])==comb_les(a))==0){
tempmat[[matidx]] <- enlarge_row(tempmat[[matidx]], comb_les(a))
}
if(sum(colnames(tempmat[[matidx]])==lastc)==0){
tempmat[[matidx]] <- enlarge_col(tempmat[[matidx]], lastc)
}
tempmat[[matidx]][comb_les(a), lastc] <- tempmat[[matidx]][comb_les(a), lastc]+1
}
break
}
b[length(b)] <- lastc
matidx <- 10*(numbr>0)+numn
if(short_flag == F){
if(sum(rownames(tempmat[[matidx]])==comb_les(a))==0){
tempmat[[matidx]] <- enlarge_row(tempmat[[matidx]], comb_les(a))
}
if(sum(colnames(tempmat[[matidx]])==lastc)==0){
tempmat[[matidx]] <- enlarge_col(tempmat[[matidx]], lastc)
}
tempmat[[matidx]][comb_les(a), lastc] <- tempmat[[matidx]][comb_les(a), lastc]+1 # occurence counting
}
if(brflag==T){
matidx <- 10*(numbr>0)+numn
b <- startbr[[length(startbr)]]
startbr <- startbr[-length(startbr)]
if(short_flag==F){
if(sum(rownames(tempmat[[matidx]])==comb_les(b))==0){
tempmat[[matidx]] <- enlarge_row(tempmat[[matidx]], comb_les(b))
}
}
brflag <- F
}
# }else{
# skip_flag <- F
# }
if(lastc=="("){
numbr <- numbr + 1
}else if(lastc=="#brterm#"){
numbr <- numbr - 1
}else if(lastc=="#num#"){
numn <- numn + 1
}else if(length(grep("numt_", lastc))==1){
numn <- numn - 1
}
n <- n+1
}
}
for(k in 1:length(tempmat)){
tempmat[[k]] <- tempmat[[k]][order(rownames(tempmat[[k]])),]
}
res <- c(res, list(tempmat))
}
mat <<- res
##
mat_D <<- matrix(0, order, 20)
for(i in 1:nrow(mat_D)){
for(j in 1:ncol(mat_D)){
n1 <- sum(mat[[i]][[j]]==1)
n2 <- sum(mat[[i]][[j]]==2)
mat_D[i,j] <<- n1/(n1+2*n2)
}
}
denom <<- list()
N1p <<- list()
for(i in 1:length(mat)){
p_denom <- list()
p_N1p <- list()
for(j in 1:length(mat[[i]])){
if(!is.na(mat_D[i,j])){
p_denom <- c(p_denom, list(rowSums(mat[[i]][[j]])))
p_N1p <- c(p_N1p, list(apply(mat[[i]][[j]], 1, function(x) sum(x==1))))
}else{
p_denom <- c(p_denom, list(NA))
p_N1p <- c(p_N1p, list(NA))
}
}
denom <<- c(denom, list(p_denom))
N1p <<- c(N1p, list(p_N1p))
}
}
)
)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.