ChemmineR-release: Cheminformatics Toolkit for R

################################################
## Class and Method Definitions for ChemmineR ##
################################################
## SDF format definition: 
        # http://www.epa.gov/ncct/dsstox/MoreonSDF.html
	# http://www.symyx.com/downloads/public/ctfile/ctfile.jsp

#################################################
## (1) Class and Method Definitions for SDFstr ##
#################################################
## Download PubChem SDF samples for code testing 
## This function is not intended to be used by users.
.sdfDownload <- function(mypath="ftp://ftp.ncbi.nih.gov/pubchem/Compound/CURRENT-Full/SDF/", myfile="Compound_00650001_00675000.sdf.gz") {
	system(paste("wget ", mypath, myfile, sep=""))
	system(paste("gunzip ", myfile))
}
# .sdfDownload(mypath="ftp://ftp.ncbi.nih.gov/pubchem/Compound/CURRENT-Full/SDF/", myfile="Compound_00650001_00675000.sdf.gz") 

## Import SD File and Return SDFstr Class (list-like) 
read.SDFstr <- function(sdfstr) {
        if(length(sdfstr) > 1) { # Support for passing on SD File content as character vector
                mysdf <- sdfstr
        } else {
                mysdf <- readLines(sdfstr) # Reads file line-wise into vector
        }
        y <- regexpr("^\\${4,4}", mysdf, perl=TRUE) # identifies all fields that start with a '$$$$' sign
        index <- which(y!=-1)


		  #if the last non-empty line of mysdf is not "$$$$". Only search from the last "$$$$" found
		  # to the end of the file
		  start = if(length(index)==0) 1 else index[length(index)]
		  if("$$$$" != Find(function(line) line !="", mysdf[start:length(mysdf)]
								 ,right=TRUE)){
			  # we have a MOL file, so just insert the $$$$ at the end
			  mysdf <- c(mysdf,"$$$$")
			  index <- c(index,length(mysdf))
		  }

        indexDF <- data.frame(start=c(1, index[-length(index)]+1), end=index)
        mysdf_list <- lapply(seq(along=indexDF[,1]), function(x) mysdf[seq(indexDF[x,1], indexDF[x,2])])
        if(class(mysdf_list) != "list") { mysdf_list <- list(as.vector(mysdf_list)) }
        names(mysdf_list) <- 1:length(mysdf_list)
        mysdf_list <- new("SDFstr", a=mysdf_list)
        return(mysdf_list)
}

## Define SDFstr class
setClass("SDFstr", representation(a = "list"))

## Accessor method for SDFstr
setGeneric(name="sdfstr2list", def=function(x) standardGeneric("sdfstr2list"))
setMethod(f="sdfstr2list", signature="SDFstr", definition=function(x) {return(x@a)}) 

## Replacement method for SDFstr using accessor method
setGeneric(name="sdfstr2list<-", def=function(x, value) standardGeneric("sdfstr2list<-"))
setReplaceMethod(f="sdfstr2list", signature="SDFstr", definition=function(x, value) {
	x@a <- value 
	return(x)
})

## Replacement method for SDFstr using "[" operator 
## It doesn't provide here full set of expected functionalities.
setReplaceMethod(f="[", signature="SDFstr", definition=function(x, i, j, value) {
	x@a[i] <- sdfstr2list(value)
	return(x)
})

## Define behavior of "[" operator for SDFstr 
setMethod(f="[", signature="SDFstr", definition=function(x, i, ..., drop) {
		x@a <- x@a[i]                   
		return(x)
})

## Behavior of "[[" operator to convert single SDFstr component to character vector
setMethod(f="[[", signature="SDFstr",
	definition=function(x, i, ..., drop) {
		return(x@a[[i]])                 
})

## Replacement method for SDFstr using "[" operator 
setReplaceMethod(f="[", signature="SDFstr", definition=function(x, i, value) {
	x@a[i] <- value
	return(x)
})

## Replacement method for SDFstr using "[[" operator 
setReplaceMethod(f="[[", signature="SDFstr", definition=function(x, i, value) {
	x@a[[i]] <- value
	return(x)
})

## Define print behavior for SDFstr
setMethod(f="show", signature="SDFstr",
	definition=function(object) { 
		cat("An instance of ", "\"", class(object), "\"", " with ", length(sdfstr2list(object)), " molecules", "\n", sep="")
})

## Length function
setMethod(f="length", signature="SDFstr",
    definition=function(x) {
    	return(length(sdfstr2list(x)))
})

## Write SDF/SDFstr/SDFset Objects to SD File 
write.SDF <- function(sdf, file, cid=FALSE, ...) {
	if(class(sdf)=="SDF") sdfstr <- as(sdf, "SDFstr")
	if(class(sdf)=="SDFstr") sdfstr <- sdf
	if(class(sdf)=="SDFset") {
		if(cid==TRUE) {	sdflist <- lapply(cid(sdf), function(x) sdf2str(sdf=sdf[[x]], cid=x, ...)) }	
		if(cid==FALSE) { sdflist <- lapply(cid(sdf), function(x) sdf2str(sdf=sdf[[x]], ...)) } 
		sdfstr <- as(sdflist, "SDFstr") 
	}
	cat(unlist(sdfstr2list(sdfstr)), sep="\n", file=file)
} 

## Coerce Methods for SDFstr Class
## SDFstr to list
setAs(from="SDFstr", to="list", 
	def=function(from) { 
		sdfstr2list(from)
})

## List to SDFstr
setAs(from="list", to="SDFstr", 
	def=function(from) {
		new("SDFstr", a=from)
})

## List to SDFstr
setAs(from="character", to="SDFstr", 
	def=function(from) {
		new("SDFstr", a=list(from))
})

################################################
## (2) Class and Method Definitions for SDF ##
################################################
setClass("ExternalReference")
setClassUnion("ExternalReferenceOrNULL",members=c("ExternalReference","NULL"))
setClass("SDF", representation(header="character", atomblock="matrix", 
										 bondblock="matrix", datablock="character",
										 obmolRef="ExternalReferenceOrNULL",
										 version="character"),
			prototype=list(obmolRef=NULL,version="V2000"))

## Convert SDFstr to SDF Class
## SDFstr Parser Function

v2kTimes = new.env()
v2kTimes$header = 0
v2kTimes$atom = 0
v2kTimes$bond = 0
v2kTimes$data = 0


.sdfParse <- function(sdf, datablock=TRUE, tail2vec=TRUE,extendedAttributes=FALSE ) {
	t=Sys.time()
	countpos <- grep("V\\d\\d\\d\\d$", sdf, perl=TRUE)
	if(length(countpos)==0)  
		countpos <- grep("V {0,}\\d\\d\\d\\d$", sdf, perl=TRUE) 
	if(length(countpos)==0 || length(countpos)>1) 
		 countpos <- 4 

	countline <- sdf[countpos]

	if(length(grep("V3000$",countline))!=0) # we have a V3000 formatted file
		return(.parseV3000(sdf,datablock,tail2vec,extendedAttributes))

   if(nchar(gsub("\\d| ", "", substring(countline, 1, 6))) != 0) 
		countline <- "  0  0"  # Create dummy countline if it contains non-numeric values 

	Natom <- as.numeric(substring(countline, 1, 3))
	Nbond <- as.numeric(substring(countline, 4, 6))
	start <- c(header=1, atom=countpos+1, bond=countpos+Natom+1, extradata=countpos+Natom+Nbond+1)
	index <- cbind(start=start, end=c(start[2:3], start[4], length(sdf)+1)-1)
	
	## Header block
	header <- sdf[index["header",1]:index["header",2]]
	if(length(header)==4) names(header) <- c("Molecule_Name", "Source", "Comment", "Counts_Line")	
	v2kTimes$header <- v2kTimes$header + (Sys.time() - t)
	
	t=Sys.time()
	## Atom block
	## format: x y z <atom symbol> 
	ab2matrix <- function(ct=sdf[index["atom",1]:index["atom",2]]) {
		if((index["atom","end"] - index["atom","start"]) < 1) {
			ctma <- matrix(rep(0,2), 1, 2, dimnames=list("0", c("C1", "C2"))) # Creates dummy matrix in case there is none.
		} else {
			ct <- gsub("^ {1,}", "", ct)
			ctlist <- strsplit(ct, " {1,}")
			ctma <- tryCatch(matrix(unlist(ctlist), 
											ncol=length(ctlist[[1]]), 
											nrow=length(ct), 
											byrow=TRUE), 
								  warning=function(w) NULL)
			# If rows in atom block are of variable length, use alternative/slower approach
			if(length(ctma)==0) { 
			  maxcol <- max(sapply(ctlist, length))
			  ctma <- matrix(0, nrow=length(ct), ncol=maxcol)
			  for(i in seq(along=ctma[,1])) 
				  ctma[i, 1:length(ctlist[[i]])] <- ctlist[[i]]
			}
			myrownames <- paste(ctma[,4], 1:length(ctma[,4]), sep="_")
		   # Ncol <- length(ctlist[[1]])
		   Ncol <- length(ctma[1, , drop = FALSE])
			ctma <- matrix(as.numeric(ctma[,-4]), 
								nrow=length(ct), 
								ncol=Ncol-1, 
								dimnames=list(myrownames, paste("C", c(1:3, 5:Ncol), sep="")))	
		}
		return(ctma)
	}
	atomblock <- ab2matrix(ct=sdf[index["atom",1]:index["atom",2]])
	v2kTimes$atom<- v2kTimes$atom+ (Sys.time() - t)
	
	t=Sys.time()
	## Bond block
	bb2matrix <- function(ct=sdf[index["bond",1]:index["bond",2]]) {
		#if((index["bond","end"] - index["bond","start"]) < 1) {
		if(((index["bond","end"] - index["bond","start"])+1) < 1) {
                        ctma <- matrix(rep(0,2), 1, 2, dimnames=list("0", c("C1", "C2"))) # Creates dummy matrix in case there is none.
                } else {
                    ct <- gsub("^(...)(...)(...)(...)(...)(...)(...)", "\\1 \\2 \\3 \\4 \\5 \\6 \\7", ct)
                    ct <- gsub("(^..\\d)(\\d)", "\\1 \\2", ct) # Splits bond strings where one or both of the atoms have 3 digit numbers
		    ct <- gsub("^ {1,}", "", ct)
                    ctlist <- strsplit(ct, " {1,}")
                    ctma <- matrix(unlist(ctlist), ncol=length(ctlist[[1]]), nrow=length(ct), byrow=TRUE)
                    Ncol <- length(ctlist[[1]])
                    ctma <- matrix(as.numeric(ctma), nrow=length(ct), ncol=Ncol, dimnames=list(1:length(ct), paste("C", 1:Ncol, sep="")))	
		}
                return(ctma)
	}
	bondblock <- bb2matrix(ct=sdf[index["bond",1]:index["bond",2]])
	v2kTimes$bond<- v2kTimes$bond + (Sys.time() - t)
	
	
	t=Sys.time()
	if(tail2vec==TRUE) {
		extradata <- ex2vec(extradata=sdf[index["extradata",1]:index["extradata",2]])
	} else {	
		extradata <- sdf[index["extradata",1]:index["extradata",2]]
	}
	v2kTimes$data<- v2kTimes$data+ (Sys.time() - t)

	## Assemble components in object of class SDF
	if(datablock==TRUE) {
		sdf <- new("SDF", header=header, atomblock=atomblock, bondblock=bondblock, datablock=extradata)
	} else {
		sdf <- new("SDF", header=header, atomblock=atomblock, bondblock=bondblock)
	}
	return(sdf)
}
## SDF name/value block
ex2vec <- function(extradata) {
                exstart <- grep("^>", extradata)
		if(length(exstart)==0) { 
                        exvec <- vector("character", length=0) 
                } else {
                    names(exstart) <- gsub("^>.*<|>", "", extradata[exstart])
                    exindex <- cbind(start=exstart, end=c(exstart[-1], length(extradata))-1) # Changed 'length(extradata)+1' to 'length(extradata)' on Mar 22, 2014
		    exvec <- sapply(rownames(exindex), function(x) paste(extradata[(exindex[x,1]+1):(exindex[x,2]-1)], collapse=" __ "))
		}
                return(exvec)
	}


findPositions = function(sdf){

	patterns = c("BEGIN CTAB","COUNTS","BEGIN ATOM","END ATOM", 
					 "BEGIN BOND", "END BOND","M  END")

	tagPositions = lapply(patterns,function(pattern) grep(pattern,sdf,fixed=TRUE))
	tagPositions[which(lapply(tagPositions,length)==0)] = -1 # ensure unlist does not remove undefined entries
	tagPositions = unlist(tagPositions)

	unfound = which(tagPositions == -1)
	if(length(unfound) != 0){ # some tags are missing
		warning("malformed SDF V3000 file, could not find tags ",
				  paste("\"",patterns[unfound],"\"",sep="",collapse=","))
	}

	tagPositions

}
v3kTimes = new.env()
v3kTimes$header = 0
v3kTimes$atom = 0
v3kTimes$atomCore = 0
v3kTimes$bond = 0
v3kTimes$data = 0

.parseV3000 <- function(sdf, datablock=TRUE, tail2vec=TRUE,extendedAttributes=FALSE) {
	#message("found V3000 formatted compound")
	sdfLength = length(sdf)

	t=Sys.time()
	tagPositions=findPositions(sdf)


	ctabPos = tagPositions[1]
	if(ctabPos == -1){
		header = sdf[1:4]
	}else
		header = sdf[1:(ctabPos-1)]
	
	if(length(header)==4) 
		names(header) <- c("Molecule_Name", "Source", "Comment", "Counts_Line")	

	countLinePos = tagPositions[2]
	if(countLinePos == -1){
		counts=c(0,0)
	}else{
		counts = as.numeric(cstrsplit(sdf[countLinePos])[c(4,5)])
	}
	#message("class of counts: ",class(counts))
	#print(counts)
	v3kTimes$header <- v3kTimes$header + (Sys.time() - t)


	t=Sys.time()
	#message("parsing atomblock")
	atomPos = tagPositions[3]
	if(atomPos == -1){
		warning("No ATOM block found in ",sdf[1]," returning a dummy atom block")
		atomblock <- matrix(rep(0,2), 1, 2, dimnames=list("0", c("C1", "C2"))) # Creates dummy matrix in case there is none.
	}else{
		#message("atomPos: ",atomPos," class: ",class(atomPos))
		atomEndPos = tagPositions[4]
		if(atomEndPos == -1){
				warning("Could not find the end of the ATOM block in ",sdf[1])
				atomEndPos = atomPos + 1 #assume and emtpy atom block
		}

		
		data = Reduce(rbind,Map(function(line) {
				parts = cstrsplit(line)
				attrs = if(length(parts) > 8)
								paste(parts[9:length(parts)],collapse=" ")
						  else ""
				c(parts[3:7],attrs)
			}, sdf[(atomPos+1):(atomEndPos-1)]))  #TODO: check for empty range
		#if(extendedAttributes)
		#	extAtomAttrs = parseAttributes(data[,6])
		extAtomAttrs = parseAttributes(data[,6])


		# we need to tranlate certain "Standard" attribute values from v3k back
		# into the v2k format to make things consistent
		#
		# <mass difference>  MASS  v2k: offset from pt v3k: actual value
		# <charge>   CHG           v2k: value of 1-7   v3k: actual charge value
		# <atom stereo parity>  CFG
		# <hydrogen count +1>   HCOUNT
		# <stereo care box>  STBOX
		# <valence>   VAL			v2k: 15 indicates 0  v3: -1 indicates 0
		standardAttrs = matrix(0,nrow(data),7)
		for(i in seq(along=extAtomAttrs)){ # for each atom
			mass = extAtomAttrs[[i]]$MASS
			if(!is.null(mass)){
				massDiff = mass - AW[data[i,2]]
				standardAttrs[i,1] = massDiff
			}
			chg= extAtomAttrs[[i]]$CHG
			if(!is.null(chg)){
				chg= 4 - as.numeric(chg)
				if(chg == 4) # undo shift for 0 values
					chg = 0
				if(chg < 1 || chg > 7) 
					chg = 0 # chg not in [1,7] => 0
				standardAttrs[i,2] = chg 
			}
			cfg = extAtomAttrs[[i]]$CFG
			if(!is.null(cfg)){
				standardAttrs[i,3] = cfg
			}
			hcount = extAtomAttrs[[i]]$HCOUNT
			if(!is.null(hcount)){
				standardAttrs[i,4] = hcount
			}
			stbox = extAtomAttrs[[i]]$STBOX
			if(!is.null(stbox)){
				standardAttrs[i,5] = stbox
			}
			val = extAtomAttrs[[i]]$val
			if(!is.null(val)){
				if(val == -1) # means 0 in v3k
					val = 15   # translate to 0 in v2k 
				standardAttrs[i,6] = val
			}
		}

    	#print(extAtomAttrs)
		atomblock =cbind(data[,3:5],standardAttrs)
		mode(atomblock)="numeric"
		#print(atomblock)
		colnames(atomblock) = paste("C",c(1:3,5:11),sep="")
		rownames(atomblock) = paste(data[,2],data[,1],sep="_")
	}
	v3kTimes$atom<- v3kTimes$atom+ (Sys.time() - t)


	t=Sys.time()
	#message("parsing bond block")
	bondPos = tagPositions[5]
	if(bondPos == -1){
		warning("No BOND block found in ",sdf[1]," returning a dummy bond block")
		bondblock <- matrix(rep(0,2), 1, 2, dimnames=list("0", c("C1", "C2"))) # Creates dummy matrix in case there is none.
	}else{
		bondEndPos = tagPositions[6]
		if(bondEndPos == -1){
	  		warning("Could not find the end of the BOND block in ",sdf[1])
	  		bondEndPos = bondPos + 1 #assume and emtpy atom block
		}

#t2=Sys.time()
		data = Reduce(rbind,Map(function(line) {
				#cstrsplit(line)[3:6]
				parts = cstrsplit(line)
				attrs=  if(length(parts) > 6)
								paste(parts[7:length(parts)],collapse=" ")
						  else ""
				c(parts[3:6],attrs)
			}, sdf[(bondPos+1):(bondEndPos-1)]))  #TODO: check for empty range
#v3kTimes$atomCore<- v3kTimes$atomCore+ (Sys.time() - t2)
		extBondAttrs = parseAttributes(data[,5])  # +2.6s

		# CFG  bond configuration(stereo)  change: 0 -> 0, 1-> 1, 2-> 4, 3->6
		# empty slot
		# TOPO  same
		# RXCTR same
		standardAttrs = matrix(0,nrow(data),4)  # +0.8s
		for(i in seq(along=extBondAttrs)){  # +1.3s
			cfg=extBondAttrs[[i]]$CFG
			if(!is.null(cfg)){
				#cfg = extBondAttrs[[i]]$CFG
				if(cfg == 2) cfg = 4
				else if(cfg == 3) cfg = 6
				standardAttrs[i,1] = cfg
			}
			topo=extBondAttrs[[i]]$TOPO
			if(!is.null(topo)){
				standardAttrs[i,3] = topo
			}
			rxctr=extBondAttrs[[i]]$rxctr
			if(!is.null(rxctr)){
				standardAttrs[i,4] = rxctr
			}
		}

		bondblock = cbind(data[,c(3,4,2)],standardAttrs) # +0.5s
		#bondblock = data[,c(3,4,2)]
		mode(bondblock)="numeric"
		colnames(bondblock) = paste("C",1:7,sep="")
		rownames(bondblock) = data[,1]
	}
	v3kTimes$bond<- v3kTimes$bond+ (Sys.time() - t)


	#message("parsing extra data")
	t=Sys.time()
	endPos = tagPositions[7]
	if(endPos == -1){
		warning("no END tag found in ",sdf[1])
		extradata = vector("character",length=0)
	}else{
		if(endPos+2 < sdfLength){
			extradata = if(tail2vec==TRUE) ex2vec(sdf[(endPos+1):sdfLength])
							else sdf[(endPos+1):sdfLength]
		}
		else
			extradata = vector("character",length=0)
	}
	v3kTimes$data <- v3kTimes$data + (Sys.time() - t)

	## Assemble components in object of class SDF
	className = "SDF"
	args = list( header=header, atomblock=atomblock, bondblock=bondblock,version="V3000")
	if(datablock==TRUE) 
		args = c(args,datablock=list(extradata))
	
	if(extendedAttributes==TRUE) {
		className = "ExtSDF"
		args = c(args,extendedAtomAttributes =list(extAtomAttrs),
					extendedBondAttributes = list(extBondAttrs) )
	}
	#message("className: ",className," args: ")
	#print(str(args))

	sdf = do.call("new",c(className,args))

	return(sdf)

}
trim <- function(str) gsub("^\\s+|\\s+$","",str)
parseAttributes <- function(attributes){
	#message("attributes: ")
	#print(attributes)

	if(all(attributes=="")){
		return(sapply(seq(along=attributes),function(i) list()))
	}


	starts = gregexpr("(^|\\s)[a-zA-Z0-9]+=",attributes)
	sapply(seq(along=attributes),function(i) {
			 if(attributes[i]=="")
				 list()
			 else{
				 starts[[i]][length(starts[[i]])+1] = nchar(attributes[i])
	#			 message("starts[[",i,"]]:")
	#			 print(starts[[i]])
				 sapply(1:(length(starts[[i]])-1),function(j){
						pair = unlist(strsplit(
											  substring(attributes[i],starts[[i]][j],starts[[i]][j+1])
											  ,"=",fixed=TRUE) )
	#					print(pair)
						if(length(pair) != 2)
							warning("bad key value pair found:
									  ",substring(attributes[i],starts[[i]][j],starts[[i]][j+1]))
						l=list()
						l[[trim(pair[1])]] = trim(pair[2])
						l
				
				})
			 }
	 })
}
## Accessor methods for SDF class
setGeneric(name="sdf2list", def=function(x) standardGeneric("sdf2list"))
setMethod(f="sdf2list", signature="SDF", definition=function(x) {return(list(header=header(x), atomblock=atomblock(x), bondblock=bondblock(x), datablock=datablock(x)))}) 
setGeneric(name="header", def=function(x) standardGeneric("header"))
setMethod(f="header", signature="SDF", definition=function(x) {return(x@header)}) 
setGeneric(name="sdfid", def=function(x, tag=1) standardGeneric("sdfid"))
setMethod(f="sdfid", signature="SDF", definition=function(x, tag=1) {return(x@header[tag])}) 
setGeneric(name="atomblock", def=function(x) standardGeneric("atomblock"))
setMethod(f="atomblock", signature="SDF", definition=function(x) {return(x@atomblock)}) 
setGeneric(name="atomcount", def=function(x, addH=FALSE, ...) standardGeneric("atomcount"))
setMethod(f="atomcount", signature="SDF", definition=function(x, addH=FALSE, ...) {
	if(addH==TRUE) { 
		return(table(c(gsub("_.*", "", rownames(x@atomblock)), rep("H", bonds(x, type="addNH")))))
	} else {
		return(table(gsub("_.*", "", rownames(x@atomblock))))
	}
})
setGeneric(name="bondblock", def=function(x) standardGeneric("bondblock"))
setMethod(f="bondblock", signature="SDF", definition=function(x) {return(x@bondblock)}) 
setGeneric(name="datablock", def=function(x) standardGeneric("datablock"))
setMethod(f="datablock", signature="SDF", definition=function(x) {return(x@datablock)}) 
setGeneric(name="datablocktag", def=function(x, tag) standardGeneric("datablocktag"))
setMethod(f="datablocktag", signature="SDF", definition=function(x, tag) {return(x@datablock[tag])}) 

setGeneric(name="obmol",def=function(x) standardGeneric("obmol"))
setMethod(f="obmol",signature="SDF",definition= function(x) {
	.ensureOB()
	if(is.null(x@obmolRef))
		x@obmolRef = sdf2OBMol(x)
	x@obmolRef
})

## Define print behavior for SDF
setMethod(f="show", signature="SDF",                
   definition=function(object) {
         cat("An instance of ", "\"", class(object), "\"", "\n", sep="")
         cat("\n<<header>>", "\n", sep="")
         print(header(object))
         cat("\n<<atomblock>>", "\n", sep="")
         if(length(atomblock(object)[,1])>=5) {
             print(as.data.frame(rbind(atomblock(object)[1:2,], ...=rep("...", length(atomblock(object)[1,])), 
             atomblock(object)[(length(atomblock(object)[,1])-1):length(atomblock(object)[,1]),])))
         } else {
             print(atomblock(object))}
         cat("\n<<bondblock>>", "\n", sep="")
         if(length(bondblock(object)[,1])>=5) {
             print(as.data.frame(rbind(bondblock(object)[1:2,], ...=rep("...", length(bondblock(object)[1,])), 
             bondblock(object)[(length(bondblock(object)[,1])-1):length(bondblock(object)[,1]),])))
         } else {
             print(bondblock(object))}
         cat("\n<<datablock>> (", length(datablock(object)), " data items)", "\n", sep="")
         if(length(datablock(object))>=5) {
         	print(c(datablock(object)[1:4], "..."))
         } else {
         	print(datablock(object))}
})

## Behavior of "[" operator for SDF
setMethod(f="[", signature="SDF",
	definition=function(x, i, ..., drop) {
		return(sdf2list(x)[i]) 
})

## Replacement method for SDF using "[" operator 
setReplaceMethod(f="[", signature="SDF", definition=function(x, i, j, value) {
	if(i==1) x@header <- value
	if(i==2) x@atomblock <- value 
	if(i==3) x@bondblock <- value
	if(i==4) x@datablock <- value
	if(i=="header") x@header <- value
	if(i=="atomblock") x@atomblock <- value 
	if(i=="bondblock") x@bondblock <- value
	if(i=="datablock") x@datablock <- value
	obmolRef=NULL
	return(x)
})

## Behavior of "[[" operator for SDF
setMethod(f="[[", signature="SDF",
	definition=function(x, i, ..., drop) {
		return(sdf2list(x)[[i]]) 
})

## Replacement method for SDF using "[[" operator 
setReplaceMethod(f="[[", signature="SDF", definition=function(x, i, j, value) {
	if(i==1) x@header <- value
	if(i==2) x@atomblock <- value 
	if(i==3) x@bondblock <- value
	if(i==4) x@datablock <- value
	if(i=="header") x@header <- value
	if(i=="atomblock") x@atomblock <- value 
	if(i=="bondblock") x@bondblock <- value
	if(i=="datablock") x@datablock <- value
	obmolRef=NULL
	return(x)
})

## Coerce Methods for SDF Class 
## Character vector to SDF
setAs(from="character", to="SDF", 
	def=function(from) {
		.sdfParse(sdf=from) 
})

## SDF to list
setAs(from="SDF", to="list", 
	def=function(from) {
		list(header=header(from), atomblock=atomblock(from), bondblock=bondblock(from), datablock=datablock(from))
})

## SDF to SDFstr
setAs(from="SDF", to="SDFstr", 
	def=function(from) {
		new("SDFstr", a=list(as(from, "character")))		
})

## list (w. SDF components) to SDF  
setAs(from="list", to="SDF", 
	def=function(from) {
		new("SDF", bondblock=from$bondblock, header=from$header, atomblock=from$atomblock, datablock=from$datablock)
})

################################################# 
##  ExtSDF
################################################# 

setClass("ExtSDF",representation(extendedAtomAttributes="list",
											extendedBondAttributes="list"),contains="SDF")

setGeneric(name="getAtomAttr", def=function(x,atomId,tag) standardGeneric("getAtomAttr"))
setMethod(f="getAtomAttr", signature="ExtSDF",definition = 
			 function(x,atomId,tag) x@extendedAtomAttributes[[atomId]][[tag]])

setGeneric(name="getBondAttr", def=function(x,bondId,tag) standardGeneric("getBondAttr"))
setMethod(f="getBondAttr", signature="ExtSDF",definition = 
			 function(x,bondId,tag) x@extendedBondAttributes[[bondId]][[tag]])
setMethod(f="show", signature="ExtSDF",                
   definition=function(object) {
		callNextMethod(object)
		nonEmptyAtoms = which(sapply(object@extendedAtomAttributes,
											  function(x) length(x)!=0))
		message("<<Extended Atom Attributes>>")
		count = min(5,length(nonEmptyAtoms))
		for(i in nonEmptyAtoms[seq(1,count,length.out=count)]){
			message("Atom ",i)
			print(object@extendedAtomAttributes[[i]])
		}

		nonEmptyBonds = which(sapply(object@extendedBondAttributes,
											  function(x) length(x)!=0))
		message("<<Extended Bond Attributes>>")
		count = min(5,length(nonEmptyBonds))
		for(i in nonEmptyBonds[seq(1,count,length.out=count)]){
			message("Bond ",i)
			print(object@extendedBondAttributes[[i]])
		}
	})
 
################################################# 
## (3) Class and Method Definitions for SDFset ##
#################################################
setClass("SDFset", representation(SDF="list", ID="character"))

## Store many SDFs in SDFset Object
read.SDFset <- function(sdfstr=sdfstr, ...) {
	## If a file name is provided run read.SDFstr function first
	if(is.character(sdfstr)) { 
		sdfstr <- read.SDFstr(sdfstr=sdfstr) 
	}
	## Iterate over SDFstr components	
	sdfset <- lapply(seq(along=sdfstr@a), function(x) .sdfParse(sdfstr2list(sdfstr)[[x]], ...))
	sdfset <- new("SDFset", SDF=sdfset, ID=paste("CMP", seq(along=sdfset), sep=""))
	## Validity check of SDFs based on atom/bond block column numbers
	badsdf <- sum(!validSDF(sdfset))
	if(sum(badsdf)!=0) warning(paste(c(sum(badsdf), " invalid SDFs detected. To fix, run: valid <- validSDF(sdfset); sdfset <- sdfset[valid]")))
	return(sdfset)
}

## Accessor methods for SDFset class
setGeneric(name="SDFset2list", def=function(x) standardGeneric("SDFset2list"))
setMethod(f="SDFset2list", signature="SDFset", definition=function(x) {
	SDFlist <- x@SDF
	charlist <- lapply(seq(along=SDFlist), function(x) as(SDFlist[[x]], "list"))
	names(charlist) <- x@ID
	return(charlist)
}) 
setGeneric(name="SDFset2SDF", def=function(x) standardGeneric("SDFset2SDF"))
setMethod(f="SDFset2SDF", signature="SDFset", definition=function(x) { tmp <- x@SDF; names(tmp) <- x@ID; return(tmp)}) 
setMethod(f="header", signature="SDFset", definition=function(x) { return(lapply(SDFset2SDF(x), header))}) 
setMethod(f="sdfid", signature="SDFset", definition=function(x, tag=1) {return(as.vector(sapply(SDFset2SDF(x), sdfid, tag)))}) 
setGeneric(name="cid", def=function(x) standardGeneric("cid"))
setMethod(f="cid", signature="SDFset", definition=function(x) {return(x@ID)}) 
setMethod(f="atomblock", signature="SDFset", definition=function(x) {return(lapply(SDFset2SDF(x), atomblock))}) 
setMethod(f="atomcount", signature="SDFset", definition=function(x, addH, ...) {
	atomcounts <- lapply(SDFset2SDF(x), function(y) atomcount(y, addH, ...))
	return(atomcounts)
}) 
setMethod(f="bondblock", signature="SDFset", definition=function(x) {return(lapply(SDFset2SDF(x), bondblock))}) 
setMethod(f="datablock", signature="SDFset", definition=function(x) {return(lapply(SDFset2SDF(x), datablock))}) 
setMethod(f="datablocktag", signature="SDFset", definition=function(x, tag) {return(as.vector(sapply(SDFset2SDF(x), datablocktag, tag)))}) 
setMethod(f="obmol",signature="SDFset",definition= function(x) lapply(SDFset2SDF(x),obmol))

## Replacement method for SDF component of SDFset using accessor methods
setGeneric(name="SDFset2SDF<-", def=function(x, value) standardGeneric("SDFset2SDF<-"))
setReplaceMethod(f="SDFset2SDF", signature="SDFset", definition=function(x, value) {
	x@SDF <- value 
	return(x)
})

## Replacement method for ID component of SDFset using accessor methods
setGeneric(name="cid<-", def=function(x, value) standardGeneric("cid<-"))
setReplaceMethod(f="cid", signature="SDFset", definition=function(x, value) {
	x@ID <- value 
	if(any(duplicated(x@ID))) { 
		warning("The values in the CMP ID slot are not unique anymore. To fix this, run: cid(sdfset) <- makeUnique(cid(sdfset))")
	}
	return(x)
})

## Replacement method for SDFset using "[" operator 
## It doesn't provide here full set of expected functionalities.
setReplaceMethod(f="[", signature="SDFset", definition=function(x, i, j, value) {
	x@SDF[i] <- SDFset2SDF(value)
	return(x)
})

## Behavior of "[" operator for SDFset 
setMethod(f="[", signature="SDFset", definition=function(x, i, ..., drop) {
	if(is.logical(i)) {
                i <- which(i)
        }
        if(is.character(i)) { 
		ids <-seq(along=x@ID); names(ids) <- x@ID
		i <- ids[i]
	}
	x@SDF <- x@SDF[i]                   
	x@ID <- x@ID[i]                   
	if(any(duplicated(i))) {
		warning("The values in the CMP ID slot are not unique anymore. To fix this, run: cid(sdfset) <- makeUnique(cid(sdfset))")
	}
	return(x)
})

## Replacement method for SDFset using "[" operator 
setReplaceMethod(f="[", signature="SDFset", definition=function(x, i, value) {
	x@SDF[i] <- value
	return(x)
})

## Replacement method for SDFset using "[[" operator 
setReplaceMethod(f="[[", signature="SDFset", definition=function(x, i, value) {
	x@SDF[[i]] <- value
	return(x)
})

## Behavior of "[[" operator for SDFset to convert single SDFset component to SDF 
setMethod(f="[[", signature="SDFset", definition=function(x, i, ..., drop) {
	if(is.character(i)) { i <- which(x@ID %in% i) }
	return(x@SDF[[i]])                 
})

## Batch replacement of header, atomblock, bondblock and datablock sections for
## one to all SDF objects in an SDFset.
.gsubsdfsec <- function(sdfset, what, secdata) {
	if(class(secdata) %in% c("data.frame", "matrix")) { 
		secdata <- as.matrix(secdata)
		tmp <- as.list(seq(along=secdata[,1]))
		for(i in seq(along=tmp)) {
			vec <- as.character(secdata[i,])
			names(vec) <- colnames(secdata)
			tmp[[i]] <- vec
		}
		secdata <- tmp
	}
	if(length(sdfset) != length(secdata)) { stop("The length of the two data sets needs to match.") }
	sdfsec <- list(header=header(sdfset), atomblock=atomblock(sdfset), bondblock=bondblock(sdfset), datablock=datablock(sdfset))
	sdfsec[[what]] <- secdata
	sdflist <- lapply(seq(along=sdfsec[[1]]), function(x) as(list(header=sdfsec$header[[x]], atomblock=sdfsec$atomblock[[x]], bondblock=sdfsec$bondblock[[x]], datablock=sdfsec$datablock[[x]]), "SDF")) 
	names(sdflist) <- cid(sdfset)
	return(as(sdflist, "SDFset"))
}

setGeneric(name="header<-", def=function(x, value) standardGeneric("header<-"))
setReplaceMethod(f="header", signature="SDFset", definition=function(x, value) {
	sdfset <- .gsubsdfsec(sdfset=x, what=1, secdata=value) 
	return(sdfset)
})

setGeneric(name="atomblock<-", def=function(x, value) standardGeneric("atomblock<-"))
setReplaceMethod(f="atomblock", signature="SDFset", definition=function(x, value) {
	sdfset <- .gsubsdfsec(sdfset=x, what=2, secdata=value) 
	return(sdfset)
})

setGeneric(name="bondblock<-", def=function(x, value) standardGeneric("bondblock<-"))
setReplaceMethod(f="bondblock", signature="SDFset", definition=function(x, value) {
	sdfset <- .gsubsdfsec(sdfset=x, what=3, secdata=value) 
	return(sdfset)
})

setGeneric(name="datablock<-", def=function(x, value) standardGeneric("datablock<-"))
setReplaceMethod(f="datablock", signature="SDFset", definition=function(x, value) {
	sdfset <- .gsubsdfsec(sdfset=x, what=4, secdata=value) 
	return(sdfset)
})


## Length function
setMethod(f="length", signature="SDFset",
    definition=function(x) {
    	return(length(SDFset2SDF(x)))
})

## Print behavior for SDFset
setMethod(f="show", signature="SDFset",
	definition=function(object) { 
		cat("An instance of ", "\"", class(object), "\"", " with ", length(object), " molecules", "\n", sep="")
		if(any(duplicated(object@ID))) {
			warning("The values in the CMP ID slot are not unique. To fix this, run: cid(sdfset) <- makeUnique(cid(sdfset))")
		}
})

## Concatenate function for SDFset
## Note: is currently limited to 2 arguments!
setMethod(f="c", signature="SDFset", definition=function(x, y) {
	sdflist1 <- as(x, "SDF")
	sdflist2 <- as(y, "SDF")
	sdflist <- c(sdflist1, sdflist2)
	sdfset <- as(sdflist, "SDFset")
	if(any(duplicated(cid(sdfset)))) {
		warning("The values in the CMP ID slot are not unique anymore, makeUnique() can fix this!")
	}
	return(sdfset)
})

## Convert SDF to SDFstr Class for Export to File
## Function allows to customize output via optional arguments 
setGeneric(name="sdf2str", def=function(sdf, head, ab, bb, db, cid=NULL, sig=FALSE, ...) standardGeneric("sdf2str"))
setMethod(f="sdf2str", signature="SDF", definition = function(sdf, head, ab, bb, db, cid=NULL, sig=FALSE, ...) {
	## Checks
	if(class(sdf)!="SDF") stop("Function expects molecule object of class SDF as input!")	
	
	## Header
	if(missing(head)) {
		head <- as.character(sdf[[1]])
		if(sig==TRUE) head[2] <- paste("ChemmineR-", format(Sys.time(), "%m%d%y%H%M"), "XD", sep="")	
		if(length(cid)==1) head[1] <- cid
	}
	
	## Atom block
	if(missing(ab)) {
		ab <- sdf[[2]]
		#ab <- cbind(Indent="", format(ab[,1:3], width=9, justify="right"), A=format(gsub("_.*", "", rownames(ab)), width=1, justify="right"), Space="", format(ab[,-c(1:3)], width=2, justify="right")) # Changed on 26-Aug-13
		ab <- cbind(Indent="", format(ab[,1:3], width=9, justify="right"), A=format(gsub("_.*", "", rownames(ab)), width=1, justify="left"),  Space="", format(ab[,-c(1:3)], width=2, justify="right"))
		ab <- sapply(seq(along=ab[,1]), function(x) paste(ab[x, ], collapse=" "))
	}

	## Bond block
	if(missing(bb)) {
		bb <- sdf[[3]]
		bb <- cbind(Indent="", format(bb, width=3, justify="right"))
		bb <- sapply(seq(along=bb[,1]), function(x) paste(bb[x, ], collapse=""))
	}

	## Data block
	if(missing(db)) {
		db <- sdf[[4]]
		if(length(db)>0) {
			dbnames <- paste("> <", names(db), ">", sep="")
			dbvalues <- as.character(db)
			db <- as.vector(rbind(dbnames, dbvalues, ""))
		} else {
			db <- NULL
		}
	}
	
	## Assemble in character vector
	sdfstrvec <- c(head, ab, bb, "M  END", db, "$$$$")
        return(sdfstrvec)
})

## Coerce Methods for SDFset Class 
## SDFset to list with lists of SDF sub-components
setAs(from="SDFset", to="list", 
	def=function(from) {
		SDFset2list(from)
})

## SDFset to list with many SDF objects (for summary view)
setAs(from="SDFset", to="SDF", 
	def=function(from) {
		SDFset2SDF(from)
})
setGeneric(name="view", def=function(x) standardGeneric("view"))
setMethod(f="view", signature="SDFset", definition=function(x) { as(x, "SDF") })

## SDFstr to SDFset
setAs(from="SDFstr", to="SDFset", 
	def=function(from) {
		read.SDFset(sdfstr=from) 
})

## SDF to SDFset of length one
setAs(from="SDF", to="SDFset", 
	def=function(from) {
		new("SDFset", SDF=list(from), ID="CMP") 
})

## SDF to SDFstr
setAs(from="SDF", to="SDFstr", 
	def=function(from) {
		as(as(from, "SDFset"), "SDFstr") 
})

## SDFset to character for SDFstr coercion
setAs(from="SDF", to="character", 
	def=function(from) {
       		sdf2str(sdf=from)
})

## SDFset to SDFstr
setAs(from="SDFset", to="SDFstr", 
	def=function(from) {
		from <- lapply(seq(along=from), function(x) as(from[[x]], "character"))
		new("SDFstr", a=from)		
})

## List of SDFs to SDFset
setAs(from="list", to="SDFset", 
	def=function(from) {
		new("SDFset", SDF=from, ID=names(from))
})

## User interface to SDFset() constructor
SDFset <- function(SDFlist=list(), ID=character()) {
	new("SDFset", SDF=SDFlist, ID=ID)
}

setClass("SDFset", representation(SDF="list", ID="character"))

#########################################################
## (4) Class and Method Definitions for SMI and SMIset ##
#########################################################
## Define SMI and SMIset classes
setClass("SMI", representation(smiles="character"))
setClass("SMIset", representation(smilist="list"))

## Methods to return SMI/SMIset objects as character vector or list, respectively
setMethod(f="as.character", signature="SMI", definition=function(x) {return(x@smiles)})
setMethod(f="as.character", signature="SMIset", definition=function(x) {return(unlist(x@smilist))})

## Constructor methods
## Character vector to FP with: as(myvector, "SMI")
setAs(from="character", to="SMI",  
        def=function(from) {
		new("SMI", smiles=from)
})

## List to SMIset with: as(mylist, "SMIset")
setAs(from="list", to="SMIset",  
        def=function(from) {
		new("SMIset", smilist=from)
})

## Character to SMIset with: as(mychar, "SMIset")
setAs(from="character", to="SMIset",  
        def=function(from) {
		new("SMIset", smilist=as.list(from))
})

## Accessor methods for SMIset
## Note: generic for cid() is defined under SDFset class section.
setMethod(f="cid", signature="SMIset", definition=function(x) { return(names(x@smilist)) })

## Replacement method for ID component of SMIset using accessor methods.
setReplaceMethod(f="cid", signature="SMIset", definition=function(x, value) {
	names(x@smilist) <- value 
	if(any(duplicated(names(x@smilist)))) { 
		warning("The values in the CMP ID slot are not unique anymore. To fix this, run: cid(smiset) <- makeUnique(cid(smiset))")
	}
	return(x)
})

## Replacement method for SMIset using "[" operator 
## It doesn't provide here full set of expected functionalities.
setReplaceMethod(f="[", signature="SMIset", definition=function(x, i, value) {
	x@smilist[i] <- value
	names(x@smilist) <- names(value)
	return(x)
})

## Behavior of "[" operator for SMIset 
setMethod(f="[", signature="SMIset", definition=function(x, i, ..., drop) {
	if(is.logical(i)) {
                i <- which(i)
        }
	x@smilist <- x@smilist[i]                   
	if(any(duplicated(i))) {
		warning("The values in the CMP ID slot are not unique anymore. To fix this, run: cid(smiset) <- makeUnique(cid(smiset))")
	}
	return(x)
})

## Behavior of "[[" operator for SMIset to convert single SMIset component to SMI
setMethod(f="[[", signature="SMIset", definition=function(x, i, ..., drop) {
	if(is.character(i)) { i <- which(names(x@fpma) %in% i) }
	return(new("SMI", smiles=x@smilist[[i]]))                 
})

## Length function
setMethod(f="length", signature="SMIset",
    definition=function(x) {
    	return(length(as.character(x)))
})

## Define print behavior for SMIset
setMethod(f="show", signature="SMIset", 
	definition=function(object) {    
		cat("An instance of ", "\"", class(object), "\"", " with ", length(object), " molecules", "\n", sep="")
		if(any(duplicated(names(object@smilist)))) {
			warning("The values in the CMP ID slot are not unique. To fix this, run: cid(smiset) <- makeUnique(cid(smiset))")
		}
})

## Concatenate function for SMIset
## Note: is currently limited to 2 arguments!
setMethod(f="c", signature="SMIset", definition=function(x, y) {
	smilist1 <- as.list(x)
	smilist2 <- as.list(y)
	smilist <- c(smilist1, smilist2)
	smiset <- as(smilist, "SMIset")
	if(any(duplicated(cid(smiset)))) {
		warning("The values in the CMP ID slot are not unique anymore, makeUnique() can fix this!")
	}
	return(smiset)
})

## Define print behavior for SMI
setMethod(f="show", signature="SMI",                
   definition=function(object) {
         cat("An instance of ", "\"", class(object), "\"", "\n", sep="")
         print(object@smiles)
})

## SMIset to list with many SMI objects (for summary view)
setAs(from="SMIset", to="SMI", 
        def=function(from) {
                tmp <- lapply(seq(along=from), function(x) from[[x]])
		names(tmp) <- cid(from)
		return(tmp)
})
setMethod(f="view", signature="SMIset", definition=function(x) { as(x, "SMI") })
# view(smiset)

## Import SMILES Files and Store them as SMIset Objects
read.SMIset <- function(file, removespaces=TRUE, ...) {
	smisettmp <- readLines(file, ...)
	if(removespaces==TRUE) smisettmp <- gsub(" {1,}", "", smisettmp)
	## Add compound names where they are missing
	index <- !grepl("\t.{1,}$", smisettmp)
	if(any(index)) smisettmp[index] <- paste(smisettmp[index], "\t", "CMP", which(index), sep="")
	## Construct and return SMIset object
	smiset <- gsub("\t.*$", "", smisettmp)
	names(smiset) <- gsub("^.*\t", "", smisettmp)
        smiset <- as(smiset, "SMIset")
        return(smiset)
}

## Write SMI/SMIset Objects to File 
write.SMI <- function(smi, file, cid=TRUE, ...) {
	if(class(smi)=="SMI") smima <- cbind(as.character(smi), "CMP1")
	if(class(smi)=="SMIset") smima <- cbind(as.character(smi), cid(smi), ...)
	if(cid==TRUE) write.table(smima, file=file, quote=FALSE, sep="\t", row.names=FALSE, col.names = FALSE) 
	if(cid==FALSE) write.table(smima[ , 1, drop = TRUE], file=file, quote=FALSE, sep="\t", row.names=FALSE, col.names = FALSE) 
} 

#######################################################
## (5) Class and Method Definitions for AP and APset ##
#######################################################
## Function to coerce SDF class to old non-S4 AP CMP object
SDF2apcmp <- function(SDF) { 
	atoms <- gsub("_.*", "", rownames(atomblock(SDF)))
	u <- as.numeric(bondblock(SDF)[,1])
	n_atoms <- length(atoms)
	n_bonds <- length(u)
	v <- as.numeric(bondblock(SDF)[,2])
	t <- as.numeric(bondblock(SDF)[,3])

	cmp = list(atoms=atoms, bonds=list(u=u, v=v, t=t), n_atoms=n_atoms, n_bonds=n_bonds)

	# assume we have an SDF object, not an SDFset
	sdfstr=as(as(SDF,"SDFstr"),"list")[[1]]
	defs = paste(sdfstr,collapse="\n")
	#defs = unlist(Map(function(x) paste(x,collapse="\n"), sdfstrList) )
	d <- Descriptors()
	if (Descriptors_parse_sdf(self=d, sdf=defs) == 0) {
		stop("SDF format not available or SDF not well-formatted!")
		return(list(n_atoms=0, n_bonds=0, desc_obj=NULL))
	}
	cmp$desc_obj=d

	return(cmp)
}

## Define AP/APset S4 classes for single AP vector and AP list
setClass("AP", representation(AP="numeric"))
setClass("APset", representation(AP="list", ID="character"))

## Create instance of APset form SDFset 
sdf2ap <- function(sdfset, type="AP",uniquePairs=TRUE) {
        if(!class(sdfset) %in% c("SDF", "SDFset")) stop("Functions expects input of classes SDF or SDFset.")
        if(class(sdfset)=="SDF") {
	         if(type=="AP") {
                	return(new("AP",
										AP=genAPDescriptors(sdfset,uniquePairs)))
        	   }
	         if(type=="character") {
                	return(paste(genAPDescriptors(sdfset,uniquePairs), collapse=", "))
        	   }
	     }
        if(class(sdfset)=="SDFset") {
                aplist <- as.list(seq(along=sdfset))
                exception <- FALSE
                for(i in seq(along=aplist)) {
                        tmp <- try(genAPDescriptors(sdfset[[i]],uniquePairs))
                        if(length(tmp) > 0 & class(tmp)!="try-error") {
                                aplist[[i]] <- tmp
                        } else if(length(tmp) == 0 & class(tmp)!="try-error") {
                                aplist[[i]] <- 0 # Value to use if no atom pairs are returned 
                                exception <- TRUE
                        } else if(class(tmp)=="try-error") {
                                aplist[[i]] <- 1 # Value to use if error is returned 
                                exception <- TRUE
                        }
                }
		if(exception) {
                        warning("One or more compounds failed to return APs. To identify them, run: \n\t which(sapply(as(apset, \"list\"), length)==1)")
		}
                if(type=="AP") {
                	return(new("APset", AP=aplist, ID=cid(sdfset)))
        	}
		if(type=="character") {
			names(aplist) <- cid(sdfset)
                	return(sapply(aplist, paste, collapse=", "))
        	}
        }
}

## Create AP Fingerprints
desc2fp <- function(x, descnames=1024, type="FPset") {
	if(length(descnames) == 1) {
		data(apfp)
		descnames <- as.character(apfp$AP)[1:descnames]
	}	
	if(class(x)=="APset") { 
        	apfp <- matrix(0, nrow=length(x), ncol=length(descnames), dimnames=list(cid(x), descnames))
		apsetlist <- ap(x)
                for(i in cid(x)) apfp[i, descnames %in% as.character(apsetlist[[i]])] <- 1
        } else if(class(x)=="list") {
        	apfp <- matrix(0, nrow=length(x), ncol=length(descnames), dimnames=list(names(x), descnames))
		for(i in names(x)) apfp[i, descnames %in% as.character(x[[i]])] <- 1
	} else {
		stop("x needs to be of class APset or list")
	}
        if(type=="FPset") {
                #return(as(apfp, "FPset"))
                return(new("FPset",fpma=apfp,type="apfp"))
        }
        if(type=="matrix") {
                return(apfp)
        }
        if(type=="character") {
                return(sapply(rownames(apfp), function(x) paste(apfp[x,], collapse="")))
        }
}
## Usage: 
# apfpset <- desc2fp(x=apset, descnames=1024, type="matrix")

## Accessor methods for APset class
setGeneric(name="ap", def=function(x) standardGeneric("ap"))

setMethod(f="ap", signature="AP", definition=function(x) { return(x@AP) })
setMethod(f="ap", signature="APset", definition=function(x) { tmp <- x@AP; names(tmp) <- x@ID; return(tmp) })
setMethod(f="cid", signature="APset", definition=function(x) { return(x@ID) })

## Replacement method for ID component of APset using accessor methods.
## Note: generic for cid() is defined under SDFset class section.
setReplaceMethod(f="cid", signature="APset", definition=function(x, value) {
	x@ID <- value 
	if(any(duplicated(x@ID))) { 
		warning("The values in the CMP ID slot are not unique anymore. To fix this, run: cid(apset) <- makeUnique(cid(apset))")
	}
	return(x)
})

## Replacement method for APset using "[" operator 
## It doesn't provide here full set of expected functionalities.
setReplaceMethod(f="[", signature="APset", definition=function(x, i, j, value) {
	x@AP[i] <- ap(value)
	x@ID[i] <- cid(value)
	return(x)
})

## Behavior of "[" operator for APset 
setMethod(f="[", signature="APset", definition=function(x, i, ..., drop) {
	if(is.logical(i)) {
                i <- which(i)
        }
	if(is.character(i)) { 
		ids <- seq(along=x@ID)
		names(ids) <- x@ID
		i <- ids[i]
	}
	x@AP <- x@AP[i]                   
	x@ID <- x@ID[i]                   
	if(any(duplicated(i))) {
		warning("The values in the CMP ID slot are not unique anymore. To fix this, run: cid(apset) <- makeUnique(cid(apset))")
	}
	return(x)
})

## Replacement method for APset using "[[" operator 
setReplaceMethod(f="[[", signature="APset", definition=function(x, i, value) {
	x@AP[[i]] <- value
	return(x)
})

## Behavior of "[[" operator for APset to convert single APset component to AP
setMethod(f="[[", signature="APset", definition=function(x, i, ..., drop) {
	if(is.character(i)) { i <- which(x@ID %in% i) }
	return(new("AP", AP=x@AP[[i]]))                 
})

## Length function
setMethod(f="length", signature="APset",
    definition=function(x) {
    	return(length(ap(x)))
})

## Print behavior for APset
setMethod(f="show", signature="APset",
	definition=function(object) { 
		cat("An instance of ", "\"", class(object), "\"", " with ", length(object), " molecules", "\n", sep="")
		if(any(duplicated(object@ID))) {
			warning("The values in the CMP ID slot are not unique. To fix this, run: cid(apset) <- makeUnique(cid(apset))")
		}
})

## Concatenate function for APset
## Note: is currently limited to 2 arguments!
setMethod(f="c", signature="APset", definition=function(x, y) {
	aplist1 <- as(x, "list")
	aplist2 <- as(y, "list")
	aplist <- c(aplist1, aplist2)
	apset <- as(aplist, "APset")
	if(any(duplicated(cid(apset)))) {
		warning("The values in the CMP ID slot are not unique anymore, makeUnique() can fix this!")
	}
	return(apset)
})

## Define print behavior for AP
setMethod(f="show", signature="AP",                
   definition=function(object) {
         cat("An instance of ", "\"", class(object), "\"", "\n", sep="")
         cat("<<atom pairs>>", "\n", sep="")
         if(length(object@AP)>=5) {
             cat(c(object@AP[1:5], "... length:", length(object@AP), "\n"))
         } else {
             print(object@AP)}
})

## Coerce Methods for APset Class
## APset to list with many AP objects
setAs(from="APset", to="list",
        def=function(from) {
                ap(from)
})

## List of APs to APset
setAs(from="list", to="APset", 
        def=function(from) {
                new("APset", AP=from, ID=names(from))
})

## APset to list with many AP objects (for summary view)
setAs(from="APset", to="AP", 
        def=function(from) {
                tmp <- lapply(seq(along=from), function(x) from[[x]])
		names(tmp) <- cid(from)
		return(tmp)
})
setMethod(f="view", signature="APset", definition=function(x) { as(x, "AP") })
# view(apset)

## Coerce APset to old list-style descriptor database used by search/cluster functions
apset2descdb <- function(apset) {
                list(descdb=ap(apset), cids=cid(apset), sdfsegs=NULL, source="SDFset", type="SDFset")
}

#######################################################
## (6) Class and Method Definitions for FP and FPset ##
#######################################################
## Define FP and FPset classes
setClass("FP", representation(fp="numeric",type="character",foldCount="numeric"),
			prototype=list(foldCount=0))
setMethod(f="initialize", signature="FP",definition= function(.Object, ...){
			  obj <- callNextMethod(.Object, ...)
			  if(length(obj@type)==0)
				  obj@type=paste("unknown",as.integer(runif(1)*10000),sep="-")
			  obj
			})

setClass("FPset", representation(fpma="matrix",type="character",foldCount="numeric"),
			prototype=list(foldCount=0))
setMethod(f="initialize", signature="FPset",definition= function(.Object, ...){
			  obj <- callNextMethod(.Object, ...)
			  if(length(obj@type)==0)
				  obj@type=paste("unknown",as.integer(runif(1)*10000),sep="-")
			  obj
			})

## Methods to return FPSet as vector or matrix, respectively
setMethod(f="as.vector", signature="FP", definition=function(x) {return(x@fp)})
setMethod(f="as.numeric", signature="FP", definition=function(x) {return(x@fp)})
setMethod(f="as.character", signature="FP", definition=function(x) {return(paste(x@fp, collapse=""))})
setMethod(f="as.matrix", signature="FPset", definition=function(x) {return(x@fpma)})
setMethod(f="as.character", signature="FPset", definition=function(x) {sapply(rownames(x@fpma), function(y) paste(x@fpma[y,], collapse=""))})

## Constructor methods
## Numeric vector to FP with: as(myvector, "FP")
setAs(from="numeric", to="FP",  
        def=function(from) {
		new("FP", fp=from)
})
## Matrix to FPset with: as(mymatrix, "FPset")
setAs(from="matrix", to="FPset",  
        def=function(from) {
		new("FPset", fpma=from)
})

## Character to FPset with: as(mychar, "FPset")
setAs(from="character", to="FPset",  
        def=function(from) {
		read.AP(from, type="fp")
})

## Accessor methods for FPset
## Note: generic for cid() is defined under SDFset class section.
setMethod(f="cid", signature="FPset", definition=function(x) { return(rownames(x@fpma)) })

## Replacement method for ID component of FPset using accessor methods.
setReplaceMethod(f="cid", signature="FPset", definition=function(x, value) {
	rownames(x@fpma) <- value 
	if(any(duplicated(rownames(x@fpma)))) { 
		warning("The values in the CMP ID slot are not unique anymore. To fix this, run: cid(fpset) <- makeUnique(cid(fpset))")
	}
	return(x)
})

foldVector <- function(v,count=1,bits=NULL){
	 len = length(v)
	 if(len%%2!=0)
		 stop("must have an even number of bits to fold")

	 if(!is.null(bits))
		 count=foldsNeeded(len,bits)

	 #message("count: ",count," len: ",len)
	 realCount=0
	 for(i in rep(0,count)){
		 if(len==1)
			 break
		 mid = len/2
		 #message("mid: ",mid)
		 v = mapply(function(a,b) if(a||b)1 else 0,v[1:mid],v[(mid+1):len])
		 len = length(v)
		 realCount=realCount+1
	 }
	 if(count!=realCount)
		 warning(count," folds requested but could only fold ",realCount," times")
	 list(fp=v,actualFoldCount=realCount)
}
foldsNeeded <- function(length,bits){
	count = log2(length) - log2(bits)
	if(count != as.integer(count))
		stop("not possible to achieve ",bits," with an integral number of folds")
	count
}
setGeneric(name="fold", def=function(x,count=1,bits=NULL) standardGeneric("fold"))
setMethod(f="fold",signature="FP", definition=function(x,count,bits){
			 result = foldVector(x@fp,count,bits)
			 x@fp = result$fp
			 x@foldCount = x@foldCount+result$actualFoldCount
			 x
})
setMethod(f="fold",signature="FPset", definition=function(x,count,bits) {
			 actualFolds=0
			 data = apply(x@fpma,c(1),function(y){
								 result = foldVector(y,count,bits)
								 actualFolds <<- result$actualFoldCount #just keep the last one, they should all be the same
								 result$fp	 })
			 if(class(data) != "matrix") # account for case when fp gets folded down to 1
				 dim(data) = c(nrow(x@fpma),1)
			 else
				 data = t(data)
			 x@fpma = data
			 x@foldCount = x@foldCount+actualFolds
			 x
})

setGeneric(name="fptype", def=function(x) standardGeneric("fptype"))
setMethod(f="fptype", signature="FP",definition=function(x) x@type)
setMethod(f="fptype", signature="FPset",definition=function(x) x@type)

setGeneric(name="foldCount", def=function(x) standardGeneric("foldCount"))
setMethod(f="foldCount", signature="FP",definition=function(x) x@foldCount)
setMethod(f="foldCount", signature="FPset",definition=function(x) x@foldCount)

setGeneric(name="numBits", def=function(x) standardGeneric("numBits"))
setMethod(f="numBits", signature="FP",definition=function(x) length(x@fp))
setMethod(f="numBits", signature="FPset",definition=function(x) ncol(x@fpma))

## Replacement method for FPset using "[" operator 
## It doesn't provide here full set of expected functionalities.
setReplaceMethod(f="[", signature="FPset", definition=function(x, i, value) {
	x@fpma[i,] <- value
	return(x)
})

## Behavior of "[" operator for FPset 
setMethod(f="[", signature="FPset", definition=function(x, i, ..., drop) {
	if(is.logical(i)) {
                i <- which(i)
        }
	x@fpma <- x@fpma[i, , drop=FALSE]                   
	if(any(duplicated(i))) {
		warning("The values in the CMP ID slot are not unique anymore. To fix this, run: cid(fpset) <- makeUnique(cid(fpset))")
	}
	return(x)
})

## Behavior of "[[" operator for FPset to convert single FPset component to FP
setMethod(f="[[", signature="FPset", definition=function(x, i, ..., drop) {
	if(is.character(i)) { i <- which(rownames(x@fpma) %in% i) }
	return(new("FP", fp=x@fpma[i,]))                 
})

## Length function
setMethod(f="length", signature="FPset",
    definition=function(x) {
    	return(length(as.matrix(x)[,1]))
})

## Define print behavior for FPset
setMethod(f="show", signature="FPset", 
	definition=function(object) {    
	cat("An instance of a ", length(object@fpma[1,]), " bit ", "\"", class(object), 
		 "\" of type \"",object@type,"\" with ",
		 length(object@fpma[,1]), " molecules", "\n", sep="")
})

setMethod(f="c",signature="FP",definition=function(x,...){
	args = list(x,...)

	type=NULL
	length=0
	fpma = t(sapply(args,function(obj){
		if(!inherits(obj,"FP"))
			stop("all arguments must be of type FP")

		if(is.null(type))
			type<<-obj@type
		if(length==0)
			length <<- numBits(obj)

		if(obj@type != type)
			stop("found differing types: ",obj@type," and ",type)
		if(numBits(obj) != length)
			stop("found differing number of bits: ",numBits(obj)," and ",length)

		obj@fp
	}))
	fpset=new("FPset",fpma=fpma,type=type)
	if(any(duplicated(cid(fpset)))) 
		warning("The values in the CMP ID slot are not unique anymore, makeUnique() can fix this!")
	fpset
})
## Concatenate function for FPset
setMethod(f="c", signature="FPset", definition=function(x, ...) {
	args = list(x,...)
	type=NULL
	length=0
	fpma = Reduce(rbind,Map(function(obj){
		if(!inherits(obj,"FPset"))
			stop("all arguments must be of type FPset")

		if(is.null(type))
			type<<-obj@type
		if(length==0)
			length <<- numBits(obj)

		if(obj@type != type)
			stop("found differing types: ",obj@type," and ",type)
		if(numBits(obj) != length)
			stop("found differing number of bits: ",numBits(obj)," and ",length)

		obj@fpma
	},args))
	fpset=new("FPset",fpma=fpma,type=type)
	if(any(duplicated(cid(fpset)))) 
		warning("The values in the CMP ID slot are not unique anymore, makeUnique() can fix this!")

	fpset
})

## Define print behavior for FP
setMethod(f="show", signature="FP",                
   definition=function(object) {
         cat("An instance of ", "\"", class(object), "\" of type \"",object@type, "\"\n", sep="")
         cat("<<fingerprint>>", "\n", sep="")
         if(length(object@fp)>=20) {
             cat(c(object@fp[1:20], "... length:", length(object@fp), "\n"))
         } else {
             print(object@fp)}
})

## FPset to list with many FP objects (for summary view)
setAs(from="FPset", to="FP", 
        def=function(from) {
                tmp <- lapply(seq(along=from), function(x) from[[x]])
		names(tmp) <- cid(from)
		return(tmp)
})
setMethod(f="view", signature="FPset", definition=function(x) { as(x, "FP") })
# view(fpset)

###################
## (7) Utilities ##
###################

#################################################
## (6.1) Detect Invalid SDFs in SDFset Objects ##
#################################################
validSDF <- function(x, Nabcol = 3, Nbbcol = 3, logic="&", checkNA=TRUE) {
	if(class(x)!="SDFset") 
		warning("x needs to be of class SDFset")
	ab <- atomblock(x); 
	abcol <- sapply(names(ab), function(x) length(ab[[x]][1,]))
	bb <- bondblock(x); 
	bbcol <- sapply(names(bb), function(x) length(bb[[x]][1,]))
	if(logic=="|") 
		validsdf <- abcol >= Nabcol | bbcol >= Nbbcol 
	if(logic=="&") 
		validsdf <- abcol >= Nabcol & bbcol >= Nbbcol 
	if(checkNA==TRUE) {
        	abNA <- sapply(names(ab), function(x) !any(is.na(ab[[x]])))
        	bbNA <- sapply(names(bb), function(x) !any(is.na(bb[[x]])))
		validsdf <- validsdf & abNA & bbNA
	}
	return(validsdf)
}

######################################################################
## (6.2) Create Unique CMP Names by Appending a Counter to Duplates ##
######################################################################
makeUnique <- function(x, silent=FALSE) {
	if(all(!duplicated(x))) {
		if(silent!=TRUE) print("No duplicates detected!")
		return(x)
	} else {
		count <- table(x); count <- count[x]
		dupids <- count[count>1]; dupids <- dupids[!duplicated(names(dupids))]
		for(i in seq(along=dupids)) {
			names(count)[names(count) %in% names(dupids[i])] <- paste(names(dupids)[i], seq(1, dupids[i]), sep="_")
		}
		if(silent!=TRUE) print(paste("Counter appended to", length(dupids), "duplicates!"))
		return(names(count))
	}
}

###############################
## (6.3) Molecule Properties ##
###############################
## (6.3.1) Atom count matrix
atomcountMA <- function(x, ...) {
        if(class(x)=="SDF") x <- as(x, "SDFset")
	atomcountlist <- atomcount(x, ...) 	

	columns <- unique(unlist(lapply(seq(along=atomcountlist), function(x) names(atomcountlist[[x]]))))
        myMA <- matrix(NA, length(atomcountlist), length(columns), dimnames=list(NULL, columns))
        for(i in seq(along=atomcountlist)) myMA[i, names(atomcountlist[[i]])] <- atomcountlist[[i]]
	myMA[is.na(myMA)] <- 0
	rownames(myMA) <- names(atomcountlist)
	return(myMA)
}

## (6.3.2) Molecular weight (MW data from http://iupac.org/publications/pac/78/11/2051/)
data(atomprop); atomprop=atomprop
MW <- function(x, mw=atomprop, ...) {
   if(class(x)=="SDF") x <- as(x, "SDFset")
	
	## Create MW vector with atom symbols in name slot
	AW <- mw$Atomic_weight
	names(AW) <- mw$Symbol

	## Calculate MW
	propma <- atomcountMA(x, ...)
	MW <- rowSums(t(t(propma) * AW[colnames(propma)]), na.rm = TRUE) 
	return(MW)
}

## (6.3.3) Molecular formula
MF <- function(x, ...) {
        if(class(x)=="SDF") x <- as(x, "SDFset")
	propma <- atomcountMA(x, ...)
	propma <- propma[c(1, seq(along=propma[,1])),] # Duplicates first row to support processing of single molecule with same code
	hillorder <- colnames(propma); names(hillorder) <- hillorder
	hillorder <- na.omit(unique(hillorder[c("C", "H", sort(hillorder))]))
	propma <- propma[, hillorder]
	propma[propma==1] <- ""
	MF <- paste(colnames(propma), t(propma), sep="")
	propma <- matrix(MF, nrow=length(propma[,1]), ncol=length(propma[1,]), dimnames=list(rownames(propma), colnames(propma)), byrow=TRUE)
	MF <- seq(along=propma[,1]); names(MF) <- rownames(propma)
	zeroma <- matrix(grepl("[\\*A-Za-z]0$", propma), nrow=length(propma[,1]), ncol=length(propma[1,]), dimnames=list(rownames(propma), colnames(propma)))
	propma[zeroma] <- ""
	for(i in seq(along=MF)) { MF[i] <-  paste(propma[i,], collapse="") }
	return(MF[-1]) # Minus one to remove duplicated entry in first row of propma
}

## (6.3.4) Ring Perception and Aromaticity Assignment
## Implements with some modifications the exhaustive ring perception algorithm 
## from Hanser et al (1996). URL: http://pubs.acs.org/doi/abs/10.1021/ci960322f

## (a) Iterative removal of atoms with single non hydrogen bonds.
## Returns from molecule only its rings and their inter-connections.
.cyclicCore <- function(x) {
	if(length(x) != 1) stop("x needs to be a single molecule")
	if(class(x) == "SDFset") x <- x[[1]]
	path <- conMA(x, exclude="H")
	noconnect <- rowSums(path) != 0 # Removes atoms with no connections
	path <- path[noconnect, noconnect]
	if(all(dim(path) == 0)) { return(path) } 
	term <- which(rowSums(path > 0)==1)
	while(length(term) > 0) {
		path <- path[-term,-term]
		if(any(dim(path) == 0) | is.vector(path)) { break() }
		term <- which(rowSums(path > 0)==1)
	}
	return(path)
}

## (b) Function to return the longest possible linear bond paths where:
##     - internal atoms have only two heavy atom neighbors
##     - terminal atoms are atoms with more than two heavy atom neighbors or they are ring closures
.linearCon <- function(x) {
	secatoms <- rowSums(x > 0) == 2
	secatoms <- names(which(secatoms))
	.linearCon <- function(vertex, x=x) {
		con <- as.list(names(which(x[vertex, ] > 0)))
		for(i in seq(along=con)) {
			termatom <- con[[i]][length(con[[i]])]
			termcon <- names(which(x[termatom, ] > 0))
			termcon <- termcon[!termcon %in% vertex]
			while(length(termcon) == 1 & !any(duplicated(con[[i]]))) { 
				con[[i]] <- c(con[[i]], termcon)						
				termatom <- con[[i]][length(con[[i]])]
				termcon <- names(which(x[termatom, ] > 0))
				termcon <- termcon[!termcon %in% con[[i]][length(con[[i]])-1]]
			}
		}
		if(paste(sort(unique(con[[1]])), collapse="") == paste(sort(unique(con[[2]])), collapse="")) {
			return(con[[1]])
		} else {
			return(c(rev(con[[1]]), vertex, con[[2]]))
		}
	}
	linearconlist <- lapply(secatoms, function(y) .linearCon(vertex=y, x=x))
	nodups <- !duplicated(sapply(linearconlist, function(y) paste(sort(unique(y)), collapse=""))) 
	linearconlist <- linearconlist[nodups]
	return(linearconlist)
}

## (c) Assemble intermediate results in a list
.update <- function(con, path) {
	## Remove non-terminal atoms in each path
	center_atoms <- unique(unlist(lapply(path, function(x) x[-c(1, length(x))])))
	con1 <- con[!rownames(con) %in% center_atoms, !colnames(con) %in% center_atoms]
	## Add atom pairs with three neighbors to connection list
	if(is.matrix(con1)) {
		remainbonds <- con1
		remainbonds[lower.tri(remainbonds)] <- 0
		remainbonds <- lapply(rownames(remainbonds), function(x) names(which(remainbonds[x,] > 0)))
		names(remainbonds) <- rownames(con1)
		remainbonds <- cbind(rep(names(remainbonds), sapply(remainbonds, length)), unlist(remainbonds, use.names=F))
		remainbonds <- split(remainbonds, seq(along=remainbonds[,1]))
		path <- c(path, remainbonds)
		names(path) <- seq(along=path)
	}
	## Collect complete rings and remove them from path object
	index <- unlist(lapply(path, function(y) any(duplicated(y))))
	rings <- path[index]
	path <- path[!index]
	names(path) <- seq(along=path)
	## Connection list for path component
	conpath <- t(sapply(path, function(x) x[c(1, length(x))]))
	ends <- unique(as.vector(conpath))
	conpath <- lapply(ends, function(x) as.numeric(names(which(rowSums(conpath==x) > 0))))
	names(conpath) <- ends
	conpath <- conpath[sapply(conpath, length) > 1] # removes ends that occur only once 
	## Assemble results in list
	return(list(con=con, conpath=conpath, path=path, rings=rings))
}

## (d) Return rings from cyclist object
.rings <- function(cyclist, upper=Inf) {
	## Define data containers
	pathlist <- cyclist$path
	conpath <- cyclist$conpath
	pathlistnew <- list() 
	rings <- list()
	## Loop to join linear paths/fragments stored in pathlist
	for(i in names(conpath)) {
		if(length(conpath) == 0 | !any(names(conpath) == i)) { next() }
		pos <- t(combn(conpath[[i]], m=2))
		for(j in seq(along=pos[,1])) { 
			p1 <- pathlist[[pos[j,1]]]
			p2 <- pathlist[[pos[j,2]]]
			if(sum(p1[-c(1,length(p1))] %in% p2[-c(1,length(p2))]) > 0) {
				next()
			}
			if(p1[1] == i & p2[1] == i) { # matching s1:s2 
				pathlistnew[[length(pathlistnew)+1]] <- c(rev(p2[-1]), p1)
			}
			if(p1[length(p1)] == i & p2[length(p2)] == i) { # matching e1:e2 
				pathlistnew[[length(pathlistnew)+1]] <- c(p1, rev(p2[-length(p2)]))
			}
			if(p1[1] == i & p2[length(p2)] == i) { # matching s1:e2
				pathlistnew[[length(pathlistnew)+1]] <- c(p2, p1[-1])
			}
			if(p1[length(p1)] == i & p2[1] == i) { # matching e1:s2
				pathlistnew[[length(pathlistnew)+1]] <- c(p1, p2[-1])
			}
		}
		## Various postprocessing routines for joined fragments follow
		if(length(pathlistnew) == 0) { next() }
		## Remove duplicates
		dups <- duplicated(sapply(pathlistnew, function(x) paste(sort(unique(x)), collapse="_")))
		pathlistnew <- pathlistnew[!dups]
		## Set maximum ring size; improves time performance if outer rings are not needed
		if(upper != Inf) {   
			l <- sapply(pathlistnew, length)
			pathlistnew <- pathlistnew[l <= upper]
			if(length(pathlistnew) == 0) { next() }
		} 
		## Collect complete rings and remove them from path object
		index <- unlist(lapply(pathlistnew, function(y) any(duplicated(y[c(1, length(y))]))))
		rings[[length(rings)+1]] <- pathlistnew[index]
		pathlistnew <- pathlistnew[!index]
		## Remove paths with internal duplicates 
		if(length(pathlistnew) > 0) {
			index <- unlist(lapply(pathlistnew, function(y) any(duplicated(y))))
			pathlistnew <- pathlistnew[!index]
		}
		## Update pathlist and conpath
		pathlist <- c(pathlist[-conpath[[i]]], pathlistnew)
		dups <- duplicated(sapply(pathlist, function(x) paste(sort(unique(x)), collapse="_")))
		pathlist <- pathlist[!dups]
		names(pathlist) <- seq(along=pathlist)
		conpath <- t(sapply(pathlist, function(x) x[c(1, length(x))]))
		ends <- unique(as.vector(conpath))
		conpath <- lapply(ends, function(x) as.numeric(names(which(rowSums(conpath==x) > 0))))
		names(conpath) <- ends
		conpath <- conpath[sapply(conpath, length) > 1] # removes ends that occur only once
		pathlistnew <- list()
	}
	## Generate proper output format 
	rings <- unlist(rings, recursive=FALSE)
	dups <- duplicated(sapply(rings, function(x) paste(sort(unique(x)), collapse="_")))
	rings <- c(cyclist$rings, rings[!dups])
	l <- sapply(rings, length) 
	rings <- rings[order(l)]
	if(upper != Inf) { rings <- rings[l <= upper] }
	if(length(rings) > 0) { 
		names(rings) <- paste("ring", seq(along=rings), sep="") 
	} else {
		rings <- NULL
	}
	return(rings)
}

## (e) Identify inner rings
## Expected input x is list of rings from call: 
##      x <- rings(x=sdfset[[1]], upper=Inf, type="all", arom=FALSE) 
.is.inner <- function(x) {
        rnames <- rev(names(x)); names(rnames) <- rnames
        r <- x
        names(r) <- paste(names(r), "_", sep="")
        r <- unlist(r)
        for(i in rnames) {
                tmp <- x[[i]]
                r2 <- r[!gsub("_.*", "", names(r)) %in% i] 
                if(all(tmp %in% r2)) {
                        rnames[i] <- "redundant"
                        r <- r2
                }
        }
        return(x[rev(rnames!="redundant")])
}

## (f) Aromaticity assignment for rings
## Approach
##   (i) Identify rings where all atoms are sp2 hybridized. This means each atom has a 
##       double bond or at least one lone electron pair and is attached to a sp2 hybridized atom
##   (ii) Hueckel's rule needs to be true (4n+2=integer): 2, 6, 10, 14, 18, ... pi electrons 
##        per ring.
.is.arom <- function(sdf, rings) {
	if(length(rings)==0) { return(NULL) } 
	con <- conMA(sdf)
	b2 <- bonds(sdf)
	b <- b2[,"Nbondcount"] - b2[,"charge"]
	names(b) <- paste(b2[,"atom"], "_", seq(along=b2[,1]), sep="") 
	.neighborsFct <- function(x) { sapply(rownames(x), function(y) paste(sort(paste(x[y,][x[y,]!=0], sep="_")), collapse="_")) }
	## Determine aromaticity
	.arom <- function(con, b, r=rings[[1]]) {
		## Identify sp2 hybridized atoms
		sp <- .neighborsFct(x=con)[r]
		sp2 <- grepl("1_2$", sp)
		## Identify atoms with lone electron pairs
		el <- c(Al=3, As=5, At=7, B=3, Bi=5, Br=7, C=4, Cl=7, F=7, Ga=3, Ge=4, I=7, In=3, N=5, O=6, P=5, Pb=4, Po=6, S=6, Sb=5, Se=6, Si=4, Sn=4, Te=6, Tl=3)
		bsub <- b[names(sp)]
		lp <- el[gsub("_.*", "", names(bsub))] - bsub >= 2
		lp[is.na(lp)] <- 0 # If an element is not specified under el, then its NA value in lp is set to zero
		sp2lp <- all(sp2 | lp)
		## Hueckel's rule 
		d <- rowSums(con[r, r]==2)
		double <- sum(d) 
		if(length(d[lp]) != 0) {
			lpcount <- sum(lp[d==0]) * 2
		} else {
			lpcount <- 0
		}
		n <- ((double + lpcount) - 2) / 4
		is.wholenumber <- function(x, tol = .Machine$double.eps^0.5)  abs(x - round(x)) < tol
		hueckel <- is.wholenumber(n)
		return(sp2lp & hueckel)
	}
	return(sapply(names(rings), function(x) .arom(con, b, r=rings[[x]])))
}

## (g) Ring and aromaticity perception by running the functions (1)-(5) 
rings <- function(x, upper=Inf, type="all", arom=FALSE, inner=FALSE) {
        if(!any(c("SDF", "SDFset") %in% class(x))) stop("x needs to be of class SDF or SDFset")
	if(inner==TRUE & upper!=Inf) stop("Inner ring prediction requires upper=Inf")
        runAll <- function(x, upper) {
		con <- .cyclicCore(x)
		if(any(dim(con) == 0) | is.vector(con) | all(con == 0)) { # TRUE for linear compounds 
			if(arom==TRUE) {
				if(type=="all") return(list(RINGS=NULL, AROMATIC=NULL))
				if(type=="count") return(c(RINGS=0, AROMATIC=0))
			} else {
				if(type=="all") return(NULL)
				if(type=="count") return(0)
			}
		} else {
			path <- .linearCon(x=con)
			cyclist <- .update(con=con, path=path)
			myrings <- .rings(cyclist, upper+1) # Plus 'upper+1' is required because at this step all rings have duplicated atoms at ring closure
			myrings <- lapply(myrings, function(x) x[-1]) # Removes duplicated atom at ring closure 
			if(upper==Inf & inner==TRUE) myrings <- .is.inner(x=myrings) # Reduces myrings to inner rings only 
			if(arom==TRUE) {
				myarom <- .is.arom(sdf=x, rings=myrings)
				if(type=="all") return(list(RINGS=myrings, AROMATIC=myarom))
				if(type=="arom") return(list(AROMATIC_RINGS=myrings[myarom]))
				if(type=="count") return(c(RINGS=length(myrings), AROMATIC=sum(myarom)))
			} else {
				if(type=="all") return(myrings)
				if(type=="count") return(length(myrings))
			}
		}
	}
	if(class(x)=="SDF") { 
		return(runAll(x, upper))
	}
	if(class(x)=="SDFset" & length(x) == 1) { 
		return(runAll(x[[1]], upper))
	}
	if(class(x)=="SDFset" &  length(x) > 1) {
		myrings <- lapply(1:length(x), function(y) runAll(x[[y]], upper))
		names(myrings) <- cid(x)
		if(type=="all" | type=="arom") return(myrings)
		if(type=="count") {
			mycol <- c("RINGS", "AROMATIC")
			return(matrix(unlist(myrings), ncol=length(myrings[[1]]), dimnames=list(names(myrings), mycol[1:length(myrings[[1]])]), byrow=T))
		}
	}
}
## Usage:
# rings(sdfset[1:4], upper=6, type="all", arom=TRUE)
# rings(sdfset[1:4], upper=6, type="arom", arom=TRUE)
# rings(sdfset[1:4], upper=6, type="count", arom=TRUE)
# plot(sdfset[1], print=F, atomnum=T, no_print_atoms="H") 
# plot(sdfset[1:4], print=F, atomnum=T, no_print_atoms="H") 

## (6.3.5) Enumerate Functional Groups
## (a) Generate neighbor information for each heavy atom in a molecule
.neighbors <- function(x, type="countMA") {
        ## Input checks        
        if(!any(c("SDF", "SDFset") %in% class(x))) stop("x needs to be of class SDF or SDFset")
        if(!any(c("all", "count", "countMA") %in% type)) stop("type can only be assigned: all, count or countMA") 
        ## Return neighbors
        .neighborsFct <- function(x, type=type) {
                colnames(x) <- paste(colnames(x), colSums(x>0), sep="_") # Adds number of bonded heavy atom neighbors (non-hydrogens)
                neighbors <- sapply(rownames(x), function(y) paste(sort(paste(gsub("_.*_", "_", colnames(x)[x[y,]!=0]), x[y,][x[y,]!=0], sep="_")), collapse="_"))
                if(type=="all") {
                        return(neighbors)
                }
                if(type=="count" | type=="countMA") {
                        return(table(paste(gsub("_.*", "", names(neighbors)), neighbors, sep=":")))
                }
        }
        ## Run on SDF object
        if(class(x)=="SDF") {
                x <- conMA(x, exclude="H")
                neighbors <- .neighborsFct(x, type)
                return(neighbors)
        }
        ## Run on SDFset objects containing one or many molecules
        if(class(x)=="SDFset") {
                cid <- cid(x)
                x <- conMA(x, exclude="H")
                neighbor_set <- lapply(seq(along=x), function(y) .neighborsFct(x[[y]], type))
                names(neighbor_set) <- cid
                if(type=="all" | type=="count") {
                        return(neighbor_set)
                }
                if(type=="countMA") {
                        columns <- unique(unlist(lapply(seq(along=neighbor_set), function(x) names(neighbor_set[[x]]))))
                        myMA <- matrix(NA, length(neighbor_set), length(columns), dimnames=list(NULL, columns))
                        for(i in seq(along=neighbor_set)) myMA[i, names(neighbor_set[[i]])] <- neighbor_set[[i]]
                        myMA[is.na(myMA)] <- 0
                        rownames(myMA) <- cid
                        return(myMA)
                }
        }
}
## Usage:
# .neighbors(sdfset[1:4], type="all")
# .neighbors(sdfset[1:4], type="countMA")

## (b) Count functional groups
groups <- function(x, groups="fctgroup", type="countMA") {
        ## Input checks        
        if(!any(c("SDF", "SDFset") %in% class(x))) stop("x needs to be of class SDF or SDFset")
        if(groups=="fctgroup" & (type=="count" | type=="all")) stop("when groups=\"fctgroup\", only type=\"countMA\" can be used")
        mylength <- length(x)
        ## Support for single molecule objects
	if(mylength==1) { x <- as(x, "SDFset"); y <- x; z <- x; cid(z) <- "dummy"; x <- c(y, z) }
        ## Generate neighbor counts
	neighbors <- .neighbors(x, type)
        ## Return neighbor counts if requested
	if(groups[1]=="neighbors") { 
		if(class(neighbors)=="matrix") {
			neighbors <- neighbors[!rownames(neighbors) %in% "dummy", ]
		} else { 
			neighbors <- neighbors[!names(neighbors) %in% "dummy"]
		}
		return(neighbors) 
	
	}
        ## Count functional groups based on data stored in neighbor matrix
        if(groups[1]=="fctgroup") { 
		groups <- c(RNH2="^C:.*N_1_1$", R2NH="^N:C_._1_C_._1$", R3N="^N:C_._1_C_._1_C_._1$", 
                            ROPO3="^P:O_._._O_._._O_._._O_._.$", ROH="(?=^C:.*O_1_1)(?=^(?:(?!(N|O|P|S)_._(2|3)).)*$)", 
		            RCHO="^O:C_2_2", RCOR="^C:C_._1_C_._1.*O_1_2", RCOOH="^C:.*O_1_1_O_1_2", 
			    RCOOR="^C:.*O_1_2_O_2_1", ROR="^O:.*C_._1_C_._1", RCCH="^C:C_2_3", RCN="^N:C_2_3") 
	}
        groupMA <- sapply(names(groups), function(x) rowSums(neighbors[, rep(grep(groups[x], colnames(neighbors), perl=TRUE),2)]/2))
	## Fix counts for ambiguous functional groups
        if(c("ROR" %in% colnames(groupMA))) {
	        groupMA[, "ROR"] <- groupMA[, "ROR"] - groupMA[, "RCOOR"]
                groupMA[groupMA < 0] <- 0
        }
	if(mylength>1) {
                return(groupMA)
        } else {
                return(groupMA[1,])
        }
}
## Usage:
# groups(sdfset[1:20], groups="fctgroup", type="countMA") 
# groups(sdfset[1:4], groups="neighbors", type="countMA")
# groups(sdfset[1:4], groups="neighbors", type="count")
# groups(sdfset[1:4], groups="neighbors", type="all")

##############################################################
## (6.4) Convert SDF Tail to Numeric and Character Matrices ##
##############################################################
## (6.4.1) Store everything in one character matrix
datablock2ma <- function(datablocklist, cleanup=" \\(.*", ...) {
	if(exists("cleanup")) 
		for(i in seq(along=datablocklist)) 
			names(datablocklist[[i]]) <- gsub(cleanup, "", names(datablocklist[[i]])) # Required if name tags contain compound ids
   columns <- unique(unlist(lapply(seq(along=datablocklist), function(x) names(datablocklist[[x]]))))
	myMA <- matrix(NA, length(datablocklist), length(columns), dimnames=list(NULL, columns))
	for(i in seq(along=datablocklist)) 
		myMA[i, names(datablocklist[[i]])] <- datablocklist[[i]]
	rownames(myMA) <- names(datablocklist)
	return(myMA)
}
# Usage:
# blockmatrix <- datablock2ma(datablocklist=datablock(sdfset))

## (6.4.2) Split SDF tail matrix into character and numeric matrices
splitNumChar <- function(blockmatrix) {
	# Define function to check for valid numeric values in a character vector
	numberAble <- function(myvec, type=c("single", "vector"), extras = c(".", "NA")) {
	    type <- match.arg(type)
	    old <- options(warn = -1)
	    on.exit(options(old))
	    myvec <- gsub(" ", "", myvec)
	    myvecsub <- myvec[!myvec %in% c("", extras)]
	    isnum <- !any(is.na(as.numeric(myvecsub)))
	    if (type == "single") 
		isnum
	    else if (isnum) 
		as.numeric(myvec)
	    else myvec
	}
        colindex <- sapply(seq(along=blockmatrix[1,]), function(x) numberAble(blockmatrix[,x]))
        numMA <- blockmatrix[ , colindex]
        storage.mode(numMA) <- "numeric"
        charMA <- blockmatrix[ , !colindex]
        return(list(numMA=numMA, charMA=charMA))
}
# Usage:
# numchar <- splitNumChar(blockmatrix=blockmatrix)

#########################
## (6.5) Bond Matrices ##
#########################
## (6.5.1) Generate bond matrix from SDFset or SDF objects
conMA <- function(x, exclude="none") {
        ## Function for SDF object 
        .conMA <- function(x, exclude=exclude) {
            atoms <- rownames(atomblock(x))
            conma <- matrix(0, length(atoms), length(atoms), dimnames=list(atoms, atoms))
            bondblock <- bondblock(x)
            for(i in seq(along=bondblock[,1])) conma[bondblock[i,1], bondblock[i,2]] <- bondblock[i,3]
            for(i in seq(along=bondblock[,1])) conma[bondblock[i,2], bondblock[i,1]] <- bondblock[i,3]
            index <- !gsub("_.*", "", rownames(conma)) %in% exclude
	    conma <- conma[index, index, drop = FALSE]
            return(conma)
        }   
        ## Run on SDF objects
        if(class(x)=="SDF") {
                conma <- .conMA(x, exclude)
                return(conma)
        }
        ## Run on SDFset objects containing one or many molecules
        if(class(x)=="SDFset") {
                conma_set <- lapply(seq(along=x), function(y) .conMA(x[[y]], exclude))
                names(conma_set) <- cid(x)
                return(conma_set)
        }
}
# Usage:
# conma <- conMA(sdfset[1:2], exclude=c("H"))

## (6.5.2) Compute bond/charge count for each atom in SDFset or SDF objects
## This is used to add hydrogens with methods/functions atomcount, atomcountMA, MW and MF
bonds <- function(x, type="bonds") {
	## Input checks
        if(!any(c("SDF", "SDFset") %in% class(x))) stop("x needs to be of class SDF or SDFset")
	if(!any(c("bonds", "charge", "addNH") %in% type)) stop("type can only be assigned: bonds, charge or addNH") 
	
	## Compute bonds, charges and missing hydrogens
	.bonds <- function(x, type=type) {
		atomMA <- atomblock(x)
		atoms <- gsub("_.*", "", rownames(atomMA))
		bondMA <- bondblock(x)
		Nbonds1 <- cbind(atoms=c(bondMA[,1], bondMA[,2]), bonds=c(bondMA[,3], bondMA[,"C3"]))
		Nbonds1 <- tapply(Nbonds1[, "bonds"], Nbonds1[, "atoms"], sum)
		Nbonds <- rep(0, length(atomMA[,1])); names(Nbonds) <- seq(along=atomMA[,1]); Nbonds[names(Nbonds1)] <- Nbonds1 	
		
		## Valence related to position in periodic table (following octet rule) 
		val <- c("1"=1, "17"=1, "2"=2, "16"=2, "13"=3, "15"=3, "14"=4)
		group <- as.numeric(atomprop$Group); names(group) <- as.character(atomprop$Symbol)
		Nbondrule <- val[as.character(group[atoms])]
		Nbondrule[is.na(Nbondrule)] <- 0 # Atoms with undefined Nbondrule (NAs) are assigned zero
		Nbondrule[Nbondrule < Nbonds] <- Nbonds[Nbondrule < Nbonds] # Set Nbondrule to Nbonds values, where latter is larger 

		## Charges
		charge <- c("0"=0, "1"=3, "2"=2, "3"=1, "4"=0, "5"=-1, "6"=-2, "7"=-3) # 4 is "doublet_radical"
		charge <- charge[as.character(atomMA[,5])]
		Nbonds <- data.frame(atom=atoms, Nbondcount=Nbonds, Nbondrule=Nbondrule, charge=charge) 
		
		## Data type to return
		if(type=="bonds") { return(Nbonds) }
		if(type=="charge") {
			chargeindex <- Nbonds[, "charge"] != 0 
			if(sum(chargeindex) == 0) {
				return(NULL)
			} else {
				chargeDF <- Nbonds[chargeindex, ]
				charge <- chargeDF[, "charge"]
				names(charge) <- chargeDF[, "atom"] 
				return(charge)
			}
		}
		if(type=="addNH") { 
			Nbonds[Nbonds[,"Nbondcount"] >= Nbonds[,"Nbondrule"], "charge"] <- 0 # Ignore charge where Nbondcount greater or equal than Nbondrule
			Nbonds[Nbonds[,"Nbondcount"] == 0, c("Nbondrule", "charge")] <- 0 # Ignore atoms with zero bonds
			NH <- sum((Nbonds[, "Nbondrule"] + Nbonds[, "charge"]) - Nbonds[, "Nbondcount"])
			if(NH < 0) NH <- 0 # Count should not be negative
			return(NH) 
		}
	}
        ## Run on SDF objects
        if(class(x)=="SDF") {
                bonds <- .bonds(x, type)
                return(bonds)
        }
        ## Run on SDFset objects containing one or many molecules
        if(class(x)=="SDFset") {
                bonds_set <- lapply(seq(along=x), function(y) .bonds(x[[y]], type))
                names(bonds_set) <- cid(x)
		if(type=="bonds") {
                	return(bonds_set)
		}
		if(type=="charge") {
                	return(bonds_set)
		}
		if(type=="addNH") {
                	return(unlist(bonds_set))
		}
        }
}
# Usage:
# bondDF <- bonds(sdfset[1], type="df")) 
# bondcount <- bonds(sdfset[1], type="addNH")) 

##########################################################################
## (6.6) Subset SDF/SDFset Objects by Atom Index to Obtain Substructure ##
##########################################################################
## Function to obtain substructure from SDF/SDFset object by providing a row
## index for atom block. Both atom and bond blocks will be subsetted accordingly. 
## Two functions are combined in one: type="new" assigns new atom numbers to
## the subsetted SDF, while type="old" maintains the numbering of the source SDF.
atomsubset <- function (x, atomrows, type="new", datablock = FALSE) {
    ## Variant that assigns new numbers to atoms in subsetted SDF
    if(type=="new") { 
	if(!any(c("SDF", "SDFset") %in% class(x))) 
	    stop("x needs to be of class SDF or SDFset")
	if(class(x) == "SDFset" & class(atomrows) != "list") 
	    stop("if x is of class SDFset, atomrows argument needs to be a list")
	if(class(x) == "SDFset") {
	    if (!all(cid(x) == names(atomrows))) 
		stop("x and atomrows need to have same length and identical component (molecule) names")
	}
	.atomsubset <- function(x, atomrows) {
	    hb <- header(x)
	    ab <- atomblock(x)[atomrows, ]
	    bb <- bondblock(x)
	    index <- rowSums(cbind(bb[, 1] %in% atomrows, bb[, 2] %in% 
		atomrows)) == 2
	    bb <- bb[index, ]
	    pos <- as.numeric(gsub(".*_", "", rownames(ab)))
	    oripos <- 1:length(pos)
	    names(oripos) <- pos
	    tmp <- bb[, 1:2]
	    tmp2 <- tmp
	    for (i in oripos) tmp2[tmp == as.numeric(names(oripos[i]))] <- i
	    bb[, 1:2] <- tmp2
	    if (is.vector(bb)) {
		bb <- t(as.matrix(bb))
	    }
	    countsLine <- hb[4]
	    atomCount <- sprintf("%3d", length(atomrows))
	    bondCount <- sprintf("%3d", length(rowSums(bb)))
	    # update atom count and bond count
	    hb[4] <- paste(atomCount, bondCount, substr(countsLine, 7, 100000L), sep="")
	    # update bond block row names
	    row.names(bb) <- 1:length(rowSums(bb))
	    # update atom block row names
	    row.names(ab) <- paste(gsub("_.*", "",rownames(ab)), 1:length(atomrows), sep="_")
	    if (datablock == FALSE) {
		sdf <- new("SDF", header = hb, atomblock = ab, bondblock = bb)
		return(sdf)
	    }
	    if (datablock == TRUE) {
		sdf <- new("SDF", header = hb, atomblock = ab, bondblock = as.matrix(bb), 
		    datablock = datablock(x))
		return(sdf)
	    }
	}
	if (class(x) == "SDF") {
	    return(.atomsubset(x, atomrows))
	}
	if (class(x) == "SDFset") {
	    ids <- cid(x)
	    sdflist <- lapply(cid(x), function(y) atomsubset(sdfset[[y]], 
		atomrows[[y]]))
	    names(sdflist) <- ids
	    sdfset <- as(sdflist, "SDFset")
	    return(sdfset)
	    }
    }

    ## Variant that maintains atom numbers from source SDF in subsetted SDF
    if(type=="old") { 
        if(!any(c("SDF", "SDFset") %in% class(x))) stop("x needs to be of class SDF or SDFset")
        if(class(x)=="SDFset" & class(atomrows)!="list") stop("if x is of class SDFset, atomrows argument needs to be a list")
        if(class(x)=="SDFset") {
                if(!all(cid(x) == names(atomrows))) stop("x and atomrows need to have same length and identical component (molecule) names")
        }
        .atomsubset <- function(x, atomrows) {
	    hb <- header(x)
	    ab <- atomblock(x)[atomrows, ]
	    bb <- bondblock(x)
	    index <- rowSums(cbind(bb[,1] %in% atomrows, bb[,2] %in% atomrows)) == 2
	    bb <- bb[index,]

	    ## Update bb to positions in ab
	    pos <- as.numeric(gsub(".*_", "", rownames(ab)))
	    oripos <- 1:length(pos)
	    names(oripos) <- pos
	    tmp <- bb[,1:2]; tmp2 <- tmp 
	    for(i in oripos) tmp2[tmp==as.numeric(names(oripos[i]))] <- i
	    bb[,1:2] <- tmp2
	    
	    ## Outputs
	    if(is.vector(bb)) { bb <- t(as.matrix(bb)) }
	    if(datablock==FALSE) {
		sdf <- new("SDF", header=hb, atomblock=ab, bondblock=bb)
		return(sdf)
	    }
	    if(datablock==TRUE) {
		sdf <- new("SDF", header=hb, atomblock=ab, bondblock=as.matrix(bb), datablock=datablock(x))
		return(sdf)
	    }
        }
        if(class(x)=="SDF") {
	    return(.atomsubset(x, atomrows))
        }
        if(class(x)=="SDFset") {
	    ids <- cid(x)
	    sdflist <- lapply(cid(x), function(y) atomsubset(sdfset[[y]], atomrows[[y]]))
	    names(sdflist) <- ids
	    sdfset <- as(sdflist, "SDFset")
	    return(sdfset)
        }
    }
}

## Usage: 
# atomsubset(sdfset[[1]], atomrows=1:18, type="new")
# atomsubset(sdfset[1:2], atomrows=list(CMP1=1:18, CMP2=1:12), type="new")

###################################
## (6.7) Function write.SDFsplit ##
###################################
## Splits SD Files into any number of smaller SD Files                                                                   
write.SDFsplit <- function(x, filetag, nmol) {
        from <- seq(1, length(x), by=nmol)
        splitDF <- data.frame(from=from, to=c(from[-1], length(x)+1)-1, filename=NA)
        splitDF[,"filename"] <- paste(filetag, sprintf(paste("%0", nchar(as.character(length(x))), "d", sep=""), splitDF[,1]), "_", 
                          sprintf(paste("%0", nchar(as.character(length(x))), "d", sep=""), splitDF[,2]), ".sdf", sep="")
        for(i in seq(along=splitDF[,1])) {
                write.SDF(x[splitDF[i,"from"]:splitDF[i,"to"]], splitDF[i,"filename"])
        }                                                                                             
	return(splitDF) # Gives access to file names to simplify import of split SD Files
}                   

## Usage 
# write.SDFsplit(x=sdfstr, filetag="myfile", nmol=10)
# write.SDFsplit(x=sdfsample, filetag="myfile", nmol=10)

######################################
## (6.8) Streaming Through SD Files ##
######################################
## Streaming function to compute descriptors for large SD Files without consuming much memory.
## In addition to descriptor values, it returns a line index that defines the positions of each 
## molecule in the source SD File. This line index can be used by the read.SDFindex function to 
## retrieve specific compounds of interest from large SD Files without reading the entire file 
## into memory. 
sdfStream <- function(input, output, append=FALSE, fct, Nlines=10000, startline=1, restartNlines=10000, silent=FALSE, ...) {
	## Define loop parameters 
	stop <- FALSE 
	f <- file(input, "r")
	n <- Nlines
	offset <- 0
	## For restarting sdfStream at specific line assigned to startline argument. If assigned
        ## startline value does not match the first line of a molecule in the SD file then it 
        ## will be reset to the start position of the next molecule in the SD file.
	if(startline!=1) { 
		fmap <- file(input, "r")
		shiftback <- 2
		chunkmap <- scan(fmap, skip=startline-shiftback, nlines=restartNlines, what="a", blank.lines.skip=FALSE, quiet=TRUE, sep ="\n")
		startline <- startline + (which(grepl("^\\${4,4}", chunkmap, perl=TRUE))[1] + 1 - shiftback)
		if(is.na(startline)) stop("Invalid value assigned to startline.")
		dummy <- scan(f, skip=startline-2, nlines=1, what="a", blank.lines.skip=FALSE, quiet=TRUE, sep ="\n")
		close(fmap)
		offset <- startline - 1 # Maintains abolut line positions in index
	}
	counter <- 0
	cmpid <- 1
	partial <- NULL
	while(!stop) {
		counter <- counter + 1
		chunk <- readLines(f, n = n) # chunk n can be any number of lines
		# chunk <- scan(f, n=n, what="a", blank.lines.skip=FALSE, quiet=TRUE, sep ="\n") # scan has more flexibilities for reading specific line ranges in files.
		if(length(chunk) > 0) {
			if(length(partial) > 0) {
				chunk <- c(partial, chunk)
			}
			## Assure that lines of least 2 complete molecules are stored in chunk if available
			inner <- sum(grepl("^\\${4,4}", chunk, perl=TRUE)) < 2
			while(inner) {
				chunklength <- length(chunk)
				chunk <- c(chunk, readLines(f, n = n))
				if(chunklength == length(chunk)) { 
					inner <- FALSE 
				} else {
					inner <- sum(grepl("^\\${4,4}", chunk, perl=TRUE)) < 2
				}
			}
			y <- regexpr("^\\${4,4}", chunk, perl=TRUE) # identifies all fields that start with a '$$$$' sign
			index <- which(y!=-1)
			indexDF <- data.frame(start=c(1, index[-length(index)]+1), end=index)
			complete <- chunk[1:index[length(index)]]
			if((index[length(index)]+1) <= length(chunk)) {
				partial <- chunk[(index[length(index)]+1):length(chunk)]
			} else {
				partial <- NULL
			}
			index <- index + offset 
			indexDF <- data.frame(SDFlineStart=c(offset + 1, index[-length(index)]+1), SDFlineEnd=index)
			offset <- indexDF[length(indexDF[,2]),2]

			## Coerce file lines stored in character vector to SDFset
			sdfset <- read.SDFset(read.SDFstr(complete))
                        valid <- validSDF(sdfset)
                        sdfset <- sdfset[valid]
                        indexDForig <- indexDF
                        indexDF <- indexDF[valid,]
			## Perform desired computation on SDFset
			if(length(indexDF[,1])==1) {
				suppressWarnings(sdfset <- c(sdfset, sdfset)) # Trick to keep data in matrix format
                                resultMA <- fct(sdfset, ...)
				resultMA <- cbind(as.data.frame(indexDF), as.data.frame(resultMA[1, , drop=FALSE]), row.names=row.names(resultMA)[1])
			} else {
				resultMA <- fct(sdfset, ...)
				resultMA <- cbind(as.data.frame(indexDF), as.data.frame(resultMA), row.names=row.names(resultMA))
			}
			resultMA <- resultMA[names(valid),]
                        if(any(is.na(resultMA))) { # Maintains index for invalid compounds having NAs in descriptor fields
                                resultMA[,1:2] <- indexDForig[,1:2]
                        }
                        rownames(resultMA) <- paste("CMP", cmpid : (cmpid + length(resultMA[,1])-1), sep="")
			cmpid <- cmpid + length(resultMA[,1])
			## Print processing status to screen
                        if(silent==FALSE) {
                                print(rownames(resultMA))
                        }
			## Append results to tabular file
			if(counter==1 & append!=TRUE) {
				unlink(output)
				write.table(resultMA, output, quote=FALSE, col.names=NA, sep="\t")
			} else {	
				write.table(resultMA, output, quote=FALSE, append=TRUE, col.names=FALSE, sep="\t")
			}
		}
		if(length(chunk) == 0) {
			stop <- TRUE
			close(f)
		}
	}
}

## Usage:
# library(ChemmineR)
# data(sdfsample); sdfset <- sdfsample
# write.SDF(sdfset, "test.sdf")
## Choose descriptor set in a simple function:
# desc <- function(sdfset) {
#         cbind(SDFID=sdfid(sdfset),
#               # datablock2ma(datablocklist=datablock(sdfset)),
#               MW=MW(sdfset),
#               groups(sdfset),
#               # AP=sdf2ap(sdfset, type="character"),
#               rings(sdfset, type="count", upper=6, arom=TRUE)
#         )     
# }
# sdfStream(input="test.sdf", output="matrix.xls", fct=desc, Nlines=1000)
# indexDF <- read.delim("matrix.xls", row.names=1)[,1:2]
# sdfset <- read.SDFindex(file="test.sdf", index=indexDF, type="SDFset", outfile="sub.sdf") 


#####################################################################
## (6.9) Read Atom Pair String Representation from File into APset ##
#####################################################################
## Function to convert atom pairs (AP) or atom pair fingerprints (APFP) stored
## as character strings to APset or FPset objects (e.g. generated by sdfStream).
## Alternatively, one can provide the AP/APFP strings in a named character vector.
read.AP <- function(x, type, colid,isFile = class(x)=="character" & length(x)==1) {
        if(type=="ap") {
                if(isFile) {
                        x <- read.delim(x, sep="\t", row.names=1)
                        ids <- row.names(x); x <- x[,colid]; names(x) <- ids
                }
                desclist <- strsplit(as.character(x), ", ", fixed = TRUE)
                desclist <- lapply(desclist, as.numeric)
                names(desclist) <- names(x)
                return(as(desclist, "APset"))
        }
        if(type=="apfp" | type=="fp") {
                if(isFile) {
                        x <- read.delim(x, colClasses="character", sep="\t", row.names=1)
                        ids <- row.names(x); x <- x[,colid]; names(x) <- ids
                }
                descma <- matrix(as.numeric(unlist(strsplit(x, ""))), length(x), nchar(x[1]), byrow=TRUE, dimnames=list(names(x), 1:nchar(x[1])))
                #return(as(descma, "FPset"))
                return(new( "FPset",fpma=descma,type="apfp"))
        }
}

## Usage:
# apset <- read.AP(x="matrix.xls", type="ap", colid="AP")
# apfp <- read.AP(x="matrix.xls", type="apfp", colid="APFP")

#########################################################
## (6.10) Extract Molecules from SD File by Line Index ##
#########################################################
## Extracts specific molecules from SD File based on a line position index computed by the sdfStream function
read.SDFindex <- function(file, index, type="SDFset", outfile) {
	f <- file(file, "r") 
	if(type=="SDFset") {
		sdfset <- SDFset()
	}
	## Index transfromations
	index <- index[order(index[,1]),] # Assures positional sorting, which is expected by the following step!!
	index <- data.frame(skip=index[,1] - c(0, index[-length(index[,1]),2]) - 1 , nlines=index[,2] - index[,1] + 1) 
	## Scan through file using index to retrieve molecules one-by-one
	for(i in seq(along=index[,1])) {
		lines <- scan(f, skip=index[i,1], nlines=index[i,2], what="a", blank.lines.skip=FALSE, quiet=TRUE, sep ="\n")
		#delteme# lines <- scan(file, skip=index[i,1]-1, nlines=index[i,2]-index[i,1] + 1, what="a", blank.lines.skip=FALSE, quiet=TRUE, sep ="\n") 
		if(type=="file") {
			if(i == 1) {
				unlink(outfile)
				cat(lines, file=outfile, sep="\n")
			} else {	
				cat(lines, file=outfile, sep="\n", append=TRUE)
			}
		}
		if(type=="SDFset") {
			suppressWarnings(sdfset <- c(sdfset, read.SDFset(read.SDFstr(lines))))	
		}
	}
	close(f)
	if(type=="SDFset") {
		cid(sdfset) <- paste("CMP", 1:length(sdfset), sep="")
                return(sdfset)
	}
}
## Usage: see sdfStream()

#################################
## (6.11) String Search Method ##
#################################
## String search function for SDFset
grepSDFset <- function(pattern, x, field="datablock", mode="subset", ignore.case=TRUE, ...) {
	## Generate search vector and index for desired field in SDFset
	if(field=="header" | field==1) {
		searchfield <- header(x)
		searchstr <- as.character(unlist(searchfield))
		index <- sapply(searchfield, length)
		index <- unlist(sapply(index, function(x) seq(1, x)))
		indexnames <- rep(seq(along=searchfield), sapply(searchfield, length))
		names(index) <- indexnames
	}
	if(field=="atomblock" | field==2) { 
		searchfield <- atomblock(x)
		searchstr <- as.character(unlist(sapply(searchfield, rownames)))
		searchstr <- gsub("_.*", "", searchstr)
		index <- sapply(searchfield, function(x) length(x[,1]))
		index <- unlist(sapply(index, function(x) seq(1, x)))
		indexnames <- rep(seq(along=searchfield), sapply(searchfield, function(x) length(x[,1])))
		names(index) <- indexnames
	}
	if(field=="bondblock" | field==3) { # Not very useful for string searching.
		searchfield <- bondblock(x)
		searchstr <- as.character(unlist(sapply(searchfield, rownames)))
		searchstr <- gsub("_.*", "", searchstr)
		index <- sapply(searchfield, function(x) length(x[,1]))
		index <- unlist(sapply(index, function(x) seq(1, x)))
		indexnames <- rep(seq(along=searchfield), sapply(searchfield, function(x) length(x[,1])))
		names(index) <- indexnames
	}
	if(field=="datablock" | field==4) { 
		searchfield <- datablock(x)
		searchstr <- paste(names(unlist(searchfield)), as.character(unlist(searchfield)), sep="___")
		index <- sapply(searchfield, length)
		index <- unlist(sapply(index, function(x) seq(1, x)))
		indexnames <- rep(seq(along=searchfield), sapply(searchfield, length))
		names(index) <- indexnames
	}
	## Search with grep
	matchpos <- grep(pattern, searchstr, ignore.case=ignore.case, ...)
	if(mode=="index") {
		return(index[matchpos])

	}
	if(mode=="subset") {
		xpos <- as.numeric(unique(names(index[matchpos])))
		return(as(x[xpos], "SDF"))
	}
}

##########################
## (7) Plotting Methods ##
##########################
## Plot single CMP Structure
plotStruc <- function(sdf,
							 atomcex=1.2,
							 atomnum=FALSE,
							 no_print_atoms=c("C"),
							 noHbonds=TRUE,
							 bondspacer=0.12,
							 colbonds=NULL,
							 bondcol="red",
							 regenCoords=FALSE,
							 ...) {

	if(regenCoords && .haveOB())
		sdf = regenCoordsOB(sdf)

   toplot <- list(atomblock=cbind(atomblock(sdf)[,c(1:2)], as.matrix(bonds(sdf, type="bonds")[,-1])), bondblock=cbind(as.matrix(as.data.frame(bondblock(sdf))[,1:3]), bondcol=1))
	## Add bond color
	toplot[[2]][, "bondcol"] <- toplot[[2]][,"bondcol"] + as.numeric((toplot[[2]][,"C1"] %in% colbonds) & (toplot[[2]][,"C2"] %in% colbonds))
	## Create empty plot with proper dimensions
	plot(toplot[[1]], type="n", axes=F, xlab="", ylab="", ...)
	## Remove C-hydrogens including their bonds 
	if(noHbonds==TRUE) {
		nonbonded <- !1:length(toplot[[1]][,1]) %in% sort(unique(as.vector(toplot[[2]][,1:2])))
                nonbonded <- as.data.frame(toplot[[1]])[nonbonded,]
                CHbondindex <- sapply(seq(toplot[[2]][,1]), function(x) paste(sort(gsub("_.*", "", rownames(toplot[[1]]))[toplot[[2]][x,1:2]]), collapse="") == "CH")
		toplot[[1]] <- toplot[[1]][sort(unique(as.numeric(toplot[[2]][!CHbondindex,1:2]))), ]
		toplot[[2]] <- as.matrix(as.data.frame(toplot[[2]])[!CHbondindex,]) 
                toplot[[1]] <- as.matrix(rbind(toplot[[1]], nonbonded))
	}	
	## Plot bonds
	z <- toplot[[2]][, "bondcol"]; z[z==2] <- bondcol # Stores bond coloring data
	for(i in seq(along=toplot[[2]][,1])) {
		x <- toplot[[1]][gsub("*.*_", "", rownames(toplot[[1]])) %in% toplot[[2]][i,1:2],1]
		y <- toplot[[1]][gsub("*.*_", "", rownames(toplot[[1]])) %in% toplot[[2]][i,1:2],2]
		## Plot single bonds
		if(toplot[[2]][i,3]==1) {
			lines(x=x, y=y, lty=1, lwd=3, col=z[i]) 
		} 
		## Plot double bonds
		if(toplot[[2]][i,3]==2) {
			rslope <- (atan(diff(y)/diff(x))*180/pi)/90
			lines(x=x-rslope*bondspacer, y=y+(1-abs(rslope))*bondspacer, lty=1, lwd=3, col=z[i])
			lines(x=x+rslope*bondspacer, y=y-(1-abs(rslope))*bondspacer, lty=1, lwd=3, col=z[i])
		}
		## Plot triple bonds
		if(toplot[[2]][i,3]==3) {
			rslope <- (atan(diff(y)/diff(x))*180/pi)/90
			bondspacer <- bondspacer * 2
			lines(x=x, y=y, lty=1, lwd=3, col=z[i]) 
			lines(x=x-rslope*bondspacer, y=y+(1-abs(rslope))*bondspacer, lty=1, lwd=3, col=z[i])
			lines(x=x+rslope*bondspacer, y=y-(1-abs(rslope))*bondspacer, lty=1, lwd=3, col=z[i])
		}
		## Plot bonds with labels other than 1-3 (e.g. some ChEMBL SDFs use 4 for aromatic bonds)
		if(!toplot[[2]][i,3] %in% c(1,2,3)) {
			rslope <- (atan(diff(y)/diff(x))*180/pi)/90
			lines(x=x-rslope*bondspacer, y=y+(1-abs(rslope))*bondspacer, lty=3, lwd=3, col=z[i])
			lines(x=x+rslope*bondspacer, y=y-(1-abs(rslope))*bondspacer, lty=3, lwd=3, col=z[i])
		}
	}
	## Exclude certain atoms from being printed
	exclude <- paste("(^", no_print_atoms, "_)", sep="", collapse="|")
	labelMA <- toplot[[1]][!grepl(exclude, rownames(toplot[[1]])), , drop=FALSE] # Added July 31, 2012: 'drop=FALSE'
        ## Add charges 
	charge <- c("0"="", "3"="3+", "2"="2+", "1"="+", "-1"="-", "-2"="2-", "-3"="3-")
        charge <- charge[as.character(labelMA[,"charge"])]
        ## Add hydrogens to non-charged/non-C atoms according to valence rules (some SD files require this)
        Nhydrogens <- c("0"="", "1"="H", "2"="H2", "3"="H3", "4"="H4", "5"="H5", "6"="H6", "7"="H7", "8"="H8") 
        hydrogens <- (labelMA[, "Nbondrule"] + labelMA[, "charge"]) - labelMA[,"Nbondcount"]
        hydrogens[labelMA[,"charge"]!=0] <- 0; hydrogens[hydrogens < 0] <- 0
        hydrogens <- Nhydrogens[as.character(hydrogens)]
        ## Plot data
        if(is.vector(labelMA)) labelMA <- matrix(labelMA, 1, 2, byrow=TRUE, dimnames=list(rownames(toplot[[1]])[!grepl(exclude, rownames(toplot[[1]]))], c("C1", "C2")))
	if(is.matrix(labelMA) & length(labelMA[,1])>=1) {
		atomcol <- gsub("_.*", "", rownames(labelMA)); atomcol[!grepl("N|C|O|H", atomcol)] <- "any"; mycol <- c(C="black", H="black", N="blue", O="red", any="green"); atomcol <- mycol[atomcol]

		## Overplot nodes to display atom labels
		points(x=labelMA[,1], y=labelMA[,2], col="white", pch=16, cex=2.8)
		## Plot atom labels
		if(atomnum==TRUE) {
			text(x=labelMA[,1], y=labelMA[,2], paste(gsub("_", "", rownames(labelMA)), hydrogens, charge, sep=""), cex=atomcex, col=atomcol) 
		} else {
			text(x=labelMA[,1], y=labelMA[,2], paste(gsub("_.*", "", rownames(labelMA)), hydrogens, charge, sep=""), cex=atomcex, col=atomcol)
		}
	}
}
## Usage:
# plotStruc(sdf=sdfset[[2]], atomcex=1.2, atomnum=F, no_print_atoms=c("C"), noHbonds=TRUE, bondspacer=0.08)
# par(mfrow=c(2,3)); for(i in 1:6) plotStruc(sdf=sdfset[[i]], atomcex=1.8, atomnum=F, no_print_atoms=c("C"), noHbonds=TRUE, bondspacer=0.08)

## Plot method for single SDF object
setMethod(f="plot", signature="SDF", definition=function(x, print=TRUE, ...) { 
		plotStruc(sdf=x, ...)
		if(print==TRUE) { return(x) } 
	}
)

## Plot method for multiple SDF objects in SDFset
setMethod(f="plot", signature="SDFset",
	definition=function(x, griddim, print_cid=cid(x), print=TRUE, ...) {
		if(missing(griddim)) {
			mydim <- ceiling(sqrt(length(x)))
			griddim <- c(mydim, mydim)
		}
		par(mfrow=griddim)
		for(i in 1:length(x)) { plotStruc(sdf=x[[i]], main=print_cid[i], ...) }
		if(print==TRUE) { return(SDFset2SDF(x)) }
})

## Object for referencing chemminetools jobs
setClass("jobToken", representation=representation(
    tool_name = "character",
    jobId = "character"
))

## Show method for checking status of a submitted chemminetools job
setMethod("show", signature=signature(
    object="jobToken"),
    function(object){
        response <- status(object)
        cat("tool name:\t", slot(object, "tool_name"), "\n")
        cat("status:\t\t", response, "\n")
    }
)

cstrsplit <- function(line) .Call(cstrsplitSym,line)
#cstrsplit <- cxxfunction(signature(l="character"),includes='
#					#include <R.h>
#					#include <boost/algorithm/string.hpp>
#								',body='
#					std::vector<std::string> strs;
#					const char *line = CHAR(STRING_ELT(l,0));
#					boost::split(strs, line, boost::is_any_of("\t "),boost::token_compress_on);
#
#					he
#					CharacterVector output2(strs.begin(),strs.end());
#					return output2;
#
#					',plugin="Rcpp")