R/AllClasses.R

Defines functions SDFDataTable cstrsplit openBabelPlot plotStruc grepSDFset read.SDFindex read.AP sdfStream write.SDFsplit atomsubset bonds conMA splitNumChar datablock2ma groups neighbors rings containedArom s.arom s.inner rings update linearCon cyclicCore MF MW atomcountMA makeUnique validSDF foldsNeeded foldVector apset2descdb desc2fp sdf2ap SDF2apcmp write.SMI read.SMIset SDFset gsubsdfsec read.SDFset parseAttributes trim parseV3000 findPositions ex2vec sdfParse write.SDF read.SDFstr sdfDownload

Documented in apset2descdb atomcountMA atomsubset bonds conMA datablock2ma desc2fp grepSDFset groups makeUnique MF MW openBabelPlot openBabelPlot plotStruc read.AP read.SDFindex read.SDFset read.SDFstr read.SMIset rings sdf2ap SDF2apcmp SDFDataTable SDFset sdfStream splitNumChar validSDF write.SDF write.SDFsplit write.SMI

################################################
## Class and Method Definitions for ChemmineR ##
################################################
## SDF format definition: 
        # http://www.epa.gov/ncct/dsstox/MoreonSDF.html
	# http://www.symyx.com/downloads/public/ctfile/ctfile.jsp

#################################################
## (1) Class and Method Definitions for SDFstr ##
#################################################
## Download PubChem SDF samples for code testing 
## This function is not intended to be used by users.
.sdfDownload <- function(mypath="ftp://ftp.ncbi.nih.gov/pubchem/Compound/CURRENT-Full/SDF/", myfile="Compound_00650001_00675000.sdf.gz") {
	system(paste("wget ", mypath, myfile, sep=""))
	system(paste("gunzip ", myfile))
}
# .sdfDownload(mypath="ftp://ftp.ncbi.nih.gov/pubchem/Compound/CURRENT-Full/SDF/", myfile="Compound_00650001_00675000.sdf.gz") 

## Import SD File and Return SDFstr Class (list-like) 
read.SDFstr <- function(sdfstr) {
        if(length(sdfstr) > 1) { # Support for passing on SD File content as character vector
                mysdf <- sdfstr
        } else {
                mysdf <- readLines(sdfstr) # Reads file line-wise into vector
        }
        y <- regexpr("^\\${4,4}", mysdf, perl=TRUE) # identifies all fields that start with a '$$$$' sign
        index <- which(y!=-1)


		  #if the last non-empty line of mysdf is not "$$$$". Only search from the last "$$$$" found
		  # to the end of the file
		  start = if(length(index)==0) 1 else index[length(index)]
		  if("$$$$" != Find(function(line) line !="", mysdf[start:length(mysdf)]
								 ,right=TRUE)){
			  # we have a MOL file, so just insert the $$$$ at the end
			  mysdf <- c(mysdf,"$$$$")
			  index <- c(index,length(mysdf))
		  }

        indexDF <- data.frame(start=c(1, index[-length(index)]+1), end=index)
        mysdf_list <- lapply(seq(along=indexDF[,1]), function(x) mysdf[seq(indexDF[x,1], indexDF[x,2])])
        if(class(mysdf_list) != "list") { mysdf_list <- list(as.vector(mysdf_list)) }
        names(mysdf_list) <- 1:length(mysdf_list)
        mysdf_list <- new("SDFstr", a=mysdf_list)
        return(mysdf_list)
}

## Define SDFstr class
setClass("SDFstr", representation(a = "list"))

## Accessor method for SDFstr
setGeneric(name="sdfstr2list", def=function(x) standardGeneric("sdfstr2list"))
setMethod(f="sdfstr2list", signature="SDFstr", definition=function(x) {return(x@a)}) 

## Replacement method for SDFstr using accessor method
setGeneric(name="sdfstr2list<-", def=function(x, value) standardGeneric("sdfstr2list<-"))
setReplaceMethod(f="sdfstr2list", signature="SDFstr", definition=function(x, value) {
	x@a <- value 
	return(x)
})

## Replacement method for SDFstr using "[" operator 
## It doesn't provide here full set of expected functionalities.
setReplaceMethod(f="[", signature="SDFstr", definition=function(x, i, j, value) {
	x@a[i] <- sdfstr2list(value)
	return(x)
})

## Define behavior of "[" operator for SDFstr 
setMethod(f="[", signature="SDFstr", definition=function(x, i, ..., drop) {
		x@a <- x@a[i]                   
		return(x)
})

## Behavior of "[[" operator to convert single SDFstr component to character vector
setMethod(f="[[", signature="SDFstr",
	definition=function(x, i, ..., drop) {
		return(x@a[[i]])                 
})

## Replacement method for SDFstr using "[" operator 
setReplaceMethod(f="[", signature="SDFstr", definition=function(x, i, value) {
	x@a[i] <- value
	return(x)
})

## Replacement method for SDFstr using "[[" operator 
setReplaceMethod(f="[[", signature="SDFstr", definition=function(x, i, value) {
	x@a[[i]] <- value
	return(x)
})

## Define print behavior for SDFstr
setMethod(f="show", signature="SDFstr",
	definition=function(object) { 
		cat("An instance of ", "\"", class(object), "\"", " with ", length(sdfstr2list(object)), " molecules", "\n", sep="")
})

## Length function
setMethod(f="length", signature="SDFstr",
    definition=function(x) {
    	return(length(sdfstr2list(x)))
})

## Write SDF/SDFstr/SDFset Objects to SD File 
write.SDF <- function(sdf, file, cid=FALSE, ...) {
	if(class(sdf)=="SDF") sdfstr <- as(sdf, "SDFstr")
	if(class(sdf)=="SDFstr") sdfstr <- sdf
	if(class(sdf)=="SDFset") {
		if(cid==TRUE) {	sdflist <- lapply(cid(sdf), function(x) sdf2str(sdf=sdf[[x]], cid=x, ...)) }	
		if(cid==FALSE) { sdflist <- lapply(cid(sdf), function(x) sdf2str(sdf=sdf[[x]], ...)) } 
		sdfstr <- as(sdflist, "SDFstr") 
	}
	cat(unlist(sdfstr2list(sdfstr)), sep="\n", file=file)
} 

## Coerce Methods for SDFstr Class
## SDFstr to list
setAs(from="SDFstr", to="list", 
	def=function(from) { 
		sdfstr2list(from)
})

## List to SDFstr
setAs(from="list", to="SDFstr", 
	def=function(from) {
		new("SDFstr", a=from)
})

## List to SDFstr
setAs(from="character", to="SDFstr", 
	def=function(from) {
		new("SDFstr", a=list(from))
})

################################################
## (2) Class and Method Definitions for SDF ##
################################################
setClass("ExternalReference")
setClassUnion("ExternalReferenceOrNULL",members=c("ExternalReference","NULL"))
setClass("SDF", representation(header="character", atomblock="matrix", 
										 bondblock="matrix", datablock="character",
										 obmolRef="ExternalReferenceOrNULL",
										 version="character"),
			prototype=list(obmolRef=NULL,version="V2000"))

## Convert SDFstr to SDF Class
## SDFstr Parser Function

v2kTimes = new.env()
v2kTimes$header = 0
v2kTimes$atom = 0
v2kTimes$bond = 0
v2kTimes$data = 0


.sdfParse <- function(sdf, datablock=TRUE, tail2vec=TRUE,extendedAttributes=FALSE ) {
	t=Sys.time()
	countpos <- grep("V\\d\\d\\d\\d$", sdf, perl=TRUE)
	if(length(countpos)==0)  
		countpos <- grep("V {0,}\\d\\d\\d\\d$", sdf, perl=TRUE) 
	if(length(countpos)==0 || length(countpos)>1) 
		 countpos <- 4 

	countline <- sdf[countpos]

	if(length(grep("V3000$",countline))!=0) # we have a V3000 formatted file
		return(.parseV3000(sdf,datablock,tail2vec,extendedAttributes))

   if(nchar(gsub("\\d| ", "", substring(countline, 1, 6))) != 0) 
		countline <- "  0  0"  # Create dummy countline if it contains non-numeric values 

	Natom <- as.numeric(substring(countline, 1, 3))
	Nbond <- as.numeric(substring(countline, 4, 6))
	start <- c(header=1, atom=countpos+1, bond=countpos+Natom+1, extradata=countpos+Natom+Nbond+1)
	index <- cbind(start=start, end=c(start[2:3], start[4], length(sdf)+1)-1)
	
	## Header block
	header <- sdf[index["header",1]:index["header",2]]
	if(length(header)==4) names(header) <- c("Molecule_Name", "Source", "Comment", "Counts_Line")	
	v2kTimes$header <- v2kTimes$header + (Sys.time() - t)
	
	t=Sys.time()
	## Atom block
	## format: x y z <atom symbol> 
	ab2matrix <- function(ct=sdf[index["atom",1]:index["atom",2]]) {
		if((index["atom","end"] - index["atom","start"]) < 1) {
			ctma <- matrix(rep(0,2), 1, 2, dimnames=list("0", c("C1", "C2"))) # Creates dummy matrix in case there is none.
		} else {
			ct <- gsub("^ {1,}", "", ct)
			ctlist <- strsplit(ct, " {1,}")
			ctma <- tryCatch(matrix(unlist(ctlist), 
											ncol=length(ctlist[[1]]), 
											nrow=length(ct), 
											byrow=TRUE), 
								  warning=function(w) NULL)
			# If rows in atom block are of variable length, use alternative/slower approach
			if(length(ctma)==0) { 
			  maxcol <- max(sapply(ctlist, length))
			  ctma <- matrix(0, nrow=length(ct), ncol=maxcol)
			  for(i in seq(along=ctma[,1])) 
				  ctma[i, 1:length(ctlist[[i]])] <- ctlist[[i]]
			}
			myrownames <- paste(ctma[,4], 1:length(ctma[,4]), sep="_")
		   # Ncol <- length(ctlist[[1]])
		   Ncol <- length(ctma[1, , drop = FALSE])
			ctma <- matrix(as.numeric(ctma[,-4]), 
								nrow=length(ct), 
								ncol=Ncol-1, 
								dimnames=list(myrownames, paste("C", c(1:3, 5:Ncol), sep="")))	
		}
		return(ctma)
	}
	atomblock <- ab2matrix(ct=sdf[index["atom",1]:index["atom",2]])
	v2kTimes$atom<- v2kTimes$atom+ (Sys.time() - t)
	
	t=Sys.time()
	## Bond block
	bb2matrix <- function(ct=sdf[index["bond",1]:index["bond",2]]) {
		#if((index["bond","end"] - index["bond","start"]) < 1) {
		if(((index["bond","end"] - index["bond","start"])+1) < 1) {
                        ctma <- matrix(rep(0,2), 1, 2, dimnames=list("0", c("C1", "C2"))) # Creates dummy matrix in case there is none.
                } else {
                    ct <- gsub("^(...)(...)(...)(...)(...)(...)(...)", "\\1 \\2 \\3 \\4 \\5 \\6 \\7", ct)
                    ct <- gsub("(^..\\d)(\\d)", "\\1 \\2", ct) # Splits bond strings where one or both of the atoms have 3 digit numbers
		    ct <- gsub("^ {1,}", "", ct)
                    ctlist <- strsplit(ct, " {1,}")
                    ctma <- matrix(unlist(ctlist), ncol=length(ctlist[[1]]), nrow=length(ct), byrow=TRUE)
                    Ncol <- length(ctlist[[1]])
                    ctma <- matrix(as.numeric(ctma), nrow=length(ct), ncol=Ncol, dimnames=list(1:length(ct), paste("C", 1:Ncol, sep="")))	
		}
                return(ctma)
	}
	bondblock <- bb2matrix(ct=sdf[index["bond",1]:index["bond",2]])
	v2kTimes$bond<- v2kTimes$bond + (Sys.time() - t)
	
	
	t=Sys.time()
	if(tail2vec==TRUE) {
		extradata <- ex2vec(extradata=sdf[index["extradata",1]:index["extradata",2]])
	} else {	
		extradata <- sdf[index["extradata",1]:index["extradata",2]]
	}
	v2kTimes$data<- v2kTimes$data+ (Sys.time() - t)

	## Assemble components in object of class SDF
	if(datablock==TRUE) {
		sdf <- new("SDF", header=header, atomblock=atomblock, bondblock=bondblock, datablock=extradata)
	} else {
		sdf <- new("SDF", header=header, atomblock=atomblock, bondblock=bondblock)
	}
	return(sdf)
}
## SDF name/value block
ex2vec <- function(extradata) {
                exstart <- grep("^>", extradata)
		if(length(exstart)==0) { 
                        exvec <- vector("character", length=0) 
                } else {
                    names(exstart) <- gsub("^>.*<|>", "", extradata[exstart])
                    exindex <- cbind(start=exstart, end=c(exstart[-1], length(extradata))-1) # Changed 'length(extradata)+1' to 'length(extradata)' on Mar 22, 2014
		    exvec <- sapply(rownames(exindex), function(x) paste(extradata[(exindex[x,1]+1):(exindex[x,2]-1)], collapse=" __ "))
		}
                return(exvec)
	}


findPositions = function(sdf){

	patterns = c("BEGIN CTAB","COUNTS","BEGIN ATOM","END ATOM", 
					 "BEGIN BOND", "END BOND","M  END")

	tagPositions = lapply(patterns,function(pattern) grep(pattern,sdf,fixed=TRUE))
	tagPositions[which(lapply(tagPositions,length)==0)] = -1 # ensure unlist does not remove undefined entries
	tagPositions = unlist(tagPositions)

	unfound = which(tagPositions == -1)
	if(length(unfound) != 0){ # some tags are missing
		warning("malformed SDF V3000 file, could not find tags ",
				  paste("\"",patterns[unfound],"\"",sep="",collapse=","))
	}

	tagPositions

}
v3kTimes = new.env()
v3kTimes$header = 0
v3kTimes$atom = 0
v3kTimes$atomCore = 0
v3kTimes$bond = 0
v3kTimes$data = 0

.parseV3000 <- function(sdf, datablock=TRUE, tail2vec=TRUE,extendedAttributes=FALSE) {
	#message("found V3000 formatted compound")
	sdfLength = length(sdf)

	t=Sys.time()
	tagPositions=findPositions(sdf)


	ctabPos = tagPositions[1]
	if(ctabPos == -1){
		header = sdf[1:4]
	}else
		header = sdf[1:(ctabPos-1)]
	
	if(length(header)==4) 
		names(header) <- c("Molecule_Name", "Source", "Comment", "Counts_Line")	

	countLinePos = tagPositions[2]
	if(countLinePos == -1){
		counts=c(0,0)
	}else{
		counts = as.numeric(cstrsplit(sdf[countLinePos])[c(4,5)])
	}
	#message("class of counts: ",class(counts))
	#print(counts)
	v3kTimes$header <- v3kTimes$header + (Sys.time() - t)


	t=Sys.time()
	#message("parsing atomblock")
	atomPos = tagPositions[3]
	if(atomPos == -1){
		warning("No ATOM block found in ",sdf[1]," returning a dummy atom block")
		atomblock <- matrix(rep(0,2), 1, 2, dimnames=list("0", c("C1", "C2"))) # Creates dummy matrix in case there is none.
	}else{
		#message("atomPos: ",atomPos," class: ",class(atomPos))
		atomEndPos = tagPositions[4]
		if(atomEndPos == -1){
				warning("Could not find the end of the ATOM block in ",sdf[1])
				atomEndPos = atomPos + 1 #assume and emtpy atom block
		}

		
		data = Reduce(rbind,Map(function(line) {
				parts = cstrsplit(line)
				attrs = if(length(parts) > 8)
								paste(parts[9:length(parts)],collapse=" ")
						  else ""
				c(parts[3:7],attrs)
			}, sdf[(atomPos+1):(atomEndPos-1)]))  #TODO: check for empty range
		#if(extendedAttributes)
		#	extAtomAttrs = parseAttributes(data[,6])
		extAtomAttrs = parseAttributes(data[,6])


		# we need to tranlate certain "Standard" attribute values from v3k back
		# into the v2k format to make things consistent
		#
		# <mass difference>  MASS  v2k: offset from pt v3k: actual value
		# <charge>   CHG           v2k: value of 1-7   v3k: actual charge value
		# <atom stereo parity>  CFG
		# <hydrogen count +1>   HCOUNT
		# <stereo care box>  STBOX
		# <valence>   VAL			v2k: 15 indicates 0  v3: -1 indicates 0
		standardAttrs = matrix(0,nrow(data),7)
		for(i in seq(along=extAtomAttrs)){ # for each atom
			mass = extAtomAttrs[[i]]$MASS
			if(!is.null(mass)){
				massDiff = mass - AW[data[i,2]]
				standardAttrs[i,1] = massDiff
			}
			chg= extAtomAttrs[[i]]$CHG
			if(!is.null(chg)){
				chg= 4 - as.numeric(chg)
				if(chg == 4) # undo shift for 0 values
					chg = 0
				if(chg < 1 || chg > 7) 
					chg = 0 # chg not in [1,7] => 0
				standardAttrs[i,2] = chg 
			}
			cfg = extAtomAttrs[[i]]$CFG
			if(!is.null(cfg)){
				standardAttrs[i,3] = cfg
			}
			hcount = extAtomAttrs[[i]]$HCOUNT
			if(!is.null(hcount)){
				standardAttrs[i,4] = hcount
			}
			stbox = extAtomAttrs[[i]]$STBOX
			if(!is.null(stbox)){
				standardAttrs[i,5] = stbox
			}
			val = extAtomAttrs[[i]]$val
			if(!is.null(val)){
				if(val == -1) # means 0 in v3k
					val = 15   # translate to 0 in v2k 
				standardAttrs[i,6] = val
			}
		}

    	#print(extAtomAttrs)
		atomblock =cbind(data[,3:5],standardAttrs)
		mode(atomblock)="numeric"
		#print(atomblock)
		colnames(atomblock) = paste("C",c(1:3,5:11),sep="")
		rownames(atomblock) = paste(data[,2],data[,1],sep="_")
	}
	v3kTimes$atom<- v3kTimes$atom+ (Sys.time() - t)


	t=Sys.time()
	#message("parsing bond block")
	bondPos = tagPositions[5]
	if(bondPos == -1){
		warning("No BOND block found in ",sdf[1]," returning a dummy bond block")
		bondblock <- matrix(rep(0,2), 1, 2, dimnames=list("0", c("C1", "C2"))) # Creates dummy matrix in case there is none.
	}else{
		bondEndPos = tagPositions[6]
		if(bondEndPos == -1){
	  		warning("Could not find the end of the BOND block in ",sdf[1])
	  		bondEndPos = bondPos + 1 #assume and emtpy atom block
		}

#t2=Sys.time()
		data = Reduce(rbind,Map(function(line) {
				#cstrsplit(line)[3:6]
				parts = cstrsplit(line)
				attrs=  if(length(parts) > 6)
								paste(parts[7:length(parts)],collapse=" ")
						  else ""
				c(parts[3:6],attrs)
			}, sdf[(bondPos+1):(bondEndPos-1)]))  #TODO: check for empty range
#v3kTimes$atomCore<- v3kTimes$atomCore+ (Sys.time() - t2)
		extBondAttrs = parseAttributes(data[,5])  # +2.6s

		# CFG  bond configuration(stereo)  change: 0 -> 0, 1-> 1, 2-> 4, 3->6
		# empty slot
		# TOPO  same
		# RXCTR same
		standardAttrs = matrix(0,nrow(data),4)  # +0.8s
		for(i in seq(along=extBondAttrs)){  # +1.3s
			cfg=extBondAttrs[[i]]$CFG
			if(!is.null(cfg)){
				#cfg = extBondAttrs[[i]]$CFG
				if(cfg == 2) cfg = 4
				else if(cfg == 3) cfg = 6
				standardAttrs[i,1] = cfg
			}
			topo=extBondAttrs[[i]]$TOPO
			if(!is.null(topo)){
				standardAttrs[i,3] = topo
			}
			rxctr=extBondAttrs[[i]]$rxctr
			if(!is.null(rxctr)){
				standardAttrs[i,4] = rxctr
			}
		}

		bondblock = cbind(data[,c(3,4,2)],standardAttrs) # +0.5s
		#bondblock = data[,c(3,4,2)]
		mode(bondblock)="numeric"
		colnames(bondblock) = paste("C",1:7,sep="")
		rownames(bondblock) = data[,1]
	}
	v3kTimes$bond<- v3kTimes$bond+ (Sys.time() - t)


	#message("parsing extra data")
	t=Sys.time()
	endPos = tagPositions[7]
	if(endPos == -1){
		warning("no END tag found in ",sdf[1])
		extradata = vector("character",length=0)
	}else{
		if(endPos+2 < sdfLength){
			extradata = if(tail2vec==TRUE) ex2vec(sdf[(endPos+1):sdfLength])
							else sdf[(endPos+1):sdfLength]
		}
		else
			extradata = vector("character",length=0)
	}
	v3kTimes$data <- v3kTimes$data + (Sys.time() - t)

	## Assemble components in object of class SDF
	className = "SDF"
	args = list( header=header, atomblock=atomblock, bondblock=bondblock,version="V3000")
	if(datablock==TRUE) 
		args = c(args,datablock=list(extradata))
	
	if(extendedAttributes==TRUE) {
		className = "ExtSDF"
		args = c(args,extendedAtomAttributes =list(extAtomAttrs),
					extendedBondAttributes = list(extBondAttrs) )
	}
	#message("className: ",className," args: ")
	#print(str(args))

	sdf = do.call("new",c(className,args))

	return(sdf)

}
trim <- function(str) gsub("^\\s+|\\s+$","",str)
parseAttributes <- function(attributes){
	#message("attributes: ")
	#print(attributes)

	if(all(attributes=="")){
		return(sapply(seq(along=attributes),function(i) list()))
	}


	starts = gregexpr("(^|\\s)[a-zA-Z0-9]+=",attributes)
	sapply(seq(along=attributes),function(i) {
			 if(attributes[i]=="")
				 list()
			 else{
				 starts[[i]][length(starts[[i]])+1] = nchar(attributes[i])
	#			 message("starts[[",i,"]]:")
	#			 print(starts[[i]])
				 sapply(1:(length(starts[[i]])-1),function(j){
						pair = unlist(strsplit(
											  substring(attributes[i],starts[[i]][j],starts[[i]][j+1])
											  ,"=",fixed=TRUE) )
	#					print(pair)
						if(length(pair) != 2)
							warning("bad key value pair found:
									  ",substring(attributes[i],starts[[i]][j],starts[[i]][j+1]))
						l=list()
						l[[trim(pair[1])]] = trim(pair[2])
						l
				
				})
			 }
	 })
}
## Accessor methods for SDF class
setGeneric(name="sdf2list", def=function(x) standardGeneric("sdf2list"))
setMethod(f="sdf2list", signature="SDF", definition=function(x) {return(list(header=header(x), atomblock=atomblock(x), bondblock=bondblock(x), datablock=datablock(x)))}) 
setGeneric(name="header", def=function(x) standardGeneric("header"))
setMethod(f="header", signature="SDF", definition=function(x) {return(x@header)}) 
setGeneric(name="sdfid", def=function(x, tag=1) standardGeneric("sdfid"))
setMethod(f="sdfid", signature="SDF", definition=function(x, tag=1) {return(x@header[tag])}) 
setGeneric(name="atomblock", def=function(x) standardGeneric("atomblock"))
setMethod(f="atomblock", signature="SDF", definition=function(x) {return(x@atomblock)}) 
setGeneric(name="atomcount", def=function(x, addH=FALSE, ...) standardGeneric("atomcount"))
setMethod(f="atomcount", signature="SDF", definition=function(x, addH=FALSE, ...) {
	if(addH==TRUE) { 
		return(table(c(gsub("_.*", "", rownames(x@atomblock)), rep("H", bonds(x, type="addNH")))))
	} else {
		return(table(gsub("_.*", "", rownames(x@atomblock))))
	}
})
setGeneric(name="bondblock", def=function(x) standardGeneric("bondblock"))
setMethod(f="bondblock", signature="SDF", definition=function(x) {return(x@bondblock)}) 
setGeneric(name="datablock", def=function(x) standardGeneric("datablock"))
setMethod(f="datablock", signature="SDF", definition=function(x) {return(x@datablock)}) 
setGeneric(name="datablocktag", def=function(x, tag) standardGeneric("datablocktag"))
setMethod(f="datablocktag", signature="SDF", definition=function(x, tag) {return(x@datablock[tag])}) 

setGeneric(name="obmol",def=function(x) standardGeneric("obmol"))
setMethod(f="obmol",signature="SDF",definition= function(x) {
	.ensureOB()
	if(is.null(x@obmolRef))
		x@obmolRef = sdf2OBMol(x)
	x@obmolRef
})

## Define print behavior for SDF
setMethod(f="show", signature="SDF",                
   definition=function(object) {
         cat("An instance of ", "\"", class(object), "\"", "\n", sep="")
         cat("\n<<header>>", "\n", sep="")
         print(header(object))
         cat("\n<<atomblock>>", "\n", sep="")
         if(length(atomblock(object)[,1])>=5) {
             print(as.data.frame(rbind(atomblock(object)[1:2,], ...=rep("...", length(atomblock(object)[1,])), 
             atomblock(object)[(length(atomblock(object)[,1])-1):length(atomblock(object)[,1]),])))
         } else {
             print(atomblock(object))}
         cat("\n<<bondblock>>", "\n", sep="")
         if(length(bondblock(object)[,1])>=5) {
             print(as.data.frame(rbind(bondblock(object)[1:2,], ...=rep("...", length(bondblock(object)[1,])), 
             bondblock(object)[(length(bondblock(object)[,1])-1):length(bondblock(object)[,1]),])))
         } else {
             print(bondblock(object))}
         cat("\n<<datablock>> (", length(datablock(object)), " data items)", "\n", sep="")
         if(length(datablock(object))>=5) {
         	print(c(datablock(object)[1:4], "..."))
         } else {
         	print(datablock(object))}
})

## Behavior of "[" operator for SDF
setMethod(f="[", signature="SDF",
	definition=function(x, i, ..., drop) {
		return(sdf2list(x)[i]) 
})

## Replacement method for SDF using "[" operator 
setReplaceMethod(f="[", signature="SDF", definition=function(x, i, j, value) {
	if(i==1) x@header <- value
	if(i==2) x@atomblock <- value 
	if(i==3) x@bondblock <- value
	if(i==4) x@datablock <- value
	if(i=="header") x@header <- value
	if(i=="atomblock") x@atomblock <- value 
	if(i=="bondblock") x@bondblock <- value
	if(i=="datablock") x@datablock <- value
	obmolRef=NULL
	return(x)
})

## Behavior of "[[" operator for SDF
setMethod(f="[[", signature="SDF",
	definition=function(x, i, ..., drop) {
		return(sdf2list(x)[[i]]) 
})

## Replacement method for SDF using "[[" operator 
setReplaceMethod(f="[[", signature="SDF", definition=function(x, i, j, value) {
	if(i==1) x@header <- value
	if(i==2) x@atomblock <- value 
	if(i==3) x@bondblock <- value
	if(i==4) x@datablock <- value
	if(i=="header") x@header <- value
	if(i=="atomblock") x@atomblock <- value 
	if(i=="bondblock") x@bondblock <- value
	if(i=="datablock") x@datablock <- value
	obmolRef=NULL
	return(x)
})

## Coerce Methods for SDF Class 
## Character vector to SDF
setAs(from="character", to="SDF", 
	def=function(from) {
		.sdfParse(sdf=from) 
})

## SDF to list
setAs(from="SDF", to="list", 
	def=function(from) {
		list(header=header(from), atomblock=atomblock(from), bondblock=bondblock(from), datablock=datablock(from))
})

## SDF to SDFstr
setAs(from="SDF", to="SDFstr", 
	def=function(from) {
		new("SDFstr", a=list(as(from, "character")))		
})

## list (w. SDF components) to SDF  
setAs(from="list", to="SDF", 
	def=function(from) {
		new("SDF", bondblock=from$bondblock, header=from$header, atomblock=from$atomblock, datablock=from$datablock)
})

################################################# 
##  ExtSDF
################################################# 

setClass("ExtSDF",representation(extendedAtomAttributes="list",
											extendedBondAttributes="list"),contains="SDF")

setGeneric(name="getAtomAttr", def=function(x,atomId,tag) standardGeneric("getAtomAttr"))
setMethod(f="getAtomAttr", signature="ExtSDF",definition = 
			 function(x,atomId,tag) x@extendedAtomAttributes[[atomId]][[tag]])

setGeneric(name="getBondAttr", def=function(x,bondId,tag) standardGeneric("getBondAttr"))
setMethod(f="getBondAttr", signature="ExtSDF",definition = 
			 function(x,bondId,tag) x@extendedBondAttributes[[bondId]][[tag]])
setMethod(f="show", signature="ExtSDF",                
   definition=function(object) {
		callNextMethod(object)
		nonEmptyAtoms = which(sapply(object@extendedAtomAttributes,
											  function(x) length(x)!=0))
		message("<<Extended Atom Attributes>>")
		count = min(5,length(nonEmptyAtoms))
		for(i in nonEmptyAtoms[seq(1,count,length.out=count)]){
			message("Atom ",i)
			print(object@extendedAtomAttributes[[i]])
		}

		nonEmptyBonds = which(sapply(object@extendedBondAttributes,
											  function(x) length(x)!=0))
		message("<<Extended Bond Attributes>>")
		count = min(5,length(nonEmptyBonds))
		for(i in nonEmptyBonds[seq(1,count,length.out=count)]){
			message("Bond ",i)
			print(object@extendedBondAttributes[[i]])
		}
	})
 
################################################# 
## (3) Class and Method Definitions for SDFset ##
#################################################
setClass("SDFset", representation(SDF="list", ID="character"))

## Store many SDFs in SDFset Object
read.SDFset <- function(sdfstr=sdfstr,skipErrors=FALSE, ...) {
	## If a file name is provided run read.SDFstr function first
	if(is.character(sdfstr)) { 
		sdfstr <- read.SDFstr(sdfstr=sdfstr) 
	}
	## Iterate over SDFstr components	
	sdfset <- lapply(seq(along=sdfstr@a), 
						  function(x){
							  tryCatch(
								  .sdfParse(sdfstr2list(sdfstr)[[x]], ...),
								  error=function(error){
									  if(skipErrors){
										  warning("failed to parse item ",x,": ",error)
										  NA
									  }else
										  stop("failed to parse item ",x,": ",error)
								  }
								)
						  })
	failedToParse <- is.na(sdfset)
	if(sum(failedToParse) != 0)
		warning("Failed to parse input compounds at indexe(s) (",paste(which(failedToParse),collapse=", "),
				  "). These have been removed from the output.")
	sdfset <- sdfset[!failedToParse]
	sdfset <- new("SDFset", SDF=sdfset, ID=paste("CMP", seq(along=sdfset), sep=""))
	## Validity check of SDFs based on atom/bond block column numbers
	badsdf <- sum(!validSDF(sdfset))
	if(sum(badsdf)!=0) warning(paste(c(sum(badsdf), " invalid SDFs detected. To fix, run: valid <- validSDF(sdfset); sdfset <- sdfset[valid]")))
	return(sdfset)
}

## Accessor methods for SDFset class
setGeneric(name="SDFset2list", def=function(x) standardGeneric("SDFset2list"))
setMethod(f="SDFset2list", signature="SDFset", definition=function(x) {
	SDFlist <- x@SDF
	charlist <- lapply(seq(along=SDFlist), function(x) as(SDFlist[[x]], "list"))
	names(charlist) <- x@ID
	return(charlist)
}) 
setGeneric(name="SDFset2SDF", def=function(x) standardGeneric("SDFset2SDF"))
setMethod(f="SDFset2SDF", signature="SDFset", definition=function(x) { tmp <- x@SDF; names(tmp) <- x@ID; return(tmp)}) 
setMethod(f="header", signature="SDFset", definition=function(x) { return(lapply(SDFset2SDF(x), header))}) 
setMethod(f="sdfid", signature="SDFset", definition=function(x, tag=1) {return(as.vector(sapply(SDFset2SDF(x), sdfid, tag)))}) 
setGeneric(name="cid", def=function(x) standardGeneric("cid"))
setMethod(f="cid", signature="SDFset", definition=function(x) {return(x@ID)}) 
setMethod(f="atomblock", signature="SDFset", definition=function(x) {return(lapply(SDFset2SDF(x), atomblock))}) 
setMethod(f="atomcount", signature="SDFset", definition=function(x, addH, ...) {
	atomcounts <- lapply(SDFset2SDF(x), function(y) atomcount(y, addH, ...))
	return(atomcounts)
}) 
setMethod(f="bondblock", signature="SDFset", definition=function(x) {return(lapply(SDFset2SDF(x), bondblock))}) 
setMethod(f="datablock", signature="SDFset", definition=function(x) {return(lapply(SDFset2SDF(x), datablock))}) 
setMethod(f="datablocktag", signature="SDFset", definition=function(x, tag) {return(as.vector(sapply(SDFset2SDF(x), datablocktag, tag)))}) 
setMethod(f="obmol",signature="SDFset",definition= function(x) lapply(SDFset2SDF(x),obmol))

## Replacement method for SDF component of SDFset using accessor methods
setGeneric(name="SDFset2SDF<-", def=function(x, value) standardGeneric("SDFset2SDF<-"))
setReplaceMethod(f="SDFset2SDF", signature="SDFset", definition=function(x, value) {
	x@SDF <- value 
	return(x)
})

## Replacement method for ID component of SDFset using accessor methods
setGeneric(name="cid<-", def=function(x, value) standardGeneric("cid<-"))
setReplaceMethod(f="cid", signature="SDFset", definition=function(x, value) {
	x@ID <- value 
	if(any(duplicated(x@ID))) { 
		warning("The values in the CMP ID slot are not unique anymore. To fix this, run: cid(sdfset) <- makeUnique(cid(sdfset))")
	}
	return(x)
})

## Replacement method for SDFset using "[" operator 
## It doesn't provide here full set of expected functionalities.
setReplaceMethod(f="[", signature="SDFset", definition=function(x, i, j, value) {
	x@SDF[i] <- SDFset2SDF(value)
	return(x)
})

## Behavior of "[" operator for SDFset 
setMethod(f="[", signature="SDFset", definition=function(x, i, ..., drop) {
	if(is.logical(i)) {
                i <- which(i)
        }
        if(is.character(i)) { 
		ids <-seq(along=x@ID); names(ids) <- x@ID
		i <- ids[i]
	}
	x@SDF <- x@SDF[i]                   
	x@ID <- x@ID[i]                   
	if(any(duplicated(i))) {
		warning("The values in the CMP ID slot are not unique anymore. To fix this, run: cid(sdfset) <- makeUnique(cid(sdfset))")
	}
	return(x)
})

## Replacement method for SDFset using "[" operator 
setReplaceMethod(f="[", signature="SDFset", definition=function(x, i, value) {
	x@SDF[i] <- value
	return(x)
})

## Replacement method for SDFset using "[[" operator 
setReplaceMethod(f="[[", signature="SDFset", definition=function(x, i, value) {
	x@SDF[[i]] <- value
	return(x)
})

## Behavior of "[[" operator for SDFset to convert single SDFset component to SDF 
setMethod(f="[[", signature="SDFset", definition=function(x, i, ..., drop) {
	if(is.character(i)) { i <- which(x@ID %in% i) }
	return(x@SDF[[i]])                 
})

## Batch replacement of header, atomblock, bondblock and datablock sections for
## one to all SDF objects in an SDFset.
.gsubsdfsec <- function(sdfset, what, secdata) {
	if(class(secdata) %in% c("data.frame", "matrix")) { 
		secdata <- as.matrix(secdata)
		tmp <- as.list(seq(along=secdata[,1]))
		for(i in seq(along=tmp)) {
			vec <- as.character(secdata[i,])
			names(vec) <- colnames(secdata)
			tmp[[i]] <- vec
		}
		secdata <- tmp
	}
	if(length(sdfset) != length(secdata)) { stop("The length of the two data sets needs to match.") }
	sdfsec <- list(header=header(sdfset), atomblock=atomblock(sdfset), bondblock=bondblock(sdfset), datablock=datablock(sdfset))
	sdfsec[[what]] <- secdata
	sdflist <- lapply(seq(along=sdfsec[[1]]), function(x) as(list(header=sdfsec$header[[x]], atomblock=sdfsec$atomblock[[x]], bondblock=sdfsec$bondblock[[x]], datablock=sdfsec$datablock[[x]]), "SDF")) 
	names(sdflist) <- cid(sdfset)
	return(as(sdflist, "SDFset"))
}

setGeneric(name="header<-", def=function(x, value) standardGeneric("header<-"))
setReplaceMethod(f="header", signature="SDFset", definition=function(x, value) {
	sdfset <- .gsubsdfsec(sdfset=x, what=1, secdata=value) 
	return(sdfset)
})

setGeneric(name="atomblock<-", def=function(x, value) standardGeneric("atomblock<-"))
setReplaceMethod(f="atomblock", signature="SDFset", definition=function(x, value) {
	sdfset <- .gsubsdfsec(sdfset=x, what=2, secdata=value) 
	return(sdfset)
})

setGeneric(name="bondblock<-", def=function(x, value) standardGeneric("bondblock<-"))
setReplaceMethod(f="bondblock", signature="SDFset", definition=function(x, value) {
	sdfset <- .gsubsdfsec(sdfset=x, what=3, secdata=value) 
	return(sdfset)
})

setGeneric(name="datablock<-", def=function(x, value) standardGeneric("datablock<-"))
setReplaceMethod(f="datablock", signature="SDFset", definition=function(x, value) {
	sdfset <- .gsubsdfsec(sdfset=x, what=4, secdata=value) 
	return(sdfset)
})


## Length function
setMethod(f="length", signature="SDFset",
    definition=function(x) {
    	return(length(SDFset2SDF(x)))
})

## Print behavior for SDFset
setMethod(f="show", signature="SDFset",
	definition=function(object) { 
		cat("An instance of ", "\"", class(object), "\"", " with ", length(object), " molecules", "\n", sep="")
		if(any(duplicated(object@ID))) {
			warning("The values in the CMP ID slot are not unique. To fix this, run: cid(sdfset) <- makeUnique(cid(sdfset))")
		}
})

## Concatenate function for SDFset
## Note: is currently limited to 2 arguments!
setMethod(f="c", signature="SDFset", definition=function(x, y) {
	sdflist1 <- as(x, "SDF")
	sdflist2 <- as(y, "SDF")
	sdflist <- c(sdflist1, sdflist2)
	sdfset <- as(sdflist, "SDFset")
	if(any(duplicated(cid(sdfset)))) {
		warning("The values in the CMP ID slot are not unique anymore, makeUnique() can fix this!")
	}
	return(sdfset)
})

## Convert SDF to SDFstr Class for Export to File
## Function allows to customize output via optional arguments 
setGeneric(name="sdf2str", def=function(sdf, head, ab, bb, db, cid=NULL, sig=FALSE, ...) standardGeneric("sdf2str"))
setMethod(f="sdf2str", signature="SDF", definition = function(sdf, head, ab, bb, db, cid=NULL, sig=FALSE, ...) {
	## Checks
	if(class(sdf)!="SDF") stop("Function expects molecule object of class SDF as input!")	
	
	## Header
	if(missing(head)) {
		head <- as.character(sdf[[1]])
		if(sig==TRUE) head[2] <- paste("ChemmineR-", format(Sys.time(), "%m%d%y%H%M"), "XD", sep="")	
		if(length(cid)==1) head[1] <- cid
	}
	
	## Atom block
	if(missing(ab)) {
		ab <- sdf[[2]]
		#ab <- cbind(Indent="", format(ab[,1:3], width=9, justify="right"), A=format(gsub("_.*", "", rownames(ab)), width=1, justify="right"), Space="", format(ab[,-c(1:3)], width=2, justify="right")) # Changed on 26-Aug-13
		ab <- cbind(Indent="", format(ab[,1:3], width=9, justify="right"), A=format(gsub("_.*", "", rownames(ab)), width=1, justify="left"),  Space="", format(ab[,-c(1:3)], width=2, justify="right"))
		ab <- sapply(seq(along=ab[,1]), function(x) paste(ab[x, ], collapse=" "))
	}

	## Bond block
	if(missing(bb)) {
		bb <- sdf[[3]]
		bb <- cbind(Indent="", format(bb, width=3, justify="right"))
		bb <- sapply(seq(along=bb[,1]), function(x) paste(bb[x, ], collapse=""))
	}

	## Data block
	if(missing(db)) {
		db <- sdf[[4]]
		if(length(db)>0) {
			dbnames <- paste("> <", names(db), ">", sep="")
			dbvalues <- as.character(db)
			db <- as.vector(rbind(dbnames, dbvalues, ""))
		} else {
			db <- NULL
		}
	}
	
	## Assemble in character vector
	sdfstrvec <- c(head, ab, bb, "M  END", db, "$$$$")
        return(sdfstrvec)
})

## Coerce Methods for SDFset Class 
## SDFset to list with lists of SDF sub-components
setAs(from="SDFset", to="list", 
	def=function(from) {
		SDFset2list(from)
})

## SDFset to list with many SDF objects (for summary view)
setAs(from="SDFset", to="SDF", 
	def=function(from) {
		SDFset2SDF(from)
})
setGeneric(name="view", def=function(x) standardGeneric("view"))
setMethod(f="view", signature="SDFset", definition=function(x) { as(x, "SDF") })

## SDFstr to SDFset
setAs(from="SDFstr", to="SDFset", 
	def=function(from) {
		read.SDFset(sdfstr=from) 
})

## SDF to SDFset of length one
setAs(from="SDF", to="SDFset", 
	def=function(from) {
		new("SDFset", SDF=list(from), ID="CMP") 
})

## SDF to SDFstr
setAs(from="SDF", to="SDFstr", 
	def=function(from) {
		as(as(from, "SDFset"), "SDFstr") 
})

## SDFset to character for SDFstr coercion
setAs(from="SDF", to="character", 
	def=function(from) {
       		sdf2str(sdf=from)
})

## SDFset to SDFstr
setAs(from="SDFset", to="SDFstr", 
	def=function(from) {
		from <- lapply(seq(along=from), function(x) as(from[[x]], "character"))
		new("SDFstr", a=from)		
})

## List of SDFs to SDFset
setAs(from="list", to="SDFset", 
	def=function(from) {
		new("SDFset", SDF=from, ID=names(from))
})

## User interface to SDFset() constructor
SDFset <- function(SDFlist=list(), ID=character()) {
	new("SDFset", SDF=SDFlist, ID=ID)
}

setClass("SDFset", representation(SDF="list", ID="character"))

#########################################################
## (4) Class and Method Definitions for SMI and SMIset ##
#########################################################
## Define SMI and SMIset classes
setClass("SMI", representation(smiles="character"))
setClass("SMIset", representation(smilist="list"))

## Methods to return SMI/SMIset objects as character vector or list, respectively
setMethod(f="as.character", signature="SMI", definition=function(x) {return(x@smiles)})
setMethod(f="as.character", signature="SMIset", definition=function(x) {return(unlist(x@smilist))})

## Constructor methods
## Character vector to FP with: as(myvector, "SMI")
setAs(from="character", to="SMI",  
        def=function(from) {
		new("SMI", smiles=from)
})

## List to SMIset with: as(mylist, "SMIset")
setAs(from="list", to="SMIset",  
        def=function(from) {
		new("SMIset", smilist=from)
})

## Character to SMIset with: as(mychar, "SMIset")
setAs(from="character", to="SMIset",  
        def=function(from) {
		new("SMIset", smilist=as.list(from))
})

## Accessor methods for SMIset
## Note: generic for cid() is defined under SDFset class section.
setMethod(f="cid", signature="SMIset", definition=function(x) { return(names(x@smilist)) })

## Replacement method for ID component of SMIset using accessor methods.
setReplaceMethod(f="cid", signature="SMIset", definition=function(x, value) {
	names(x@smilist) <- value 
	if(any(duplicated(names(x@smilist)))) { 
		warning("The values in the CMP ID slot are not unique anymore. To fix this, run: cid(smiset) <- makeUnique(cid(smiset))")
	}
	return(x)
})

## Replacement method for SMIset using "[" operator 
## It doesn't provide here full set of expected functionalities.
setReplaceMethod(f="[", signature="SMIset", definition=function(x, i, value) {
	x@smilist[i] <- value
	names(x@smilist) <- names(value)
	return(x)
})

## Behavior of "[" operator for SMIset 
setMethod(f="[", signature="SMIset", definition=function(x, i, ..., drop) {
	if(is.logical(i)) {
                i <- which(i)
        }
	x@smilist <- x@smilist[i]                   
	if(any(duplicated(i))) {
		warning("The values in the CMP ID slot are not unique anymore. To fix this, run: cid(smiset) <- makeUnique(cid(smiset))")
	}
	return(x)
})

## Behavior of "[[" operator for SMIset to convert single SMIset component to SMI
setMethod(f="[[", signature="SMIset", definition=function(x, i, ..., drop) {
	if(is.character(i)) { i <- which(names(x@fpma) %in% i) }
	return(new("SMI", smiles=x@smilist[[i]]))                 
})

## Length function
setMethod(f="length", signature="SMIset",
    definition=function(x) {
    	return(length(as.character(x)))
})

## Define print behavior for SMIset
setMethod(f="show", signature="SMIset", 
	definition=function(object) {    
		cat("An instance of ", "\"", class(object), "\"", " with ", length(object), " molecules", "\n", sep="")
		if(any(duplicated(names(object@smilist)))) {
			warning("The values in the CMP ID slot are not unique. To fix this, run: cid(smiset) <- makeUnique(cid(smiset))")
		}
})

## Concatenate function for SMIset
## Note: is currently limited to 2 arguments!
setMethod(f="c", signature="SMIset", definition=function(x, y) {
	smilist1 <- as.list(x)
	smilist2 <- as.list(y)
	smilist <- c(smilist1, smilist2)
	smiset <- as(smilist, "SMIset")
	if(any(duplicated(cid(smiset)))) {
		warning("The values in the CMP ID slot are not unique anymore, makeUnique() can fix this!")
	}
	return(smiset)
})

## Define print behavior for SMI
setMethod(f="show", signature="SMI",                
   definition=function(object) {
         cat("An instance of ", "\"", class(object), "\"", "\n", sep="")
         print(object@smiles)
})

## SMIset to list with many SMI objects (for summary view)
setAs(from="SMIset", to="SMI", 
        def=function(from) {
                tmp <- lapply(seq(along=from), function(x) from[[x]])
		names(tmp) <- cid(from)
		return(tmp)
})
setMethod(f="view", signature="SMIset", definition=function(x) { as(x, "SMI") })
# view(smiset)

## Import SMILES Files and Store them as SMIset Objects
read.SMIset <- function(file, removespaces=TRUE, ...) {
	smisettmp <- readLines(file, ...)
	if(removespaces==TRUE) smisettmp <- gsub(" {1,}", "", smisettmp)
	## Add compound names where they are missing
	index <- !grepl("\t.{1,}$", smisettmp)
	if(any(index)) smisettmp[index] <- paste(smisettmp[index], "\t", "CMP", which(index), sep="")
	## Construct and return SMIset object
	smiset <- gsub("\t.*$", "", smisettmp)
	names(smiset) <- gsub("^.*\t", "", smisettmp)
        smiset <- as(smiset, "SMIset")
        return(smiset)
}

## Write SMI/SMIset Objects to File 
write.SMI <- function(smi, file, cid=TRUE, ...) {
	if(class(smi)=="SMI") smima <- cbind(as.character(smi), "CMP1")
	if(class(smi)=="SMIset") smima <- cbind(as.character(smi), cid(smi), ...)
	if(cid==TRUE) write.table(smima, file=file, quote=FALSE, sep="\t", row.names=FALSE, col.names = FALSE) 
	if(cid==FALSE) write.table(smima[ , 1, drop = TRUE], file=file, quote=FALSE, sep="\t", row.names=FALSE, col.names = FALSE) 
} 

#######################################################
## (5) Class and Method Definitions for AP and APset ##
#######################################################
## Function to coerce SDF class to old non-S4 AP CMP object
SDF2apcmp <- function(SDF) { 
	atoms <- gsub("_.*", "", rownames(atomblock(SDF)))
	u <- as.numeric(bondblock(SDF)[,1])
	n_atoms <- length(atoms)
	n_bonds <- length(u)
	v <- as.numeric(bondblock(SDF)[,2])
	t <- as.numeric(bondblock(SDF)[,3])

	cmp = list(atoms=atoms, bonds=list(u=u, v=v, t=t), n_atoms=n_atoms, n_bonds=n_bonds)

	# assume we have an SDF object, not an SDFset
	sdfstr=as(as(SDF,"SDFstr"),"list")[[1]]
	defs = paste(sdfstr,collapse="\n")
	#defs = unlist(Map(function(x) paste(x,collapse="\n"), sdfstrList) )
	d <- Descriptors()
	if (Descriptors_parse_sdf(self=d, sdf=defs) == 0) {
		stop("SDF format not available or SDF not well-formatted!")
		return(list(n_atoms=0, n_bonds=0, desc_obj=NULL))
	}
	cmp$desc_obj=d

	return(cmp)
}

## Define AP/APset S4 classes for single AP vector and AP list
setClass("AP", representation(AP="numeric"))
setClass("APset", representation(AP="list", ID="character"))

## Create instance of APset form SDFset 
sdf2ap <- function(sdfset, type="AP",uniquePairs=TRUE) {
        if(!class(sdfset) %in% c("SDF", "SDFset")) stop("Functions expects input of classes SDF or SDFset.")
        if(class(sdfset)=="SDF") {
	         if(type=="AP") {
                	return(new("AP",
										AP=genAPDescriptors(sdfset,uniquePairs)))
        	   }
	         if(type=="character") {
                	return(paste(genAPDescriptors(sdfset,uniquePairs), collapse=", "))
        	   }
	     }
        if(class(sdfset)=="SDFset") {
                aplist <- as.list(seq(along=sdfset))
                exception <- FALSE
                for(i in seq(along=aplist)) {
                        tmp <- try(genAPDescriptors(sdfset[[i]],uniquePairs))
                        if(length(tmp) > 0 & class(tmp)!="try-error") {
                                aplist[[i]] <- tmp
                        } else if(length(tmp) == 0 & class(tmp)!="try-error") {
                                aplist[[i]] <- 0 # Value to use if no atom pairs are returned 
                                exception <- TRUE
                        } else if(class(tmp)=="try-error") {
                                aplist[[i]] <- 1 # Value to use if error is returned 
                                exception <- TRUE
                        }
                }
		if(exception) {
                        warning("One or more compounds failed to return APs. To identify them, run: \n\t which(sapply(as(apset, \"list\"), length)==1)")
		}
                if(type=="AP") {
                	return(new("APset", AP=aplist, ID=cid(sdfset)))
        	}
		if(type=="character") {
			names(aplist) <- cid(sdfset)
                	return(sapply(aplist, paste, collapse=", "))
        	}
        }
}

## Create AP Fingerprints
desc2fp <- function(x, descnames=1024, type="FPset") {
	if(length(descnames) == 1) {
		data(apfp)
		descnames <- as.character(apfp$AP)[1:descnames]
	}	
	if(class(x)=="APset") { 
        	apfp <- matrix(0, nrow=length(x), ncol=length(descnames), dimnames=list(cid(x), descnames))
		apsetlist <- ap(x)
                for(i in cid(x)) apfp[i, descnames %in% as.character(apsetlist[[i]])] <- 1
        } else if(class(x)=="list") {
        	apfp <- matrix(0, nrow=length(x), ncol=length(descnames), dimnames=list(names(x), descnames))
		for(i in names(x)) apfp[i, descnames %in% as.character(x[[i]])] <- 1
	} else {
		stop("x needs to be of class APset or list")
	}
        if(type=="FPset") {
                #return(as(apfp, "FPset"))
                return(new("FPset",fpma=apfp,type="apfp"))
        }
        if(type=="matrix") {
                return(apfp)
        }
        if(type=="character") {
                return(sapply(rownames(apfp), function(x) paste(apfp[x,], collapse="")))
        }
}
## Usage: 
# apfpset <- desc2fp(x=apset, descnames=1024, type="matrix")

## Accessor methods for APset class
setGeneric(name="ap", def=function(x) standardGeneric("ap"))

setMethod(f="ap", signature="AP", definition=function(x) { return(x@AP) })
setMethod(f="ap", signature="APset", definition=function(x) { tmp <- x@AP; names(tmp) <- x@ID; return(tmp) })
setMethod(f="cid", signature="APset", definition=function(x) { return(x@ID) })

## Replacement method for ID component of APset using accessor methods.
## Note: generic for cid() is defined under SDFset class section.
setReplaceMethod(f="cid", signature="APset", definition=function(x, value) {
	x@ID <- value 
	if(any(duplicated(x@ID))) { 
		warning("The values in the CMP ID slot are not unique anymore. To fix this, run: cid(apset) <- makeUnique(cid(apset))")
	}
	return(x)
})

## Replacement method for APset using "[" operator 
## It doesn't provide here full set of expected functionalities.
setReplaceMethod(f="[", signature="APset", definition=function(x, i, j, value) {
	x@AP[i] <- ap(value)
	x@ID[i] <- cid(value)
	return(x)
})

## Behavior of "[" operator for APset 
setMethod(f="[", signature="APset", definition=function(x, i, ..., drop) {
	if(is.logical(i)) {
                i <- which(i)
        }
	if(is.character(i)) { 
		ids <- seq(along=x@ID)
		names(ids) <- x@ID
		i <- ids[i]
	}
	x@AP <- x@AP[i]                   
	x@ID <- x@ID[i]                   
	if(any(duplicated(i))) {
		warning("The values in the CMP ID slot are not unique anymore. To fix this, run: cid(apset) <- makeUnique(cid(apset))")
	}
	return(x)
})

## Replacement method for APset using "[[" operator 
setReplaceMethod(f="[[", signature="APset", definition=function(x, i, value) {
	x@AP[[i]] <- value
	return(x)
})

## Behavior of "[[" operator for APset to convert single APset component to AP
setMethod(f="[[", signature="APset", definition=function(x, i, ..., drop) {
	if(is.character(i)) { i <- which(x@ID %in% i) }
	return(new("AP", AP=x@AP[[i]]))                 
})

## Length function
setMethod(f="length", signature="APset",
    definition=function(x) {
    	return(length(ap(x)))
})

## Print behavior for APset
setMethod(f="show", signature="APset",
	definition=function(object) { 
		cat("An instance of ", "\"", class(object), "\"", " with ", length(object), " molecules", "\n", sep="")
		if(any(duplicated(object@ID))) {
			warning("The values in the CMP ID slot are not unique. To fix this, run: cid(apset) <- makeUnique(cid(apset))")
		}
})

## Concatenate function for APset
## Note: is currently limited to 2 arguments!
setMethod(f="c", signature="APset", definition=function(x, y) {
	aplist1 <- as(x, "list")
	aplist2 <- as(y, "list")
	aplist <- c(aplist1, aplist2)
	apset <- as(aplist, "APset")
	if(any(duplicated(cid(apset)))) {
		warning("The values in the CMP ID slot are not unique anymore, makeUnique() can fix this!")
	}
	return(apset)
})

## Define print behavior for AP
setMethod(f="show", signature="AP",                
   definition=function(object) {
         cat("An instance of ", "\"", class(object), "\"", "\n", sep="")
         cat("<<atom pairs>>", "\n", sep="")
         if(length(object@AP)>=5) {
             cat(c(object@AP[1:5], "... length:", length(object@AP), "\n"))
         } else {
             print(object@AP)}
})

## Coerce Methods for APset Class
## APset to list with many AP objects
setAs(from="APset", to="list",
        def=function(from) {
                ap(from)
})

## List of APs to APset
setAs(from="list", to="APset", 
        def=function(from) {
                new("APset", AP=from, ID=names(from))
})

## APset to list with many AP objects (for summary view)
setAs(from="APset", to="AP", 
        def=function(from) {
                tmp <- lapply(seq(along=from), function(x) from[[x]])
		names(tmp) <- cid(from)
		return(tmp)
})
setMethod(f="view", signature="APset", definition=function(x) { as(x, "AP") })
# view(apset)

## Coerce APset to old list-style descriptor database used by search/cluster functions
apset2descdb <- function(apset) {
                list(descdb=ap(apset), cids=cid(apset), sdfsegs=NULL, source="SDFset", type="SDFset")
}

#######################################################
## (6) Class and Method Definitions for FP and FPset ##
#######################################################
## Define FP and FPset classes
setClass("FP", representation(fp="numeric",type="character",foldCount="numeric"),
			prototype=list(foldCount=0))
setMethod(f="initialize", signature="FP",definition= function(.Object, ...){
			  obj <- callNextMethod(.Object, ...)
			  if(length(obj@type)==0)
				  obj@type=paste("unknown",as.integer(runif(1)*10000),sep="-")
			  obj
			})

setClass("FPset", representation(fpma="matrix",type="character",foldCount="numeric"),
			prototype=list(foldCount=0))
setMethod(f="initialize", signature="FPset",definition= function(.Object, ...){
			  obj <- callNextMethod(.Object, ...)
			  if(length(obj@type)==0)
				  obj@type=paste("unknown",as.integer(runif(1)*10000),sep="-")
			  obj
			})

## Methods to return FPSet as vector or matrix, respectively
setMethod(f="as.vector", signature="FP", definition=function(x) {return(x@fp)})
setMethod(f="as.numeric", signature="FP", definition=function(x) {return(x@fp)})
setMethod(f="as.character", signature="FP", definition=function(x) {return(paste(x@fp, collapse=""))})
setMethod(f="as.matrix", signature="FPset", definition=function(x) {return(x@fpma)})
setMethod(f="as.character", signature="FPset", definition=function(x) {sapply(rownames(x@fpma), function(y) paste(x@fpma[y,], collapse=""))})

## Constructor methods
## Numeric vector to FP with: as(myvector, "FP")
setAs(from="numeric", to="FP",  
        def=function(from) {
		new("FP", fp=from)
})
## Matrix to FPset with: as(mymatrix, "FPset")
setAs(from="matrix", to="FPset",  
        def=function(from) {
		new("FPset", fpma=from)
})

## Character to FPset with: as(mychar, "FPset")
setAs(from="character", to="FPset",  
        def=function(from) {
		read.AP(from, type="fp")
})

## Accessor methods for FPset
## Note: generic for cid() is defined under SDFset class section.
setMethod(f="cid", signature="FPset", definition=function(x) { return(rownames(x@fpma)) })

## Replacement method for ID component of FPset using accessor methods.
setReplaceMethod(f="cid", signature="FPset", definition=function(x, value) {
	rownames(x@fpma) <- value 
	if(any(duplicated(rownames(x@fpma)))) { 
		warning("The values in the CMP ID slot are not unique anymore. To fix this, run: cid(fpset) <- makeUnique(cid(fpset))")
	}
	return(x)
})

foldVector <- function(v,count=1,bits=NULL){
	 len = length(v)
	 if(len%%2!=0)
		 stop("must have an even number of bits to fold")

	 if(!is.null(bits))
		 count=foldsNeeded(len,bits)

	 #message("count: ",count," len: ",len)
	 realCount=0
	 for(i in rep(0,count)){
		 if(len==1)
			 break
		 mid = len/2
		 #message("mid: ",mid)
		 v = mapply(function(a,b) if(a||b)1 else 0,v[1:mid],v[(mid+1):len])
		 len = length(v)
		 realCount=realCount+1
	 }
	 if(count!=realCount)
		 warning(count," folds requested but could only fold ",realCount," times")
	 list(fp=v,actualFoldCount=realCount)
}
foldsNeeded <- function(length,bits){
	count = log2(length) - log2(bits)
	if(count != as.integer(count))
		stop("not possible to achieve ",bits," with an integral number of folds")
	count
}
setGeneric(name="fold", def=function(x,count=1,bits=NULL) standardGeneric("fold"))
setMethod(f="fold",signature="FP", definition=function(x,count,bits){
			 result = foldVector(x@fp,count,bits)
			 x@fp = result$fp
			 x@foldCount = x@foldCount+result$actualFoldCount
			 x
})
setMethod(f="fold",signature="FPset", definition=function(x,count,bits) {
			 actualFolds=0
			 data = apply(x@fpma,c(1),function(y){
								 result = foldVector(y,count,bits)
								 actualFolds <<- result$actualFoldCount #just keep the last one, they should all be the same
								 result$fp	 })
			 if(class(data) != "matrix") # account for case when fp gets folded down to 1
				 dim(data) = c(nrow(x@fpma),1)
			 else
				 data = t(data)
			 x@fpma = data
			 x@foldCount = x@foldCount+actualFolds
			 x
})

setGeneric(name="fptype", def=function(x) standardGeneric("fptype"))
setMethod(f="fptype", signature="FP",definition=function(x) x@type)
setMethod(f="fptype", signature="FPset",definition=function(x) x@type)

setGeneric(name="foldCount", def=function(x) standardGeneric("foldCount"))
setMethod(f="foldCount", signature="FP",definition=function(x) x@foldCount)
setMethod(f="foldCount", signature="FPset",definition=function(x) x@foldCount)

setGeneric(name="numBits", def=function(x) standardGeneric("numBits"))
setMethod(f="numBits", signature="FP",definition=function(x) length(x@fp))
setMethod(f="numBits", signature="FPset",definition=function(x) ncol(x@fpma))

## Replacement method for FPset using "[" operator 
## It doesn't provide here full set of expected functionalities.
setReplaceMethod(f="[", signature="FPset", definition=function(x, i, value) {
	x@fpma[i,] <- value
	return(x)
})

## Behavior of "[" operator for FPset 
setMethod(f="[", signature="FPset", definition=function(x, i, ..., drop) {
	if(is.logical(i)) {
                i <- which(i)
        }
	x@fpma <- x@fpma[i, , drop=FALSE]                   
	if(any(duplicated(i))) {
		warning("The values in the CMP ID slot are not unique anymore. To fix this, run: cid(fpset) <- makeUnique(cid(fpset))")
	}
	return(x)
})

## Behavior of "[[" operator for FPset to convert single FPset component to FP
setMethod(f="[[", signature="FPset", definition=function(x, i, ..., drop) {
	if(is.character(i)) { i <- which(rownames(x@fpma) %in% i) }
	return(new("FP", fp=x@fpma[i,]))                 
})

## Length function
setMethod(f="length", signature="FPset",
    definition=function(x) {
    	return(length(as.matrix(x)[,1]))
})

## Define print behavior for FPset
setMethod(f="show", signature="FPset", 
	definition=function(object) {    
	cat("An instance of a ", length(object@fpma[1,]), " bit ", "\"", class(object), 
		 "\" of type \"",object@type,"\" with ",
		 length(object@fpma[,1]), " molecules", "\n", sep="")
})

setMethod(f="c",signature="FP",definition=function(x,...){
	args = list(x,...)

	type=NULL
	length=0
	fpma = t(sapply(args,function(obj){
		if(!inherits(obj,"FP"))
			stop("all arguments must be of type FP")

		if(is.null(type))
			type<<-obj@type
		if(length==0)
			length <<- numBits(obj)

		if(obj@type != type)
			stop("found differing types: ",obj@type," and ",type)
		if(numBits(obj) != length)
			stop("found differing number of bits: ",numBits(obj)," and ",length)

		obj@fp
	}))
	fpset=new("FPset",fpma=fpma,type=type)
	if(any(duplicated(cid(fpset)))) 
		warning("The values in the CMP ID slot are not unique anymore, makeUnique() can fix this!")
	fpset
})
## Concatenate function for FPset
setMethod(f="c", signature="FPset", definition=function(x, ...) {
	args = list(x,...)
	type=NULL
	length=0
	fpma = Reduce(rbind,Map(function(obj){
		if(!inherits(obj,"FPset"))
			stop("all arguments must be of type FPset")

		if(is.null(type))
			type<<-obj@type
		if(length==0)
			length <<- numBits(obj)

		if(obj@type != type)
			stop("found differing types: ",obj@type," and ",type)
		if(numBits(obj) != length)
			stop("found differing number of bits: ",numBits(obj)," and ",length)

		obj@fpma
	},args))
	fpset=new("FPset",fpma=fpma,type=type)
	if(any(duplicated(cid(fpset)))) 
		warning("The values in the CMP ID slot are not unique anymore, makeUnique() can fix this!")

	fpset
})

## Define print behavior for FP
setMethod(f="show", signature="FP",                
   definition=function(object) {
         cat("An instance of ", "\"", class(object), "\" of type \"",object@type, "\"\n", sep="")
         cat("<<fingerprint>>", "\n", sep="")
         if(length(object@fp)>=20) {
             cat(c(object@fp[1:20], "... length:", length(object@fp), "\n"))
         } else {
             print(object@fp)}
})

## FPset to list with many FP objects (for summary view)
setAs(from="FPset", to="FP", 
        def=function(from) {
                tmp <- lapply(seq(along=from), function(x) from[[x]])
		names(tmp) <- cid(from)
		return(tmp)
})
setMethod(f="view", signature="FPset", definition=function(x) { as(x, "FP") })
# view(fpset)

###################
## (7) Utilities ##
###################

#################################################
## (6.1) Detect Invalid SDFs in SDFset Objects ##
#################################################
validSDF <- function(x, Nabcol = 3, Nbbcol = 3, logic="&", checkNA=TRUE) {
	if(class(x)!="SDFset") 
		warning("x needs to be of class SDFset")
	ab <- atomblock(x); 
	abcol <- sapply(names(ab), function(x) length(ab[[x]][1,]))
	bb <- bondblock(x); 
	bbcol <- sapply(names(bb), function(x) length(bb[[x]][1,]))
	if(logic=="|") 
		validsdf <- abcol >= Nabcol | bbcol >= Nbbcol 
	if(logic=="&") 
		validsdf <- abcol >= Nabcol & bbcol >= Nbbcol 
	if(checkNA==TRUE) {
        	abNA <- sapply(names(ab), function(x) !any(is.na(ab[[x]])))
        	bbNA <- sapply(names(bb), function(x) !any(is.na(bb[[x]])))
		validsdf <- validsdf & abNA & bbNA
	}
	return(validsdf)
}

######################################################################
## (6.2) Create Unique CMP Names by Appending a Counter to Duplates ##
######################################################################
makeUnique <- function(x, silent=FALSE) {
	if(all(!duplicated(x))) {
		if(silent!=TRUE) print("No duplicates detected!")
		return(x)
	} else {
		count <- table(x); count <- count[x]
		dupids <- count[count>1]; dupids <- dupids[!duplicated(names(dupids))]
		for(i in seq(along=dupids)) {
			names(count)[names(count) %in% names(dupids[i])] <- paste(names(dupids)[i], seq(1, dupids[i]), sep="_")
		}
		if(silent!=TRUE) print(paste("Counter appended to", length(dupids), "duplicates!"))
		return(names(count))
	}
}

###############################
## (6.3) Molecule Properties ##
###############################
## (6.3.1) Atom count matrix
atomcountMA <- function(x, ...) {
        if(class(x)=="SDF") x <- as(x, "SDFset")
	atomcountlist <- atomcount(x, ...) 	

	columns <- unique(unlist(lapply(seq(along=atomcountlist), function(x) names(atomcountlist[[x]]))))
        myMA <- matrix(NA, length(atomcountlist), length(columns), dimnames=list(NULL, columns))
        for(i in seq(along=atomcountlist)) myMA[i, names(atomcountlist[[i]])] <- atomcountlist[[i]]
	myMA[is.na(myMA)] <- 0
	rownames(myMA) <- names(atomcountlist)
	return(myMA)
}

## (6.3.2) Molecular weight (MW data from http://iupac.org/publications/pac/78/11/2051/)
data(atomprop); atomprop=atomprop
MW <- function(x, mw=atomprop, ...) {
   if(class(x)=="SDF") x <- as(x, "SDFset")
	
	## Create MW vector with atom symbols in name slot
	AW <- mw$Atomic_weight
	names(AW) <- mw$Symbol

	## Calculate MW
	propma <- atomcountMA(x, ...)
	MW <- rowSums(t(t(propma) * AW[colnames(propma)]), na.rm = TRUE) 
	return(MW)
}

## (6.3.3) Molecular formula
MF <- function(x, ...) {
        if(class(x)=="SDF") x <- as(x, "SDFset")
	propma <- atomcountMA(x, ...)
	propma <- propma[c(1, seq(along=propma[,1])),] # Duplicates first row to support processing of single molecule with same code
	hillorder <- colnames(propma); names(hillorder) <- hillorder
	hillorder <- na.omit(unique(hillorder[c("C", "H", sort(hillorder))]))
	propma <- propma[, hillorder]
	propma[propma==1] <- ""
	MF <- paste(colnames(propma), t(propma), sep="")
	propma <- matrix(MF, nrow=length(propma[,1]), ncol=length(propma[1,]), dimnames=list(rownames(propma), colnames(propma)), byrow=TRUE)
	MF <- seq(along=propma[,1]); names(MF) <- rownames(propma)
	zeroma <- matrix(grepl("[\\*A-Za-z]0$", propma), nrow=length(propma[,1]), ncol=length(propma[1,]), dimnames=list(rownames(propma), colnames(propma)))
	propma[zeroma] <- ""
	for(i in seq(along=MF)) { MF[i] <-  paste(propma[i,], collapse="") }
	return(MF[-1]) # Minus one to remove duplicated entry in first row of propma
}

## (6.3.4) Ring Perception and Aromaticity Assignment
## Implements with some modifications the exhaustive ring perception algorithm 
## from Hanser et al (1996). URL: http://pubs.acs.org/doi/abs/10.1021/ci960322f

## (a) Iterative removal of atoms with single non hydrogen bonds.
## Returns from molecule only its rings and their inter-connections.
.cyclicCore <- function(x) {
	if(length(x) != 1) stop("x needs to be a single molecule")
	if(class(x) == "SDFset") x <- x[[1]]
	path <- conMA(x, exclude="H")
	noconnect <- rowSums(path) != 0 # Removes atoms with no connections
	path <- path[noconnect, noconnect]
	if(all(dim(path) == 0)) { return(path) } 
	term <- which(rowSums(path > 0)==1)
	while(length(term) > 0) {
		path <- path[-term,-term]
		if(any(dim(path) == 0) | is.vector(path)) { break() }
		term <- which(rowSums(path > 0)==1)
	}
	return(path)
}

## (b) Function to return the longest possible linear bond paths where:
##     - internal atoms have only two heavy atom neighbors
##     - terminal atoms are atoms with more than two heavy atom neighbors or they are ring closures
.linearCon <- function(x) {
	secatoms <- rowSums(x > 0) == 2
	secatoms <- names(which(secatoms))
	.linearCon <- function(vertex, x=x) {
		con <- as.list(names(which(x[vertex, ] > 0)))
		for(i in seq(along=con)) {
			termatom <- con[[i]][length(con[[i]])]
			termcon <- names(which(x[termatom, ] > 0))
			termcon <- termcon[!termcon %in% vertex]
			while(length(termcon) == 1 & !any(duplicated(con[[i]]))) { 
				con[[i]] <- c(con[[i]], termcon)						
				termatom <- con[[i]][length(con[[i]])]
				termcon <- names(which(x[termatom, ] > 0))
				termcon <- termcon[!termcon %in% con[[i]][length(con[[i]])-1]]
			}
		}
		if(paste(sort(unique(con[[1]])), collapse="") == paste(sort(unique(con[[2]])), collapse="")) {
			return(con[[1]])
		} else {
			return(c(rev(con[[1]]), vertex, con[[2]]))
		}
	}
	linearconlist <- lapply(secatoms, function(y) .linearCon(vertex=y, x=x))
	nodups <- !duplicated(sapply(linearconlist, function(y) paste(sort(unique(y)), collapse=""))) 
	linearconlist <- linearconlist[nodups]
	return(linearconlist)
}

## (c) Assemble intermediate results in a list
.update <- function(con, path) {
	## Remove non-terminal atoms in each path
	center_atoms <- unique(unlist(lapply(path, function(x) x[-c(1, length(x))])))
	con1 <- con[!rownames(con) %in% center_atoms, !colnames(con) %in% center_atoms]
	## Add atom pairs with three neighbors to connection list
	if(is.matrix(con1)) {
		remainbonds <- con1
		remainbonds[lower.tri(remainbonds)] <- 0
		remainbonds <- lapply(rownames(remainbonds), function(x) names(which(remainbonds[x,] > 0)))
		names(remainbonds) <- rownames(con1)
		remainbonds <- cbind(rep(names(remainbonds), sapply(remainbonds, length)), unlist(remainbonds, use.names=F))
		remainbonds <- split(remainbonds, seq(along=remainbonds[,1]))
		path <- c(path, remainbonds)
		names(path) <- seq(along=path)
	}
	## Collect complete rings and remove them from path object
	index <- unlist(lapply(path, function(y) any(duplicated(y))))
	rings <- path[index]
	path <- path[!index]
	names(path) <- seq(along=path)
	## Connection list for path component
	conpath <- t(sapply(path, function(x) x[c(1, length(x))]))
	ends <- unique(as.vector(conpath))
	conpath <- lapply(ends, function(x) as.numeric(names(which(rowSums(conpath==x) > 0))))
	names(conpath) <- ends
	conpath <- conpath[sapply(conpath, length) > 1] # removes ends that occur only once 
	## Assemble results in list
	return(list(con=con, conpath=conpath, path=path, rings=rings))
}

## (d) Return rings from cyclist object
.rings <- function(cyclist, upper=Inf) {
	## Define data containers
	pathlist <- cyclist$path
	conpath <- cyclist$conpath
	pathlistnew <- list() 
	rings <- list()
	## Loop to join linear paths/fragments stored in pathlist
	for(i in names(conpath)) {
		if(length(conpath) == 0 | !any(names(conpath) == i)) { next() }
		pos <- t(combn(conpath[[i]], m=2))
		for(j in seq(along=pos[,1])) { 
			p1 <- pathlist[[pos[j,1]]]
			p2 <- pathlist[[pos[j,2]]]
			if(sum(p1[-c(1,length(p1))] %in% p2[-c(1,length(p2))]) > 0) {
				next()
			}
			if(p1[1] == i & p2[1] == i) { # matching s1:s2 
				pathlistnew[[length(pathlistnew)+1]] <- c(rev(p2[-1]), p1)
			}
			if(p1[length(p1)] == i & p2[length(p2)] == i) { # matching e1:e2 
				pathlistnew[[length(pathlistnew)+1]] <- c(p1, rev(p2[-length(p2)]))
			}
			if(p1[1] == i & p2[length(p2)] == i) { # matching s1:e2
				pathlistnew[[length(pathlistnew)+1]] <- c(p2, p1[-1])
			}
			if(p1[length(p1)] == i & p2[1] == i) { # matching e1:s2
				pathlistnew[[length(pathlistnew)+1]] <- c(p1, p2[-1])
			}
		}
		## Various postprocessing routines for joined fragments follow
		if(length(pathlistnew) == 0) { next() }
		## Remove duplicates
		dups <- duplicated(sapply(pathlistnew, function(x) paste(sort(unique(x)), collapse="_")))
		pathlistnew <- pathlistnew[!dups]
		## Set maximum ring size; improves time performance if outer rings are not needed
		if(upper != Inf) {   
			l <- sapply(pathlistnew, length)
			pathlistnew <- pathlistnew[l <= upper]
			if(length(pathlistnew) == 0) { next() }
		} 
		## Collect complete rings and remove them from path object
		index <- unlist(lapply(pathlistnew, function(y) any(duplicated(y[c(1, length(y))]))))
		rings[[length(rings)+1]] <- pathlistnew[index]
		pathlistnew <- pathlistnew[!index]
		## Remove paths with internal duplicates 
		if(length(pathlistnew) > 0) {
			index <- unlist(lapply(pathlistnew, function(y) any(duplicated(y))))
			pathlistnew <- pathlistnew[!index]
		}
		## Update pathlist and conpath
		pathlist <- c(pathlist[-conpath[[i]]], pathlistnew)
		dups <- duplicated(sapply(pathlist, function(x) paste(sort(unique(x)), collapse="_")))
		pathlist <- pathlist[!dups]
		names(pathlist) <- seq(along=pathlist)
		conpath <- t(sapply(pathlist, function(x) x[c(1, length(x))]))
		ends <- unique(as.vector(conpath))
		conpath <- lapply(ends, function(x) as.numeric(names(which(rowSums(conpath==x) > 0))))
		names(conpath) <- ends
		conpath <- conpath[sapply(conpath, length) > 1] # removes ends that occur only once
		pathlistnew <- list()
	}
	## Generate proper output format 
	rings <- unlist(rings, recursive=FALSE)
	dups <- duplicated(sapply(rings, function(x) paste(sort(unique(x)), collapse="_")))
	rings <- c(cyclist$rings, rings[!dups])
	l <- sapply(rings, length) 
	rings <- rings[order(l)]
	if(upper != Inf) { rings <- rings[l <= upper] }
	if(length(rings) > 0) { 
		names(rings) <- paste("ring", seq(along=rings), sep="") 
	} else {
		rings <- NULL
	}
	return(rings)
}

## (e) Identify inner rings
## Expected input x is list of rings from call: 
##      x <- rings(x=sdfset[[1]], upper=Inf, type="all", arom=FALSE) 
.is.inner <- function(x) {
        rnames <- rev(names(x)); names(rnames) <- rnames
        r <- x
        names(r) <- paste(names(r), "_", sep="")
        r <- unlist(r)
        for(i in rnames) {
                tmp <- x[[i]]
                r2 <- r[!gsub("_.*", "", names(r)) %in% i] 
                if(all(tmp %in% r2)) {
                        rnames[i] <- "redundant"
                        r <- r2
                }
        }
        return(x[rev(rnames!="redundant")])
}

## (f) Aromaticity assignment for rings
## Approach
##   (i) Identify rings where all atoms are sp2 hybridized. This means each atom has a 
##       double bond or at least one lone electron pair and is attached to a sp2 hybridized atom
##   (ii) Hueckel's rule needs to be true (4n+2=integer): 2, 6, 10, 14, 18, ... pi electrons 
##        per ring.
.is.arom <- function(sdf, rings) {
	if(length(rings)==0) { return(NULL) } 
	con <- conMA(sdf)
	b2 <- bonds(sdf)
	b <- b2[,"Nbondcount"] - b2[,"charge"]
	names(b) <- paste(b2[,"atom"], "_", seq(along=b2[,1]), sep="") 
	.neighborsFct <- function(x) { sapply(rownames(x), function(y) paste(sort(paste(x[y,][x[y,]!=0], sep="_")), collapse="_")) }
	## Determine aromaticity
	.arom <- function(con, b, r=rings[[1]]) {
		## Identify sp2 hybridized atoms
		sp <- .neighborsFct(x=con)[r]
		sp2 <- grepl("1_2$", sp)
		## Identify atoms with lone electron pairs
		el <- c(Al=3, As=5, At=7, B=3, Bi=5, Br=7, C=4, Cl=7, F=7, Ga=3, Ge=4, I=7, In=3, N=5, O=6, P=5, Pb=4, Po=6, S=6, Sb=5, Se=6, Si=4, Sn=4, Te=6, Tl=3)
		bsub <- b[names(sp)]
		lp <- el[gsub("_.*", "", names(bsub))] - bsub >= 2
		lp[is.na(lp)] <- 0 # If an element is not specified under el, then its NA value in lp is set to zero
		sp2lp <- all(sp2 | lp)
		## Hueckel's rule 
		d <- rowSums(con[r, r]==2)
		double <- sum(d) 
		if(length(d[lp]) != 0) {
			lpcount <- sum(lp[d==0]) * 2
		} else {
			lpcount <- 0
		}
		n <- ((double + lpcount) - 2) / 4
		is.wholenumber <- function(x, tol = .Machine$double.eps^0.5)  abs(x - round(x)) < tol
		hueckel <- is.wholenumber(n)
		return(sp2lp & hueckel)
	}
	return(sapply(names(rings), function(x) .arom(con, b, r=rings[[x]])))
}

## Inner rings inherit aromaticity assignment from super rings if they are fully contained in them
## This is necessary since the aromaticity assignment for inner rings may miss context information required
## for Hueckel's rule. Note, for this to work the input data has to be generated with .rings(x, Inf)
.containedArom <- function(myrings, myarom) {
    if(!all(names(myrings) %in% names(myarom))) stop("Both inputs need to have the same number of rings with identical names")
    if(all(!myarom)) {
        return(myarom)
    } else {
        nonaromrings <- myrings[!myarom] # Restricts loop to non-aromatic entries
        aromrings <- myrings[myarom]
        for(i in seq_along(nonaromrings)) {
            updatearom  <- sapply(aromrings, function(x) all(nonaromrings[[i]] %in% x))
            if(any(updatearom)) myarom[names(nonaromrings[i])] <- TRUE
        }
        return(myarom)
    }
}

## (g) Ring and aromaticity perception by running the functions (1)-(5) 
rings <- function(x, upper=Inf, type="all", arom=FALSE, inner=FALSE) {
    if(!any(c("SDF", "SDFset") %in% class(x))) stop("x needs to be of class SDF or SDFset")
    if(inner==TRUE & upper!=Inf) stop("Inner ring prediction requires upper=Inf")
    if(type=="arom" & arom==FALSE) stop("type='arom' requires arom=TRUE")
    runAll <- function(x, upper) {
        con <- .cyclicCore(x)
        if(any(dim(con) == 0) | is.vector(con) | all(con == 0)) { # TRUE for linear compounds 
            if(arom==TRUE) {
                if(type=="all") return(list(RINGS=NULL, AROMATIC=NULL))
                if(type=="count") return(c(RINGS=0, AROMATIC=0))
            } else {
                if(type=="all") return(NULL)
                if(type=="count") return(0)
            }
        } else {
            path <- .linearCon(x=con)
            cyclist <- .update(con=con, path=path)
            if(arom==TRUE) {
                myrings <- .rings(cyclist, Inf) # Full ring set required to identify remaining aromatic rings with .containedArom 
                myrings <- lapply(myrings, function(x) x[-1]) # Removes duplicated atom at ring closure 
                myarom <- .is.arom(sdf=x, rings=myrings)
                myarom <- .containedArom(myrings, myarom)
                if(upper==Inf & inner==TRUE) {
                    myringnames <- names(.is.inner(x=myrings)) # Returns names of inner rings only
                    myrings <- myrings[myringnames]
                    myarom <- myarom[myringnames]
                }
                if(upper!=Inf) {
                    uppernames <- names(myrings[sapply(myrings, length) <= upper])
                    myrings <- myrings[uppernames] 
                    myarom <- myarom[uppernames]
                }
                if(type=="all") return(list(RINGS=myrings, AROMATIC=myarom))
                if(type=="arom") return(list(AROMATIC_RINGS=myrings[myarom]))
                if(type=="count") return(c(RINGS=length(myrings), AROMATIC=sum(myarom)))
            } else {
                myrings <- .rings(cyclist, upper+1) # Plus 'upper+1' is required because at this step all rings have duplicated atoms at ring closure
                myrings <- lapply(myrings, function(x) x[-1]) # Removes duplicated atom at ring closure 
                if(upper==Inf & inner==TRUE) myrings <- .is.inner(x=myrings) # Reduces myrings to inner rings only 
                if(type=="all") return(myrings)
                if(type=="count") return(length(myrings))
            }
        }
    }
    if(class(x)=="SDF") { 
        return(runAll(x, upper))
    }
    if(class(x)=="SDFset" & length(x) == 1) { 
        return(runAll(x[[1]], upper))
    }
    if(class(x)=="SDFset" &  length(x) > 1) {
        myrings <- lapply(1:length(x), function(y) runAll(x[[y]], upper))
        names(myrings) <- cid(x)
        if(type=="all" | type=="arom") return(myrings)
        if(type=="count") {
            mycol <- c("RINGS", "AROMATIC")
            return(matrix(unlist(myrings), ncol=length(myrings[[1]]), dimnames=list(names(myrings), mycol[1:length(myrings[[1]])]), byrow=T))
        }
    }
}
## Usage:
# rings(sdfset[1:4], upper=6, type="all", arom=TRUE)
# rings(sdfset[1:4], upper=6, type="arom", arom=TRUE)
# rings(sdfset[1:4], upper=6, type="count", arom=TRUE)
# plot(sdfset[1], print=F, atomnum=T, no_print_atoms="H") 
# plot(sdfset[1:4], print=F, atomnum=T, no_print_atoms="H") 

## (6.3.5) Enumerate Functional Groups
## (a) Generate neighbor information for each heavy atom in a molecule
.neighbors <- function(x, type="countMA") {
        ## Input checks        
        if(!any(c("SDF", "SDFset") %in% class(x))) stop("x needs to be of class SDF or SDFset")
        if(!any(c("all", "count", "countMA") %in% type)) stop("type can only be assigned: all, count or countMA") 
        ## Return neighbors
        .neighborsFct <- function(x, type=type) {
                colnames(x) <- paste(colnames(x), colSums(x>0), sep="_") # Adds number of bonded heavy atom neighbors (non-hydrogens)
                neighbors <- sapply(rownames(x), function(y) paste(sort(paste(gsub("_.*_", "_", colnames(x)[x[y,]!=0]), x[y,][x[y,]!=0], sep="_")), collapse="_"))
                if(type=="all") {
                        return(neighbors)
                }
                if(type=="count" | type=="countMA") {
                        return(table(paste(gsub("_.*", "", names(neighbors)), neighbors, sep=":")))
                }
        }
        ## Run on SDF object
        if(class(x)=="SDF") {
                x <- conMA(x, exclude="H")
                neighbors <- .neighborsFct(x, type)
                return(neighbors)
        }
        ## Run on SDFset objects containing one or many molecules
        if(class(x)=="SDFset") {
                cid <- cid(x)
                x <- conMA(x, exclude="H")
                neighbor_set <- lapply(seq(along=x), function(y) .neighborsFct(x[[y]], type))
                names(neighbor_set) <- cid
                if(type=="all" | type=="count") {
                        return(neighbor_set)
                }
                if(type=="countMA") {
                        columns <- unique(unlist(lapply(seq(along=neighbor_set), function(x) names(neighbor_set[[x]]))))
                        myMA <- matrix(NA, length(neighbor_set), length(columns), dimnames=list(NULL, columns))
                        for(i in seq(along=neighbor_set)) myMA[i, names(neighbor_set[[i]])] <- neighbor_set[[i]]
                        myMA[is.na(myMA)] <- 0
                        rownames(myMA) <- cid
                        return(myMA)
                }
        }
}
## Usage:
# .neighbors(sdfset[1:4], type="all")
# .neighbors(sdfset[1:4], type="countMA")

## (b) Count functional groups
groups <- function(x, groups="fctgroup", type="countMA") {
        ## Input checks        
        if(!any(c("SDF", "SDFset") %in% class(x))) stop("x needs to be of class SDF or SDFset")
        if(groups=="fctgroup" & (type=="count" | type=="all")) stop("when groups=\"fctgroup\", only type=\"countMA\" can be used")
        mylength <- length(x)
        ## Support for single molecule objects
	if(mylength==1) { x <- as(x, "SDFset"); y <- x; z <- x; cid(z) <- "dummy"; x <- c(y, z) }
        ## Generate neighbor counts
	neighbors <- .neighbors(x, type)
        ## Return neighbor counts if requested
	if(groups[1]=="neighbors") { 
		if(class(neighbors)=="matrix") {
			neighbors <- neighbors[!rownames(neighbors) %in% "dummy", ]
		} else { 
			neighbors <- neighbors[!names(neighbors) %in% "dummy"]
		}
		return(neighbors) 
	
	}
        ## Count functional groups based on data stored in neighbor matrix
        if(groups[1]=="fctgroup") { 
		groups <- c(RNH2="^C:.*N_1_1$", R2NH="^N:C_._1_C_._1$", R3N="^N:C_._1_C_._1_C_._1$", 
                            ROPO3="^P:O_._._O_._._O_._._O_._.$", ROH="(?=^C:.*O_1_1)(?=^(?:(?!(N|O|P|S)_._(2|3)).)*$)", 
		            RCHO="^O:C_2_2", RCOR="^C:C_._1_C_._1.*O_1_2", RCOOH="^C:.*O_1_1_O_1_2", 
			    RCOOR="^C:.*O_1_2_O_2_1", ROR="^O:.*C_._1_C_._1", RCCH="^C:C_2_3", RCN="^N:C_2_3") 
	}
        groupMA <- sapply(names(groups), function(x) rowSums(neighbors[, rep(grep(groups[x], colnames(neighbors), perl=TRUE),2)]/2))
	## Fix counts for ambiguous functional groups
        if(c("ROR" %in% colnames(groupMA))) {
	        groupMA[, "ROR"] <- groupMA[, "ROR"] - groupMA[, "RCOOR"]
                groupMA[groupMA < 0] <- 0
        }
	if(mylength>1) {
                return(groupMA)
        } else {
                return(groupMA[1,])
        }
}
## Usage:
# groups(sdfset[1:20], groups="fctgroup", type="countMA") 
# groups(sdfset[1:4], groups="neighbors", type="countMA")
# groups(sdfset[1:4], groups="neighbors", type="count")
# groups(sdfset[1:4], groups="neighbors", type="all")

##############################################################
## (6.4) Convert SDF Tail to Numeric and Character Matrices ##
##############################################################
## (6.4.1) Store everything in one character matrix
datablock2ma <- function(datablocklist, cleanup=" \\(.*", ...) {
	if(exists("cleanup")) 
		for(i in seq(along=datablocklist)) 
			names(datablocklist[[i]]) <- gsub(cleanup, "", names(datablocklist[[i]])) # Required if name tags contain compound ids
   columns <- unique(unlist(lapply(seq(along=datablocklist), function(x) names(datablocklist[[x]]))))
	myMA <- matrix(NA, length(datablocklist), length(columns), dimnames=list(NULL, columns))
	for(i in seq(along=datablocklist)) 
		myMA[i, names(datablocklist[[i]])] <- datablocklist[[i]]
	rownames(myMA) <- names(datablocklist)
	return(myMA)
}
# Usage:
# blockmatrix <- datablock2ma(datablocklist=datablock(sdfset))

## (6.4.2) Split SDF tail matrix into character and numeric matrices
splitNumChar <- function(blockmatrix) {
	# Define function to check for valid numeric values in a character vector
	numberAble <- function(myvec, type=c("single", "vector"), extras = c(".", "NA")) {
	    type <- match.arg(type)
	    old <- options(warn = -1)
	    on.exit(options(old))
	    myvec <- gsub(" ", "", myvec)
	    myvecsub <- myvec[!myvec %in% c("", extras)]
	    isnum <- !any(is.na(as.numeric(myvecsub)))
	    if (type == "single") 
		isnum
	    else if (isnum) 
		as.numeric(myvec)
	    else myvec
	}
        colindex <- sapply(seq(along=blockmatrix[1,]), function(x) numberAble(blockmatrix[,x]))
        numMA <- blockmatrix[ , colindex]
        storage.mode(numMA) <- "numeric"
        charMA <- blockmatrix[ , !colindex]
        return(list(numMA=numMA, charMA=charMA))
}
# Usage:
# numchar <- splitNumChar(blockmatrix=blockmatrix)

#########################
## (6.5) Bond Matrices ##
#########################
## (6.5.1) Generate bond matrix from SDFset or SDF objects
conMA <- function(x, exclude="none") {
        ## Function for SDF object 
        .conMA <- function(x, exclude=exclude) {
            atoms <- rownames(atomblock(x))
            conma <- matrix(0, length(atoms), length(atoms), dimnames=list(atoms, atoms))
            bondblock <- bondblock(x)
            for(i in seq(along=bondblock[,1])) conma[bondblock[i,1], bondblock[i,2]] <- bondblock[i,3]
            for(i in seq(along=bondblock[,1])) conma[bondblock[i,2], bondblock[i,1]] <- bondblock[i,3]
            index <- !gsub("_.*", "", rownames(conma)) %in% exclude
	    conma <- conma[index, index, drop = FALSE]
            return(conma)
        }   
        ## Run on SDF objects
        if(class(x)=="SDF") {
                conma <- .conMA(x, exclude)
                return(conma)
        }
        ## Run on SDFset objects containing one or many molecules
        if(class(x)=="SDFset") {
                conma_set <- lapply(seq(along=x), function(y) .conMA(x[[y]], exclude))
                names(conma_set) <- cid(x)
                return(conma_set)
        }
}
# Usage:
# conma <- conMA(sdfset[1:2], exclude=c("H"))

## (6.5.2) Compute bond/charge count for each atom in SDFset or SDF objects
## This is used to add hydrogens with methods/functions atomcount, atomcountMA, MW and MF
bonds <- function(x, type="bonds") {
	## Input checks
        if(!any(c("SDF", "SDFset") %in% class(x))) stop("x needs to be of class SDF or SDFset")
	if(!any(c("bonds", "charge", "addNH") %in% type)) stop("type can only be assigned: bonds, charge or addNH") 
	
	## Compute bonds, charges and missing hydrogens
	.bonds <- function(x, type=type) {
		atomMA <- atomblock(x)
		atoms <- gsub("_.*", "", rownames(atomMA))
		bondMA <- bondblock(x)
		Nbonds1 <- cbind(atoms=c(bondMA[,1], bondMA[,2]), bonds=c(bondMA[,3], bondMA[,"C3"]))
		Nbonds1 <- tapply(Nbonds1[, "bonds"], Nbonds1[, "atoms"], sum)
		Nbonds <- rep(0, length(atomMA[,1])); names(Nbonds) <- seq(along=atomMA[,1]); Nbonds[names(Nbonds1)] <- Nbonds1 	
		
		## Valence related to position in periodic table (following octet rule) 
		val <- c("1"=1, "17"=1, "2"=2, "16"=2, "13"=3, "15"=3, "14"=4)
		group <- as.numeric(atomprop$Group); names(group) <- as.character(atomprop$Symbol)
		Nbondrule <- val[as.character(group[atoms])]
		Nbondrule[is.na(Nbondrule)] <- 0 # Atoms with undefined Nbondrule (NAs) are assigned zero
		Nbondrule[Nbondrule < Nbonds] <- Nbonds[Nbondrule < Nbonds] # Set Nbondrule to Nbonds values, where latter is larger 

		## Charges
		charge <- c("0"=0, "1"=3, "2"=2, "3"=1, "4"=0, "5"=-1, "6"=-2, "7"=-3) # 4 is "doublet_radical"
		charge <- charge[as.character(atomMA[,5])]
		Nbonds <- data.frame(atom=atoms, Nbondcount=Nbonds, Nbondrule=Nbondrule, charge=charge) 
		
		## Data type to return
		if(type=="bonds") { return(Nbonds) }
		if(type=="charge") {
			chargeindex <- Nbonds[, "charge"] != 0 
			if(sum(chargeindex) == 0) {
				return(NULL)
			} else {
				chargeDF <- Nbonds[chargeindex, ]
				charge <- chargeDF[, "charge"]
				names(charge) <- chargeDF[, "atom"] 
				return(charge)
			}
		}
		if(type=="addNH") { 
			Nbonds[Nbonds[,"Nbondcount"] >= Nbonds[,"Nbondrule"], "charge"] <- 0 # Ignore charge where Nbondcount greater or equal than Nbondrule
			Nbonds[Nbonds[,"Nbondcount"] == 0, c("Nbondrule", "charge")] <- 0 # Ignore atoms with zero bonds
			NH <- sum((Nbonds[, "Nbondrule"] + Nbonds[, "charge"]) - Nbonds[, "Nbondcount"])
			if(NH < 0) NH <- 0 # Count should not be negative
			return(NH) 
		}
	}
        ## Run on SDF objects
        if(class(x)=="SDF") {
                bonds <- .bonds(x, type)
                return(bonds)
        }
        ## Run on SDFset objects containing one or many molecules
        if(class(x)=="SDFset") {
                bonds_set <- lapply(seq(along=x), function(y) .bonds(x[[y]], type))
                names(bonds_set) <- cid(x)
		if(type=="bonds") {
                	return(bonds_set)
		}
		if(type=="charge") {
                	return(bonds_set)
		}
		if(type=="addNH") {
                	return(unlist(bonds_set))
		}
        }
}
# Usage:
# bondDF <- bonds(sdfset[1], type="df")) 
# bondcount <- bonds(sdfset[1], type="addNH")) 

##########################################################################
## (6.6) Subset SDF/SDFset Objects by Atom Index to Obtain Substructure ##
##########################################################################
## Function to obtain substructure from SDF/SDFset object by providing a row
## index for atom block. Both atom and bond blocks will be subsetted accordingly. 
## Two functions are combined in one: type="new" assigns new atom numbers to
## the subsetted SDF, while type="old" maintains the numbering of the source SDF.
atomsubset <- function (x, atomrows, type="new", datablock = FALSE) {
    ## Variant that assigns new numbers to atoms in subsetted SDF
    if(type=="new") { 
	if(!any(c("SDF", "SDFset") %in% class(x))) 
	    stop("x needs to be of class SDF or SDFset")
	if(class(x) == "SDFset" & class(atomrows) != "list") 
	    stop("if x is of class SDFset, atomrows argument needs to be a list")
	if(class(x) == "SDFset") {
	    if (!all(cid(x) == names(atomrows))) 
		stop("x and atomrows need to have same length and identical component (molecule) names")
	}
	.atomsubset <- function(x, atomrows) {
	    hb <- header(x)
	    ab <- atomblock(x)[atomrows, ]
	    bb <- bondblock(x)
	    index <- rowSums(cbind(bb[, 1] %in% atomrows, bb[, 2] %in% 
		atomrows)) == 2
	    bb <- bb[index, ]
	    pos <- as.numeric(gsub(".*_", "", rownames(ab)))
	    oripos <- 1:length(pos)
	    names(oripos) <- pos
	    tmp <- bb[, 1:2]
	    tmp2 <- tmp
	    for (i in oripos) tmp2[tmp == as.numeric(names(oripos[i]))] <- i
	    bb[, 1:2] <- tmp2
	    if (is.vector(bb)) {
		bb <- t(as.matrix(bb))
	    }
	    countsLine <- hb[4]
	    atomCount <- sprintf("%3d", length(atomrows))
	    bondCount <- sprintf("%3d", length(rowSums(bb)))
	    # update atom count and bond count
	    hb[4] <- paste(atomCount, bondCount, substr(countsLine, 7, 100000L), sep="")
	    # update bond block row names
	    row.names(bb) <- 1:length(rowSums(bb))
	    # update atom block row names
	    row.names(ab) <- paste(gsub("_.*", "",rownames(ab)), 1:length(atomrows), sep="_")
	    if (datablock == FALSE) {
		sdf <- new("SDF", header = hb, atomblock = ab, bondblock = bb)
		return(sdf)
	    }
	    if (datablock == TRUE) {
		sdf <- new("SDF", header = hb, atomblock = ab, bondblock = as.matrix(bb), 
		    datablock = datablock(x))
		return(sdf)
	    }
	}
	if (class(x) == "SDF") {
	    return(.atomsubset(x, atomrows))
	}
	if (class(x) == "SDFset") {
	    ids <- cid(x)
	    sdflist <- lapply(cid(x), function(y) atomsubset(sdfset[[y]], 
		atomrows[[y]]))
	    names(sdflist) <- ids
	    sdfset <- as(sdflist, "SDFset")
	    return(sdfset)
	    }
    }

    ## Variant that maintains atom numbers from source SDF in subsetted SDF
    if(type=="old") { 
        if(!any(c("SDF", "SDFset") %in% class(x))) stop("x needs to be of class SDF or SDFset")
        if(class(x)=="SDFset" & class(atomrows)!="list") stop("if x is of class SDFset, atomrows argument needs to be a list")
        if(class(x)=="SDFset") {
                if(!all(cid(x) == names(atomrows))) stop("x and atomrows need to have same length and identical component (molecule) names")
        }
        .atomsubset <- function(x, atomrows) {
	    hb <- header(x)
	    ab <- atomblock(x)[atomrows, ]
	    bb <- bondblock(x)
	    index <- rowSums(cbind(bb[,1] %in% atomrows, bb[,2] %in% atomrows)) == 2
	    bb <- bb[index,]

	    ## Update bb to positions in ab
	    pos <- as.numeric(gsub(".*_", "", rownames(ab)))
	    oripos <- 1:length(pos)
	    names(oripos) <- pos
	    tmp <- bb[,1:2]; tmp2 <- tmp 
	    for(i in oripos) tmp2[tmp==as.numeric(names(oripos[i]))] <- i
	    bb[,1:2] <- tmp2
	    
	    ## Outputs
	    if(is.vector(bb)) { bb <- t(as.matrix(bb)) }
	    if(datablock==FALSE) {
		sdf <- new("SDF", header=hb, atomblock=ab, bondblock=bb)
		return(sdf)
	    }
	    if(datablock==TRUE) {
		sdf <- new("SDF", header=hb, atomblock=ab, bondblock=as.matrix(bb), datablock=datablock(x))
		return(sdf)
	    }
        }
        if(class(x)=="SDF") {
	    return(.atomsubset(x, atomrows))
        }
        if(class(x)=="SDFset") {
	    ids <- cid(x)
	    sdflist <- lapply(cid(x), function(y) atomsubset(sdfset[[y]], atomrows[[y]]))
	    names(sdflist) <- ids
	    sdfset <- as(sdflist, "SDFset")
	    return(sdfset)
        }
    }
}

## Usage: 
# atomsubset(sdfset[[1]], atomrows=1:18, type="new")
# atomsubset(sdfset[1:2], atomrows=list(CMP1=1:18, CMP2=1:12), type="new")

###################################
## (6.7) Function write.SDFsplit ##
###################################
## Splits SD Files into any number of smaller SD Files                                                                   
write.SDFsplit <- function(x, filetag, nmol) {
        from <- seq(1, length(x), by=nmol)
        splitDF <- data.frame(from=from, to=c(from[-1], length(x)+1)-1, filename=NA)
        splitDF[,"filename"] <- paste(filetag, sprintf(paste("%0", nchar(as.character(length(x))), "d", sep=""), splitDF[,1]), "_", 
                          sprintf(paste("%0", nchar(as.character(length(x))), "d", sep=""), splitDF[,2]), ".sdf", sep="")
        for(i in seq(along=splitDF[,1])) {
                write.SDF(x[splitDF[i,"from"]:splitDF[i,"to"]], splitDF[i,"filename"])
        }                                                                                             
	return(splitDF) # Gives access to file names to simplify import of split SD Files
}                   

## Usage 
# write.SDFsplit(x=sdfstr, filetag="myfile", nmol=10)
# write.SDFsplit(x=sdfsample, filetag="myfile", nmol=10)

######################################
## (6.8) Streaming Through SD Files ##
######################################
## Streaming function to compute descriptors for large SD Files without consuming much memory.
## In addition to descriptor values, it returns a line index that defines the positions of each 
## molecule in the source SD File. This line index can be used by the read.SDFindex function to 
## retrieve specific compounds of interest from large SD Files without reading the entire file 
## into memory. 
sdfStream <- function(input, output, append=FALSE, fct, Nlines=10000, startline=1, restartNlines=10000, silent=FALSE, ...) {
	## Define loop parameters 
	stop <- FALSE 
	f <- file(input, "r")
	n <- Nlines
	offset <- 0
	## For restarting sdfStream at specific line assigned to startline argument. If assigned
        ## startline value does not match the first line of a molecule in the SD file then it 
        ## will be reset to the start position of the next molecule in the SD file.
	if(startline!=1) { 
		fmap <- file(input, "r")
		shiftback <- 2
		chunkmap <- scan(fmap, skip=startline-shiftback, nlines=restartNlines, what="a", blank.lines.skip=FALSE, quiet=TRUE, sep ="\n")
		startline <- startline + (which(grepl("^\\${4,4}", chunkmap, perl=TRUE))[1] + 1 - shiftback)
		if(is.na(startline)) stop("Invalid value assigned to startline.")
		dummy <- scan(f, skip=startline-2, nlines=1, what="a", blank.lines.skip=FALSE, quiet=TRUE, sep ="\n")
		close(fmap)
		offset <- startline - 1 # Maintains abolut line positions in index
	}
	counter <- 0
	cmpid <- 1
	partial <- NULL
	while(!stop) {
		counter <- counter + 1
		chunk <- readLines(f, n = n) # chunk n can be any number of lines
		# chunk <- scan(f, n=n, what="a", blank.lines.skip=FALSE, quiet=TRUE, sep ="\n") # scan has more flexibilities for reading specific line ranges in files.
		if(length(chunk) > 0) {
			if(length(partial) > 0) {
				chunk <- c(partial, chunk)
			}
			## Assure that lines of least 2 complete molecules are stored in chunk if available
			inner <- sum(grepl("^\\${4,4}", chunk, perl=TRUE)) < 2
			while(inner) {
				chunklength <- length(chunk)
				chunk <- c(chunk, readLines(f, n = n))
				if(chunklength == length(chunk)) { 
					inner <- FALSE 
				} else {
					inner <- sum(grepl("^\\${4,4}", chunk, perl=TRUE)) < 2
				}
			}
			y <- regexpr("^\\${4,4}", chunk, perl=TRUE) # identifies all fields that start with a '$$$$' sign
			index <- which(y!=-1)
			indexDF <- data.frame(start=c(1, index[-length(index)]+1), end=index)
			complete <- chunk[1:index[length(index)]]
			if((index[length(index)]+1) <= length(chunk)) {
				partial <- chunk[(index[length(index)]+1):length(chunk)]
			} else {
				partial <- NULL
			}
			index <- index + offset 
			indexDF <- data.frame(SDFlineStart=c(offset + 1, index[-length(index)]+1), SDFlineEnd=index)
			offset <- indexDF[length(indexDF[,2]),2]

			## Coerce file lines stored in character vector to SDFset
			sdfset <- read.SDFset(read.SDFstr(complete))
                        valid <- validSDF(sdfset)
                        sdfset <- sdfset[valid]
                        indexDForig <- indexDF
                        indexDF <- indexDF[valid,]
			## Perform desired computation on SDFset
			if(length(indexDF[,1])==1) {
				suppressWarnings(sdfset <- c(sdfset, sdfset)) # Trick to keep data in matrix format
                                resultMA <- fct(sdfset, ...)
				resultMA <- cbind(as.data.frame(indexDF), as.data.frame(resultMA[1, , drop=FALSE]), row.names=row.names(resultMA)[1])
			} else {
				resultMA <- fct(sdfset, ...)
				resultMA <- cbind(as.data.frame(indexDF), as.data.frame(resultMA), row.names=row.names(resultMA))
			}
			resultMA <- resultMA[names(valid),]
                        if(any(is.na(resultMA))) { # Maintains index for invalid compounds having NAs in descriptor fields
                                resultMA[,1:2] <- indexDForig[,1:2]
                        }
                        rownames(resultMA) <- paste("CMP", cmpid : (cmpid + length(resultMA[,1])-1), sep="")
			cmpid <- cmpid + length(resultMA[,1])
			## Print processing status to screen
                        if(silent==FALSE) {
                                print(rownames(resultMA))
                        }
			## Append results to tabular file
			if(counter==1 & append!=TRUE) {
				unlink(output)
				write.table(resultMA, output, quote=FALSE, col.names=NA, sep="\t")
			} else {	
				write.table(resultMA, output, quote=FALSE, append=TRUE, col.names=FALSE, sep="\t")
			}
		}
		if(length(chunk) == 0) {
			stop <- TRUE
			close(f)
		}
	}
}

## Usage:
# library(ChemmineR)
# data(sdfsample); sdfset <- sdfsample
# write.SDF(sdfset, "test.sdf")
## Choose descriptor set in a simple function:
# desc <- function(sdfset) {
#         cbind(SDFID=sdfid(sdfset),
#               # datablock2ma(datablocklist=datablock(sdfset)),
#               MW=MW(sdfset),
#               groups(sdfset),
#               # AP=sdf2ap(sdfset, type="character"),
#               rings(sdfset, type="count", upper=6, arom=TRUE)
#         )     
# }
# sdfStream(input="test.sdf", output="matrix.xls", fct=desc, Nlines=1000)
# indexDF <- read.delim("matrix.xls", row.names=1)[,1:2]
# sdfset <- read.SDFindex(file="test.sdf", index=indexDF, type="SDFset", outfile="sub.sdf") 


#####################################################################
## (6.9) Read Atom Pair String Representation from File into APset ##
#####################################################################
## Function to convert atom pairs (AP) or atom pair fingerprints (APFP) stored
## as character strings to APset or FPset objects (e.g. generated by sdfStream).
## Alternatively, one can provide the AP/APFP strings in a named character vector.
read.AP <- function(x, type, colid,isFile = class(x)=="character" & length(x)==1) {
        if(type=="ap") {
                if(isFile) {
                        x <- read.delim(x, sep="\t", row.names=1)
                        ids <- row.names(x); x <- x[,colid]; names(x) <- ids
                }
                desclist <- strsplit(as.character(x), ", ", fixed = TRUE)
                desclist <- lapply(desclist, as.numeric)
                names(desclist) <- names(x)
                return(as(desclist, "APset"))
        }
        if(type=="apfp" | type=="fp") {
                if(isFile) {
                        x <- read.delim(x, colClasses="character", sep="\t", row.names=1)
                        ids <- row.names(x); x <- x[,colid]; names(x) <- ids
                }
                descma <- matrix(as.numeric(unlist(strsplit(x, ""))), length(x), nchar(x[1]), byrow=TRUE, dimnames=list(names(x), 1:nchar(x[1])))
                #return(as(descma, "FPset"))
                return(new( "FPset",fpma=descma,type="apfp"))
        }
}

## Usage:
# apset <- read.AP(x="matrix.xls", type="ap", colid="AP")
# apfp <- read.AP(x="matrix.xls", type="apfp", colid="APFP")

#########################################################
## (6.10) Extract Molecules from SD File by Line Index ##
#########################################################
## Extracts specific molecules from SD File based on a line position index computed by the sdfStream function
read.SDFindex <- function(file, index, type="SDFset", outfile) {
	f <- file(file, "r") 
	if(type=="SDFset") {
		sdfset <- SDFset()
	}
	## Index transfromations
	index <- index[order(index[,1]),] # Assures positional sorting, which is expected by the following step!!
	index <- data.frame(skip=index[,1] - c(0, index[-length(index[,1]),2]) - 1 , nlines=index[,2] - index[,1] + 1) 
	## Scan through file using index to retrieve molecules one-by-one
	for(i in seq(along=index[,1])) {
		lines <- scan(f, skip=index[i,1], nlines=index[i,2], what="a", blank.lines.skip=FALSE, quiet=TRUE, sep ="\n")
		#delteme# lines <- scan(file, skip=index[i,1]-1, nlines=index[i,2]-index[i,1] + 1, what="a", blank.lines.skip=FALSE, quiet=TRUE, sep ="\n") 
		if(type=="file") {
			if(i == 1) {
				unlink(outfile)
				cat(lines, file=outfile, sep="\n")
			} else {	
				cat(lines, file=outfile, sep="\n", append=TRUE)
			}
		}
		if(type=="SDFset") {
			suppressWarnings(sdfset <- c(sdfset, read.SDFset(read.SDFstr(lines))))	
		}
	}
	close(f)
	if(type=="SDFset") {
		cid(sdfset) <- paste("CMP", 1:length(sdfset), sep="")
                return(sdfset)
	}
}
## Usage: see sdfStream()

#################################
## (6.11) String Search Method ##
#################################
## String search function for SDFset
grepSDFset <- function(pattern, x, field="datablock", mode="subset", ignore.case=TRUE, ...) {
	## Generate search vector and index for desired field in SDFset
	if(field=="header" | field==1) {
		searchfield <- header(x)
		searchstr <- as.character(unlist(searchfield))
		index <- sapply(searchfield, length)
		index <- unlist(sapply(index, function(x) seq(1, x)))
		indexnames <- rep(seq(along=searchfield), sapply(searchfield, length))
		names(index) <- indexnames
	}
	if(field=="atomblock" | field==2) { 
		searchfield <- atomblock(x)
		searchstr <- as.character(unlist(sapply(searchfield, rownames)))
		searchstr <- gsub("_.*", "", searchstr)
		index <- sapply(searchfield, function(x) length(x[,1]))
		index <- unlist(sapply(index, function(x) seq(1, x)))
		indexnames <- rep(seq(along=searchfield), sapply(searchfield, function(x) length(x[,1])))
		names(index) <- indexnames
	}
	if(field=="bondblock" | field==3) { # Not very useful for string searching.
		searchfield <- bondblock(x)
		searchstr <- as.character(unlist(sapply(searchfield, rownames)))
		searchstr <- gsub("_.*", "", searchstr)
		index <- sapply(searchfield, function(x) length(x[,1]))
		index <- unlist(sapply(index, function(x) seq(1, x)))
		indexnames <- rep(seq(along=searchfield), sapply(searchfield, function(x) length(x[,1])))
		names(index) <- indexnames
	}
	if(field=="datablock" | field==4) { 
		searchfield <- datablock(x)
		searchstr <- paste(names(unlist(searchfield)), as.character(unlist(searchfield)), sep="___")
		index <- sapply(searchfield, length)
		index <- unlist(sapply(index, function(x) seq(1, x)))
		indexnames <- rep(seq(along=searchfield), sapply(searchfield, length))
		names(index) <- indexnames
	}
	## Search with grep
	matchpos <- grep(pattern, searchstr, ignore.case=ignore.case, ...)
	if(mode=="index") {
		return(index[matchpos])

	}
	if(mode=="subset") {
		xpos <- as.numeric(unique(names(index[matchpos])))
		return(as(x[xpos], "SDF"))
	}
}

##########################
## (7) Plotting Methods ##
##########################
## Plot single CMP Structure
plotStruc <- function(sdf,
							 atomcex=1.2,
							 atomnum=FALSE,
							 no_print_atoms=c("C"),
							 noHbonds=TRUE,
							 bondspacer=0.12,
							 colbonds=NULL,
							 bondcol="red",
							 regenCoords=FALSE,
							 ...) {

	if(regenCoords && .haveOB())
		sdf = regenCoordsOB(sdf)

   toplot <- list(atomblock=cbind(atomblock(sdf)[,c(1:2)], as.matrix(bonds(sdf, type="bonds")[,-1])), bondblock=cbind(as.matrix(as.data.frame(bondblock(sdf))[,1:3]), bondcol=1))
	## Add bond color
	toplot[[2]][, "bondcol"] <- toplot[[2]][,"bondcol"] + as.numeric((toplot[[2]][,"C1"] %in% colbonds) & (toplot[[2]][,"C2"] %in% colbonds))
	## Create empty plot with proper dimensions
	plot(toplot[[1]], type="n", axes=F, xlab="", ylab="", ...)
	## Remove C-hydrogens including their bonds 
	if(noHbonds==TRUE) {
		nonbonded <- !1:length(toplot[[1]][,1]) %in% sort(unique(as.vector(toplot[[2]][,1:2])))
                nonbonded <- as.data.frame(toplot[[1]])[nonbonded,]
                CHbondindex <- sapply(seq(toplot[[2]][,1]), function(x) paste(sort(gsub("_.*", "", rownames(toplot[[1]]))[toplot[[2]][x,1:2]]), collapse="") == "CH")
		toplot[[1]] <- toplot[[1]][sort(unique(as.numeric(toplot[[2]][!CHbondindex,1:2]))), ]
		toplot[[2]] <- as.matrix(as.data.frame(toplot[[2]])[!CHbondindex,]) 
                toplot[[1]] <- as.matrix(rbind(toplot[[1]], nonbonded))
	}	
	## Plot bonds
	z <- toplot[[2]][, "bondcol"]; z[z==2] <- bondcol # Stores bond coloring data
	for(i in seq(along=toplot[[2]][,1])) {
		x <- toplot[[1]][gsub("*.*_", "", rownames(toplot[[1]])) %in% toplot[[2]][i,1:2],1]
		y <- toplot[[1]][gsub("*.*_", "", rownames(toplot[[1]])) %in% toplot[[2]][i,1:2],2]
		## Plot single bonds
		if(toplot[[2]][i,3]==1) {
			lines(x=x, y=y, lty=1, lwd=3, col=z[i]) 
		} 
		## Plot double bonds
		if(toplot[[2]][i,3]==2) {
			rslope <- (atan(diff(y)/diff(x))*180/pi)/90
			lines(x=x-rslope*bondspacer, y=y+(1-abs(rslope))*bondspacer, lty=1, lwd=3, col=z[i])
			lines(x=x+rslope*bondspacer, y=y-(1-abs(rslope))*bondspacer, lty=1, lwd=3, col=z[i])
		}
		## Plot triple bonds
		if(toplot[[2]][i,3]==3) {
			rslope <- (atan(diff(y)/diff(x))*180/pi)/90
			bondspacer <- bondspacer * 2
			lines(x=x, y=y, lty=1, lwd=3, col=z[i]) 
			lines(x=x-rslope*bondspacer, y=y+(1-abs(rslope))*bondspacer, lty=1, lwd=3, col=z[i])
			lines(x=x+rslope*bondspacer, y=y-(1-abs(rslope))*bondspacer, lty=1, lwd=3, col=z[i])
		}
		## Plot bonds with labels other than 1-3 (e.g. some ChEMBL SDFs use 4 for aromatic bonds)
		if(!toplot[[2]][i,3] %in% c(1,2,3)) {
			rslope <- (atan(diff(y)/diff(x))*180/pi)/90
			lines(x=x-rslope*bondspacer, y=y+(1-abs(rslope))*bondspacer, lty=3, lwd=3, col=z[i])
			lines(x=x+rslope*bondspacer, y=y-(1-abs(rslope))*bondspacer, lty=3, lwd=3, col=z[i])
		}
	}
	## Exclude certain atoms from being printed
	exclude <- paste("(^", no_print_atoms, "_)", sep="", collapse="|")
	labelMA <- toplot[[1]][!grepl(exclude, rownames(toplot[[1]])), , drop=FALSE] # Added July 31, 2012: 'drop=FALSE'
        ## Add charges 
	charge <- c("0"="", "3"="3+", "2"="2+", "1"="+", "-1"="-", "-2"="2-", "-3"="3-")
        charge <- charge[as.character(labelMA[,"charge"])]
        ## Add hydrogens to non-charged/non-C atoms according to valence rules (some SD files require this)
        Nhydrogens <- c("0"="", "1"="H", "2"="H2", "3"="H3", "4"="H4", "5"="H5", "6"="H6", "7"="H7", "8"="H8") 
        hydrogens <- (labelMA[, "Nbondrule"] + labelMA[, "charge"]) - labelMA[,"Nbondcount"]
        hydrogens[labelMA[,"charge"]!=0] <- 0; hydrogens[hydrogens < 0] <- 0
        hydrogens <- Nhydrogens[as.character(hydrogens)]
        ## Plot data
        if(is.vector(labelMA)) labelMA <- matrix(labelMA, 1, 2, byrow=TRUE, dimnames=list(rownames(toplot[[1]])[!grepl(exclude, rownames(toplot[[1]]))], c("C1", "C2")))
	if(is.matrix(labelMA) & length(labelMA[,1])>=1) {
		atomcol <- gsub("_.*", "", rownames(labelMA)); atomcol[!grepl("N|C|O|H", atomcol)] <- "any"; mycol <- c(C="black", H="black", N="blue", O="red", any="green"); atomcol <- mycol[atomcol]

		## Overplot nodes to display atom labels
		points(x=labelMA[,1], y=labelMA[,2], col="white", pch=16, cex=2.8)
		## Plot atom labels
		if(atomnum==TRUE) {
			text(x=labelMA[,1], y=labelMA[,2], paste(gsub("_", "", rownames(labelMA)), hydrogens, charge, sep=""), cex=atomcex, col=atomcol) 
		} else {
			text(x=labelMA[,1], y=labelMA[,2], paste(gsub("_.*", "", rownames(labelMA)), hydrogens, charge, sep=""), cex=atomcex, col=atomcol)
		}
	}
}
## Usage:
# plotStruc(sdf=sdfset[[2]], atomcex=1.2, atomnum=F, no_print_atoms=c("C"), noHbonds=TRUE, bondspacer=0.08)
# par(mfrow=c(2,3)); for(i in 1:6) plotStruc(sdf=sdfset[[i]], atomcex=1.8, atomnum=F, no_print_atoms=c("C"), noHbonds=TRUE, bondspacer=0.08)
# disabled for now as rsvg is not available on bioC
openBabelPlot=function(sdfset,
							 height=600,
							 noHbonds=TRUE,
							 regenCoords=FALSE
){
	.ensureOB()

	tempF = tempfile()
	sdf2image(sdfset,tempF,format="SVG",height=height,noHbonds,regenCoords)
	img = rsvg(tempF)
	file.remove(tempF)
	plot(c(0,100),c(0,100), type="n", axes=F, xlab="", ylab="")
	rasterImage(img,0,0,100,100)
}

## Plot method for single SDF object
setMethod(f="plot", signature="SDF", definition=function(x, print=TRUE, ...) { 
		plotStruc(sdf=x, ...)
		if(print==TRUE) { return(x) } 
	}
)

## Plot method for multiple SDF objects in SDFset
setMethod(f="plot", signature="SDFset",
	definition=function(x, griddim, print_cid=cid(x), print=TRUE, ...) {
		if(missing(griddim)) {
			mydim <- ceiling(sqrt(length(x)))
			griddim <- c(mydim, mydim)
		}
		par(mfrow=griddim)
		for(i in 1:length(x)) { plotStruc(sdf=x[[i]], main=print_cid[i], ...) }
		if(print==TRUE) { return(SDFset2SDF(x)) }
})

## Object for referencing chemminetools jobs
setClass("jobToken", representation=representation(
    tool_name = "character",
    jobId = "character"
))

## Show method for checking status of a submitted chemminetools job
setMethod("show", signature=signature(
    object="jobToken"),
    function(object){
        response <- status(object)
        cat("tool name:\t", slot(object, "tool_name"), "\n")
        cat("status:\t\t", response, "\n")
    }
)

cstrsplit <- function(line) .Call(cstrsplitSym,line)
#cstrsplit <- cxxfunction(signature(l="character"),includes='
#					#include <R.h>
#					#include <boost/algorithm/string.hpp>
#								',body='
#					std::vector<std::string> strs;
#					const char *line = CHAR(STRING_ELT(l,0));
#					boost::split(strs, line, boost::is_any_of("\t "),boost::token_compress_on);
#
#					he
#					CharacterVector out