R/sd2gram.R

Defines functions sd2gram

Documented in sd2gram

#' @title sd2gram - Similarity of molecules by the marginalized kernel and 
#' proposed extensions.
#' 
#' @description This tools compute the marginalized kernel (\cite{Kashima, 2004})
#' and its proposed extensions (\cite{Mahe, 2005)}.
#' 
#' 
#' @param sdf File containing the molecules. Must be in MDL file format
#' (MOL and SDF files). For more information on the file format see 
#' http://en.wikipedia.org/wiki/Chemical_table_file. 
#' @param sdf2 A second file containing molecules. Must also be in SDF format.
#' If specified the molecules of the first file will be compared with the 
#' molecules of this second file. Default = "missing".
#' @param stopP The probability that a random walk stops. The higher the value
#' the more weigth is put on shorter walks. Default = 0.1.
#' @param filterTottering A logical specifying whether tottering paths should
#' be removed. Default = FALSE.
#' @param converg A numeric value specifying when convergence is reached. The
#' algorithm stops when the kernel value does not change by more
#' than 1/c, where c is the value specified by the converg option. 
#' Default = 1000.
#' @param atomKernelMatrix A string that sets the similarity measure between
#' atoms that should be used. Default = "missing". 
#' @param flagRemoveH A logical that indicates whether H-atoms should be 
#' removed or not. Default = FALSE. 
#' @param morganOrder The order of the DeMorgan indices to be used. If set to
#' zero, no DeMorgan indices are used. The higher the order the more 
#' types of atoms exist and consequently the more dissimilar will be the molecules.
#' Default = 0.
#' @param silentMode Whether or not the program should print progress reports
#' to the standart output. Default = FALSE.
#' @param returnNormalized A logical specifying whether a normalized kernel
#' matrix should be returned. Default = TRUE.
#' @param detectArom Whether aromatic rings should be detected and aromatic
#' bonds should a special bond type. If large molecules are in the data set
#' the detection of aromatic rings can be very time-consuming. (Default = FALSE).
#' @examples 
#' sdfolder <- system.file("extdata",package="Rchemcpp")
#' sdf <- list.files(sdfolder,full.names=TRUE,pattern="tiny")
#' K <- sd2gram(sdf)
#' @return A numeric matrix containing the similarity values between the
#' molecules.
#' @author Michael Mahr <[email protected]@bioinf.jku.at> 
#' c++ function written by Jean-Luc Perret and Pierre Mahe.
#' @references
#' (Kashima, 2004) -- H. Kashima, K. Tsuda, and A. Inokuchi. Kernels for graphs. 
#' In B. Schoelkopf, K. Tsuda, and J.P.
#' Vert, editors, Kernel Methods in Computational Biology, pages 155-170.
#' MIT Press, 2004.
#' 
#' (Mahe, 2005) -- P. Mahe, N. Ueda, T. Akutsu, J.-L. Perret, and J.-P. Vert. 
#' Graph kernels for molecular structure-
#' activity relationship analysis with support vector machines. 
#' J Chem Inf Model, 45(4):939-51, 2005.
#' 
#' 
#' @export


sd2gram <-  function(sdf, sdf2, stopP = 0.1, 
		filterTottering = FALSE, converg = as.integer(1000),  
		atomKernelMatrix = "", flagRemoveH = FALSE, morganOrder = as.integer(0),
		silentMode = FALSE, returnNormalized = TRUE, detectArom=FALSE)
{
	
	nbThreadsWanted = as.integer(1)
	fromN = as.integer(1)
	if(!is.numeric(stopP)) stop("stopP must be numeric")
	if(!is.logical(filterTottering)) stop("filterTottering must be logical")
	if(!is.numeric(converg)) stop("converg must be integer")
	converg <- as.integer(converg)
	if(!is.character(atomKernelMatrix)) stop("atomKernelMatrix must be string")
	if(!is.logical(flagRemoveH)) stop("flagRemoveH must be logical")	
	if(!is.numeric(morganOrder)) stop("morganOrder must be integer")
	morganOrder <- as.integer(morganOrder)	
	if(!is.logical(silentMode)) stop("silentMode must be logical")
	if(!is.logical(returnNormalized)) stop("returnNormalized must be logical")	
	
	if(!is.logical(detectArom)) stop("\"detectArom\" must be logical.")
	
	if(inherits(sdf,"SDFset")){
		datablock(sdf) <- lapply(1:length(sdf),function(x) return(""))		
		aSet <- SDFsetToRmoleculeset(sdf,detectArom=detectArom)[[1]]
		
		molnames <- ChemmineR::sdfid(sdf)
		molnames2 <- molnames
		if (!missing(sdf2)){
			if (inherits(sdf2,"SDFset")){
				datablock(sdf2) <- lapply(1:length(sdf2),function(x) return(""))			
				aSet2 <- SDFsetToRmoleculeset(sdf2,detectArom=detectArom)[[1]]
				
				molnames2 <- ChemmineR::sdfid(sdf2)
			} else 
				stop("Input must be existing SDF files or \"SDFset\" objects.")	
		}
		inputMode <- "SDFset"
		
	} else if (inherits(sdf,"Rcpp_Rmoleculeset")) {
		aSet <- sdf
		molnames <- NULL
		molnames2 <- NULL
		if (!missing(sdf2)){
			if (inherits(sdf2,"Rcpp_Rmoleculeset")){
				aSet2 <- sdf2
			} else 
				stop("Input must be existing SDF files or \"SDFset\" objects.")
		}
		
		inputMode <- "Rmoleculeset"
		
	} else if (is.character(sdf) & file.exists(sdf)) {
		
		
		aSetList <- readRmoleculeset(sdf,detectArom=detectArom)
		aSet <- aSetList[[1]]
		molnames <- aSetList[[3]]
		molnames2 <- aSetList[[3]]
		if (!missing(sdf2)){
			if (is.character(sdf2) & file.exists(sdf2)){
				aSetList2  <-  readRmoleculeset(sdf2,detectArom=detectArom)
				aSet2 <- aSetList2[[1]]
				molnames2 <- aSetList2[[3]]
			} else 
				stop("Input must be existing SDF files or \"SDFset\" objects.")	
		} 
		
		inputMode <- "fileName"
		
		
	} else {
		stop("Input must be existing SDF files or \"SDFset\" objects.")
	}
	
	
#aSet = new (Rchemcpp::Rmoleculeset);
	
	
	if( atomKernelMatrix != "" ){
		loadGramAtoms( atomKernelMatrix );	# atomKernelMatrix is set by the
		# -k argument on the command line
	}
	
	
	if( !silentMode ){
		print("reading file");
	}
	
	if( flagRemoveH == TRUE ) {
		if( !silentMode ){
			print("removing Hydrogens");
		}
		aSet$hideHydrogens();
	}
	
	
	
	if( !silentMode ){
		print("reading file done");
	}
	
	if(missing(sdf2)){
		# if the gram matrix is to be computed with all mol files in a single directory
		
		if( !silentMode ){
			print("setting morgan labels");
		}
		aSet$setMorganLabels(morganOrder);
		
		
		if( atomKernelMatrix != "" ){
			
			if( !silentMode ){
				print("using external atomKernel");
			}
			
			
			aSet$gramCompute(stopP, converg, fromN, nbThreadsWanted, 
					silentMode, filterTottering, TRUE); #external			
			
		}else{
			if( !silentMode ){
				print("using moleculeKernel Kashima");
			}
			
			aSet$gramCompute(stopP, converg, fromN, nbThreadsWanted, 
					silentMode, filterTottering, FALSE); #morgan			
		}
		
		
		# write the gram matrix to a file
		# raw matrix and normalised matrix
		
		
		#aSet$deleteAll(); #Should be done by destructor!!!
		
		
	}else{
		if( !silentMode ){
			print("test set mode");
		}
		
		if( flagRemoveH == TRUE ) {
			if( !silentMode ){
				print("removing Hydrogens");
			}
			aSet2$hideHydrogens();
		}
		
		
		if( !silentMode ){
			print("reading second file done");
		}
		
		
		# if the gram matrix is to be computed between two datasets of mol files
		# in two separate directories
		#deleted
		
		
		if( !silentMode ){
			print ("setting morgan labels");
		}
		
		aSet$setMorganLabels( morganOrder );
		aSet2$setMorganLabels( morganOrder );
		
		#FIXEd
		aSet2$initializeSelfKernel( 0.0);  
		aSet$initializeSelfKernel( 0.0);   
		aSet$setComparisonSetCopy( aSet2 );
		aSet$initializeGram( 0.0 );
		
		if( atomKernelMatrix != "" ){
			
			if( !silentMode ){
				print("using external atomKernel");
			}
			
			aSet$gramCompute(stopP, converg, fromN, nbThreadsWanted, silentMode, filterTottering, TRUE); #external			
			
		}else{
			
			if( !silentMode ){
				print("using moleculeKernel Kashima");
			}
			
			aSet$gramCompute(stopP, converg, fromN, nbThreadsWanted, silentMode, filterTottering, FALSE); #morgan			
			
		}
		
		
		#aSet$deleteAll(); #Should be done by destructor!!!
		#aSet2$deleteAll(); #Should be done by destructor!!!
	}
	
	if( !silentMode ){
		print("end");
	}
	if (returnNormalized == FALSE)
	{	
		K <- do.call(rbind,aSet$getGram() )
		
	}
	else
	{		
		K <- do.call(rbind,aSet$getGramNormal() )
		
	}
	
	
	if (inputMode == "fileName" | inputMode == "SDFset"){
		aSet$deleteAll()
		if (exists("aSet2"))
			aSet2$deleteAll()
	}
	
	try({
				rownames(K) <- molnames
				colnames(K) <- molnames2
			})
	
	return ( K )
	
	
}

Try the Rchemcpp package in your browser

Any scripts or data that you put into this service are public.

Rchemcpp documentation built on Nov. 1, 2018, 2:34 a.m.