parser: Detailed R source code parser

#' Detailed R code parser
#' 
#' This is a parser for R code using the same grammar as the usual parser
#' (\code{\link{parse}}) but structuring the information differently.
#' 
#' 
#' @param file Source file to parse
#' @param encoding Encoding for the file
#' @param text If \code{text} is given, then its content is sent to a temporary
#' file, which is then used as the \code{file} argument.
#' @return The output is a list of expressions similar to the output of
#' \code{\link{parse}} except that expressions are not associated with an
#' inline \code{srcref} information. Instead, each expression has an \code{id}
#' attribute that refers to the data.
#' 
#' The \code{data} attribute is a data frame with the following columns:
#' \item{line1,col1,byte1}{First line, column, and byte of the symbol (token or
#' expression)} \item{line2,col2,byte2}{Last line, column, and byte of the
#' symbol (token or expression)} \item{token}{Token type.} \item{id}{Unique
#' identifier given to this token or expression} \item{parent}{identifier for
#' the expression that contains this token or expression} \item{token.desc}{A
#' description of this token type. This is read from the output generated by
#' bison}
#' @note The location information is different to that of R's
#' \code{\link{parse}} function, \code{\link{parse}} structures columns and
#' bytes information to comply with the \code{\link{substring}} function
#' whereas this function returns the information as an offset to the start of
#' the line.
#' @author Romain Francois <romain@@r-enthusiasts.com>
#' @seealso The usual R parser \code{\link{parse}}
#' @references This function is largely inspired from the R core
#' \code{\link{parse}} function. The C code uses the same grammar as the one
#' used by \code{\link{parse}}.
#' 
#' The Gnu bison parser generator (\url{http://www.gnu.org/software/bison/}) is
#' used to create the C code from the grammar.
#' @keywords manip
#' @examples
#' 	
#' 
#' f <- system.file( "example", "f.R", package = "parser" )
#' out <- parser( f )
#' 
#' 
#' @export
parser <- function( file, encoding = "unknown", text ){
	# go to a temporary file to deal with the text argument, 
	# functions and connections  (I know it is ugly)
	if( !missing( text ) ){
		file <- tempfile( );
		cat( text , file = file, sep = "\n" )
	} else if( inherits(file,"function") ){
		source <- attr( file, "source" )
		if( is.null(source ) ){
			source <- deparse( file )
		}
		file <- tempfile( )
		writeLines( source, file )
	} else if( inherits( file, "connection" ) ){
		sc <- summary( file )
		text <- readLines( file )
		file <- tempfile()
		writeLines( text, file )
	}
	
	# this is where the actual work takes place
	p <- .External( "do_parser", file = file, encoding = encoding )
	
	# formatting the results a bit
	data <- as.data.frame( t(attr(p,"data")) )
	colnames( data ) <- c( "line1", "col1", "byte1", 
		 	"line2", "col2", "byte2", "token", "id", "parent" )
	
	data[["top_level"]] <- .Call( "top_level", data$parent, PACKAGE = "parser" )
	
	# populate token.desc and terminal
	m <- match( data$token, grammar_symbols$token )
	data$token.desc <- as.character(grammar_symbols$desc)[ m ]
	data$terminal <- grammar_symbols$terminal[m]
	
	# go back to C to grab the actual tokens
	data$text     <- character( nrow(data) )
	toks <- getTokens( data= data[ data$terminal, , drop = FALSE], 
		encoding = encoding, file = file, 
		sort = FALSE )
	data$text[ match( toks$id, data$id) ] <- toks$text
	attr( p, "data" ) <- data
	attr( p, "file" ) <- file
	attr( p, "encoding") <- encoding
	oldClass( p ) <- "parser"
	p
}

#' Get the child expression of an expression in the parse tree
#' 
#' 
#' @param x Output from the \code{\link{parser}} function
#' @param i index of the top-level expressions we want the childs of. The
#' default value (0) means all top level expressions
#' @param parent id of the parent expressions as found in the id column
#' @return An integer vector giving all the ids of symbols that are childs of
#' one of the parent expression
#' @author Romain Francois <romain@@r-enthusiasts.com>
#' @seealso \code{\link{parser}}
#' @keywords manip
#' @export
getChilds <- function( x, i = 0, 
	parent = sapply( x[i], function(.) attr(.,"id" ) ) ){
	
	if(missing(parent) && ( missing(i) || is.null(i) || i==0 ) ){
		return( attr(x, "data")[,'id'] )
	}
	all.childs <- c()
	data    <- attr( x, "data" )
	parents <- abs( data[, "parent"] )
	id      <- data[, "id" ]
	childs  <- function( index ){
		kids <- id[ parents %in% index ]
		if( length(kids) ){
			all.childs <<- c( all.childs, kids )
			childs( kids )
		}
	}
	childs( parent )
	sort( all.childs )
}

getDirectChilds <- function( x, parent = 0, data = attr(x, "data") ){
	parents <- abs( data[, "parent"] )
	id      <- data[, "id" ]
	id[ parents %in% parent ]
}

isTerminal <- function(x, id = 0, data = attr(x, "data") ){
	terminal  <- data[["terminal"]]
	ids       <- data[["id"]]
	if( ! id %in% ids ){
		FALSE
	} else{
		terminal[ ids %in% id ]
	}
}


#' Gets the terminal tokens
#' 
#' Retrieve the terminal nodes from a parser object.
#' 
#' @param x the parser object.
#' @param data the data attribute of x
#' @param encoding The encoding used on the file
#' @param file the file x came from.
#' @param sort Does the data need to be sorted?
#' 
#' @export
getTokens <- function( x, 
	data = attr( x, "data" )[ attr(x, "data")[["terminal"]], ], 
	encoding = attr( x, "encoding"), 
	file = attr( x, "file" ), 
	sort = TRUE ){
	
	if( sort ){
		data <- data[ do.call( order, data[, c("line1", "col1") ] ), ]
	} else{
		if( is.unsorted(data[["line1"]] ) ){
			stop( "data is not in increasing order of line1" )
		}
	}
	data.frame( id = data[, "id"], 
		text = .External( "do_getTokens", 
			file = file, 
			encoding = encoding, 
			line1 = data[, "line1" ], 
			col1  = data[, "byte1"  ], 
			line2 = data[, "line2" ], 
			col2  = data[, "byte2"  ],
            PACKAGE="parser"), 
		stringsAsFactors = FALSE
		)
	
}

#' @name grammar_symbols
#' @title Parser Grammar Symbols
#' @docType data
#' @aliases grammar_symbols
#' @description
#'  A data table with the symbols used in the parser.
#' @usage
#'  grammar_symbols
#' @format
#'  A data frame with \code{desc}, the description;
#'  \code{token}, the number identifier of the class;
#'  and \code{terminal} if the class constitutes terminal nodes.
#' @keywords datasets
NULL