R/readRules.R

Defines functions build_sif_table_from_rule readBND readBNET

Documented in build_sif_table_from_rule readBND readBNET

# library(data.table)
# library(dplyr)
# library(tidyr)
# library(readr)

#' Read network from BNET file
#' 
#' @param filename BNET file. The file is a tab delimited file with two columns,
#' `targets` and `factors`. `factors` contains the logic rule and `targets` the node
#' activated by the logic rule.
#'
#' @return CellNOpt network
#'
#' @author Luis Tobalina
readBNET <- function(filename){
	

	if (!requireNamespace("dplyr", quietly = TRUE)) {
		stop("Package \"dplyr\" needed for this function to work. Please install it.",
			 call. = FALSE)
	}
	if (!requireNamespace("tidyr", quietly = TRUE)) {
		stop("Package \"tidyr\" needed for this function to work. Please install it.",
			 call. = FALSE)
	}
	if (!requireNamespace("readr", quietly = TRUE)) {
		stop("Package \"readr\" needed for this function to work. Please install it.",
			 call. = FALSE)
	}
		
	
	
	bnet <- data.table::fread(filename, header=TRUE)
	
	# count number of `and` gates in each logic rule
	bnet <- bnet %>% dplyr::rowwise() %>% 
		mutate(i_and_gates = (nchar(factors) - nchar(gsub("&", "", factors, perl=TRUE))))
	# count how many `and` gates have appeared up to the current rule
	bnet <- bnet %>% transform(i_and_gates = c(0, i_and_gates[-nrow(bnet)])) %>% 
		mutate(i_and_gates = cumsum(i_and_gates))
	# parse each logic rule
	sif <- bnet %>% dplyr::rowwise() %>% 
		dplyr::do(sif_df = build_sif_table_from_rule(.$factors, .$targets, last_and_num=.$i_and_gates)$sif_str) %>% 
		tidyr::unnest(cols = sif_df) %>% 
		dplyr::ungroup() 
	
	# use `readSIF()` to get the network ready for CellNOpt
	fh <- tempfile()
	write.table(sif$sif_df, file=fh,
				row.names=FALSE, col.names=FALSE, quote=FALSE, sep="\t")
	model <- readSIF(fh)
	
	return(model)
}

#' Read network from BND file
#' 
#' BND is a file format used by MaBoSS to store the boolean network definition.
#' The reader works if the logic for the activation of the node is stated withing the parameter `logic` of the node.
#' An example file can be found in `https://maboss.curie.fr/pub/example.bnd`.
#' 
#' @param filename BND file. 
#' 
#' @examples
#' \dontrun{
#' model = readBND("https://maboss.curie.fr/pub/example.bnd")
#' }
#'
#' @return CellNOpt network
#' 
#' @author Luis Tobalina
readBND <- function(filename){
	required_pcks = list("dplyr","tidyr","readr")
	
	if(!all(unlist(lapply(required_pcks,function(str) {require(str,character.only = TRUE)})))){
		print("the following packages need to be installed to use readBND:")	
		print(unlist(required_pcks))
		print("Please, install the packages manually for this feature.")
		return(-1)
	}
	
	
	bnd <- readr::read_file(filename)
	# remove all line breaks
	bnd <- gsub("\\n", "", bnd, perl=TRUE)
	# remove parenthesis enclosing a single node (which is composed of alphanumeric characters and "_")
	bnd <- gsub("\\((!?[[:alnum:]|_]*)\\)", "\\1", bnd, perl=TRUE)
	# get the information in bnet format 
	targets <- regmatches(bnd, gregexpr("(?<=Node ).*?(?= {)", bnd, perl = TRUE))[[1]]
	factors <- regmatches(bnd, gregexpr("(?<=logic = ).*?(?=;)", bnd, perl = TRUE))[[1]]
	bnet <- data.frame(targets, factors)
	
	# use `readBNET()` to get the network ready for CellNOpt
	fh <- tempfile()
	write.table(bnet, file=fh,
				row.names=FALSE, col.names=TRUE, quote=FALSE, sep="\t")
	model <- readBNET(fh)
}


# logic rule parser written following explanation of boolean expression evaluator in:
# https://unnikked.ga/how-to-build-a-boolean-expression-evaluator-518e9e068a65
# also of interest:
# https://www.strchr.com/expression_evaluator
# Unless the boolean expression is written in Disjunctive Normal Form (DNF), in order
# to get a correct `*.sif` file we will need to process the first interpretation we do of
# the tree into sif format.
#
#' Build a SIF table from a logic rule written in a string
#' 
#' @param rule_str String containing the rule to be parsed 
#' @param target Name of the node affected by the rule
#' @param last_and_num If the rule contains `and` gates, their numeration will start after the number provided here (default is 0)
#'
#' @return data.frame with the network structure derived from the rule.
#' The column `sif_str` contains the string that can be written to a file 
#' and then read with `readSIF()` in order to load a CellNOpt compatible network.
#' 
#' @examples
#' CellNOptR:::build_sif_table_from_rule("B & (C | D)", "A", last_and_num=2)
#' 
#' test_rule <- list()
#' test_rule[[1]] <- "AMP_ATP | (ATM & ATR) | HIF1 | !(EGFR | FGFR3)"
#' test_rule[[2]] <- "A & ((B | C) & !(D & E))"
#' test_rule[[3]] <- "A & B | C"
#' test_rule[[4]] <- "A & B & C"
#' test_rule[[5]] <- "A & (B | C)"
#' test_rule[[6]] <- "(A | B) & (C | D)"
#' test_rule[[7]] <- "!(C & D) | (E & F)"
#' test_rule[[8]] <- "(A | B) & (C | !D) & (E | F)"
#' parsed_rule <- list()
#' for (i in c(1:length(test_rule))){
#'   parsed_rule[[i]] <- CellNOptR:::build_sif_table_from_rule(test_rule[[i]], "T")
#' }
#' 
build_sif_table_from_rule <- function(rule_str, target, last_and_num=0) {
	required_pcks = list("dplyr","tidyr","readr")
	
	if(!all(unlist(lapply(required_pcks,function(str) {require(str,character.only = TRUE)})))){
		print("the following packages need to be installed to use readBND:")	
		print(unlist(required_pcks))
		print("Please, install the packages manually for this feature.")
		return(-1)
	}
	# remove all whitespaces
	rule_str <-  gsub(" ", "", rule_str)
	# split rule in its different atomic components
	tokens <- strsplit(rule_str, "(?=[\\|\\&\\(\\)!])", perl=TRUE)[[1]]
	# the next set of variables will be used and/or modified by the inner functions
	current_token_pos <- 0
	symbol <- ""
	root <- ""
	negation_flag <- FALSE
	sif_str <- ""
	sif_list <- list()
	or_list <- list()
	and_list <- list()
	not_list <- list()
	sif_num <- 0
	or_num <- 0
	and_num <- last_and_num
	not_num <- 0
	
	# This function is used to retrieve the next token from the rule string
	get_next_token <- function(){
		current_token_pos <<- current_token_pos + 1
		#cat("\n", current_token_pos, tokens[current_token_pos], sep=" ")
		if (current_token_pos<=length(tokens)){
			symbol <<- tokens[current_token_pos]
			# apply de Morgan's Law if necessary
			if (negation_flag){
				if (symbol=="|"){
					symbol <<- "&"
				} else if (symbol=="&"){
					symbol <<- "|"
				}
			}
		} else {
			symbol <<- ""
		}
	}
	
	# helper function to parse each expression
	get_expression <- function(){
		get_term()
		while (symbol=="|") {
			left_part <- root
			get_term()
			right_part <- root
			
			or_num <<- or_num + 1
			or_list[[or_num]] <<- c(left_part, right_part)
			root <<- paste("or", or_num, sep="")
			
			sif_num <<- sif_num + 1
			sif_list[[sif_num]] <<- c(left_part, right_part, root)
		}
	}
	
	# helper function to parse each term
	get_term <- function(){
		get_factor()
		while (symbol=="&") {
			left_part <- root
			get_factor()
			right_part <- root
			
			and_num <<- and_num + 1
			and_list[[and_num]] <<- c(left_part, right_part)
			root <<- paste("and", and_num, sep="")
			
			sif_num <<- sif_num + 1
			sif_list[[sif_num]] <<- c(left_part, right_part, root)
		}
	}
	
	# helper function
	get_factor <- function(){
		get_next_token()
		if (symbol=="!") {
			# negate next expression
			negation_flag <<- !negation_flag
			get_factor()
			negation_flag <<- !negation_flag
			# revert changes to & and | symbols if they where acquired before the reset of the negation flag
			if (symbol=="|"){
				symbol <<- "&"
			} else if (symbol=="&"){
				symbol <<- "|"
			}
		} else if (symbol=="(") {
			get_expression()
			get_next_token()
		} else if (symbol==")") {
			# we don't care about ')'
		} else{
			if (negation_flag) {
				symbol <<- paste("!", symbol, sep="")
			}
			root <<- symbol
			get_next_token()
		}
	}
	
	# helper function to rename `and` or `or` gates (i.e. reset how they are numbered)
	rename_gates <- function(sif_df, type, last_num=0){
		if (!(type %in% c("and", "or"))){
			stop("type should be \"and\" or \"or\".")
		}
		type_gates <- unique(c(sif_df$node_in, sif_df$node_out))
		type_gates <- type_gates[grepl(paste("^", type, "\\d{1,}", sep=""), type_gates, perl=TRUE)]
		if (length(type_gates)>0){
			type_gates <- cbind(type_gates, paste("and", (c(1:length(type_gates))+last_num), sep=""))
			type_gates <- data.frame(type_gates)
			colnames(type_gates) <- c("old_name", "new_name")
			type_gates <- type_gates %>% dplyr::mutate_if(is.factor, as.character)
			
			dic = type_gates$new_name
			names(dic) = type_gates$old_name
			
			sif_df <- sif_df %>% 
				dplyr::mutate(node_in = dplyr::recode(node_in, !!!dic)) %>% 
				dplyr::mutate(node_out = dplyr::recode(node_out,  !!!dic))
		}
		return(sif_df)
	}
	
	# helper function to simplify a chain of `and` or `or` gates
	simplify_gates <- function(sif_df, type){
		if (!(type %in% c("and", "or"))){
			stop("type should be \"and\" or \"or\".")
		}
		
		type_pattern <- paste("^", type, "\\d{1,}", sep="")
		while (any(grepl(type_pattern, sif_df$node_in, perl=TRUE) &
				   grepl(type_pattern, sif_df$node_out, perl=TRUE))){
			consecutive_type_gates <- sif_df %>% filter(grepl(type_pattern, sif_df$node_in, perl=TRUE), grepl(type_pattern, sif_df$node_out, perl=TRUE))
			
			dic = consecutive_type_gates$node_out
			names(dic) = consecutive_type_gates$node_in
			
			sif_df <- sif_df %>% 
				dplyr::mutate(node_in = dplyr::recode(node_in, !!!dic)) %>% 
				dplyr::mutate(node_out = dplyr::recode(node_out, !!!dic)) %>% 
				filter(node_in!=node_out)
		}
		return(sif_df)
	}
	
	# helper function to interpret the parsed expression stored in the `sif_list` variable as a data.frame
	# `sif_list` contains the parsed expression in a tree like structure and this function takes care of making
	# several simplifications until a SIF format friendly version is reached (i.e. no concatenated `and` nodes and
	# no `or` nodes).
	interpret_sif_list <- function(sif_list){
		tree_df <- data.frame(matrix(unlist(sif_list), nrow=length(sif_list), byrow=TRUE))
		colnames(tree_df) <- c("left_part", "right_part", "root")
		
		sif_df <- tree_df[,c("left_part", "root")]
		colnames(sif_df) <- c("node_in", "node_out")
		sif_df <- rbind(sif_df, setNames(tree_df[,c("right_part", "root")], names(sif_df)))
		# add target
		sif_df <- rbind(sif_df, data.frame(node_in=root, node_out=target))
		sif_df <- sif_df %>% dplyr::mutate_if(is.factor, as.character)
		
		# simplify cascade of "and" and "or" gates
		sif_df <- simplify_gates(sif_df, "and")
		#sif_df <- simplify_gates(sif_df, "or")
		
		# we need to have the expression in disjunctive normal form (DNF)
		# find if there are or gates (node_in) connected to and gates (node_out)
		or_to_and_gates <- sif_df %>% dplyr::filter(grepl("^or\\d{1,}", sif_df$node_in, perl=TRUE), grepl("^and\\d{1,}", sif_df$node_out, perl=TRUE))
		while (dim(or_to_and_gates)[1]>0){
			current_pair <- or_to_and_gates[1,]
			or_group_df <- sif_df %>% dplyr::filter(grepl(current_pair$node_in, sif_df$node_out, perl=TRUE)) %>% 
				dplyr::mutate(num_or = 1) %>% 
				dplyr::mutate(num_or = cumsum(num_or)) %>% 
				dplyr::mutate(new_node_out = paste(current_pair$node_out, node_out, num_or, sep="_")) %>% 
				dplyr::mutate(old_node_out = node_out) %>% 
				dplyr::mutate(node_out = new_node_out)
			and_group_df <- sif_df %>% dplyr::filter(grepl(current_pair$node_out, sif_df$node_out, perl=TRUE) &
											  	!grepl(current_pair$node_in, sif_df$node_in, perl=TRUE)) %>% 
				dplyr::mutate(new_node_out = paste(c(or_group_df$new_node_out), collapse=",")) %>% 
				dplyr::mutate(new_node_out = strsplit(as.character(new_node_out), ",")) %>% 
				tidyr::unnest(cols = new_node_out) %>% 
				dplyr::mutate(old_node_out = node_out) %>% 
				dplyr::mutate(node_out = new_node_out)
			gate_group_df <- sif_df %>% dplyr::filter(grepl(current_pair$node_out, sif_df$node_out, perl=TRUE) &
											   	grepl(current_pair$node_in, sif_df$node_in, perl=TRUE)) %>% 
				dplyr::mutate(new_node_out = node_in) %>% 
				dplyr::mutate(new_node_in = paste(c(or_group_df$new_node_out), collapse=",")) %>% 
				dplyr::mutate(new_node_in = strsplit(as.character(new_node_in), ",")) %>% 
				tidyr::unnest(cols = new_node_in) %>% 
				dplyr::mutate(old_node_in = node_in) %>% 
				dplyr::mutate(old_node_out = node_out) %>% 
				dplyr::mutate(node_in = new_node_in) %>% 
				dplyr::mutate(node_out = new_node_out)
			root_group_df <- sif_df %>% dplyr::filter(grepl(current_pair$node_out, sif_df$node_in, perl=TRUE)) %>% 
				dplyr::mutate(new_node_in = current_pair$node_in) %>% 
				dplyr::mutate(old_node_in = node_in) %>% 
				dplyr::mutate(node_in = new_node_in)
			new_sif_df_part <- rbind(or_group_df[,c("node_in", "node_out")],
									 and_group_df[,c("node_in", "node_out")],
									 gate_group_df[,c("node_in", "node_out")],
									 root_group_df[,c("node_in", "node_out")])
			
			# if the or gate is connected to a different and gate than the one being processed now
			# we will rename it in those instances so that we don't lose these connections when filtering
			# the part of the dataframe that we are just about to change
			if (nrow(or_to_and_gates %>% filter(node_in==current_pair$node_in))>1) {
				duplicate_sif_df_part <- sif_df %>% dplyr::filter((grepl(current_pair$node_in, sif_df$node_in, perl=TRUE) &
																!grepl(current_pair$node_out, sif_df$node_out, perl=TRUE)) |
																	(grepl(current_pair$node_in, sif_df$node_out, perl=TRUE)))
				or_num <<- or_num + 1
				new_or_gate_name <- paste("or", or_num, sep="")
				
				dic = new_or_gate_name
				names(dic) = current_pair$node_in
				
				duplicate_sif_df_part <- duplicate_sif_df_part %>% 
					dplyr::mutate(node_in = dplyr::recode(node_in, !!!dic)) %>% 
					dplyr::mutate(node_out = dplyr::recode(node_out, !!!dic))
				sif_df <- sif_df %>% filter(!(grepl(current_pair$node_in, sif_df$node_in, perl=TRUE) &
											  	!grepl(current_pair$node_out, sif_df$node_out, perl=TRUE))) %>% 
					rbind(duplicate_sif_df_part)
			}
			
			# substitute the corresponding part of sif_df with the newly calculated df
			sif_df <- sif_df %>% dplyr::filter(!((grepl(paste("^", current_pair$node_in, "$", sep=""), sif_df$node_out, perl=TRUE)) |
										  	(grepl(paste("^", current_pair$node_out, "$", sep=""), sif_df$node_out, perl=TRUE) &
										  	 	!grepl(paste("^", current_pair$node_in, "$", sep=""), sif_df$node_in, perl=TRUE)) |
										  	(grepl(paste("^", current_pair$node_out, "$", sep=""), sif_df$node_out, perl=TRUE) &
										  	 	grepl(paste("^", current_pair$node_in, "$", sep=""), sif_df$node_in, perl=TRUE)) |
										  	grepl(paste("^", current_pair$node_out, "$", sep=""), sif_df$node_in, perl=TRUE)))
			
			sif_df <- rbind(sif_df, new_sif_df_part)
			sif_df <- sif_df %>% distinct()
			
			# simplify cascade of "and" and "or" gates
			sif_df <- simplify_gates(sif_df, "and")
			sif_df <- simplify_gates(sif_df, "or")
			
			# update list of or to and gates
			or_to_and_gates <- sif_df %>% dplyr::filter(grepl("^or\\d{1,}", sif_df$node_in, perl=TRUE), grepl("^and\\d{1,}", sif_df$node_out, perl=TRUE))
		}
		
		sif_df <- rename_gates(sif_df, type="and", last_num=last_and_num)
		
		# list or gates
		or_gates <- sif_df %>% dplyr::filter(grepl("^or\\d{1,}", node_out, perl=TRUE)) %>% 
			dplyr::group_by(node_out) %>% 
			dplyr::summarise(or_members = paste(node_in, collapse=",")) %>% 
			dplyr::mutate(root = node_out) %>% 
			dplyr::select(root, or_members) %>% 
			dplyr::ungroup()
		
		
		# substitute or gates by their inputs
		while (any(grepl("^or\\d{1,}", sif_df$node_in, perl=TRUE))) {
			
			dic = or_gates$or_members
			names(dic) = or_gates$root
			
			sif_df <- sif_df %>% dplyr::mutate(node_in = dplyr::recode(node_in, !!!dic))
			sif_df <- sif_df %>% dplyr::filter(!grepl("^or\\d{1,}", node_out, perl=TRUE))
			sif_df <- sif_df %>% dplyr::mutate(node_in = strsplit(as.character(node_in), ",")) %>% tidyr::unnest(cols = node_in)
		}
		return(sif_df)
	}
	
	# helper function to write the column with the string for the SIF file
	write_sif <- function(sif_df){
		sif_df <- sif_df %>% dplyr::mutate(sign1 = !grepl("^!", node_in, perl=TRUE)) %>% 
			dplyr::mutate(sign2 = !grepl("^!", node_out, perl=TRUE)) %>% 
			dplyr::mutate(sign = ifelse(sign1 & sign2, "1", "-1")) %>% 
			dplyr::mutate(sif_str = paste(node_in, sign, node_out, sep="\t")) %>% 
			dplyr::mutate(sif_str = gsub("^!", "", sif_str, perl=TRUE))
	}
	
	# After having defined all the helper functions, we parse the input expression.
	
	# if the rule only has one input (or a negated input), we output the sif format directly
	if (length(tokens)==1){
		sif_df <- data.frame(node_in=tokens, node_out=target)
		sif_df <- write_sif(sif_df)
		return(sif_df)
	} else if (length(tokens)==2){
		if (tokens[1]=="!"){
			sif_df <- data.frame(node_in=paste("!", tokens[2], sep=""), node_out=target)
			sif_df <- write_sif(sif_df)
			return(sif_df)
		}
	}
	
	# if there is more than one element, we parse the expression
	get_expression()
	sif_df <- interpret_sif_list(sif_list)
	sif_df <- write_sif(sif_df)
	
	#return(list(sif_df, sif_list))
	return(sif_df)
}
saezlab/CellNOptR documentation built on April 16, 2024, 5:21 a.m.