Nothing
#' Incorporating prior knowledge in the estimation process
#'
#' Learns a univariate MoTBF function using prior information.
#'
#' @param priorData A \code{"numeric"} vector which contains the prior information.
#' @param data A \code{"numeric"} vector containing the observed data.
#' @param s A \code{"numeric"} value which specifies the expert confidence in the prior knowledge.
#' This argument takes values on the interval \emph{[0, N]}, where \emph{N} is the sample size, and is used
#' to synchronize the support of the prior knowledge and the sample.
#' @param POTENTIAL_TYPE A \code{"character"} string, either \emph{MOP} or \emph{MTE}, corresponding to the type of basis function.
#' @param domain A \code{"numeric"} vector which contains the bounding values to fit the function.
#' By default, it is the range of the data.
#' @param coeffversion A \code{"numeric"} value between \code{1--4} which contains the used version for computing
#' the coefficients of the linear opinion pool to combine the prior function and the data function.
#' By default, \code{coeffversion = "4"} is used, so the combination
#' depends on the goodness of the model versus another random model.
#' @param restrictDomain A logical value. This argument allows to choose if the domain is used joining both domains,
#' the prior one and the data domain or trimming them. By default, \code{TRUE} is used, so
#' the domain will be trimmed.
#' @param maxParam A positive integer which indicates the maximum number of coefficients in the function.
#' If specified, the output is the function which gets the best BIC with, at most, this number of parameters.
#' By default, it is set to \code{NULL}.
#' @return A list with the elements
#' \item{coeffs}{An \code{"numeric"} vector with the two coefficients of the linear opinion pool}
#' \item{posteriorFunction}{The final function after combining.}
#' \item{priorFunction}{The fit of the prior data.}
#' \item{dataFunction}{The fit of the original data.}
#' \item{rangeNewPriorData}{A \code{"numeric"} vector which contains the final domain where the functions are defined.}
#' @seealso \link{getCoefficients}
#' @export
#' @examples
#'
#' ## Data
#' X <- rnorm(15)
#'
#' ## Prior Data
#' priordata <- rnorm(5000)
#'
#' ## Test data
#' test <- rnorm(1000)
#' testData <- test[test>=min(X)&test<=max(X)]
#'
#' ## Learning
#' type <- "MOP"
#' confident <- 3 ## confident <- 1,2,...,length(X)
#' f <- learnMoTBFpriorInformation(priorData = priordata, data = X, s = confident,
#' POTENTIAL_TYPE = type)
#' attributes(f)
#'
#' ## Log-likelihood
#' sum(log(as.function(f$dataFunction)(testData)))
#' sum(log(as.function(f$posteriorFunction)(testData))) ## best loglikelihood
#'
#'
learnMoTBFpriorInformation <- function(priorData, data, s, POTENTIAL_TYPE, domain=range(data), coeffversion=4, restrictDomain=TRUE, maxParam=NULL)
{
## Learning prior function
fPI <- univMoTBF(priorData,POTENTIAL_TYPE, range(priorData, domain))
## Computing the new domain
if (is.null(restrictDomain)||(restrictDomain==FALSE)) rangeNewPriorData <- range(priorData)
else rangeNewPriorData <- newRangePriorData(fPI,priorData, length(data), domain, s, POTENTIAL_TYPE)
## Normalizing the prior function
iP <- integrate(as.function(fPI), min(domain, rangeNewPriorData), max(domain, rangeNewPriorData))$value
if(POTENTIAL_TYPE=="MOP"){
f=coef(fPI)/iP
fPI <- asMOPString(f)
fPI <- motbf(fPI)
} else {
f <- coef(fPI)/iP
fPI <- asMTEString(f)
fPI <- motbf(fPI)
}
## Learning data function
fD <- univMoTBF(data,POTENTIAL_TYPE, range(rangeNewPriorData, domain), maxParam=maxParam)
## Learning posterior function
coeffs <- getCoefficients(fPI, rangeNewPriorData, fD, data, domain, coeffversion)
if(coeffs[1]==0){
fX <- fD
} else if (coeffs[2]==0){
fX <- fPI
} else {
fPI1 <- coef(fPI); fD1=coef(fD)
maxLength <- max(length(fPI1), length(fD1))
if(length(fD1)<maxLength){
fD1 <- c(fD1, rep(0, maxLength-length(fD1)))
} else {
fPI1 <- c(fPI1, rep(0,maxLength-length(fPI1)))
}
fX <- fPI1*coeffs[1]+fD1*coeffs[2]
if(POTENTIAL_TYPE=="MOP") fX <- asMOPString(fX) else fX <- asMTEString(fX)
fX <- motbf(fX)
}
return(list(coeffs=coeffs, posteriorFunction=fX,
priorFunction=fPI, dataFunction=fD,
domain=range(domain, rangeNewPriorData)))
}
#' Redefining the Domain
#'
#' Computes the new domain of two datasets.
#'
#' @param fPI The function fitted to the prior data, of class \code{"motbf"}.
#' @param priorData A \code{"numeric"} array with the values to be included as prior information.
#' @param N A \code{"numeric"} value equal to the data size.
#' @param domain A \code{"numeric"} array with the domain of the data density.
#' @param s A \code{"numeric"} value which is the expert's confidence on the prior information. It is a number between 0 and the data size.
#' @param POTENTIAL_TYPE A \code{"character"} string giving the potential of the model, i.e. \code{"MOP"} if the basis functions are polynomials,
#' or \code{"MTE"} if they are exponentials.
#' @return A \code{"numeric"} array which contains the new domain of the prior function.
#' @export
#' @examples
#'
#' ## Data
#' X <- rnorm(15)
#'
#' ## Prior Data
#' priordata <- rnorm(5000)
#'
#' ## Learning
#' type = "MTE"
#' fPrior <- univMoTBF(priordata, POTENTIAL_TYPE = type)
#'
#' ## New range
#' confident <- 5 ## confident <- 1,2,...,length(X)
#' domain <- range(X)
#' N <- length(X)
#' newRange <- newRangePriorData(fPrior, priorData = priordata, N = N,
#' domain = domain, s = confident, POTENTIAL_TYPE = type)
#' newRange
#'
newRangePriorData <- function(fPI, priorData, N, domain, s, POTENTIAL_TYPE)
{
if(s<N){
coeff <- 1-s/N
diffmin <- 0; diffmax <- 0
if((min(priorData)<min(domain))) diffmin <- min(domain) - min(priorData)
if((max(priorData)>max(domain))) diffmax <- max(priorData) - max(domain)
if((diffmax==0)&&(diffmin==0)) {
## Prior data range is inside of data range
return(range(priorData))
} else{
## Left tail
if(diffmin!=0){
coeff1 <- coeff*diffmin/(diffmax + diffmin)
fx <- coef(fPI)
CDF <- integralMoTBF(fPI)
min1 <- as.function(CDF)(min(domain, priorData))
int <- integrate(as.function(fPI), min(priorData, domain), min(domain))$value
y1 <- int*coeff1
if(-min1-y1>=0) sign = "+" else sign = ""
CDF1 <- noquote(paste(CDF,sign,-min1-y1 , sep=""))
CDF1 <- motbf(CDF1)
q1 <- uniroot(as.function(CDF1), range(domain, priorData))$root
}else {
q1 <- min(priorData)
}
## Right tail
if(diffmax!=0){
coeff2 <- coeff*diffmax/(diffmax + diffmin)
CDF <- integralMoTBF(fPI)
min2 <- as.function(CDF)(min(domain, priorData))
int <- integrate(as.function(fPI), min(domain, priorData), max(domain))$value
y2 <- (1-int)*coeff2
if(-min2-(1-y2)>=0) sign = "+" else sign = ""
CDF1 <- noquote(paste(CDF,sign,-min2-(1-y2), sep=""))
CDF1 <- motbf(CDF1)
q2 <- uniroot(as.function(CDF1), range(priorData, domain))$root
} else {
q2 <- max(priorData)
}
## New Prior domain
newDomain <- c(q1,q2)
}
return(range(newDomain))
} else{
## Parameter s is greater than the data size
return(range(priorData))
}
}
#' Get the coefficients
#'
#' Compute the coefficients for the linear opinion pool
#'
#' @param fPI The function fitted to the prior data, of class \code{"motbf"}.
#' @param rangeNewPriorData An array of length 2 with the new domain of the prior function.
#' @param fD The function fitted to the original data, of class \code{"motbf"}.
#' @param data A \code{"numeric"} array which contains the sample.
#' @param domain A \code{"numeric"} array with the domain of the data density function.
#' @param coeffversion A \code{"numeric"} value between \code{1--4} which contains the used version for computing the coefficients in the linear
#' opinion pool to combine the prior function and the data function. By default \code{coeffversion = "4"} is used, so the combination
#' depends on the goodness of the model versus another random positive MoTBF model.
#' @details \code{coeffversion} can be:
#' \code{"1"} coef1 and coef2 are the sum of the probabilities of one of the function over the sum of all probabilities, respectively;
#' \code{"2"} coef1 and coef2 are the solution of a linear optimization problem which tries to maximize the sum 1 for each row of probabilities;
#' \code{"3"} coef1 and coef2 are the difference of the log-likelihood of the evaluated model and a random uniform model over the sum of both differences, respectively;
#' \code{"4"} coef1 and coef2 are the difference of the log-likelihood of the evaluated model and a ramdom positive MoTBF model over the sum of both differences, respectively.
#' @return A \code{"numeric"} value of length 2 giving the coefficients which are the weigth of the two function to combine.
#' @seealso \link{learnMoTBFpriorInformation}
#' @importFrom lpSolve lp
#' @export
#' @examples
#'
#' ## Data
#' X <- rnorm(15)
#'
#' ## Prior Data
#' priordata <- rnorm(5000)
#'
#' ## Learning
#' confident <- 5
#' type <- "MOP"
#' f <- learnMoTBFpriorInformation(priorData = priordata, data = X, s = confident,
#' POTENTIAL_TYPE = type)
#' attributes(f)
#'
#' ## Coefficients: linear opinion pool
#' getCoefficients(fPI = f$priorFunction, rangeNewPriorData = f$domain, fD = f$dataFunction,
#' data = X, domain = range(X), coeffversion = 4)
#'
#' getCoefficients(fPI = f$priorFunction, rangeNewPriorData = f$domain, fD = f$dataFunction,
#' data = X, domain = range(X), coeffversion = 1)
#'
#' getCoefficients(fPI = f$priorFunction, rangeNewPriorData = f$domain, fD = f$dataFunction,
#' data = X, domain = range(X), coeffversion = 3)
#'
#' getCoefficients(fPI = f$priorFunction, rangeNewPriorData = f$domain, fD = f$dataFunction,
#' data = X, domain = range(X), coeffversion = 2)
#'
getCoefficients=function(fPI, rangeNewPriorData, fD, data, domain, coeffversion)
{
## Density values for both functions
pfPI <- as.function(fPI)(data)
pfD <- as.function(fD)(data)
switch(coeffversion,
## coeffversion <- 1
{coef1 <- sum(pfPI)/(sum(pfPI,pfD)); coef2 <- sum(pfD)/(sum(pfPI,pfD))},
## coeffversion <- 2
{XX <- matrix(c(pfPI, pfD), ncol=2); maxL=c()
for(i in 1:nrow(XX)){
maxLL <- lp("max", objective.in=XX[i,], const.mat=matrix(c(1,1), nrow=1), const.rhs=c(1), const.dir=c("="))$solution
maxL <- matrix(rbind(maxL,maxLL), ncol=2)
}
coef1 <- mean(maxL[,1])
coef2 <- 1-coef1},
## coeffversion <- 3
{probUnif <- dunif(data, min(data, rangeNewPriorData), max(data, rangeNewPriorData));
a <- sum(log(pfPI))-sum(log(probUnif)); if(a<0) a=0;
b <- sum(log(pfD))-sum(log(probUnif)); if(b<0) b=0;
coef1 <- a/(a+b); coef2 <- b/(a+b)},
## coeffversion <- 4
{NonNormalisedRandomMoTBF <- getNonNormalisedRandomMoTBF(length(coef(fPI))-1)
a <- sum(log(pfPI))-UpperBoundLogLikelihood(NonNormalisedRandomMoTBF,data, min(domain, rangeNewPriorData),max(domain, rangeNewPriorData)); if(a<0) a=0;
b <- sum(log(pfD))- UpperBoundLogLikelihood(NonNormalisedRandomMoTBF,data, min(domain, rangeNewPriorData),max(domain, rangeNewPriorData)); if(b<0) b=0;
coef1 <- a/(a+b); coef2 <- b/(a+b)}
)
return(c(coef1, coef2))
}
#' Ramdom MoTBF
#'
#' Generates a non normalized (i.e. not integrating to 1) positive MoTBF function.
#'
#' @param degree A \code{"numeric"} value containing the degree of the random function.
#' @param POTENTIAL_TYPE A \code{"character"} string specifying the posibles potential
#' types, must be one of \code{"MOP"} or \code{"MTE"}.
#' @return A \code{"numeric"} vector of length 2 giving the coefficients.
#' @export
#' @examples
#'
#' getNonNormalisedRandomMoTBF(8, POTENTIAL_TYPE = "MOP")
#' getNonNormalisedRandomMoTBF(11, POTENTIAL_TYPE = "MTE")
#'
getNonNormalisedRandomMoTBF <- function(degree, POTENTIAL_TYPE="MOP")
{
if(degree%%2!=0) degree <- degree-1
parameters=c()
if(POTENTIAL_TYPE=="MOP"){
for(i in 1:((degree/2)+1)) parameters <- c(parameters,0.5,0)
pol <- asMOPString(parameters)
pol <- motbf(pol)
} else {
parameters <- rep(0.5, degree/2+1)
pol <- asMTEString(parameters)
pol <- motbf(pol)
}
return(pol)
}
#' Upper bound of the loglikelihood
#'
#' Computes an upper bound of the expected loglikelihood of a dataset given a randomly
#' generated MoTBF density.
#'
#' @param f A function to evaluate of class \code{"character"}, \code{"motbf"} or others.
#' @param data A \code{"numeric"} array which contains the values to evaluate.
#' @param min A \code{"numeric"} value giving the lower limit of the domain.
#' @param max A \code{"numeric"} value giving the upper limit of the domain.
#' @return A \code{"numeric"} value which is the log-likelihood of the evaluated ramdom function.
#' @seealso \link{getNonNormalisedRandomMoTBF}
#' @export
#' @examples
#'
#' data <- rnorm(20)
#' f <- getNonNormalisedRandomMoTBF(degree = 8, POTENTIAL_TYPE = "MOP")
#' UpperBoundLogLikelihood(f, data, min = -2.5, max = 3.2)
#'
#' data <- rexp(20)
#' f <- getNonNormalisedRandomMoTBF(degree = 8, POTENTIAL_TYPE = "MTE")
#' UpperBoundLogLikelihood(f, data, min = 0, max = 5)
#'
UpperBoundLogLikelihood <- function(f,data, min, max){
n <- length(data)
k <- integrate(as.function(f),min,max)$value
prob <- as.function(f)(data)
return(sum(log(prob))-n*log(k))
}
#' Multivariate Normal sampling
#'
#' Generate a multivariate normal data vector taking into account the real data and the
#' relationships with other variables in the dataset.
#'
#' @param n A \code{"numeric"} value which is the size of the prior data to generate.
#' @param dataParents A data set of class \code{"data.frame"} giving the data of the set of coditional parent variables.
#' @param dataChild A \code{"numeric"} vector containing the original data of the child variable.
#' @return A \code{"numeric"} vector giving the prior data values.
#' @seealso \link{generateNormalPriorData}
#' @export
#' @examples
#'
#' ## Data
#' data(ecoli)
#' data <- ecoli[,-c(1,9)] ## remove sequece.name and class
#'
#' ## DAG
#' dag <- LearningHC(data)
#' plot(dag)
#' getChildParentsFromGraph(dag)
#'
#' ## 1. Random sample
#' parents <- "mcg"
#' child <- "alm1"
#' n <- 1000
#' rnormMultiv(n, dataParents = data.frame(data[,parents]), dataChild = data[,child])
#'
#' ## 2. Random sample
#' parents <- "alm1"
#' child <- "aac"
#' n <- 256
#' rnormMultiv(n, dataParents = data.frame(data[,parents]), dataChild = data[,child])
#'
rnormMultiv <- function(n, dataParents, dataChild)
{
ma <- as.data.frame(cbind(dataParents, dataChild))
mu <- colMeans(ma); covar <- cov(ma)
## Covariate matrix
cc <- covar[ncol(ma),c(1:(ncol(ma)-1))]
## Computing the alphas for the linear regression
ro <- sapply(1:(ncol(ma)-1), function(i) cc[i]/diag(covar)[i])
alpha <- c(); for(i in 1:length(ro)) alpha <- paste(alpha,"-ro[", i, "]*mu[",i, "]",sep="")
## Computing the means for the linear regression
muZ <- c(); for(i in 1:length(ro)) muZ <- paste(muZ,"+ro[",i,"]*dataParents[,",i,"]", sep="")
muZ <- paste("mu[", ncol(ma), "]", alpha, muZ, sep="")
Z <- rnorm(n,eval(parse(text=muZ)), sd(dataChild))
return(Z)
}
#' Prior data generation
#'
#' Generate a prior dataset taking in to account the relationships
#' between the varibles in a given network.
#'
#' @param graph A network of the class \code{"bn"}, \code{"graphNEL"} or \code{"network"}.
#' @param data An object of class \code{"data.frame"} containing the continuous variables in the dataset.
#' @param size A positive integer indicating the number of records to generate for each variable in the dataset.
#' @param means A \code{"numeric"} vector with the average of the variables whose prior information is available.
#' The names in the vector must be the same as the names of the variables in the data.frame.
#' @param deviations A \code{"numeric"} vector with the standard deviations of the variables whose prior information is available.
#' The names of the vector must be the same as the names of the variables in the data.frame.
#' If not specified, the standard deviation of each variable is computed from 'data'.
#' @seealso \link{rnormMultiv}
#' @return A normal prior data set of class \code{"data.frame"}.
#' @export
#' @examples
#'
#' ## Data
#' data(ecoli)
#' data <- ecoli[,-c(1,9)] ## remove sequece.name and class
#' X <- TrainingandTestData(data, percentage_test = 0.95)
#' Xtraining <- X$Training
#' Xtest <- X$Test
#'
#' ## DAG
#' dag <- LearningHC(data)
#' plot(dag)
#'
#' ## Means and desviations
#' colnames(data)
#'
#' m <- sapply(data, function(x){ifelse(is.numeric(x), mean(x),NA)})
#' d <- sapply(data, function(x){ifelse(is.numeric(x), sd(x),NA)})
#'
#'
#' ## Prior Dataset
#' n <- 5600
#' priorData <- generateNormalPriorData(dag, data = Xtraining, size = n, means = m)
#' summary(priorData)
#' ncol(priorData)
#' nrow(priorData)
#' class(priorData)
#'
generateNormalPriorData <- function(graph, data, size, means, deviations=NULL)
{
vdag <- names(graph$nodes)
vdf <- colnames(data)
orden <- match(vdag, vdf)
means <- means[orden]
if(is.null(deviations)){
#options(warn=-1)
suppressWarnings({
for(i in 1:length(means)){
if(is.na(means[i])) deviations[i] <- NA
else deviations[i] <- sd(data[,i])
}
#deviations <- sapply(data, sd)
})
}
columnsNames <- colnames(data)[orden]; XX=c()
for(i in 1:length(columnsNames)){
if(is.na(means[i])||is.na(deviations[i])) X <- NA
else X <- rnorm(size, means[i], deviations[i])
XX <- matrix(cbind(XX,X), nrow=size)
}
XX <- as.data.frame(XX)
colnames(XX) <- columnsNames
noNormal <- colnames(XX[sapply(XX[1,], is.na)])
childrenAndParents <- getChildParentsFromGraph(graph,columnsNames)
YY <- c()
for(i in 1:length(columnsNames)){
if(length(childrenAndParents[[i]])==1){
XX[,i] <- XX[,i]
} else {
child <- childrenAndParents[[i]][1]
dataChild <- cbind(XX[,child])
if(child%in%noNormal){
XX[,i] <- cbind(XX[,child])
next
}
parents <- childrenAndParents[[i]][2:length(childrenAndParents[[i]])]
if(all(parents%in%noNormal)){
XX[,i] <- cbind(XX[,child])
next
}
if(any(parents%in%noNormal)){
parents <- parents[-which(parents%in%noNormal)]
}
dataParents <- as.data.frame(XX[,parents])
XX[,i] <- rnormMultiv(size,dataParents, dataChild)
}
YY <- XX
}
YY <- YY[,match(vdf, vdag)]
pos <- which(sapply(YY[1,], is.na))
if(length(pos)!=0) YY <- YY[,-pos, drop = F]
return(as.data.frame(YY))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.