Nothing
#
# Copyright (c) 2010, 2014, 2016, 2017 IBM Corp. All rights reserved.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
idaLm <- function(form, idadf, id = "id", modelname = NULL, dropModel = TRUE, limit = 25){
storeCovMat <- TRUE
clearExisting <- TRUE
#SANITY-CHECKS
if(!is.logical(storeCovMat)){
stop("Parameter storeCovMat has to be logical.")
}
if(!is.logical(dropModel)){
stop("Parameter dropModel has to be logical.")
}
if(dropModel&&(!is.null(modelname))) {
stop("If dropModel is TRUE, you cannot set a model name.")
}
if(!is.logical(clearExisting)){
stop("Parameter clearExisting has to be logical.")
}
#if(!idaCheckProcedure("LINEAR_REGRESSION","idaLm",FALSE)){
# stop("Function not available.")
#}
if(!is.ida.data.frame(idadf)){
stop("This function can only be applied to ida.data.frame objects.")
}
idaCheckConnection()
formOut <- idaParseRFormula(form,idadf)
if(length(formOut$response) == 0) {
stop("No target variable specified.")
}
if(length(formOut$cols) > 78){
stop("This function is currently not supported for more than 78 columns.")
}
# internal function that calls the stored procedure.
idaLmSP <- function(formOut, idadf, id = "id", modelname = NULL, storeCovMat = TRUE, dropModel = FALSE,
clearExisting = TRUE, ...) {
if(idaIsDb2z()) {
encoding <- idaGetAcceleratorEncoding();
if (encoding != "UNICODE"){
stop(paste("For DB2 for z/OS connections only UNICODE data are supported. The encoding of the input data, however is ", encoding, sep=""))
}
}
formOut$cols <- formOut$cols[formOut$cols != id]
# check if given id is valid
colu = idadf@cols
tmpId = FALSE
# if no id was provided generate a temporary id
if (!(id %in% colu)) {
if (idaIsDb2z() || is.null(id)) {
stop(simpleError(paste("Id variable is not available in ida.data.frame:", id)))
} else {
warning("No valid id column specified for ida.data.frame, using a temporary id column.")
tmpId = TRUE
}
}
id.quoted <- quoteSQLIdentifier(id)
#CREATE MODELNAME
if (is.null(modelname)) {
modelname <- idaGetValidModelName('LINEAR_REGRESSION_')
} else {
if(grepl(" ",modelname)) {
stop("Space in modelname not allowed.")
}
xx <- parseTableName(modelname);
if (idaIsDb2z()) {
modelname <- paste('"',xx$table,'"',sep='');
} else {
modelname <- paste('"',xx$schema,'"."',xx$table,'"',sep='');
}
}
#ClearExisting
if(idaModelExists(modelname)) {
if(clearExisting){
idaDropModel(modelname)
} else {
stop("Model already exists.")
}
}
#Create View
if (tmpId == TRUE) {
# Add temporary id to view
input <- idaCreateView(idadf, newColumn = paste0('ROW_NUMBER() OVER () AS ', id.quoted))
} else {
input <- idaCreateView(idadf)
}
#Create formula with all attributes for the SP
#Catching column-names with ' or " inside
formOut$cols <- gsub('"', '""', formOut$cols)
attr <- paste0("\"", formOut$cols, "\"", collapse=";")
attr <- gsub("'", "''", attr)
formOut$response <- gsub('"', '""', formOut$response)
target <- paste0("\"", formOut$response, "\"")
target <- gsub("'", "''", target)
intercept <- "FALSE"
if(formOut$intercept){
intercept <- "TRUE"
}
# CALL STORED PROCEDURE
nrows<- -1
tryCatch({
if(idaIsDb2z()) {
res <- callSP("LINEAR_REGRESSION ",
model = modelname,
intable = input,
id=id.quoted,
target = target,
coldefrole = "ignore",
incolumn = attr,
intercept = intercept,
calculateDiagnostics=TRUE)
nrows <- nrow(idadf)
} else {
res <- callSP("LINEAR_REGRESSION ",
model = modelname,
intable = input,
id=id.quoted,
target = target,
coldefrole = "ignore",
incolumn = attr,
storecovariancematrix = storeCovMat,
intercept = intercept)
}
}, error = function(e) {
# in case of error, let user know what happend
stop(e)
}, finally = {
# drop view
idaDropView(input)
})
LinMod <- idaRetrieveidaLmModel(modelname, nrows=nrows)
if(dropModel){
idaDropModel(modelname)
}
return(LinMod)
}
idaLmSQL <- function(form, idadf, intercept) {
#calculates linear model for all continous variables and no in database model
idaDotProductSQL <- function(idadf,colNames,intercept) {
cols <- colnames(idadf);
if(length(cols) > 42)
stop("Only up to 40 predictors are currently supported.");
queryList <- c();
if(intercept) {
for(i in 1:length(cols)) {
for(j in i:length(cols)) {
queryList <- c(queryList,paste("SUM(\"", cols[i] ,"\"*\"", cols[j],"\") ",sep=''));
}
queryList <- c(queryList,paste("SUM(\"", cols[i] ,"\")",sep=''));
}
queryList <- c(queryList,"COUNT(*)");
query <- paste("SELECT ", paste(queryList,sep=',',collapse=',')," FROM ",idadf.from(idadf),sep='');
res <- idaQuery(query);
res[[length(res)]] <- as.numeric(res[[length(res)]])
if (nrow(res[complete.cases(res),]) == 0) {
stop("There is no valid data in the input table for building the model. Note that the rows with missing values in your input data are ignored.");
}
mdat <- matrix(1:(length(cols)+1)*(length(cols)+1),nrow=length(cols)+1,ncol=length(cols)+1,dimnames = list(c(colNames, "Intercept"),c(colNames, "Intercept")),byrow=T)
r <- 1;
c <- 1;
for(i in 1:ncol(res)) {
mdat[r,c] <- as.numeric(res[[i]][1]);
mdat[c,r] <- mdat[r,c];
c <- c + 1;
if(c>length(cols)+1) {
r <- r+1;
c <- r;
}
}
mdat
} else {
for(i in 1:length(cols)) {
for(j in i:length(cols)) {
queryList <- c(queryList,paste("SUM(\"", cols[i] ,"\"*\"", cols[j],"\") ",sep=''));
}
}
query <- paste("SELECT ", paste(queryList,sep=',',collapse=',')," FROM ",idadf.from(idadf),sep='');
res <- idaQuery(query);
if (nrow(res[complete.cases(res),]) == 0) {
stop("There is no valid data in the input table for building the model. Note that the rows with missing values in your input data are ignored.");
}
mdat <- matrix(1:(length(cols))*(length(cols)),nrow=length(cols),ncol=length(cols),dimnames = list(colNames,colNames),byrow=T)
r <- 1;
c <- 1;
for(i in 1:ncol(res)) {
mdat[r,c] <- as.numeric(res[[i]][1]);
mdat[c,r] <- mdat[r,c];
c <- c + 1;
if(c>length(cols)) {
r <- r+1;
c <- r;
}
}
mdat
}
}
#Creates a View with all columns that are in the formula form and deletes all
#rows that have NA values. This might lead to an empty table!
input <- idaCreateInputDataView(form,idadf)
inputDf <- ida.data.frame(input$view)
mat <- NULL
tryCatch({
mat <- idaDotProductSQL(inputDf,input$colNames,intercept)
if(intercept){
numrow <- mat[nrow(mat), ncol(mat)]
}else{
numrow <- nrow(inputDf)
}
newidaLm <- idaLmStatistics(mat, numrow = numrow)
},error=function(e){
stop(e)
},finally={
idaDropView(input$view)
})
card <- vector(mode = "integer", length = length(formOut$cols)) + 1
names(card) <- formOut$cols
newidaLm <- c(newidaLm, list(CovMat = mat, card = card, model = NULL))
class(newidaLm) <- "idaLm"
return(newidaLm)
}
#Checking for categorical columns so we know which method to choose.
tableDef <- idaTableDef(idadf, FALSE) #all columns with their datatype
tableDef <- tableDef[match(formOut$cols, tableDef$name), ] #look for the columns out of form
numOfCatCols <- nrow(tableDef[tableDef$valType == "CATEGORICAL", ])
if(idaIsDb2z() && idaGetAcceleratorEncoding() != "UNICODE") {
if (numOfCatCols == 0) {
LinMod <- idaLmSQL(form, idadf, formOut$intercept)
} else{
stop("At least one of the input columns is categorical with non-UNICODE encoding which is not supported for DB2 z/OS.")
}
} else {
if (numOfCatCols == 0 && dropModel && nrow(tableDef) < 42) {
LinMod <- idaLmSQL(form, idadf, formOut$intercept)
} else{
# id is needed for idaLmSP
LinMod <- idaLmSP(formOut, idadf, id=id, modelname = modelname, storeCovMat = storeCovMat,
dropModel = dropModel, clearExisting = clearExisting)
}
}
if(dropModel){
LinMod$model <- NULL
}
return(LinMod)
}
idaLmStatistics<-function(mat, coeff = NULL, numrow){
#This function calculates the statistics for the linear model.
p <- ncol(mat)-1
n <- numrow
YTY <- mat[1, 1]
XTX <- mat[-1, -1, drop = FALSE]
XTY <- mat[1, -1]
idaInvXTX <- NULL
getLogLikelihood <- function(RSS, n) {
-n/2*log(2*pi) - n/2*log(RSS/n) - n/2
}
getAIC <- function(L, n, p) {
-2*L + 2*(p+1)
}
getBIC <- function(L, n, p) {
-2*L + (p+1)*log(n)
}
getTest <- function(idaInvXTX, beta, n, p, sigmasq=1) {
sdbeta = sqrt(diag(idaInvXTX)*sigmasq)
tval = beta/sdbeta
pval = 1 - abs(1 - 2*pt(tval, n-p))
cbind(sdbeta, tval, pval)
}
getSigma <- function(RSS, df.residuals){
sqrt(RSS / df.residuals)
}
#Try solve first, only if it fails use ginv
try({idaInvXTX <- solve(XTX)}, silent = T)
if(is.null(idaInvXTX)) {
idaInvXTX <- ginv(XTX);
}
if(is.null(coeff)){
coeff<-idaInvXTX %*% XTY
rownames(coeff) = colnames(XTX)
}
beta <- coeff
RSS <- (YTY - 2* t(beta) %*% XTY + t(beta) %*% XTX %*% beta)[1, 1]
names(RSS) <- NULL
ran <- sum(abs(eigen(XTX, only.values = TRUE)$values) > 10^-8)#Every linear dependant attribute
p <- min(ran, n-1) #has a zero eigenvalue.
rank <- p
df.residuals <- n-p
likelihood <- getLogLikelihood(RSS, n)
AIC <- getAIC(likelihood, n, p)
BIC <- getBIC(likelihood, n, p)
tests <- getTest(idaInvXTX, beta, n, p, RSS/(n-p))
coefftab <- data.frame(Estimate = beta, Std.Error = tests[,1], t.value = tests[,2], p.value = tests[,3])
names(coefftab) <- c("Estimate", "Std.Error", "t.value", "Pr(>|t|)")
rownames(coefftab)<-rownames(coeff)
sigma <- getSigma(RSS, df.residuals)
newidaLm <- list(coefficients = coeff, RSS = RSS, sigma = sigma, effects = NULL, rank = rank,
df.residuals = df.residuals, coefftab = coefftab, Loglike = likelihood,
AIC = AIC, BIC = BIC, numrow = numrow)
return(newidaLm)
}
idaRetrieveidaLmModel<-function(modelname, nrows=-1){
#modelname with schema: "schema.table"
#parse the modelname to get the table and schema names
xx <- parseTableName(modelname)
model <- xx$table
modelSchema <- xx$schema
modelColList <- "VAR_ID, VAR_NAME, LEVEL_ID, LEVEL_NAME, PREDICTED_ID, PREDICTED_NAME, PREDICTED_LEVEL_ID, PREDICTED_LEVEL_NAME, VALUE, ST_DEV, TVAL, PVAL"
covmatColList <- "ROW, COL, VALUE, DISTR"
#read all data from the model tables.
calcstats <- FALSE
intercept <- 0
y_var_est <- NULL
rss <- NULL
r_squared <- NULL
if(idaIsDb2z()) {
exportModelTable <- idaGetValidTableName(prefix = "IDAR_MODEL_TABLE_")
tryCatch({
res <- callSP("EXPORT_MODEL_TO_TABLE", model=model, outtable=exportModelTable)
modelColExpr <- "case when LEVEL_NAME is NULL then VAR_NAME else VAR_NAME || ' - ' || LEVEL_NAME end, VALUE, ST_DEV, TVAL, PVAL "
modelquery <- paste('SELECT ', modelColExpr, ' FROM ', exportModelTable,' where MODELUSAGE= \'Model\' ORDER BY VAR_ID',sep="")
Lmdata <- idaQuery(modelquery, na.strings = NULL)
coeff <- Lmdata[-(1:3),]
names(coeff) <- c("", "Estimate", "Std.Error", "t.value", "Pr(>|t|")
y_var_est <- Lmdata[1,2]
rss <- Lmdata[2,2]
r_squared <- Lmdata[3,2]
numrow <- nrows
}, error = function(e) {
# in case of error, let user know what happend
stop(e)
}, finally = {
idaDeleteTable(exportModelTable)
}
)
} else {
try({
modelquery <- paste0('SELECT * FROM "',modelSchema,'"."',model,'_MODEL"')
Lmdata <- idaQuery(modelquery, na.strings = NULL)
covmatquery <- paste0('SELECT * FROM "',modelSchema,'"."',model,'_COVARIANCE_MATRIX"')
interceptpos <- match(-1,Lmdata$VAR_ID)
if(!is.na(interceptpos)){
Lmdata$VAR_NAME[interceptpos] <- "Intercept" #changing it from (Intercept)
intercept <- 1
}
coeff<-c()
calcstats <- FALSE
if(!is.null(covmatquery)) {
#Covariance-matrix
covMat <- idaQuery(covmatquery)
nrowMat <- sqrt(nrow(covMat))#if full matrix saved
#Change RCV-format into a matrix
mat <- matrix(as.double(1:(nrowMat^2)),nrow=nrowMat,ncol=nrowMat)
rowNames <- vector(mode = "character", length=nrowMat)
covMat[, c(1, 3, 5)] <- data.matrix(covMat[, c(1, 3, 5)])
for(i in 1:nrow(covMat)){
mat[covMat$ROW_ID[i], covMat$COL_ID[i]] <- covMat$VAL[i]
if(covMat$COL_ID[i] == 1){
rowNames[covMat$ROW_ID[i]] <- covMat$ROW_NAME[i]
}
}
colnames(mat) <- rowNames
rownames(mat) <- rowNames
#intercept is in line 2 instead of the last one.
if(intercept){
interceptpos <- match("Intercept", rowNames)
switchIntercept <- c((1:nrowMat)[-interceptpos], interceptpos)
mat <- mat[switchIntercept, switchIntercept]
}
}
#orders the coeff table in the order of the CovMat if it exists
coeff <- Lmdata$VALUE
if(!is.null(mat)){
names(coeff) <- rownames(mat)[-1]#ASSUMPTION: Target variable in the first row
calcstats <- TRUE
}
},silent=TRUE)
if(intercept){
numrow <- mat[nrowMat, nrowMat]
} else {
modelinfo <- idaQuery(paste0("call idax.list_params('schema=", modelSchema, ", format=short,
where=modelname=''", model, "''')"))
numrow <- as.numeric(modelinfo[match("valid_rows_in_intable", modelinfo$PARAMETERNAME), ]$PARAMETERVALUE)
}
#puts the coefficient of the model into a vector with the parsed names
for(i in 1:length(Lmdata$VAR_NAME)){
# added nchar length check, since there can be "" entries as levels
if(!is.na(Lmdata$LEVEL_NAME[i]) && nchar(Lmdata$LEVEL_NAME[i]) > 0){
tempname <- paste0(Lmdata$VAR_NAME[i],"___",Lmdata$LEVEL_NAME[i])
coeff[tempname] <- Lmdata$VALUE[i]
} else {
coeff[Lmdata$VAR_NAME[i]] <- Lmdata$VALUE[i]
}
}
coeff<-as.matrix(coeff)
colnames(coeff) <- c("Coefficients")
}
newidaLm <- list(coefficients = coeff, coefftab = NULL, RSS = rss, sigma = NULL, effects = NULL,
rank = NULL, df.residuals = NULL, likelihood = NULL, AIC = NULL,
BIC = NULL, card = NULL, CovMat = NULL, model = modelname, numrow = numrow)
if(calcstats){
#Get the attribute names and their cardinality in order of the CovMat
card <- table(Lmdata$VAR_NAME)[unique(Lmdata$VAR_NAME)]
card <- card[!"Intercept" == names(card)]
covmatnames <- rownames(mat)[-1]
covmatnames <- covmatnames[!"Intercept" == covmatnames]
#Get the original attribute names for all the dummy-variables
splitted <- sapply(covmatnames, function(x){strsplit(x, split = "___")[[1]][1]})
#find the first dummy-variable for each attribute
startpos <- match(names(card), splitted)
#reorder card in that order
card <- card[order(startpos)]
#Statistic functions from old idaLm function
statistics <- idaLmStatistics(mat, coeff, numrow)
#prepare object of the class idaLm
newidaLm <- c(statistics, list(CovMat = mat, card = card, model = modelname))
}
class(newidaLm) <- c("idaLm")
return(newidaLm)
}
print.idaLm <- function(x, ...) {
cat("\nCoefficients:\n")
if(is.null(x$coefftab)){
print(x$coefficients)
} else {
coeff<-x$coefftab
coeff$" " <-rep(" ", nrow(coeff))
coeff[coeff$"Pr(>|t|)" < 0.1, 5] <- "."
coeff[coeff$"Pr(>|t|)" < 0.05, 5] <- "*"
coeff[coeff$"Pr(>|t|)" < 0.01, 5] <- "**"
coeff[coeff$"Pr(>|t|)" < 0.001, 5] <- "***"
print(coeff)
cat("\n---")
cat("\nSignif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1")
cat("\nResidual standard error: ", x$sigma, " on ", x$df.residuals, " degrees of freedom\n\n")
cat("Log-likelihood: ", x$Loglike, "\n")
cat("AIC: ", x$AIC, "\n")
cat("BIC: ", x$BIC, "\n")
}
}
predict.idaLm <- function(object, newdata, id, outtable=NULL, ...){
idaLm <- object
#calculates the prediction of a given linear model for all datapoints in the input
#
#Args:
#newdata: ida.dat.frame that refers to a table containing data you want to be predicted
#idaLm: linear model calculated by idaLm that has the same attributes as newdata
#id: name of the id column in newdata so we can match the prediction to the input
#outtable: name of the table the results will be written in
#
#return:
#ida.data.frame that refers to the table with the predicted values sorted by the id column
if(is.null(outtable)){
outtable <- idaGetValidTableName("PREDICT_LM_")
}
if(!"idaLm" %in% class(idaLm)){
stop("object is not an object of class idaLm.")
}
if(!is.ida.data.frame(newdata)){
stop("Parameter newdata is not an ida.data.frame.")
}
#newdata needs id column
colu = newdata@cols
if (!(id %in% colu))
stop(simpleError(paste("Id variable is not available in ida.data.frame:", id)))
#TODO: Add more verbose error message
if(is.null(idaLm$model)){
stop("Only models that are stored in-database can be applied. Re-train the model with the dropModel parameter set to FALSE to use the predict function.")
}
tmpview <- idaCreateView(newdata)
#add quotation marks to support mixed case statements
model <- paste0("\"",idaLm$model,"\"")
intable <- paste0("\"",tmpview,"\"")
id <- paste0("\"",id,"\"")
outtable <- paste0("\"",outtable,"\"")
tryCatch({
callSP("PREDICT_LINEAR_REGRESSION",
model = model,
id = id,
intable = intable,
outtable = outtable, ...)
}, error = function(e) {
# in case of error, let user know what happend
stop(e)
}, finally = {
# drop view
idaDropView(tmpview)
}
)
object.pred <- ida.data.frame(outtable)
return(object.pred)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.