#' Random Loss function
#'
#' Based on a given data set, loss is computed from comparing the original data to the data binned at the specified level.
#' Loss is computed as sum of squared differences between all values at the original data and the binned data.
#' Data that are "in beween" two bins are randomly assigned to one bin or the other by a bernoulli trial with success
#' probability equal to the distance between the bin's center and the point divided by the binning level.
#' Percent loss compares loss to the total sum of squares in the original data.
#'
#' @param d1 data frame
#' @param binning vector of binwidths
#' @param newData binned data available?
#' @export
#' @examples
#' connect <- dbConnect(dbDriver("MySQL"), user="2009Expo",
#' password="R R0cks", port=3306, dbname="baseball",
#' host="headnode.stat.iastate.edu")
#' pitch <- new("dataDB", co=connect, table="Pitching")
#' d1 <- dbData(pitch, vars=c( "G", "SO"))
#' qplot(G,SO, fill=log10(Freq), data=d1, geom="tile")+scale_fill_gradient2()
#' lossRandom(d1, binning=c(2,5))
#' lossRandom(d1, binning=c(1,5))
#' d2 <- dbData(pitch, vars=c( "G", "SO"), binwidth=c(1,5))
#' qplot(G,SO, fill=log10(Freq), data=d2, geom="tile")+scale_fill_gradient2()
#' lossRandom(d1, binning=c(1,10))
#' lossRandom(d2, binning=c(1,2))
#' d2 <- dbData(pitch, vars=c( "G", "SO"), binwidth=c(1,10))
#' qplot(G,SO, fill=log10(Freq), data=d2, geom="tile")+scale_fill_gradient2()
#' ## some more exploration of loss
#' bins <- expand.grid(x=c(1:10), y=c(1:10))
#' losses <- mdply(bins, function(x,y) lossRandom(data=d1, binning=c(x, y)))
#' qplot(x, percent.loss, group=y, data=losses, geom="line")
#' qplot(y, percent.loss, group=x, data=losses, geom="line")
#' d2 <- dbData(pitch, vars=c( "G", "SO"), binwidth=c(10,10))
#'
lossRandom <- function(data, binning, newData=FALSE){
# browser()
dnew <- data
vars <- data*0
names(vars) <- paste("Var", names(data), sep="")
nl <- ncol(data)
if(length(binning) != nl)
binning <- rep(binning, length=nl-1)
for(x in 1:(nl-1)){
dnew[,x] <- binning[x]*(floor(data[,x]/binning[x]) +
sapply((data[,x]%%binning[x])/binning[x], function(p) rbinom(1, 1, p)))
vars[,x] <- data$Freq*(dnew[,x] - data[,x])^2
}
dnew$id <- interaction(dnew[,1:(nl-1)])
res <- ddply(dnew, .(id), summarize, n=length(id), fsum=sum(Freq))
dnew <- merge(dnew, res, by = "id")
vars$VarFreq <- with(dnew, (log(Freq)-log(fsum)/n)^2)
vars$id <- dnew$id
vfun <- function(v){
t <- sapply(1:ncol(v), function(i) is.numeric(v[,i]))
return(colSums(v[t]))
}
var.res <- ddply(vars, .(id), vfun)
dcond <- merge(res, var.res, by = "id")
namesdcond <- names(dcond)
dcond <- cbind(do.call("rbind", lapply(strsplit(as.character(dcond$id), ".", fixed=TRUE), as.numeric)), dcond[,-1])
names(dcond) <- c(names(dnew)[2:(nl)], namesdcond[-1])
Loss <- colSums(dcond[,(nl+2):ncol(dcond)])
names(Loss) <- paste(c("", "", "Log"), names(data[,-(nl+1)]), sep="")
TSS <- as.data.frame(t(c(sapply(1:(nl-1), function(i) sum(data$Freq*(data[,i] - mean(data[,i]))^2)), sum((log(dnew$Freq)-mean(log(dnew$Freq)))^2))))
names(TSS) <- names(data)
retdata <- dcond[,c(1:(nl-1), (nl+1))]
names(retdata) <- names(data)
if(newData)
return(list(loss = Loss, totalssq = TSS, percent.loss = Loss/TSS*100, NewData = retdata))
else return(list(loss = Loss, totalssq = TSS, percent.loss = Loss/TSS*100))
}
#' Partitioned Standard Loss function
#'
#' Based on a given data set, loss is computed from comparing the original data to the data binned at the specified level.
#' Loss is computed as sum of squared differences between all values at the original data and the mean of the data in the new bin.
#' Percent loss compares loss to the total sum of squares in the original data.
#'
#' @param d1 data frame
#' @param binning vector of binwidths
#' @export
#' @examples
#' connect <- dbConnect(dbDriver("MySQL"), user="2009Expo",
#' password="R R0cks", port=3306, dbname="baseball",
#' host="headnode.stat.iastate.edu")
#' pitch <- new("dataDB", co=connect, table="Pitching")
#' d1 <- dbData(pitch, vars=c( "G", "SO"))
#' qplot(G,SO, fill=log10(Freq), data=d1, geom="tile")+scale_fill_gradient2()
#' lossMean(d1, binning=c(2,5))
#' lossMean(d1, binning=c(1,5))
#' d2 <- dbData(pitch, vars=c( "G", "SO"), binwidth=c(1,5))
#' qplot(G,SO, fill=log10(Freq), data=d2, geom="tile")+scale_fill_gradient2()
#' lossMean(d1, binning=c(1,10))
#' lossMean(d2, binning=c(1,2))
#' d2 <- dbData(pitch, vars=c( "G", "SO"), binwidth=c(1,10))
#' qplot(G,SO, fill=log10(Freq), data=d2, geom="tile")+scale_fill_gradient2()
#' ## some more exploration of loss
#' bins <- expand.grid(x=c(1:10), y=c(1:10))
#' losses <- mdply(bins, function(x,y) lossMean(data=d1, binning=c(x, y)))
#' qplot(x, percent.loss, group=y, data=losses, geom="line")
#' qplot(y, percent.loss, group=x, data=losses, geom="line")
#' d2 <- dbData(pitch, vars=c( "G", "SO"), binwidth=c(10,10))
#'
lossMean <- function(data, binning, newData=FALSE){
dnew <- data
vars <- data*0
names(vars) <- paste("Var", names(data), sep="")
nl <- ncol(data)
if(length(binning) != nl)
binning <- rep(binning, length=nl-1)
for(x in 1:(nl-1)){
dnew[,x] <- binning[x]*round(data[,x]/binning[x],0)
}
data$id <- dnew$id <- interaction(dnew[,1:(nl-1)])
vloss <- as.data.frame(data$Freq*(data[,1:(nl-1)]-dnew[,1:(nl-1)])^2)
names(vloss) <- names(data)[1:(nl-1)]
vloss$id <- data$id
mfun <- function(v){
nl <- ncol(v)-1
t <- data.frame(t(sapply(1:(nl-1), function(i) sum(v[,i]*v[,nl])/sum(v[,nl])))) # Calculate EX
t$fsum <- sum(v[,nl])
tsq <- data.frame(t(sapply(1:(nl-1), function(i) sum(v[,i]^2*v[,nl])/sum(v[,nl])))) # Calculate EX^2
tsq <- tsq - t^2 # VarX = EX^2 - (EX)^2
tsq <- tsq * t$fsum # Get SSQ from Freq
names(t) <- c(names(v)[-c(nl, nl+1)], "fsum")
t$id <- unique(v$id)
t$n <- length(v$id)
names(tsq) <- paste(names(t)[1:(nl-1)], "Var", sep="")
t <- cbind(t, tsq)
return(t)
}
res <- ddply(data, .(id), mfun)
dnew2 <- merge(res, dnew[-c(1:(nl-1))], by="id")
Loss <- c(colSums(res[,(ncol(res)-(nl-2)):ncol(res)]), sum((log(dnew2$Freq) - log(dnew2$fsum)/dnew2$n)^2))
names(Loss) <- paste(c("", "", "Log"), names(data[,-(nl+1)]), sep="")
visLoss <- colSums(vloss[,1:(nl-1)])
TSS <- as.data.frame(t(c(sapply(1:(nl-1), function(i) sum(data$Freq*(data[,i] - mean(data[,i]))^2)), sum((log(dnew$Freq)-mean(log(dnew$Freq)))^2))))
names(TSS) <- names(data[,-(nl+1)])
dnew3 <- as.data.frame(cbind(do.call("rbind", lapply(strsplit(as.character(res$id), ".", fixed=TRUE), as.numeric)), res$fsum))
names(dnew3) <- names(data[,1:(nl)])
if(newData)
return(list(math.loss = Loss, visualLoss = visLoss, loss = Loss + c(visLoss,0), totalssq = TSS, percent.mathLoss = Loss/TSS*100, percent.visualLoss = 100*visLoss/TSS[1:(nl-1)], percent.loss = (Loss + c(visLoss, 0))/TSS*100, NewData = dnew3))
else return(list(math.loss = Loss, visualLoss = visLoss, loss = Loss + c(visLoss,0), totalssq = TSS, percent.mathLoss = Loss/TSS*100, percent.visualLoss = 100*visLoss/TSS[1:(nl-1)], percent.loss = (Loss + c(visLoss, 0))/TSS*100))
}
#' Standard Loss function (efficient, no partitioning)
#'
#' Based on a given data set, loss is computed from comparing the original data to the data binned at the specified level.
#' Loss is computed as sum of squared differences between all values at the original data and the mean of the data in the new bin.
#' Percent loss compares loss to the total sum of squares in the original data.
#'
#' @param d1 data frame
#' @param binning vector of binwidths
#' @export
#' @examples
#' connect <- dbConnect(dbDriver("MySQL"), user="2009Expo",
#' password="R R0cks", port=3306, dbname="baseball",
#' host="headnode.stat.iastate.edu")
#' pitch <- new("dataDB", co=connect, table="Pitching")
#' d1 <- dbData(pitch, vars=c( "G", "SO"))
#' qplot(G,SO, fill=log10(Freq), data=d1, geom="tile")+scale_fill_gradient2()
#' lossStandard(d1, binning=c(2,5))
#' lossStandard(d1, binning=c(1,5))
#' d2 <- dbData(pitch, vars=c( "G", "SO"), binwidth=c(1,5))
#' qplot(G,SO, fill=log10(Freq), data=d2, geom="tile")+scale_fill_gradient2()
#' lossStandard(d1, binning=c(1,10))
#' lossStandard(d2, binning=c(1,2))
#' d2 <- dbData(pitch, vars=c( "G", "SO"), binwidth=c(1,10))
#' qplot(G,SO, fill=log10(Freq), data=d2, geom="tile")+scale_fill_gradient2()
#' ## some more exploration of loss
#' bins <- expand.grid(x=c(1:10), y=c(1:10))
#' losses <- mdply(bins, function(x,y) lossStandard(data=d1, binning=c(x, y)))
#' qplot(x, percent.loss, group=y, data=losses, geom="line")
#' qplot(y, percent.loss, group=x, data=losses, geom="line")
#' d2 <- dbData(pitch, vars=c( "G", "SO"), binwidth=c(10,10))
#'
lossStandard <- function(data, binning, newData=FALSE){
dnew <- data
vars <- data*0
names(vars) <- paste("Var", names(data), sep="")
nl <- ncol(data)
if(length(binning) != nl)
binning <- rep(binning, length=nl-1)
for(x in 1:(nl-1)){
dnew[,x] <- binning[x]*round(data[,x]/binning[x],0)
vars[,x] <- data$Freq*(dnew[,x] - data[,x])^2
}
dnew$id <- interaction(dnew[,1:(nl-1)])
res <- ddply(dnew, .(id), summarize, n=length(id), fsum=sum(Freq))
dnew <- merge(dnew, res, by = "id")
vars$VarFreq <- with(dnew, (log(Freq)-log(fsum)/n)^2)
vars$id <- dnew$id
vfun <- function(v){
t <- sapply(1:ncol(v), function(i) is.numeric(v[,i]))
return(colSums(v[t]))
}
var.res <- ddply(vars, .(id), vfun)
dcond <- merge(res, var.res, by = "id")
namesdcond <- names(dcond)
dcond <- cbind(do.call("rbind", lapply(strsplit(as.character(dcond$id), ".", fixed=TRUE), as.numeric)), dcond[,-1])
names(dcond) <- c(names(dnew)[2:(nl)], namesdcond[-1])
Loss <- colSums(dcond[,(nl+2):ncol(dcond)])
names(Loss) <- paste(c("", "", "Log"), names(data[,-(nl+1)]), sep="")
TSS <- as.data.frame(t(c(sapply(1:(nl-1), function(i) sum(data$Freq*(data[,i] - mean(data[,i]))^2)), sum((log(dnew$Freq)-mean(log(dnew$Freq)))^2))))
names(TSS) <- names(data)
retdata <- dcond[,c(1:(nl-1), (nl+1))]
names(retdata) <- names(data)
if(newData)
return(list(loss = Loss, totalssq = TSS, percent.loss = Loss/TSS*100, NewData = retdata))
else return(list(loss = Loss, totalssq = TSS, percent.loss = Loss/TSS*100))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.