R/lossfunctions2.R

#' Standard Binning Algorithm
#'
#' Bins data based on the Standard binning algorithm, that is, points are allocated to the closest bin with weight 1.
#' 
#' @param d1 data frame
#' @param binning vector of binwidths
#' @param newData binned data available?
#' @export
#' @examples
#' connect <- dbConnect(dbDriver("MySQL"), user="2009Expo", 
#' password="R R0cks", port=3306, dbname="baseball", 
#' host="headnode.stat.iastate.edu")
#' pitch <- new("dataDB", co=connect, table="Pitching")
#' d1 <- dbData(pitch, vars=c( "G", "SO"))
#' qplot(G,SO, fill=log10(Freq), data=d1, geom="tile")+scale_fill_gradient2()
#' d1binned <- binStd(d1, binning=c(2,5))
#' d1binned.reduced <- ddply(d1binned, .(G, SO), summarise, Freq=sum(Freq))
#' qplot(G,SO, fill=log10(Freq), data=d1binned.reduced, geom="tile")+scale_fill_gradient2()
#' d1binned <- binStd(d1, binning=c(1,5))
#' d1binned.reduced <- ddply(d1binned, .(G, SO), summarise, Freq=sum(Freq))
#' qplot(G,SO, fill=log10(Freq), data=d1binned.reduced, geom="tile")+scale_fill_gradient2()
binStd <- function(data, binning){
  dnew <- data
  nl <- ncol(data)
  if(length(binning) != nl)
    binning <- rep(binning, length=nl-1)
  for(x in 1:(nl-1)){
    dnew[,x] <- binning[x]*round(data[,x]/binning[x],0)
  }
  return(dnew)
}

#' Random Binning Algorithm
#'
#' Bins data based on the Random binning algorithm, that is, points are allocated to the closest bin using a bernoulli trial with p inversely proportional to the distance from the point to the bin.
#' 
#' @param d1 data frame
#' @param binning vector of binwidths
#' @param newData binned data available?
#' @export
#' @examples
#' connect <- dbConnect(dbDriver("MySQL"), user="2009Expo", 
#' password="R R0cks", port=3306, dbname="baseball", 
#' host="headnode.stat.iastate.edu")
#' pitch <- new("dataDB", co=connect, table="Pitching")
#' d1 <- dbData(pitch, vars=c( "G", "SO"))
#' qplot(G,SO, fill=log10(Freq), data=d1, geom="tile")+scale_fill_gradient2()
#' d1binned <- binRdm(d1, binning=c(2,5))
#' d1binned.reduced <- ddply(d1binned, .(G, SO), summarise, Freq=sum(Freq))
#' qplot(G,SO, fill=log10(Freq), data=d1binned.reduced, geom="tile")+scale_fill_gradient2()
#' d1binned <- binRdm(d1, binning=c(1,5))
#' d1binned.reduced <- ddply(d1binned, .(G, SO), summarise, Freq=sum(Freq))
#' qplot(G,SO, fill=log10(Freq), data=d1binned.reduced, geom="tile")+scale_fill_gradient2()
binRdm <- function(data, binning){
  dnew <- data
  nl <- ncol(data)
  if(length(binning) != nl)
    binning <- rep(binning, length=nl-1)
  for(x in 1:(nl-1)){
    dnew[,x] <- binning[x]*(floor(data[,x]/binning[x]) + 
                              sapply((data[,x]%%binning[x])/binning[x], 
                                     function(p) rbinom(1, 1, p)))
  }
  return(dnew)
}

#' Partitioned Loss function
#'
#' Based on a given data set, loss is computed from comparing the original data to the data binned at the specified level using either the Random (default) or Standard binning algorithm.
#' Loss is computed as sum of squared differences between all values at the original data and the mean of the data in the new bin.
#' Percent loss compares loss to the total sum of squares in the original data.
#' 
#' @param d1 data frame
#' @param binning vector of binwidths
#' @export
#' @examples
#' connect <- dbConnect(dbDriver("MySQL"), user="2009Expo", 
#' password="R R0cks", port=3306, dbname="baseball", 
#' host="headnode.stat.iastate.edu")
#' pitch <- new("dataDB", co=connect, table="Pitching")
#' d1 <- dbData(pitch, vars=c( "G", "SO"))
#' qplot(G,SO, fill=log10(Freq), data=d1, geom="tile")+scale_fill_gradient2()
#' lossCalc(d1, binning=c(2,5))
#' lossCalc(d1, binning=c(2,5), type="standard")
#' lossCalc(d1, binning=c(1,5))
#' lossCalc(d1, binning=c(1,5), type="standard")
#' d2 <- dbData(pitch, vars=c( "G", "SO"), binwidth=c(1,5))
#' qplot(G,SO, fill=log10(Freq), data=d2, geom="tile")+scale_fill_gradient2()
#' lossCalc(d1, binning=c(1,10))
#' lossCalc(d2, binning=c(1,2))
#' d2 <- dbData(pitch, vars=c( "G", "SO"), binwidth=c(1,10))
#' qplot(G,SO, fill=log10(Freq), data=d2, geom="tile")+scale_fill_gradient2()
#' ## some more exploration of loss
#' bins <- expand.grid(x=c(1:10), y=c(1:10))
#' losses <- mdply(bins, function(x,y) lossCalc(data=d1, binning=c(x, y)))
#' qplot(x, percent.loss, group=y, data=losses, geom="line")
#' qplot(y, percent.loss, group=x, data=losses, geom="line")
#' d2 <- dbData(pitch, vars=c( "G", "SO"), binwidth=c(10,10))
#' 
lossCalc <- function(data, binning, type="standard", newData=FALSE){
  #   browser()
  if(sum(grepl("freq", tolower(names(data)), fixed=TRUE))==0) data$Freq <- 1
  nl <- ncol(data)
  
  min.bin <- sapply(1:(nl-1), function(i){
    diffs <- diff(sort(data[,i]))
    diffs <- subset(diffs, diffs>0)
    min(diffs)
  })
  numMinimalBins <- prod(sapply(1:(nl-1), function(i) diff(range(data[,i]))/min.bin[i]))
  
  data.old <- data
  data <- ddply(data, c(1:(nl-1)), summarise, Freq=sum(Freq))
  
  if(type=="random"){dnew <- binRdm(data, binning)
  }else if(type=="standard"){dnew <- binStd(data, binning)
  }else{
    warning("Type not 'standard' or 'random' - proceeding with standard binning")
    dnew <- binStd(data, binning)
  }
  
#  dmin <- ddply(binStd(data, min.bin), c(1:(nl-1)), summarise, Freq=sum(Freq))
  
  vars <- data*0
  names(vars) <- paste("Var", names(data), sep="")
  
  data$id <- dnew$id <- interaction(dnew[,1:(nl-1)], sep=".split.")
  total.loss <- as.data.frame(data$Freq*(data[,1:(nl-1)]-dnew[,1:(nl-1)])^2)
  names(total.loss) <- names(data)[1:(nl-1)]
  total.loss$id <- data$id
  
  
  mfun <- function(v){
    nl <- ncol(v)-1
    # Calculate EX
    t <- data.frame(t(sapply(1:(nl-1), function(i) sum(v[,i]*v[,nl])/sum(v[,nl])))) 
    # t <- data.frame(t(sapply(1:(nl-1), function(i) weighted.mean(v[,i], v[,nl]))))
    t$fsum <- sum(v[,nl])
    # Calculate EX^2
    tsq <- data.frame(t(sapply(1:(nl-1), function(i) sum(v[,i]^2*v[,nl])/sum(v[,nl])))) 
    # tsq <- data.frame(t(sapply(1:(nl-1), function(i) weighted.mean(v[,i]^2, v[,nl]))))
    tsq <- tsq - t^2 # VarX = EX^2 - (EX)^2
    tsq <- tsq * t$fsum # Get SSQ from Freq
    names(t) <- c(names(v)[-c(nl, nl+1)], "fsum")
    t$id <- unique(v$id)
    t$n <- length(v$id)
    names(tsq) <- paste(names(t)[1:(nl-1)], "Var", sep="")
    t <- cbind(t, tsq)
    return(t)
  }
  
  res <- ddply(data, .(id), mfun)
  names(res) <- c(paste("recover", names(res)[1:(nl-1)], sep=""), names(res)[-c(1:(nl-1))])
  dnew2 <- merge(dnew, res, by="id")
  
  TotalLoss <- colSums(total.loss[,1:(nl-1)])
  
  emptybinstop <- max(prod(binning)/prod(min.bin), nrow(dnew2), max(dnew2$n))
#   even.binning.prob <- sum(emptybinstop<dnew2$n)>0
#   
#   if(even.binning.prob){
#     emptybinstop <- apply(sapply(1:(nl-1), function(i) if(binning[i]%%2==0) binning[i] - ((dnew2[,i+1]%%binning[i]^2)>0) +((dnew2[,i+1]%%binning[i]^2)==0) else rep(binning[i], nrow(dnew))) , 1, prod)
#   }
  NumLoss <- c(colSums(res[,(ncol(res)-(nl-2)):ncol(res)]), sum(
    (log(dnew2$Freq+1) - log((dnew2$fsum+1)/emptybinstop))^2) + sum(
      (emptybinstop-dnew2$n)*(log(dnew2$fsum+1)/emptybinstop)^2
    ))
  names(NumLoss) <- paste(c("", "", "Log"), names(data[,-(nl+1)]), sep="")
  #   rm(list="dnew2")
  TotalLoss <- c(TotalLoss, NumLoss[nl])
  NumLoss <- NumLoss[1:(nl-1)]
  
  VisLoss <- TotalLoss[1:(nl-1)] - NumLoss
  
  nbins <- prod(sapply(1:(nl-1), function(i) ceiling(diff(range(data[,i]))+1)/binning[i]))*emptybinstop
  numEmptyBins <- nbins-nrow(data)
  TSS <- as.data.frame(t(c(
    sapply(1:(nl-1), function(i) 
      sum(data$Freq*(data[,i] - weighted.mean(data[,i], data[,nl]))^2)), 
    sum((log(data$Freq+1)-log(sum(data$Freq)/nbins))^2)+sum(numEmptyBins*(log(sum(data$Freq)/nbins))^2))))
  names(TSS) <- paste(c("", "", "Log"), names(data[,-(nl+1)]), sep="")
  
  if(newData){
    dnew3 <- as.data.frame(cbind(do.call("rbind", lapply(strsplit(as.character(res$id), ".split.", fixed=TRUE), as.numeric)), res$fsum))
    names(dnew3) <- names(data[,1:(nl)])
  }
  
  LossAll <- data.frame(c(NumLoss/TSS[1:2], VisLoss/TSS[1:2], TotalLoss/TSS))
  names(LossAll) <- c(paste("NumLoss.", names(NumLoss), sep=""),
                      paste("VisLoss.", names(VisLoss), sep=""),
                      paste("TotalLoss.", names(TotalLoss), sep=""))
  if(newData)
    return(list(Loss=LossAll, NewData = dnew2[,c(2:(nl+1))]))
  else return(LossAll)
}
heike/dbData documentation built on May 17, 2019, 3:23 p.m.