qeML: Quick and Easy Machine Learning Tools

Documented in qeMittalGraph qePlotCurves wideToLongWithTime

### ###########################  qePlotCurves (OLD) ###########################
### 
### # plot several curves having the same X domain; based on the inputted
### # (X,Y) data; that data can be smoothed into a curve via the 'loess' 
### # option, assumed here to be typical
### 
### # a legend is automatically produced, based on curveData[,3]
### 
### # example use case: 
### 
### #    plotting the -Utility or Fairness-Utility tradeoff
### #    curves, for comparing several different methods
### 
### # example use case: 
### 
### #    plotting quantile regression, for several quantile levels
### 
### # arguments
### 
### #    curveData: column data frame or equivalent
### #    xCol,yCol,grpCol: column numbers in curveData of
### #       the X and Y axes data, and the group membership data
### #       (a group ID, character, numeric etc.)
### #    xlab, ylab:  X,Y axis labels
### #    loess: if TRUE, plot the loess-fitted curve, not the points
### #    legendSpace: expand plot grid by this amount to fit in a legend
### #    legendPos: as in R plot()
### 
### # value
### 
### #    none; this is purely a plotting routine
### 
### qePlotCurves <- function(curveData,xCol=1,yCol=2,grpCol=3,
###    xlab=names(curveData)[xCol],ylab=names(curveData)[yCol],
###    loess=TRUE,legendTitle=names(curveData)[grpCol],
###    legendSpace=1.1,legendPos='topright') 
### {
### 
###    nms <- names(curveData)
###    if (is.character(xCol)) xCol <- which(nms == xCol)
###    if (is.character(yCol)) yCol <- which(nms == yCol)
###    if (is.character(grpCol)) grpCol <- which(nms == grpCol)
### 
###    tmpDF <- curveData[,c(xCol,yCol,grpCol)]
###    briefCurveData <- tmpDF
###    if (!is.factor(briefCurveData[,3])) 
###       briefCurveData[,3] <- as.factor(briefCurveData[,3])
### 
###    xlim <- c(min(briefCurveData[,1]),max(briefCurveData[,1]))
###    tmp <- max(briefCurveData[,2])
###    # leave room at top for legend
###    topY <- if (tmp > 0) legendSpace*tmp else tmp / legendSpace
###    ylim <- c(min(briefCurveData[,2]),topY)
###    plot(NULL,xlim=xlim,ylim=ylim,xlab=xlab,ylab=ylab)
### 
###    curves <- split(briefCurveData,briefCurveData[,3])
###    nCurves <- length(curves)
### 
###    cols <- rainbow(nCurves)
### 
###    nms <- as.factor(names(curves))
###    for (i in 1:nCurves) {
###       cvsi <- curves[[i]]
###       if (loess) {
###          toExec <- 
###             sprintf('loess(%s ~ %s,cvsi)',names(cvsi)[2],names(cvsi)[1])
###          tmp <- evalr(toExec)
###          cvsi[,2] <- predict(tmp,cvsi[,1])
###       }
###       cvsiOrdered <- cvsi[order(cvsi[,1]),]
###       lines(cvsiOrdered,col=cols[i])
###    }
### 
###    legend(legendPos,title=legendTitle,
###       legend=levels(nms),col=cols,lty=rep(1,nCurves))
### 
### }

# generate data for f(x) = x and g(x) = x^, plot
test1 <- function() 
{
   x <- runif(100)
   y1 <- x
   y2 <- x^2
   outdf <- data.frame(x=c(x,x),y=c(y1,y2),
      z=c(rep('1',100),rep('2',100)))
   qePlotCurves(outdf)
}

# fit 4 qe* ftns on lsa data, plot; call form is 
# test2(5), or put anything else instead of 5;
# zzz just a dummy for arg 1, not used; do
#
#    w <- test2(5)
#    qePlotCurves(w)
#
# to run
test2 <- defmacro(zzz,  
   expr = {
     data(lsa)
     lsa1 <- lsa[sample(1:nrow(lsa),1000),]
     data(svcensus)
     svc <- svcensus[sample(1:nrow(lsa),1000),]
     svc <- na.exclude(svc)
     xvals <- seq(5,75,5)
     outDF <- data.frame(x=NULL,y=NULL,z=NULL)
     for (i in 1:15) {
        tmp <- replicMeans(25,"qeXGBoost(svc,'wageinc',
           params=list(max_depth=i))$testAcc")
        outDF <- rbind(outDF,data.frame(x=i,y=tmp,z='XGB'))
        tmp <- replicMeans(25,"qeKNN(svc,'wageinc',k=xvals[i])$testAcc")
        outDF <- rbind(outDF,data.frame(x=i,y=tmp,z='KNN'))
        tmp <- replicMeans(25,"qeRFranger(svc,'wageinc',
           minNodeSize=xvals[i])$testAcc")
        outDF <- rbind(outDF,data.frame(x=i,y=tmp,z='RF'))
        tmp <- replicMeans(25,"qePolyLin(svc,'wageinc',
           deg=i,maxInteractDeg=2)$testAcc")
        outDF <- rbind(outDF,data.frame(x=i,y=tmp,z='polyLin'))
     }
     outDF$z <- as.factor(outDF$z)
     outDF
   }
)

# generate data to go into qePlotCurves(); finds mean testAcc over nreps
# runs, each with a different random training set

# dataName and qeFtnName should be changed to data and qeFtn soon

genQeAcc <- function(nreps,dataName,yName,qeFtnName,opts=NULL)
{
  # goal: set up do.call
   data <- get(dataName)
   qeFtn <- get(qeFtnName)
   dcArgs <- list(data=data,yName=yName)
   if (!is.null(opts)) {
      nms <- names(opts)
      for (nm in nms) {
         dcArgs[[nm]] <- opts[[nm]]
      }
   }
   tmp <- sapply(1:nreps,function(i) 
      {cmdOut <- do.call(qeFtn,dcArgs); cmdOut$testAcc})
   mean(tmp)
}

# planned replacement of regtools::replicMeans
# extension of replicate() code
# charExpr is a quoted string
replicMeans1old <- function (n, charExpr, simplify = "array") {
   expr <- eval(parse(text=charExpr))
   tmp <- sapply(integer(n), eval.parent(substitute(function(...) expr)),
      simplify = simplify)
   if (!is.matrix(tmp)) tmp <- matrix(tmp,nrow=n)
   rowMeans(tmp)
}

# generate data for f(x) = x and g(x) = x^, plot
test1 <- function() 
{
   x <- runif(100)
   y1 <- x
   y2 <- x^2
   outdf <- data.frame(x=c(x,x),y=c(y1,y2),
      z=c(rep('1',100),rep('2',100)))
   qePlotCurves(outdf)
}

# fit 4 qe* ftns on lsa data, plot; call form is 
# test2(5), or put anything else instead of 5;
# zzz just a dummy for arg 1, not used; do
#
#    w <- test2(5)
#    qePlotCurves(w)
#
# to run
test2 <- defmacro(zzz,  
   expr = {
     data(lsa)
     lsa1 <- lsa[sample(1:nrow(lsa),1000),]
     data(svcensus)
     svc <- svcensus[sample(1:nrow(lsa),1000),]
     svc <- na.exclude(svc)
     xvals <- seq(5,75,5)
     outDF <- data.frame(x=NULL,y=NULL,z=NULL)
     for (i in 1:15) {
        tmp <- replicMeans(25,"qeXGBoost(svc,'wageinc',
           params=list(max_depth=i))$testAcc")
        outDF <- rbind(outDF,data.frame(x=i,y=tmp,z='XGB'))
        tmp <- replicMeans(25,"qeKNN(svc,'wageinc',k=xvals[i])$testAcc")
        outDF <- rbind(outDF,data.frame(x=i,y=tmp,z='KNN'))
        tmp <- replicMeans(25,"qeRFranger(svc,'wageinc',
           minNodeSize=xvals[i])$testAcc")
        outDF <- rbind(outDF,data.frame(x=i,y=tmp,z='RF'))
        tmp <- replicMeans(25,"qePolyLin(svc,'wageinc',
           deg=i,maxInteractDeg=2)$testAcc")
        outDF <- rbind(outDF,data.frame(x=i,y=tmp,z='polyLin'))
     }
     outDF$z <- as.factor(outDF$z)
     outDF
   }
)

# generate data to go into qePlotCurves(); finds mean testAcc over nreps
# runs, each with a different random training set

# dataName and qeFtnName should be changed to data and qeFtn soon

genQeAcc <- function(nreps,dataName,yName,qeFtnName,opts=NULL)
{
  # goal: set up do.call
   data <- get(dataName)
   qeFtn <- get(qeFtnName)
   dcArgs <- list(data=data,yName=yName)
   if (!is.null(opts)) {
      nms <- names(opts)
      for (nm in nms) {
         dcArgs[[nm]] <- opts[[nm]]
      }
   }
   tmp <- sapply(1:nreps,function(i) 
      {cmdOut <- do.call(qeFtn,dcArgs); cmdOut$testAcc})
   mean(tmp)
}

# planned replacement of regtools::replicMeans
# extension of replicate() code
# charExpr is a quoted string
replicMeans1old <- function (n, charExpr, simplify = "array") {
   expr <- eval(parse(text=charExpr))
   tmp <- sapply(integer(n), eval.parent(substitute(function(...) expr)),
      simplify = simplify)
   if (!is.matrix(tmp)) tmp <- matrix(tmp,nrow=n)
   rowMeans(tmp)
}

###########################  qeMittalGraph  #################################

# plots several curves, one for each gorup, against a common X-axis, as
# in qePlotCurves, but showing the change in each variable, relative to
# the variable's value at min X

# X is typically input in ascending numerical order, but need not be

# X is required to be in col 1; col 2 is for Y of group 1, etc.

# 'dataMitt' must be a data frame or equivalent, in which for each X value
# there is exactly one Y value for each group; format is wide, e.g.

#    x y1 y2 y3

#    w <- data.frame(x=c(3:5,2),y1=c(5:7,4),y2=c(4,12,15,5),y3=10:7)
#    qeMittalGraph(w)

qeMittalGraph <- function(dataMitt,xlab='x',ylab='y',legendTitle='curve',
   loess=TRUE)
{
   data <- dataMitt

   x <- data[,1]
   nc <- ncol(data)
   argMinX <- which.min(x)
   nms <- names(data)[-1]

   z <- lapply(1:(nc-1),
      function(i) {
         tmp <- data[,i+1] / data[argMinX,i+1]
         tmpDF <- data.frame(x=x,curveNum=nms[i],y=tmp)
         tmpDF
      }
   )
   zz <- do.call(rbind,z)

   qePlotCurves(zz,1,3,2,xlab=xlab,ylab=ylab,legendTitle=legendTitle,
      loess=loess)
}


# basically a wide-to-long reshape, but with various options for a time
# column

# arguments:

#    data: data frame or equivalent
#    timeColName: name of time column, including if not yet formed
#    timeColPresent: TRUE means column already in 'data'
#    timeColSeq:  (m,n) means created column will have the values
#       m,m+n,m+2n,m+3n,...

wideToLongWithTime <- function(data,timeColName,timeColPresent=TRUE,
   timeColSeq=c(1,1),grpColName=NULL,valueColName=NULL) 
{
   qeML:::getSuggestedLib('reshape2')

   if (!timeColPresent) {
      first <- timeColSeq[1]
      inc <- timeColSeq[2]
      tmp <- 1:nrow(data)
      timecol <- first + inc*tmp
      newdata <- cbind(timecol,data)
      names(newdata)[1] <- timeColName
   }

   longData <- reshape2::melt(newdata,id.vars=timeColName)
   if (!is.null(grpColName)) names(longData)[2] <- grpColName
   if (!is.null(valueColName)) names(longData)[3] <- valueColName
   return(longData)

}

###########################  qePlotCurves  #################################

# plot several curves having the same X domain; based on the inputted
# (X,Y) data; that data can be smoothed into a curve via the 'loess' 
# option, assumed here to be typical

# a legend is automatically produced, based on curveData[,3]

# example use case: 

#    plotting the -Utility or Fairness-Utility tradeoff
#    curves, for comparing several different methods

# example use case: 

#    plotting quantile regression, for several quantile levels

# arguments

#    curveData: column data frame or equivalent
#    xCol,yCol,grpCol: column numbers in curveData of
#       the X and Y axes data, and the group membership data
#       (a group ID, character, numeric etc.)
#    xlab, ylab:  X,Y axis labels
#    loess: if TRUE, plot the loess-fitted curve, not the points
#    legendSpace: expand plot grid by this amount to fit in a legend
#    legendPos: as in R plot()

# value

#    none; this is purely a plotting routine

# examples

#    data(lsa)
#    qePlotCurves(lsa,6,5,9,legendSpace=1.35)
#    
#    data(currency)
#    curr <- currency
#    qePlotCurves(curr,1,3,2,wide=T,wideTimeColName='weeknum',
#       wideTimeColPresent=F,wideGrpColName='country')

# d1 <- day1[,c(1,14,15)]
# qePlotCurves(d1,wide=T,wideTimeColName='instant',
#    wideTimeColPresent=F,wideGrpColName='group')


qePlotCurves <- function(curveData,xCol=1,yCol=2,grpCol=3,
   xlab=names(curveData)[xCol],ylab=names(curveData)[yCol],
   loess=TRUE,legendTitle=names(curveData)[grpCol],
   legendSpace=1.1,legendPos='topright',
   wide=FALSE,wideTimeColName=NULL,wideTimeColPresent=NULL,
   wideTimeColBase=1:nrow(curveData),wideGrpColName=NULL,
      wideValueColName=NULL) 
{

   if(wide) {
      tmp <- wideToLongWithTime(curveData,wideTimeColName,
         wideTimeColPresent,wideTimeColBase,
         grpColName=wideGrpColName,
         valueColName=wideValueColName)
      curveData <- tmp
      xCol <- 1; yCol <- 3; grpCol <- 2
   }

   nms <- names(curveData)
   if (is.character(xCol)) xCol <- which(nms == xCol)
   if (is.character(yCol)) yCol <- which(nms == yCol)
   if (is.character(grpCol)) grpCol <- which(nms == grpCol)

   tmpDF <- curveData[,c(xCol,yCol,grpCol)]
   briefCurveData <- tmpDF
   if (!is.factor(briefCurveData[,3])) 
      briefCurveData[,3] <- as.factor(briefCurveData[,3])

   xlim <- c(min(briefCurveData[,1]),max(briefCurveData[,1]))
   tmp <- max(briefCurveData[,2])
   # leave room at top for legend
   topY <- if (tmp > 0) legendSpace*tmp else tmp / legendSpace
   ylim <- c(min(briefCurveData[,2]),topY)
   plot(NULL,xlim=xlim,ylim=ylim,xlab=xlab,ylab=ylab)

   curves <- split(briefCurveData,briefCurveData[,3])
   nCurves <- length(curves)

   cols <- rainbow(nCurves)

   nms <- as.factor(names(curves))
   for (i in 1:nCurves) {
      cvsi <- curves[[i]]
      if (loess) {
         toExec <- 
            sprintf('loess(%s ~ %s,cvsi)',names(cvsi)[2],names(cvsi)[1])
         tmp <- evalr(toExec)
         cvsi[,2] <- predict(tmp,cvsi[,1])
      }
      cvsiOrdered <- cvsi[order(cvsi[,1]),]
      lines(cvsiOrdered,col=cols[i])
   }

   legend(legendPos,title=legendTitle,
      legend=levels(nms),col=cols,lty=rep(1,nCurves))

}

# generate data for f(x) = x and g(x) = x^, plot
test1 <- function() 
{
   x <- runif(100)
   y1 <- x
   y2 <- x^2
   outdf <- data.frame(x=c(x,x),y=c(y1,y2),
      z=c(rep('1',100),rep('2',100)))
   qePlotCurves(outdf)
}

# fit 4 qe* ftns on lsa data, plot; call form is 
# test2(5), or put anything else instead of 5;
# zzz just a dummy for arg 1, not used; do
#
#    w <- test2(5)
#    qePlotCurves(w)
#
# to run
test2 <- defmacro(zzz,  
   expr = {
     data(lsa)
     lsa1 <- lsa[sample(1:nrow(lsa),1000),]
     data(svcensus)
     svc <- svcensus[sample(1:nrow(lsa),1000),]
     svc <- na.exclude(svc)
     xvals <- seq(5,75,5)
     outDF <- data.frame(x=NULL,y=NULL,z=NULL)
     for (i in 1:15) {
        tmp <- replicMeans(25,"qeXGBoost(svc,'wageinc',
           params=list(max_depth=i))$testAcc")
        outDF <- rbind(outDF,data.frame(x=i,y=tmp,z='XGB'))
        tmp <- replicMeans(25,"qeKNN(svc,'wageinc',k=xvals[i])$testAcc")
        outDF <- rbind(outDF,data.frame(x=i,y=tmp,z='KNN'))
        tmp <- replicMeans(25,"qeRFranger(svc,'wageinc',
           minNodeSize=xvals[i])$testAcc")
        outDF <- rbind(outDF,data.frame(x=i,y=tmp,z='RF'))
        tmp <- replicMeans(25,"qePolyLin(svc,'wageinc',
           deg=i,maxInteractDeg=2)$testAcc")
        outDF <- rbind(outDF,data.frame(x=i,y=tmp,z='polyLin'))
     }
     outDF$z <- as.factor(outDF$z)
     outDF
   }
)

# generate data to go into qePlotCurves(); finds mean testAcc over nreps
# runs, each with a different random training set

# dataName and qeFtnName should be changed to data and qeFtn soon

genQeAcc <- function(nreps,dataName,yName,qeFtnName,opts=NULL)
{
  # goal: set up do.call
   data <- get(dataName)
   qeFtn <- get(qeFtnName)
   dcArgs <- list(data=data,yName=yName)
   if (!is.null(opts)) {
      nms <- names(opts)
      for (nm in nms) {
         dcArgs[[nm]] <- opts[[nm]]
      }
   }
   tmp <- sapply(1:nreps,function(i) 
      {cmdOut <- do.call(qeFtn,dcArgs); cmdOut$testAcc})
   mean(tmp)
}

# planned replacement of regtools::replicMeans
# extension of replicate() code
# charExpr is a quoted string
replicMeans1old <- function (n, charExpr, simplify = "array") {
   expr <- eval(parse(text=charExpr))
   tmp <- sapply(integer(n), eval.parent(substitute(function(...) expr)),
      simplify = simplify)
   if (!is.matrix(tmp)) tmp <- matrix(tmp,nrow=n)
   rowMeans(tmp)
}

# generate data for f(x) = x and g(x) = x^, plot
test1 <- function() 
{
   x <- runif(100)
   y1 <- x
   y2 <- x^2
   outdf <- data.frame(x=c(x,x),y=c(y1,y2),
      z=c(rep('1',100),rep('2',100)))
   qePlotCurves(outdf)
}

# fit 4 qe* ftns on lsa data, plot; call form is 
# test2(5), or put anything else instead of 5;
# zzz just a dummy for arg 1, not used; do
#
#    w <- test2(5)
#    qePlotCurves(w)
#
# to run
test2 <- defmacro(zzz,  
   expr = {
     data(lsa)
     lsa1 <- lsa[sample(1:nrow(lsa),1000),]
     data(svcensus)
     svc <- svcensus[sample(1:nrow(lsa),1000),]
     svc <- na.exclude(svc)
     xvals <- seq(5,75,5)
     outDF <- data.frame(x=NULL,y=NULL,z=NULL)
     for (i in 1:15) {
        tmp <- replicMeans(25,"qeXGBoost(svc,'wageinc',
           params=list(max_depth=i))$testAcc")
        outDF <- rbind(outDF,data.frame(x=i,y=tmp,z='XGB'))
        tmp <- replicMeans(25,"qeKNN(svc,'wageinc',k=xvals[i])$testAcc")
        outDF <- rbind(outDF,data.frame(x=i,y=tmp,z='KNN'))
        tmp <- replicMeans(25,"qeRFranger(svc,'wageinc',
           minNodeSize=xvals[i])$testAcc")
        outDF <- rbind(outDF,data.frame(x=i,y=tmp,z='RF'))
        tmp <- replicMeans(25,"qePolyLin(svc,'wageinc',
           deg=i,maxInteractDeg=2)$testAcc")
        outDF <- rbind(outDF,data.frame(x=i,y=tmp,z='polyLin'))
     }
     outDF$z <- as.factor(outDF$z)
     outDF
   }
)

# generate data to go into qePlotCurves(); finds mean testAcc over nreps
# runs, each with a different random training set

# dataName and qeFtnName should be changed to data and qeFtn soon

genQeAcc <- function(nreps,dataName,yName,qeFtnName,opts=NULL)
{
  # goal: set up do.call
   data <- get(dataName)
   qeFtn <- get(qeFtnName)
   dcArgs <- list(data=data,yName=yName)
   if (!is.null(opts)) {
      nms <- names(opts)
      for (nm in nms) {
         dcArgs[[nm]] <- opts[[nm]]
      }
   }
   tmp <- sapply(1:nreps,function(i) 
      {cmdOut <- do.call(qeFtn,dcArgs); cmdOut$testAcc})
   mean(tmp)
}

# planned replacement of regtools::replicMeans
# extension of replicate() code
# charExpr is a quoted string
replicMeans1old <- function (n, charExpr, simplify = "array") {
   expr <- eval(parse(text=charExpr))
   tmp <- sapply(integer(n), eval.parent(substitute(function(...) expr)),
      simplify = simplify)
   if (!is.matrix(tmp)) tmp <- matrix(tmp,nrow=n)
   rowMeans(tmp)
}

###########################  qeMittalGraph  #################################

# plots several curves, one for each gorup, against a common X-axis, as
# in qePlotCurves, but showing the change in each variable, relative to
# the variable's value at min X

# X is typically input in ascending numerical order, but need not be

# X is required to be in col 1; col 2 is for Y of group 1, etc.

# 'data' must be a data frame or equivalent, in which for each X value
# there is exactly one Y value for each group; format is wide, e.g.

#    x y1 y2 y3

#    w <- data.frame(x=c(3:5,2),y1=c(5:7,4),y2=c(4,12,15,5),y3=10:7)
#    qeMittalGraph(w)

qeMittalGraph <- function(data,xlab='x',ylab='y',legendTitle='curve',
   loess=TRUE)
{

   x <- data[,1]
   nc <- ncol(data)
   argMinX <- which.min(x)
   nms <- names(data)[-1]

   z <- lapply(1:(nc-1),
      function(i) {
         tmp <- data[,i+1] / data[argMinX,i+1]
         tmpDF <- data.frame(x=x,curveNum=nms[i],y=tmp)
         tmpDF
      }
   )
   zz <- do.call(rbind,z)

   qePlotCurves(zz,1,3,2,xlab=xlab,ylab=ylab,legendTitle=legendTitle,
      loess=loess)
}