R/alteryxreport.R

#' Create a data frame with lm model object summary
#'
#'
#' The function Alteryx.ReportLM creates a data frame of an lm model's summary
#' output that can more easily be handled by Alteryx's reporting tools. The
#' function returns a data frame that is a set of key-value pairs for the
#' components of the summary report as well as the model's coefficients in a
#' JSON formatted character string.
#'
#' @param lm.obj lm model object whose summary output is put into a data frame
#' @author Dan Putler
#' @export
#' @family Alteryx.Report
Alteryx.ReportLM <- function (lm.obj){
  if (class(lm.obj) != "lm") {
    stop.Alteryx2(
      XMSG(
        in.targetString_sc = "The object provided is not a glm class object."
      )
    )
  }
  full.sum <- summary(lm.obj)
  the.call <- paste(capture.output(full.sum$call), collapse = "")
  the.call = gsub("\\s\\s", "", the.call)
  f <- formula(lm.obj)
  the.call <- sub("formula = ([^\\,]+)",
    paste("formula =", paste(f[2], "~", f[3])), the.call)
  resid.sum <- paste(format(summary(full.sum$residuals)[-4] , digits = 3), collapse = " ")
  p.stars <- pStars(full.sum$coefficients[,4])
  coef.est <- paste(dimnames(full.sum$coefficients)[[1]], format(full.sum$coefficients[,1], digits = 4),
                    format(full.sum$coefficients[,2], digits = 4), format(full.sum$coefficients[,3], digits = 4),
                    as.character(p.stars$p_txt), as.character(p.stars$Stars), sep = "|")
  coef.lab <- XMSG(in.targetString_sc = "Coefficients:")
  # Address variables that were omitted due to singularities
  omitted <- names(full.sum$aliased)[full.sum$aliased]
  if (length(omitted) > 0) {
    coef.lab <- XMSG(
      in.targetString_sc = "@1 (@2 not defined because of singularities)",
      in.firstBindVariable_sc = coef.lab,
      in.secondBindVariable_sc = length(omitted)
      )
    for (i in omitted)
      coef.est <- c(coef.est, paste(i, "NA|NA|NA|NA|  ", sep = "|"))
  }
  resid.se <- XMSG(
    in.targetString_sc = "Residual standard error: @1 on @2 degrees of freedom",
    in.firstBindVariable_sc = format(
      full.sum$sigma,
      digits = 5
    ),
    in.secondBindVariable_sc = full.sum$df[2]
  )
  r.sq <- XMSG(
    in.targetString_sc = "Multiple R-squared: @1, Adjusted R-Squared: @2",
    in.firstBindVariable_sc = format(full.sum$r.squared, digits = 4),
    in.secondBindVariable_sc = format(full.sum$adj.r.squared, digits = 4)
  )
  p.f1 <- 1 - pf(full.sum$fstatistic[1], full.sum$fstatistic[2], full.sum$fstatistic[3])
  p.f2 <- format(p.f1, digits = 4)
  p.f2[p.f1 < 2.2e-16] <- "< 2.2e-16"
  f.stat <- XMSG(
    in.targetString_sc = "F-statistic: @1 on @2 and @3 degrees of freedom (DF), p-value @4",
    in.firstBindVariable_sc = format(full.sum$fstatistic[1], digits = 4),
    in.secondBindVariable_sc = as.integer(full.sum$fstatistic[2]),
    in.thirdBindVariable_sc = as.integer(full.sum$fstatistic[3]),
    in.fourthBindVariable_sc = p.f2
  )
  sum.grps <- c("Call", "Residuals", "Coef_Label", rep("Coef_Est", length(coef.est)), rep("Fit_Stats", 3))
  sum.out <- c(the.call, resid.sum, coef.lab, coef.est, resid.se, r.sq, f.stat)
  summary.df <- data.frame(grp = sum.grps, out = sum.out)
  summary.df$grp <- as.character(summary.df$grp)
  summary.df$out <- as.character(summary.df$out)
  json.str <- paste("\"", names(lm.obj$coefficients), "\":\"",
                    lm.obj$coefficients, "\"", sep = "", collapse = ", ")
  json.str <- paste("{", json.str, "}")
  coef.str <- c("Coef_JSON", json.str)
  summary.df <- rbind(summary.df, coef.str)
  summary.df
}


#' Create a data frame with glm model object summary
#'
#'
#' The function Alteryx.ReportGLM creates a data frame of an glm model's summary
#' output that can more easily be handled by Alteryx's reporting tools. The
#' function returns a data frame that is a set of key-value pairs for the
#' components of the summary report as well as the model's coefficients in a
#' JSON formatted character string. The coefficient JSON string is really for
#' future option value
#'
#' @param glm.obj glm model whose summary output is put into a data frame
#' @author Dan Putler
#' @export
#' @family Alteryx.Report
Alteryx.ReportGLM <- function (glm.obj){
  if (class(glm.obj)[1] != "glm" && class(glm.obj)[2] != "glm") {
    stop.Alteryx2(
      XMSG(
        in.targetString_sc = "The object provided is not a glm class object."
        )
      )
  }
  full.sum <- summary(glm.obj)
  the.call <- paste(capture.output(full.sum$call), collapse = "")
  the.call = gsub("\\s\\s", "", the.call)
  f <- formula(glm.obj)
  the.call <- sub("formula = ([^\\,]+)",
    paste("formula =", paste(f[2], "~", f[3])), the.call)
  resid.sum <- paste(format(summary(full.sum$deviance.resid)[-4], digits = 3), collapse = " ")
  p.stars <- pStars(full.sum$coefficients[,4])
  coef.est <- paste(dimnames(full.sum$coefficients)[[1]], format(full.sum$coefficients[,1], digits = 4),
                    format(full.sum$coefficients[,2], digits = 4), format(full.sum$coefficients[,3], digits = 4),
                    as.character(p.stars$p_txt), as.character(p.stars$Stars), sep = "|")
  coef.lab <- XMSG(in.targetString_sc = "Coefficients:")
  # Address variables that were omitted due to singularities
  omitted <- names(full.sum$aliased)[full.sum$aliased]
  if (length(omitted) > 0) {
    coef.lab <- XMSG(
      in.targetString_sc = "@1 (@2 not defined because of singularities)",
      in.firstBindVariable_sc = coef.lab,
      in.secondBindVariable_sc = length(omitted)
    )
    for (i in omitted)
      coef.est <- c(coef.est, paste(i, "NA|NA|NA|NA|  ", sep = "|"))
  }
  # If there is a theta and SE.theta elements of the object it is glm.nb model
  # and add the estimates of theta and its se to ceof.est
  if (!is.null(glm.obj$theta))
    coef.est <- c(coef.est, paste("theta", format(glm.obj$theta, digits = 6), format(glm.obj$SE.theta, digits = 6), "| | ", sep = "|"))
  dispersion <- XMSG(
    in.targetString_sc = "(Dispersion parameter for @1 taken to be @2 )",
    in.firstBindVariable_sc = full.sum$family$family,
    in.secondBindVariable_sc = full.sum$dispersion
  )
  df.null <- full.sum$df[2] + full.sum$df[3] - 1
  null.dev <- XMSG(
    in.targetString_sc = "Null deviance: @1 on @2 degrees of freedom",
    in.firstBindVariable_sc = format(full.sum$null.deviance, digits = 5),
    in.secondBindVariable_sc = df.null
  )
  mod.dev <- XMSG(
    in.targetString_sc = "Residual deviance: @1 on @2 degrees of freedom",
    in.firstBindVariable_sc = format(full.sum$deviance, digits = 5),
    in.secondBindVariable_sc = full.sum$df[2]
  )
  McF.R2 <- 1 - (glm.obj$deviance/glm.obj$null.deviance)
  mod.fit <- XMSG(
    in.targetString_sc = "McFadden R-Squared: @1, Akaike Information Criterion @2",
    in.firstBindVariable_sc = format(McF.R2, digits = 4),
    in.secondBindVariable_sc = format(full.sum$aic, digits = 4)
  )
  fisher.it <- XMSG(
    in.targetString_sc = "Number of Fisher Scoring iterations: @1",
    in.firstBindVariable_sc = full.sum$iter
  )
  sum.grps <- c("Call", "Residuals", "Coef_Label", rep("Coef_Est", length(coef.est)), "Dispersion", rep("Fit_Stats", 3), "Fisher")
  sum.out <- c(the.call, resid.sum, coef.lab, coef.est, dispersion, null.dev, mod.dev, mod.fit, fisher.it)
  summary.df <- data.frame(grp = sum.grps, out = sum.out)
  summary.df$grp <- as.character(summary.df$grp)
  summary.df$out <- as.character(summary.df$out)
  json.str <- paste("\"", names(glm.obj$coefficients), "\":\"",
    glm.obj$coefficients, "\"", sep = "", collapse = ", "
  )
  json.str <- paste("{", json.str, "}")
  coef.str <- c("Coef_JSON", json.str)
  summary.df <- rbind(summary.df, coef.str)
  singular <- length(omitted) > 0
  list(summary.df = summary.df, singular = singular)
}

#' Create a data frame with model object summary
#'
#'
#' The function Alteryx.ReportAnova creates a data frame of key-value pairs for
#' the purpose of assisting Alteryx's reporting tools to report the results of a
#' type II ANOVA for a LM or GLM model object. The last key- value indicates
#' whether the report is for a lm or glm model which is relevant for labeling
#' columns in the output created by Alteryx.
#'
#' @param model.obj model object whose summary output is put into a data frame
#' @author Dan Putler
#' @export
#' @family Alteryx.Report
Alteryx.ReportAnova <- function (model.obj)
{
  if (class(model.obj)[1] != "lm" && class(model.obj)[1] !=
      "glm" && class(model.obj)[2] != "glm") {
    stop.Alteryx2(
      XMSG(
        in.targetString_sc = "The object provided is not a lm or glm class object."
      )
    )
  }
  #
  the.anova <- car::Anova(model.obj, type = "II")
  response <- attributes(the.anova)$heading[2]
  if (class(model.obj)[1] == "lm") {
    p.vals <- the.anova[[4]]
    p.vals <- p.vals[1:(length(p.vals) - 1)]
    p.stars <- pStars(p.vals)
    f.vals1 <- the.anova[[3]]
    f.vals <- as.character(round(f.vals1[1:(length(f.vals1) - 1)], 2))
    the.table <- paste(attributes(the.anova)$row.names, round(the.anova[[1]], 2),
      round(the.anova[[2]], 0), c(f.vals,""), c(as.character(p.stars$p_txt), ""),
      c(as.character(p.stars$Stars), ""),	sep = "|"
    )
  } else {
    p.stars <- pStars(the.anova[[3]])
    the.table <- paste(attributes(the.anova)$row.names, round(the.anova[[1]], 3),
      round(the.anova[[2]], 0), as.character(p.stars$p_txt),
      as.character(p.stars$Stars), sep = "|"
    )
  }
  anova_grps <- c("Anova_Resp", rep("Anova_Test", length(the.table)))
  anova.df <- data.frame(grp = anova_grps, out = c(response, the.table))
  anova.df$grp <- as.character(anova.df$grp)
  anova.df$out <- as.character(anova.df$out)
  anova.df <- rbind(anova.df, c("Model_Class", class(model.obj)[1]))
  #The only user-facing strings in anova.df come from R, so there's no need
  #to localize them.
  return(anova.df)
}


# The function Alteryx.ParseCoefSum parses and formats the lines from the
# coefficient summary portion of an lm or glm model object. The output is
# placed into pipe (|) delimited fields for further processing within an
# Alteryx macro
# Author: Dan Putler
Alteryx.ParseCoefSum <- function(coef_sum) {
  if(!is.character(coef_sum)) {
    stop.Alteryx2(
      XMSG(
        in.targetString_sc = "The argument to the function must be a character vector."
        )
    )
  }
  # parseRow is the function that is used in the apply function
  parseRow <- function(a_row) {
    cs_vec <- unlist(strsplit(a_row, "\\s"))
    # Get the significant level indicator
    cs_stars <- cs_vec[length(cs_vec)]
    # If there is only a single star or a period, then there is a last space
    if(cs_vec[(length(cs_vec) - 1)] == "*" | cs_vec[(length(cs_vec) - 1)] == ".") {
      cs_stars = cs_vec[(length(cs_vec) - 1)]
    }
    cs_stars[cs_stars == ""] <- " "
    if(cs_stars == "*" | cs_stars == ".") {
      cs_vec <- cs_vec[1:(length(cs_vec) - 2)]
    } else {cs_vec <- cs_vec[1:(length(cs_vec) - 1)]}
    cs_vec <- cs_vec[cs_vec != ""]
    # Deal with the "bottom-ending" of the p-values
    if(cs_vec[(length(cs_vec) - 1)] == "<") { # limit p-value
      pval <- paste(cs_vec[(length(cs_vec) - 1)], cs_vec[length(cs_vec)])
    } else {pval <- cs_vec[length(cs_vec)]}
    # Deal with variable indicators that have embedded spaces
    var_name <- cs_vec[1]
    i <- 2
    repeat{
      print(cs_vec[i])
      if(is.na(as.numeric(cs_vec[i])) & cs_vec[i] != "NA") {
        var_name <- paste(var_name, cs_vec[i])
        i <- i + 1
      } else {break}
    }
    # Construct the parsed, pipe-delimited line
    out_vec <- paste(var_name, cs_vec[i], cs_vec[(i + 1)], cs_vec[(i + 2)],
                     pval, cs_stars, sep="|")
    return(out_vec)
  }
  # Apply the function
  parsed_rows <- sapply(coef_sum, parseRow)
  names(parsed_rows) <- NULL
  return(parsed_rows)
}


# The function Alteryx.ParseAnova parses and formats the lines from Anova
# Type II results for an lm or glm model object. The output is placed into
# pipe (|) delimited fields for further processing within an Alteryx macro
# Author: Dan Putler
Alteryx.ParseAnova <- function(the_anova, obj.class) {
  if(!is.character(the_anova)) {
    stop.Alteryx2(
      XMSG(
        in.targetString_sc = "The argument to the function must be a character vector."
      )
    )
  }
  # parseRow is the function that is used in the apply function
  parseRow <- function(a_row, obj.class) {
    anova_vec <- unlist(strsplit(a_row, "\\s"))
    anova_vec <- anova_vec[anova_vec != ""]
    if(obj.class == "lm") {
      # Deal with the "bottom-end" of the reported p-values
      if(anova_vec[5] == "<" & !is.na(anova_vec[5])) {
        anova_vec <- c(anova_vec[1:4], paste(anova_vec[5], anova_vec[6]),
                       anova_vec[7])
      }
      # Deal with the final row of the table for an lm object
      if(is.na(anova_vec[4])) anova_vec <- c(anova_vec, rep(" ", 3))
      # Deal with reported significance for insignificant tests
      if(is.na(anova_vec[6])) anova_vec <- c(anova_vec, " ")
      # Create the reformatted rows
      anova_out <- paste(anova_vec[1], anova_vec[2], anova_vec[3], anova_vec[4],
                         anova_vec[5], anova_vec[6], sep="|")
    } else { # glm class objects
      # Deal with the "bottom-end" of the reported p-values
      if(anova_vec[4] == "<" & !is.na(anova_vec[4])) {
        anova_vec <- c(anova_vec[1:3], paste(anova_vec[4], anova_vec[5]),
                       anova_vec[6])
      }
      # Deal with reported significance for insignificant tests
      if(is.na(anova_vec[5])) anova_vec <- c(anova_vec, " ")
      # Create the reformatted rows
      anova_out <- paste(anova_vec[1], anova_vec[2], anova_vec[3], anova_vec[4],
                         anova_vec[5], sep="|")
    }
    return(anova_out)
  }
  # Apply the function
  parsed_rows <- sapply(the_anova, parseRow, obj.class)
  names(parsed_rows) <- NULL
  return(parsed_rows)
}

#' Create report for rx model objects.
#'
#' @param rx.obj model object of class rxLinMod, rxLogit or rxGlm
#' @param null.deviance null deviance
#' @export
AlteryxReportRx <- function (rx.obj, null.deviance = NULL) {
  if (!(class(rx.obj) %in% c("rxLinMod","rxLogit","rxGlm")))
    stop.Alteryx2(
      XMSG(
        in.targetString_sc = "The object provided is not an appropriate RevoScaleR class object."
      )
    )
  the.call <- paste(capture.output(rx.obj$call), collapse = "")
  the.call = gsub("\\s\\s", "", the.call)
  # The coefficients and related estimates need to be done by class
  if (class(rx.obj) == "rxLinMod") {
    param.names <- attributes(rx.obj$coefficients)$dimnames[[1]]
    coefs1 <- rx.obj$coefficients[,1]
    the.coefs <- format(coefs1, digits = 4)
    the.coefs[is.na(coefs1)] <- XMSG(in.targetString_sc = "Dropped")
    the.se <- format(rx.obj$coef.std.error[,1], digits = 4)
    the.se[is.na(coefs1)] <- XMSG(in.targetString_sc = "Dropped")
    the.t <- format(rx.obj$coef.t.value[,1], digits = 4)
    the.t[is.na(coefs1)] <- XMSG(in.targetString_sc = "Dropped")
    p.stars <- pStars(rx.obj$coef.p.value[,1])
    p.stars$p_txt <- as.character(p.stars$p_txt)
    p.stars$p_txt[is.na(coefs1)] <- XMSG(in.targetString_sc = "Dropped")
    p.stars$Stars <- as.character(p.stars$Stars)
    p.stars$Stars[is.na(coefs1)] <- " "
  } else {
    param.names <- names(rx.obj$coefficients)
    the.coefs <- format(rx.obj$coefficients, digits = 4)
    the.coefs[is.na(rx.obj$coefficients)] <- XMSG(in.targetString_sc = "Dropped")
    the.se <- format(rx.obj$coef.std.error, digits = 4)
    the.se[is.na(rx.obj$coefficients)] <- XMSG(in.targetString_sc = "Dropped")
    the.t <- format(rx.obj$coef.t.value, digits = 4)
    the.t[is.na(rx.obj$coefficients)] <- XMSG(in.targetString_sc = "Dropped")
    p.stars <- pStars(rx.obj$coef.p.value)
    p.stars$p_txt <- as.character(p.stars$p_txt)
    p.stars$p_txt[is.na(rx.obj$coefficients)] <- XMSG(in.targetString_sc = "Dropped")
    p.stars$Stars <- as.character(p.stars$Stars)
    p.stars$Stars[is.na(rx.obj$coefficients)] <- " "
  }
  coef.est <- paste(param.names, the.coefs, the.se, the.t, p.stars$p_txt, p.stars$Stars, sep = "|")
  coef.lab <- XMSG(in.targetString_sc = "Coefficients:")
  omitted <- names(rx.obj$aliased)[rx.obj$aliased]
  if (length(omitted) > 0) {
    coef.lab <- XMSG(
      in.targetString_sc = "@1 (@2  not defined because of singularities)",
      in.firstBindVariable_sc = coef.lab,
      in.secondBindVariable_sc = length(omitted)
    )
  }
  # Model summary, slightly different for glm based objects versus lm objects
  if (class(rx.obj) != "rxLinMod") {
    if (class(rx.obj) == "rxGlm") {
      dispersion <- XMSG(
        in.targetString_sc = "(Dispersion parameter for @1 taken to be @2)",
        in.firstBindVariable_sc = rx.obj$family$family,
        in.secondBindVariable_sc = rx.obj$dispersion
      )
    }
    if (class(rx.obj) == "rxLogit") {
      dispersion <- XMSG(in.targetString_sc = "(Dispersion parameter for binomial taken to be 1)")
    }
    df.null <- rx.obj$nValidObs - 1
    df.mod <- rx.obj$nValidObs - length(param.names)
    null.dev <- XMSG(
      in.targetString_sc = "Null deviance: @1 on @2 degrees of freedom",
      in.firstBindVariable_sc = format(null.deviance, digits = 5),
      in.secondBindVariable_sc = df.null
    )
    mod.dev <- XMSG(
      in.targetString_sc = "Residual deviance: @1 on @2 degrees of freedom",
      in.firstBindVariable_sc = format(rx.obj$deviance, digits = 5),
      in.secondBindVariable_sc = df.mod
    )
    McF.R2 <- 1 - (rx.obj$deviance/null.deviance)
    mod.fit <- XMSG(
      in.targetString_sc = "McFadden R-Squared: @1, Akaike Information Criterion: @2",
      in.firstBindVariable_sc = format(McF.R2, digits = 4),
      in.secondBindVariable_sc = format(rx.obj$aic, digits = 4)
    )
    fisher.it <- XMSG(
      in.targetString_sc = "Number of IRLS iterations: @1",
      in.firstBindVariable_sc = rx.obj$iter
    )
    sum.grps <- c("Call", "Coef_Label", rep("Coef_Est", length(coef.est)), "Dispersion", rep("Fit_Stats", 3), "Fisher")
    sum.out <- c(the.call, coef.lab, coef.est, dispersion, null.dev, mod.dev, mod.fit, fisher.it)
    summary.df <- data.frame(grp = sum.grps, out = sum.out)
    summary.df$grp <- as.character(summary.df$grp)
    summary.df$out <- as.character(summary.df$out)
  } else {
    resid.se <- XMSG(
      in.targetString_sc = "Residual standard error: @1 on @2 degrees of freedom",
      format(rx.obj$sigma, digits = 5),
      rx.obj$df[2]
    )
    r.sq <- XMSG(
      "Multiple R-squared: @1, Adjusted R-Squared: @2",
      in.firstBindVariable_sc = format(rx.obj$r.squared, digits = 4),
      in.secondBindVariable_sc = format(rx.obj$adj.r.squared, digits = 4)
    )
    p.f <- format(rx.obj$f.pvalue, digits = 4)
    p.f[rx.obj$f.pvalue < 2.2e-16] <- "< 2.2e-16"
    f.stat <- XMSG(
      in.targetString_sc = "F-statistic: @1 on @2 and @3 degrees of freedom (DF), p-value @4",
      in.firstBindVariable_sc = format(rx.obj$fstatistic$value, digits = 4),
      in.secondBindVariable_sc = as.integer(rx.obj$fstatistic$numdf),
      in.thirdBindVariable_sc = as.integer(rx.obj$fstatistic$dendf),
      in.fourthBindVariable_sc = p.f
    )
    sum.grps <- c("Call", "Coef_Label", rep("Coef_Est", length(coef.est)), rep("Fit_Stats", 3))
    sum.out <- c(the.call, coef.lab, coef.est, resid.se, r.sq, f.stat)
    summary.df <- data.frame(grp = sum.grps, out = sum.out)
    summary.df$grp <- as.character(summary.df$grp)
    summary.df$out <- as.character(summary.df$out)
  }
  json.str <- paste("\"", names(rx.obj$coefficients), "\":\"", rx.obj$coefficients, "\"", sep = "", collapse = ", ")
  json.str <- paste("{", json.str, "}")
  coef.str <- c("Coef_JSON", json.str)
  summary.df <- rbind(summary.df, coef.str)
  summary.df
}
alteryx/AlteryxPredictive documentation built on May 12, 2019, 1:37 a.m.