R/dglm.R
In glm.ddR: Distributed 'glm' for Big Data using 'ddR' API

Documented in dglm dglm.control dglm.fit fitted.dglm predict.dglm print.dglm print.summary.dglm residuals.dglm summary.dglm weights.dglm

#########################################################
#  File dglm.R
#  Distributed version of glm 
#
#  This code is a distributed version of glm function available in R stats package.
#  

#########################################################
## A copy of glm.R license:
#
#  Copyright (C) 1995-2012 The R Core Team
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  A copy of the GNU General Public License is available at
#  http://www.r-project.org/Licenses/


## responses: The darray that contains the vector of responses. It is single column, with values 0 or 1 for binomial regression.
## predictors: The darray that contains the vector of predictors. It may have many columns, but the number of rows and its number of blocks should be the same as responses.
## weights: It is an optional darray for weights on the samples. It has a single column. The number of rows and its number of blocks should be the same as responses.
## na_action: It is an optional argument which indicates what should happen when the data contain unsuitable values; e.g., NA, NaN, or Inf. The default is set to 'exclude'. The other option is 'fail'.
## start: Starting values for coefficients. It is optional.
## etastart: Starting values for parameter 'eta' which is used for computing deviance. It is optional.
## mustart: Starting values for mu 'parameter' which is used for computing deviance. It is optional.
## offset: An optional darray which can be used to specify an _a priori_ known component to be included in the linear predictor during fitting.
## control: An optional list of controlling arguments. The optional elements of the list and their default values are: epsilon = 1e-8, maxit = 25, trace = FALSE, rigorous = FALSE.
## method: This argument can be used for the future improvement. The only available fitting method at the moment is 'dglm.fit.Newton'. In the future, if we have new developed algorithms, this algorith can be used to switch between them. 
## completeModel: When it is FALSE (default), the output values that are in darray structure will be discarded.

dglm <- function(responses, predictors, family = gaussian, weights = NULL,
                 na_action="exclude", start = NULL, etastart = NULL, mustart = NULL, offset = NULL,		
                 control = list(...), method = "dglm.fit.Newton", completeModel = FALSE, ...)
{

  startTotalTime <- proc.time()

  call <- match.call()

  ## check the type of responses and predictors
  if( is.null(responses) || is.null(predictors) )
    stop("'responses' and 'predictors' are required arguments")
  if(! is.darray(responses) || ! is.darray(predictors))
    stop("'responses' and 'predictors' should be distributed arrays")
  if(is.sparse_darray(responses) || is.sparse_darray(predictors))
    stop("Sparse darray is not supported for 'responses' and 'predictors'")
  if(ncol(responses) != 1)
    stop("Multivariate is not supported")

  ## family
  if(is.character(family))
    family <- get(family, mode = "function", envir = parent.frame())
  if(is.function(family)) family <- family()
  if(is.null(family$family))
    stop("'family' not recognized")
  if(!( (family$family == "binomial" && family$link == "logit") || 
       (family$family == "gaussian" && family$link == "identity") ||
       (family$family == "poisson" && family$link == "log") ))
    stop("Only binomial(logit), poisson(log), and gaussian(identity) are supported for now")
  ## control parameters
  control <- do.call("dglm.control", control)
  trace <- ifelse(control$trace, TRUE, FALSE)
  nparts <- totalParts(responses)

  ## check weights
  binaryWeights <- TRUE
  if(!is.null(weights)){
    if(! is.darray(weights))
      stop("'weights' should be of type darray")
    if(weights@sparse)
      stop("Sparse darray is not supported for 'weights'")

    if( any(psize(weights) != psize(responses)) )
      stop(gettextf("'weights' should have the same partitioning pattern as 'responses', and single column"), domain = NA)
    ## check that there is no negative value for weights
    if (trace) {
      message("Checking weights")
      starttime <- proc.time()
    }
    testArray1 <- dmapply(function(weightsi) as.numeric(all(weightsi >= 0)),
                          weightsi = parts(weights), output.type = "darray", 
                          combine = "cbind", nparts = c(1,totalParts(weights)))
    testArray2 <- dmapply(function(weightsi) if(all(weightsi >= 0)) as.numeric(all(weightsi == 0 || weightsi == 1)) else 0,
                          weightsi = parts(weights), output.type = "darray", 
                          combine = "cbind", nparts = c(1,totalParts(weights)))

    if (trace) {    # end of timing step
      endtime <- proc.time()
      spentTime <- endtime-starttime
      message("Spent time: ",(spentTime)[3]," sec")
    }

    if( ! all( collect(testArray1) == 1 ) )
      stop("all weights should be non-negative")
    if( ! all( collect(testArray2) == 1 ) )
      binaryWeights <- FALSE
  }
  ## check etastart
  if(!is.null(etastart)) {
    if(! is.darray(etastart))
      stop("'etastart' should be of type darray")
    if(etastart@sparse)
      stop("Sparse darray is not supported for 'etastart'")
    if( any(psize(etastart) != psize(responses)) )
      stop(gettextf("'etastart' should have the same pattern as 'respnses'"), domain = NA)
  }
  ## check mustart
  if(!is.null(mustart)) {
    if(! is.darray(mustart))
      stop("'mustart' should be of type darray")
    if(mustart@sparse)
      stop("Sparse darray is not supported for 'mustart'")
    if( any(psize(mustart) != psize(responses)) )
      stop(gettextf("'mustart' should have the same partitioning pattern as 'respnses'"), domain = NA)
    if(family$link == "logit" || family$link == "log") {
      if (trace) {
        message("Checking mustart")
        starttime <- proc.time()
      }
      testArray <- dmapply(function(mustarti) as.numeric(all(mustarti > 0 & mustarti < 1)),
                           mustarti = parts(mustart), output.type = "darray", 
                           combine = "cbind", nparts = c(1,totalParts(mustart)))

      if (trace) {    # end of timing step
        endtime <- proc.time()
        spentTime <- endtime-starttime
        message("Spent time: ",(spentTime)[3]," sec")
      }
      
      if( ! all( collect(testArray) == 1 ) )
        stop("mustart has values out of valid range (0,1)")
    }        
  }

  ## check offset
  if(!is.null(offset)) {
    if(! is.darray(offset))
      stop("'offset' should be of type darray")
    if( any(psize(offset) != psize(responses)) )
      stop(gettextf("'offset' should have the same pattern as 'responses'"), domain = NA)
  }

  ## The call to method (dglm.fit)
  if (trace) {
    startFitTime <- proc.time()
  }
  fit <- dglm.fit(predictors, responses, weights = weights, start = start,
                  etastart = etastart, mustart = mustart,
                  offset = offset, family = family, control = control,
                  intercept = TRUE, method = method, binaryWeights = binaryWeights, 
                  na_action=na_action, completeModel=completeModel)


                                        #V    fit <- call(if(is.function(method)) "method" else "dglm.fit",
                                        #                     predictors, responses, weights = weights, start = start,
                                        #                     etastart = etastart, mustart = mustart,
                                        #                     offset = offset, family = family, control = control,
                                        #                     intercept = TRUE, method = method, binaryWeights = binaryWeights, 
                                        #                     na_action=na_action, completeModel=completeModel)
  
  if (trace) {
    endFitTime <- proc.time()
    message("Fitting time:",(endFitTime - startFitTime)[3]," sec")
  }

  if(! is.null(colnames(predictors)))
    rownames(fit$coefficients) <- c("(Intercept)", colnames(predictors))
  ## adding some parameters to fit
  fit <- c(fit, list(call = call, offset = offset, control = control, method = method))

  class(fit) <- c("dglm", "glm", "lm")

  if (trace) {
    endTotalTime <- proc.time()
    totalTime <- endTotalTime - startTotalTime
    message("*****************************")
    message("Total running time:",(totalTime)[3]," sec")
  }

  fit
}

dglm.control <- function(epsilon = 1e-8, maxit = 25, trace = FALSE, rigorous = FALSE)
{
  if(!is.numeric(epsilon) || epsilon <= 0)
    stop("value of 'epsilon' must be > 0")
  if(!is.numeric(maxit) || maxit <= 0)
    stop("maximum number of iterations must be > 0")
  list(epsilon = epsilon, maxit = maxit, trace = trace, rigorous = rigorous)
}

dglm.fit <-
  function (X, Y, weights = NULL, start = NULL,
            etastart = NULL, mustart = NULL, offset = NULL,
            family = gaussian(), control = list(), intercept = TRUE, method = "dglm.fit.Newton", 
            binaryWeights = TRUE, na_action="exclude", completeModel=FALSE)
{
  control <- do.call("dglm.control", control)
  conv <- FALSE       # converged
  nobs <- nrow(Y)     # number of observations
  nvars <- ncol(X) + 1    # adding 1 to the number of variances because of the intercept
  nparts <- totalParts(X)  # number of partitions (blocks)

  if (nparts != totalParts(Y) || !all(psize(X)[,1] == psize(Y)[,1]))
    stop("'predictors' and 'responses' should be partitioned row-wise with the same pattern. If the samples are loaded from a database, you can use the db2darrays() function to load the darrays with the same partitioning pattern.", call. = FALSE)

  if(nvars < 2) {
    stop("Empty model is not supported", call. = FALSE)
  }
  if(method != "dglm.fit.Newton")
    stop("Unknown method")

  trace <- ifelse(control$trace, TRUE, FALSE)
  if (trace) {
    starttime_preLoop<-proc.time()
  }

  ## define weights
  if (is.null(weights)) 
    weights <- dmapply(function(nrow) matrix(1,nrow=nrow), 
                       nrow = psize(X)[,1],
                       output.type = "darray", combine = "rbind", nparts = c(totalParts(X),1))


                                        # .naCheck function may alter weights
  .naCheck_output <- .naCheck(X, Y, weights, trace)
  nOmit <- .naCheck_output$nOmit
  weights <- .naCheck_output$weights
  ngoodObs <- .naCheck_output$ngoodObs
  
  
                                        # decision about na_action
  switch(na_action,
         "exclude" = {
                                        # do nothing            
         },
         "fail" = {
           if(nOmit > 0)
             stop("missing values in samples")
         },
         {
           stop("only 'exclude' and 'fail' are valid values for na_action")
         }
         )

  ## stop if not enough parameters    
  if (ngoodObs < nvars)
    stop(paste("Number of variables is", nvars, ", but only", ngoodObs, "observations (samples with NA/NaN/Inf values are excluded)"))

  ## define offset
  isInitialOffset <- TRUE
  if (is.null(offset)) {

    offset <- dmapply(function(nrow) matrix(0,nrow = nrow,ncol = 1),
                      nrow = psize(X)[,1],
                      output.type="darray", combine = "rbind", nparts = c(totalParts(X),1))
    isInitialOffset <- FALSE
  }

  ## get family functions:
  variance <- family$variance # parall safe for binomial, gaussian, poisson
  linkinv  <- family$linkinv # parall safe for logit, identity, log, probit, cauchit, cloglog, sqrt, 1/mu^2, inverse
  if (!is.function(variance) || !is.function(linkinv) )
    stop("'family' argument seems not to be a valid family object", call. = FALSE)
  dev.resids <- family$dev.resids # parall safe for binomial, gaussian, poisson
  mu.eta <- family$mu.eta # parall safe for logit, identity, probit, cauchit, cloglog, sqrt, 1/mu^2, inverse
  unless.null <- function(x, if.null) if(is.null(x)) if.null else x
  valideta <- unless.null(family$valideta, function(eta) TRUE) # parall safe for logit, identity, probit, cauchit, cloglog, sqrt, 1/mu^2, inverse
  validmu  <- unless.null(family$validmu,  function(mu) TRUE) # parall safe for binomial, gaussian, poisson


  ## calculates mustart if not present
  if(is.null(mustart)) 
    mustart <- .dglm.fit.initialize.mustart(family, Y, weights, trace)
  else 
    .dglm.fit.initialize.mustart(family, Y, weights, trace)
  
  if(!is.null(start))
    {            
      if (length(start) != nvars)
        stop(paste("length of 'start' should equal", nvars, "and correspond to initial coefs")) 
      coefold <- start
    }
  else
    coefold <- NULL

  ## calculates etastart if not present
  if(!is.null(etastart)) 
    eta <- etastart
  else
    eta <- .dglm.fit.initialize.eta(X, family, start, mustart,  weights, trace)

  ## Setting mu
  if(is.null(etastart)) 
    mu <- mustart
  else 
    mu <- .dglm.fit.initialize.mu(family, eta, weights, trace)


  if(control$rigorous) .dglm.fit.validate.mu_eta(family, mu, eta, weights, trace)


  ## calculate initial deviance and coefficient
  devold <- .dglm.fit.initialize.dev(Y, mu, weights, dev.resids,trace)
  dev <- devold   # the value of deviance

  boundary <- conv <- FALSE   # indicators for boundary situation and convergence
  if (is.null(start))
    start <- matrix(0,nvars)
  coef <- coefold <- start

  ## it should be faster if some darrays are allocated out of the iteration

  if (trace) {
    stoptime_preLoop<-proc.time()
    preLoopTime <- stoptime_preLoop-starttime_preLoop
    message("***** Pre-Loop time:",(preLoopTime)[3],"sec *****")
    startItTime <- proc.time()
  }

  ##------------- THE Iteratively Reweighting L.S. iteration -----------
  for (iter in 1L:control$maxit) {

    if(control$rigorous) .dglm.fit.validate.Vmu(mu, weights, variance, trace)

    if(! binaryWeights || isInitialOffset) {
      ## The routine when weights are not binary or there is an initial offset. It is more time consuming.
      mu.eta.val <- .dglm.fit.calculate.mu.eta.val(eta, mu.eta, weights, trace)
      
      ## drop observations for which w will be zero
      temp_output <- .dglm.fit.calculate.dGood(mu.eta.val, weights, trace)
      dGood <- temp_output$dGood
      if(temp_output$no_valid)
        {
          conv <- FALSE
          warning("no observations informative at iteration ", iter)
          break
        }


                                        #O          z <- (eta - offset)[good] + (y - mu)[good]/mu.eta.val[good]       
      Z <-  .dglm.fit.calculate.Z(Y, mu, eta, mu.eta.val, offset, dGood, trace)
      
      W <- .dglm.fit.calculate.W(variance, mu, mu.eta.val, dGood, weights, trace)
      family = gaussian(identity)
    } # !binaryWeights
    else
      {
        W <- weights # in this routine W and weights are the same
        Z <- Y
      } # binaryWeights
    
    coefficients <- .LSPSolution_Newton(Z, X, start, family, W, trace)

    if (any(!is.finite(coefficients))) {
      conv <- FALSE
      warning(paste("non-finite coefficients at iteration", iter))
      break
    }
    start <- coefficients
    notValid <- TRUE
    boundary <- FALSE
    ii <- 0
    while (notValid) {
      if (ii >= control$maxit)
        stop("inner loop 1; cannot correct step size", call. = FALSE)
      ii <- ii + 1
      if(ii != 1)
        start <- (start + coefold)/2
      ## calculating deviance
      dev_output <- .dglm.fit.calculate.deviance(X, Y, weights, offset,
                                                 eta, mu, devArray, start, linkinv, dev.resids, trace)
      eta = dev_output$eta
      mu = dev_output$mu
      devArray = dev_output$devArray
      rm(dev_output)
      if (control$trace)
        message("Deviance = ", dev, " Iterations = ", iter)

      dev <- sum(devArray)
      if(control$rigorous) 
        notValid <- .dglm.fit.validate.mu_eta(family, mu, eta, weights, trace)
      else
        notValid <- !is.finite(dev)
      if( notValid & is.null(coefold))
        stop("no valid set of coefficients has been found: please supply starting values", call. = FALSE)
      if(notValid)
        boundary <- TRUE
    }


    ## check for convergence
    if (abs(dev - devold)/(0.1 + abs(dev)) < control$epsilon) {
      conv <- TRUE
      coef <- start
      break
    } else {
      devold <- dev
      coef <- coefold <- start
    }

  } ##-------------- end IRLS iteration -------------------------------
  if (trace) {
    endItTime <- proc.time()
    message("***** LS Iteration time:",(endItTime - startItTime)[3],"sec *****")
    starttime_postLoop<-proc.time()
  }

  if(! is.null(colnames(X)))
    rownames(coef) <- c("(Intercept)", colnames(X)) 

  if(!conv && iter==1)
    stop("dglm.fit failed to converge at the first iteration.")
  if (!conv)
    warning("dglm.fit: algorithm did not converge", call. = FALSE)

  if (boundary)
    warning("dglm.fit: algorithm stopped at boundary value", call. = FALSE)

  ## calculate df
  nulldf <- ngoodObs - as.integer(intercept)

  ## The accurate rank cannot be found through our approach. So it is assumed equal to nvars.
  rank <- if(nvars < 2) 0 else min(nvars, ngoodObs)
  resdf  <- ngoodObs - rank

                                        # cov-matrix will be more accurate if we use initial weight when it is binary
  if(binaryWeights && isInitialOffset)
    W = weights

  eps <- 10*.Machine$double.eps

  if(completeModel) {

    if(control$rigorous) .dglm.fit.validate.mu_final(family, mu, weights, eps, trace)


    ## update by accurate calculation
    residuals <- .dglm.fit.calculate.residuals(Y, mu, eta, mu.eta, weights, trace)

    ## calculate null deviance -- corrected in glm() if offset and intercept
    nulldev <- .dglm.fit.calculate.null_deviance(Y, offset, 
                                                 dev.resids, weights, intercept, linkinv,trace)

    ## calculate AIC
    aic.model <- .dglm.fit.calculate.aic(family, Y, mu, weights, dev, trace, ngoodObs) + 2 * rank

    ## effects, Rmat, qr are not computed through our approach in comparison to glm
    
    fit <- list(coefficients = coef, d.residuals = residuals, d.fitted.values = mu,
                family = family, d.linear.predictors = eta, deviance = dev, aic = aic.model,
                null.deviance = nulldev, iter = iter, weights = W,
                prior.weights = weights, df.residual = resdf, df.null = nulldf,
                converged = conv, boundary = boundary, responses = Y, predictors = X)
  }
  else { # test
    fit <- list(coefficients = coef, d.fitted.values = mu, 
                family = family, d.linear.predictors = eta, deviance = dev, aic = NA,
                null.deviance = NA, iter = iter, weights = W,
                prior.weights = weights, df.residual = resdf, df.null = nulldf,
                converged = conv, boundary = boundary, responses = Y, predictors = X)
  }
  
  if(nOmit > 0)
    fit$na_action <- list(type="exclude", numbers=nOmit)

  class(fit) <- c("dglm", "glm", "lm")

  if (trace) {
    endtime_postLoop <- proc.time()
    message("***** Post-Loop time:",(endtime_postLoop - starttime_postLoop)[3],"sec *****")
  }
  fit
} # end of dglm.fit function


print.dglm <- function(x, digits= max(3, getOption("digits") - 3), ...)
{
  cat("\nCall:  ",
      paste(deparse(x$call), sep="\n", collapse = "\n"), "\n\n", sep="")
  if(length(coef(x))) {
    cat("Coefficients")
    if(is.character(co <- x$contrasts))
      cat("  [contrasts: ",
          apply(cbind(names(co),co), 1L, paste, collapse="="), "]")
    cat(":\n")
    print.default(format(x$coefficients, digits=digits),
                  print.gap = 2, quote = FALSE)
  } else cat("No coefficients\n\n")
  cat("\nDegrees of Freedom:", x$df.null, "Total (i.e. Null); ",
      x$df.residual, "Residual\n")
  if(nzchar(mess <- naprint(x$na.action))) cat("  (",mess, ")\n", sep="")
  cat("Null Deviance:	   ",	format(signif(x$null.deviance, digits)),
      "\nResidual Deviance:", format(signif(x$deviance, digits)),
      "\tAIC:", format(signif(x$aic, digits)), "\n")
  invisible(x)
}

summary.dglm <- function(object, dispersion = NULL,
                         correlation = FALSE, symbolic.cor = FALSE, trace=FALSE, ...)
{
  est.disp <- FALSE
  df.r <- object$df.residual
  completeModel <- !is.null(object$d.residuals)

  if(completeModel) { # runs only in complete mode
    if(is.null(dispersion)){	# calculate dispersion if needed
      if(object$family$family %in% c("poisson", "binomial"))
        dispersion <- 1
      else if(df.r > 0) {
        est.disp <- TRUE
        calc_dispersion <- function(wi, ri, df.r)
          {
            tempArrayi = 0
            good <- wi > 0
            tempArrayi <- sum((wi * ri^2)[good,])/ df.r
            return(tempArrayi)
          }
        tempArray <- dmapply(calc_dispersion, wi = parts(object$weights), ri = parts(object$d.residuals), 
                             MoreArgs = list(df.r = df.r), 
                             output.type = "darray", combine = "cbind", nparts = c(totalParts(object$weights),1))
        dispersion <- sum(tempArray)
      } else {
        est.disp <- TRUE
        dispersion <- NaN
      }
    }
  } # runs only in complete mode
  ## calculate scaled and unscaled covariance matrix

  aliased <- is.na(coef(object))  # used in print method
  p <- length(object$coefficients)

  if (p > 0) {
    if(!completeModel) { # incomplete mode
      coef.table <- cbind(object$coefficients, NA, NA, NA)
      dimnames(coef.table) <- list(rownames(object$coefficients), c("Estimate", "Std. Error", "t value","Pr(>|t|)"))
      covmat.unscaled <- covmat <- matrix(, 0L, 0L)           
    } else { # complete mode

      coef.p <- object$coefficients
      covmat.unscaled <- .covariantMatrix(object, trace=trace)
      covmat <- dispersion * covmat.unscaled
      var.cf <- diag(covmat)

      ## calculate coef table

      s.err <- sqrt(var.cf)
      tvalue <- coef.p/s.err

      dn <- c("Estimate", "Std. Error")
      if(!est.disp) { # known dispersion
        pvalue <- 2*pnorm(-abs(tvalue))
        coef.table <- cbind(coef.p, s.err, tvalue, pvalue)
        dimnames(coef.table) <- list(rownames(coef.p),
                                     c(dn, "z value","Pr(>|z|)"))
      } else if(df.r > 0) {
        pvalue <- 2*pt(-abs(tvalue), df.r)
        coef.table <- cbind(coef.p, s.err, tvalue, pvalue)
        dimnames(coef.table) <- list(rownames(coef.p),
                                     c(dn, "t value","Pr(>|t|)"))
      } else { # df.r == 0
        coef.table <- cbind(coef.p, NaN, NaN, NaN)
        dimnames(coef.table) <- list(rownames(coef.p),
                                     c(dn, "t value","Pr(>|t|)"))
      }
    }          
    df.f <- p

  } else {
    coef.table <- matrix(, 0L, 4L)
    dimnames(coef.table) <-
      list(NULL, c("Estimate", "Std. Error", "t value", "Pr(>|t|)"))
    covmat.unscaled <- covmat <- matrix(, 0L, 0L)
    df.f <- length(aliased)
  }
  ## return answer

  ## these need not all exist, e.g. na.action.
  keep <- match(c("call","family","deviance", "aic",
                  "df.residual","null.deviance","df.null",
                  "iter", "na_action"), names(object), 0L)
  if(completeModel) { # complete mode
    deviance.resid <- residuals.dglm(object, type = "deviance", trace=FALSE)
  } else { # incomplete mode
    deviance.resid <- NA
  }
  ans <- c(object[keep],
           list(deviance.resid = deviance.resid,
                coefficients = coef.table,
                aliased = aliased,
                dispersion = dispersion,
                df = c(p, df.r, df.f),
                cov.unscaled = covmat.unscaled,
                cov.scaled = covmat))

  if(correlation && p > 0 && completeModel) {
    dd <- sqrt(diag(covmat.unscaled))
    ans$correlation <-
      covmat.unscaled/outer(dd,dd)
    ans$symbolic.cor <- symbolic.cor
  }

                                        # it was better if it was quantiles
  if(completeModel) { # incomplete mode
    ans$minMax <- drop(cbind(min(ans$deviance.resid, na.rm=TRUE),max(ans$deviance.resid, na.rm=TRUE)))
  } else { # incomplete mode
    ans$minMax <- c(NA,NA)
  }
  names(ans$minMax) <- c("Min", "Max")

  class(ans) <- "summary.dglm"
  return(ans)
}

print.summary.dglm <-
  function (x, digits = max(3, getOption("digits") - 3),
            symbolic.cor = x$symbolic.cor,
            signif.stars = getOption("show.signif.stars"), ...)
{
  cat("\nCall:\n",
      paste(deparse(x$call), sep="\n", collapse = "\n"), "\n\n", sep="")

  xx <- zapsmall(x$minMax, digits + 1)
  cat("Deviance Residuals: \n")
  print.default(xx, digits=digits, na.print = "", print.gap = 2)

  if(length(x$aliased) == 0L) {
    cat("\nNo Coefficients\n")
  } else {
    ## df component added in 1.8.0
    ## partial matching problem here.
    df <- if ("df" %in% names(x)) x[["df"]] else NULL
    if (!is.null(df) && (nsingular <- df[3L] - df[1L]))
      cat("\nCoefficients: (", nsingular,
          " not defined because of singularities)\n", sep = "")
    else cat("\nCoefficients:\n")
    coefs <- x$coefficients
    if(!is.null(aliased <- x$aliased) && any(aliased)) {
      cn <- names(aliased)
      coefs <- matrix(NA, length(aliased), 4L,
                      dimnames=list(cn, colnames(coefs)))
      coefs[!aliased, ] <- x$coefficients
    }
    printCoefmat(coefs, digits=digits, signif.stars=signif.stars,
                 na.print="NA", ...)
  }
  
  cat("\n(Dispersion parameter for ", x$family$family,
      " family taken to be ", format(x$dispersion), ")\n\n",
      apply(cbind(paste(format(c("Null","Residual"), justify="right"),
                        "deviance:"),
                  format(unlist(x[c("null.deviance","deviance")]),
                         digits= max(5, digits+1)), " on",
                  format(unlist(x[c("df.null","df.residual")])),
                  " degrees of freedom\n"),
            1L, paste, collapse=" "), sep="")
  if(nzchar(mess <- naprint(x$na.action))) cat("  (",mess, ")\n", sep="")
  cat("AIC: ", format(x$aic, digits= max(4, digits+1)),"\n\n",
      "Number of Fisher Scoring iterations: ", x$iter,
      "\n", sep="")

  correl <- x$correlation
  if(!is.null(correl)) {
    p <- NCOL(correl)
    if(p > 1) {
      cat("\nCorrelation of Coefficients:\n")
      if(is.logical(symbolic.cor) && symbolic.cor) {# NULL < 1.7.0 objects
        print(symnum(correl, abbr.colnames = NULL))
      } else {
        correl <- format(round(correl, 2), nsmall = 2, digits = digits)
        correl[!lower.tri(correl)] <- ""
        print(correl[-1, -p, drop=FALSE], quote = FALSE)
      }
    }
  }
  cat("\n")
  invisible(x)
}


## GLM Methods for Generic Functions :

residuals.dglm <-
  function(object, type = c("deviance", "pearson", "working", "response", "partial"), 
           trace=FALSE, ...)
{
  if(is.null(object$d.residuals))
    stop("This function is only available for complete models.")
  type <- match.arg(type)
  Y <- object$responses
  r <- object$d.residuals
  mu	<- object$d.fitted.values
  wts <- object$prior.weights
  nparts <- totalParts(mu)

  switch(type,
         deviance=,pearson=,response=
         if(is.null(Y)) {
           mu.eta <- object$family$mu.eta
           eta <- object$d.linear.predictors
	   if(trace)
             message("Building Y")

	   Y <- dmapply(function(mui, ri, etai) matrix(mui + ri * mu.eta(etai),ncol = 1),
                        mui = parts(mu), ri = parts(r), etai = parts(eta),
                        output.type = "darray", combine = "rbind", nparts = c(totalParts(mu),1))
         }
         )
  
  switch(type,
         deviance = if(object$df.residual > 0) {
           if(trace)
             message("Building residuals")

           calculating_d.res <- function(Yi, mui, wtsi, func)
             {
               d.resi <- matrix(0, nrow = nrow(mui), ncol = ncol(mui))
               good <- wtsi > 0
               d.resi[good,] <- sqrt(pmax(func(Yi[good,], mui[good,], wtsi[good,]), 0))
               return(d.resi)
             }
           calculating_res <- function(Yi, mui, wtsi, func, d.resi)
             {
               resi <- matrix(NA, nrow = nrow(mui), ncol = ncol(mui))
               good <- wtsi > 0
               resi[good,] <- ifelse(Yi[good,1] > mui[good,], d.resi[good,], -d.resi[good,])
               return(resi)
             }
           d.res <- dmapply(calculating_d.res, Yi = parts(Y), mui = parts(mu), wtsi = parts(wts),
                            MoreArgs = list(func = object$family$dev.resids),
                            output.type = "darray", combine = "rbind", nparts = c(totalParts(Y),1))
           res <- dmapply(calculating_res, Yi = parts(Y), mui = parts(mu), wtsi = parts(wts),
                          d.resi = parts(d.res), MoreArgs = list(func = object$family$dev.resids),
                          output.type = "darray", combine = "rbind", nparts = c(totalParts(Y),1))
         },
         pearson = {
           if(trace)
             message("Building residuals")

           calculating_resi <- function(Yi, mui, wtsi, func)
             {
               resi <- (Yi[,1]-mui)*sqrt(wtsi)/sqrt(func(mui))
             }
           res <- dmapply(calculating_resi, Yi = parts(Y), mui = parts(mu), wtsi = parts(wts),
                          MoreArgs = list(func = object$family$dev.resids),
                          output.type = "darray", combine = "rbind", nparts = c(totalParts(Y),1))

         },
         working = {
           res <- r
         },
         response = {
           res <- Y - mu
         },
         partial = {
           res <- r
         }
         )
  res
}

weights.dglm <- function(object, type = c("prior", "working"), ...)
{
  type <- match.arg(type)
  res <- if(type == "prior") object$prior.weights else object$weights
  if(is.null(object$na.action)) res
  else naresid(object$na.action, res)
}


#########################################################
                                        #        Functions specifically added for dglm        #
#########################################################

## Distributed version of family.initilize

## Solution provided for Least Square Problem (LSP)
## Ref for the algorithm: http://www.cs.purdue.edu/homes/alanqi/Courses/ML-11/CS59000-ML-13.pdf
.LSPSolution_Newton <- function (Y, X, globalTheta, family, weights, trace=TRUE) {

  nobservation <- NROW(X)
  npredictors <- NCOL(X)
  npart <- totalParts(X)

                                        # distGrad Stores the partial gradiants distributed across workers: (npredictors+1 x 1) matrix
                                        # distHessian Stores the partial hessian distributed across workers:  (npredictors+1 x npredictors+1) matrix flattened out 

                                        # Calculate gradient and hessian locally
  if (trace) {
    message("Calculating local gradient and hessian")
    starttime<-proc.time()
  }
  calculate_gradient_hessian <- function(x, y, theta, m, weightsi, family, npredictors)
    {
      good <- (weightsi > 0)
      g <- matrix(0,nrow = npredictors+1)
      hess <- matrix(0,nrow = (npredictors+1)*(npredictors+1), ncol = 1)

      if(sum(good) == 1)
        x <- cbind(1, matrix(x[good,],1,)) * weightsi[good,]
      else
        x <- cbind(1, x[good,]) * weightsi[good,]

      trans <- t(x)   # transpose of matrix x

      switch(family$family,
             "binomial"={
                                        # dim(h) = dim(y)
               h <- family$linkinv(x %*% theta)
                                        #Gradient calcualation
               g <- 1/m * (trans %*% (h - y[good,1] * weightsi[good,1]))  
                                        #Hessian calcualation
               hsum <- as.numeric(h * (1 - h))
               hess <- matrix(1/m * (trans %*% (hsum * x ))) # converted into a single column matrix
             },
             "gaussian"={
               g <- 1/m * trans %*% (y[good,1] * weightsi[good,1])
               hess <- matrix(1/m * trans %*% x) # converted into a single column matrix
             },
             "poisson"={
               h <- family$linkinv(x %*% theta)
                                        #Gradient calcualation
               g <- 1/m * (trans %*% (h - y[good,1] * weightsi[good,1]))
                                        #Hessian calcualation
               hsum <- as.numeric(h)
               hess <- matrix(1/m * (trans %*% (hsum * x ))) # converted into a single column matrix                
             },
             stop("The specified family is not supported for now")
             )
      return(list(g=g, hess = hess))              
    }
  grad_hess <- dmapply(calculate_gradient_hessian, x = parts(X), y = parts(Y), 
                       weightsi = parts(weights),
                       MoreArgs = list(theta=globalTheta, m=nobservation, family=family, npredictors=npredictors))

  distGrad <- dmapply(function(x) x[[1]]$g, x = parts(grad_hess),
                      output.type = "darray", combine = "cbind", nparts = c(1,totalParts(grad_hess)))
  distHessian <- dmapply(function(x) x[[1]]$hess, x = parts(grad_hess),
                         output.type = "darray", combine = "cbind", nparts = c(1,totalParts(grad_hess)))

                                        #Fetch all gradients and sum them up to get one vector: 1 x (npredictors+1)
  grad<-rowSums(distGrad)

                                        #Fetch all hessians, sum them up, and convert to a (npredictors+1 x npredictors+1) matrix
                                        #We should idealy use reduce() when number of partitions is large
  hessian<-matrix(rowSums(distHessian),nrow=npredictors+1) 
  
  if (trace) {    # end of timing step
    endtime <- proc.time()
    spentTime <- endtime-starttime
    message("Spent time: ",(spentTime)[3]," sec")
  }

  switch(family$family,
         "binomial"={
           globalTheta = globalTheta - solve(hessian) %*% matrix(grad, ncol=1)
         },
         "gaussian"={
           globalTheta = solve(hessian) %*% matrix(grad, ncol=1)
         },
         "poisson"={
           globalTheta = globalTheta - solve(hessian) %*% matrix(grad, ncol=1)
         },
         stop("The specified family is not supported for now")
         )

}

## Calculating the co-variant matrix
.covariantMatrix <- function (object, trace=TRUE) {
  X <- object$predictors
  nobs <- NROW(X)
  nparts <- totalParts(X)
  npredictors <- NCOL(X)
  W <- object$weights   # samples with weight==0 should not be considered in the calculation

  switch( object$family$family,
         "binomial" = {               
           if (trace) message("Calculating the covariant matrix")

                                        #Stores the partial hessian distributed across workers:  (npredictors+1 x npredictors+1) matrix flattened out 
           calc_error <- function(Xi, theta, Wi, linkinv, npredictors)
             {
               hess <- matrix(0,nrow = (npredictors+1)*(npredictors+1),ncol = 1)
               good <- Wi > 0
               if(sum(good) == 1)
                 x <- cbind(1,matrix(Xi[good,],1,)) * Wi[good,1]
               else
                 x <- cbind(1,Xi[good,]) * Wi[good,1]
               z <- x %*% theta
               h <- linkinv(z)

                                        #Hessian calcualation for the covariance
               hsum <- as.numeric(h * (1 - h))
               hess <- matrix((t(x) %*% (hsum * x ))) # converted into a single column matrix
               return(hess)
             }
         },
         "quasibinomial" = {
           stop(sQuote(family$family), " family not supported yet")
         },
         "poisson" = {
                                        #Stores the partial hessian distributed across workers:  (npredictors+1 x npredictors+1) matrix flattened out 
           calc_error <- function(Xi, theta, Wi, linkinv, npredictors)
             {
               hess <- matrix(0,nrow = (npredictors+1)*(npredictors+1),ncol = 1)
               good <- Wi > 0
               if(sum(good) == 1)
                 x <- cbind(1,matrix(Xi[good,],1,)) * Wi[good,1]
               else
                 x <- cbind(1,Xi[good,]) * Wi[good,1]

               z <- x %*% theta
               h <- linkinv(z)

                                        #Hessian calcualation for the covariance
               hsum <- as.numeric(h)
               hess <- matrix((t(x) %*% (hsum * x ))) # converted into a single column matrix
               return(hess)
               
             }
         },
         "quasipoisson" = {
           stop(sQuote(family$family), " family not supported yet")
         },
         "gaussian" = {
                                        #Stores the partial hessian distributed across workers:  (npredictors+1 x npredictors+1) matrix flattened out 
           calc_error <- function(Xi, theta, Wi, linkinv, npredictors)
             {
               hess <- matrix(0,nrow = (npredictors+1)*(npredictors+1),ncol = 1)
               good <- Wi > 0
               if(sum(good) == 1)
                 x <- cbind(1,matrix(Xi[good,],1,)) * Wi[good,1]
               else
                 x <- cbind(1,Xi[good,]) * Wi[good,1]

                                        #Hessian calcualation for the covariance
               hess <- matrix(t(x) %*% x) # converted into a single column matrix 
               return(hess)
             }

         },
         "Gamma" = {
           stop(sQuote(family$family), " family not supported yet")
         },
         "inverse.gaussian" = {
           stop(sQuote(family$family), " family not supported yet")
         },
         "quasi" = {
           stop(sQuote(family$family), " family not supported yet")
         },
         ## else :
         stop(sQuote(family$family), " family not recognised")
         )# end switch(.)

  distHessian <- dmapply(calc_error, Xi = parts(X), Wi = parts(W),
                         MoreArgs = list(theta = object$coefficients, linkinv = object$family$linkiv,npredictors=npredictors),
                         output.type = "darray", combine = "cbind", nparts = c(1,totalParts(X)))

                                        #Fetch all hessians, sum them up, and convert to a (npredictors+1 x npredictors+1) matrix
  hessian <- matrix(rowSums(distHessian),nrow=npredictors+1)
  covmat <- solve(hessian)
}

## prediction given a model and newdata
## object: A built model of type dglm.
## newdata: A darray or a matrix containing predictors of new samples
## type: The type of prediction required which can be "link" or "response"
## na.action: A function to determine what should be done with missing values (reserved for the future improvement)  
## mask: A darray with a single column, and 0 or 1 as the value of its elements.
##      It indicates which samples (rows) should be considered in the calculation
predict.dglm <-
  function(object, newdata, type = c("link", "response"), na.action = na.pass, mask=NULL, trace=FALSE, ...)
{
  if(missing(newdata) || is.null(newdata)) {
    stop("newdata argument is required!")
  }
  if(!is.darray(newdata) && !is.matrix(newdata))
    stop("newdata should be of type darray or matrix")
  type <- match.arg(type)
  coef <- object$coefficients
  if(ncol(newdata) != length(coef)-1)
    stop("the number of predictors in newdata should fit the number of coefficients in the model")

  if(!is.null(mask))
    {
      if(class(newdata) != class(mask))
        stop("newdata and mask should be of the same type")
      if( nrow(mask) != NROW(newdata) || ncol(mask) != 1 )
        stop("'mask' must have a single column and the same number of rows as newdata")

    }


  ## newdata can be either a darray or a normal array
  if(is.darray(newdata)){  # newdata is a darray
    if (trace) message("Calculating link prediction")
    if(!is.null(mask)) {     # no mask
      if( nrow(mask) != nrow(newdata) || ncol(mask) != 1 || any(psize(mask)[,1] != psize(newdata)[,1]))
        stop("'mask' must have the same partitioning pattern as newdata")
    }
    calc_link <- function(newdatai, coef, maski)
      {                
        if(!is.null(maski))
          {
            predi <- matrix(0,nrow = nrow(newdatai), ncol = ncol(coef))
            good <- maski > 0
            if(sum(good) == 1)
              predi[good,] <- cbind(1,matrix(newdatai[good,],1,)) %*% coef
            else
              predi[good,] <- cbind(1,newdatai[good,]) %*% coef
          }
        else
          predi <- cbind(1,newdatai) %*% coef
        return(predi)
      }
    pred <- dmapply(calc_link, newdatai = parts(newdata), 
                    maski = if(is.null(mask)) rep(list(NULL),totalParts(newdata)) else parts(mask),
                    MoreArgs = list(coef = coef), 
                    output.type = "darray", combine ="rbind", nparts = c(totalParts(newdata),1))

    switch(type,
           response = {
             if (trace) message("Calculating response prediction")

             calc_prediction <- function(predi, func, maski)
               {
                 if(is.null(maski))
                   predi = func(predi)
                 else
                   predi[maski > 0,] <- func(predi[maski > 0,])
                 return(predi)
               }

             pred <- dmapply(calc_prediction, predi = parts(pred), 
                             maski = if(is.null(mask)) rep(list(NULL),totalParts(newdata)) else parts(mask),
                             MoreArgs =list(func =  family(object)$linkinv),
                             output.type = "darray", combine ="rbind", nparts = c(totalParts(newdata),1))

           },
           link = )

  } else {                # newdata is a normal array
    if(is.null(mask)) { # no mask
      if (trace) message("Calculating link prediction")
      pred <- cbind(1,newdata) %*% coef
      switch(type,
             response = {
               if (trace) message("Calculating response prediction")
               pred <- family(object)$linkinv(pred)
             },
             link = )
    } else {            # with mask
      if(class(newdata) != class(mask))
        stop("newdata and mask should be of the same type")
      if( nrow(mask) != NROW(newdata) || ncol(mask) != 1 )
        stop("'mask' must have a single column and the same number of rows as newdata")
      pred <- matrix(NA, NROW(newdata), 1)
      if (trace) message("Calculating link prediction")
      pred[mask > 0,] <- cbind(1,newdata[mask > 0,]) %*% coef
      switch(type,
             response = {
               if (trace) message("Calculating response prediction")
               pred[mask > 0,] <- family(object)$linkinv(pred[mask > 0,])
             },
             link = )
    }
  }
  pred
}

## it overwrites generic fitted function and will be used in validation
fitted.dglm <- function (object, ...) {
  object$d.fitted.values
}


                                        # .naCheck checks any missed value (NA, NaN, or Inf) in X, and Y
                                        # This function may alter weights
.naCheck <- function(X, Y, weights, trace) {
  if (trace) {
    message("Checking for missed values")
    starttime<-proc.time()
  }
  check_na <- function(xi,yi)
    {
      yTest <- !is.finite(yi)
      xTest <- matrix(rowSums(!is.finite(xi)),nrow(yi),ncol(yi)) != 0
      test <- yTest | xTest
      return(as.numeric(sum(test)))
    }
  nOmits <- dmapply(check_na, xi = parts(X), yi = parts(Y),
                    output.type = "darray", combine = "rbind", nparts = c(totalParts(X),1))
  nOmits <- sum(nOmits) # number of excluded samples


  set_weights <- function(xi, yi, wi)
    {
      yTest <- !is.finite(yi)
      xTest <- matrix(rowSums(!is.finite(xi)),nrow(yi),ncol(yi)) != 0
      wi[yTest | xTest] <- 0
      return(wi)
    }
  weights <- dmapply(set_weights, xi = parts(X), yi = parts(Y), wi = parts(weights),
                     output.type = "darray", combine = "rbind", nparts = c(totalParts(X),1))


  testArray <- dmapply(function(weightsi) as.numeric(sum(weightsi > 0)),
                       weightsi = parts(weights),
                       output.type="darray", combine = "cbind", nparts = c(totalParts(weights),1)) 
  ngoodObs <- sum(testArray)

  if (trace) {    # end of timing step
    endtime <- proc.time()
    spentTime <- endtime-starttime
    message("Spent time: ",(spentTime)[3]," sec")
  }
  return(list(nOmits = nOmits,weights = weights, ngoodObs = ngoodObs))
}
Any scripts or data that you put into this service are public.
glm.ddR documentation built on May 29, 2017, 6:49 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
glm.ddR
Distributed 'glm' for Big Data using 'ddR' API

R/dglm.R
In glm.ddR: Distributed 'glm' for Big Data using 'ddR' API

Defines functions dglm dglm.control dglm.fit print.dglm summary.dglm print.summary.dglm residuals.dglm weights.dglm SPSolution_Newton covariantMatrix predict.dglm fitted.dglm naCheck

Documented in dglm dglm.control dglm.fit fitted.dglm predict.dglm print.dglm print.summary.dglm residuals.dglm summary.dglm weights.dglm

Try the glm.ddR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

glm.ddR Distributed 'glm' for Big Data using 'ddR' API

R/dglm.R In glm.ddR: Distributed 'glm' for Big Data using 'ddR' API

Defines functions dglm dglm.control dglm.fit print.dglm summary.dglm print.summary.dglm residuals.dglm weights.dglm SPSolution_Newton covariantMatrix predict.dglm fitted.dglm naCheck

Documented in dglm dglm.control dglm.fit fitted.dglm predict.dglm print.dglm print.summary.dglm residuals.dglm summary.dglm weights.dglm

Try the glm.ddR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

glm.ddR
Distributed 'glm' for Big Data using 'ddR' API

R/dglm.R
In glm.ddR: Distributed 'glm' for Big Data using 'ddR' API