FactorAnalytics: factor analysis

#' fit fundamental factor model by classic OLS or Robust regression technique
#' 
#' fit fundamental factor model or cross-sectional factor model by
#' classic OLS or Robust regression.  Fundamental factor models use
#' observable asset specific characteristics (fundamentals) like industry
#' classification, market capitalization, style classification (value, growth)
#' etc. to calculate the common risk factors. The function creates the class
#' "FundamentalFactorModel".
#' 
#' @details
#' If style factor exposure is standardized to regression-weighted mean zero, this makes
#' style factors orthogonal to the world factor (intercept term), which in turn facilitted 
#' interpretation of the style factor returns. See Menchero 2010.    
#' 
#' The original function was designed by Doug Martin and originally implemented
#' in S-PLUS by a number of UW Ph.D. students: Christopher Green, Eric Aldrich,
#' and Yindeng Jiang. Guy Yullen re-implemented the function in R. Yi-An Chen from 
#' University of Washington re-writes the codes and finalizes the function.  
#'  
#'
#' @param data data.frame, data must have \emph{assetvar}, \emph{returnvar}, \emph{datevar}
#' , and exposure.names. Generally, data has to look like panel data. It needs firm variabales 
#' and time variables. Data has to be a balanced panel. 
#' @param exposure.names a character vector of exposure names for the factor model
#' @param wls logical flag, TRUE for weighted least squares, FALSE for ordinary
#' least squares
#' @param regression A character string, "robust" for regression via lmRob,
#' "classic" for regression with lm()
#' @param covariance A character string, "robust" for covariance matrix
#' computed with covRob(), "classic" for covariance matrix with covClassic() in 
#' robust package. 
#' @param full.resid.cov logical flag, TRUE for full residual covariance matrix
#' calculation, FALSE for diagonal residual covarinace matrix
#' @param robust.scale logical flag, TRUE for exposure scaling via robust scale
#' and location, FALSE for scaling via mean and sd
#' @param returnsvar A character string giving the name of the return variable
#' in the data.
#' @param datevar A character string gives the name of the date variable in
#' the data.
#' @param assetvar A character string gives the name of the asset variable in
#' the data.
#' @param standardized.factor.exposure logical flag. Factor exposure will be standardized 
#' to regression weighted mean 0 and standardized deviation to 1 if \code{TRUE}. 
#' Default is \code{FALSE}. See Detail. 
#' @param weight.var A character strping gives the name of the weight used for standarizing factor exposures. 
#' @return an S3 object containing
#' \itemize{
#' \item returns.cov A "list" object contains covariance information for
#' asset returns, includes covariance, mean and eigenvalus. Beta of taken as latest
#' date input. 
#' \item factor.cov An object of class "cov" or "covRob" which
#' contains the covariance matrix of the factor returns (including intercept).
#' \item resids.cov An object of class "cov" or "covRob" which contains
#' the covariance matrix of the residuals, if "full.resid.cov" is TRUE.  NULL
#' if "full.resid.cov" is FALSE.
#' \item returns.corr Correlation matrix of assets returns. 
#' \item factor.corr  An object of class "cov" or "covRob" which
#' contains the correlation matrix of the factor returns (including intercept). 
#' \item resids.corr Correlation matrix of returns returns.
#' \item resid.variance A vector of variances estimated from the OLS
#' residuals for each asset. If "wls" is TRUE, these are the weights used in
#' the weighted least squares regressions.  If "cov = robust" these values are
#' computed with "scale.tau".  Otherwise they are computed with "var".
#' \item factor.returns A "xts" object containing the times series of
#' estimated factor returns and intercepts.
#' \item residuals A "xts" object containing the time series of residuals
#' for each asset.
#' \item tstats A "xts" object containing the time series of t-statistics
#' for each exposure.
#' \item call function call
#' \item exposure.names A character string giving the name of the exposure variable in
#' the data.
#' }
#' @author Guy Yullen and Yi-An Chen
#' @references
#' \itemize{
#' \item "The Characteristics of Factor Portfolios", Fall 2010, MENCHERO Jose, 
#' Journal of Performance Measurement. 
#' \item Grinold,R and Kahn R, \emph{Active Portfolio Management}.
#' }
#' 
#' @export
#' @examples
#' 
#' # BARRA type factor model
#' data(Stock.df)
#' # there are 447 assets  
#' exposure.names <- c("BOOK2MARKET", "LOG.MARKETCAP") 
#' test.fit <- fitFundamentalFactorModel(data=stock,exposure.names=exposure.names,
#'                                        datevar = "DATE", returnsvar = "RETURN",
#'                                        assetvar = "TICKER", wls = TRUE, 
#'                                        regression = "classic", 
#'                                        covariance = "classic", full.resid.cov = TRUE, 
#'                                        robust.scale = TRUE)
#' 
#' names(test.fit) 
#' test.fit$returns.cov 
#' test.fit$resids.cov  
#' names(test.fit$cov.factor)  
#' test.fit$factor.cov$cov  
#' test.fit$factor  
#' test.fit$resid.variance  
#' test.fit$resids 
#' test.fit$tstats 
#' test.fit$call
#' 
#' # BARRA type Industry Factor Model
#' exposure.names <- c("GICS.SECTOR")  
#' # the rest keep the same
#' test.fit2 <- fitFundamentalFactorModel(data=stock,exposure.names=exposure.names,
#'                                        datevar = "DATE", returnsvar = "RETURN",
#'                                        assetvar = "TICKER", wls = TRUE, 
#'                                        regression = "classic", 
#'                                        covariance = "classic", full.resid.cov = TRUE, 
#'                                        robust.scale = TRUE)
#' 
#' names(test.fit2) 
#' test.fit2$cov.returns 
#' test.fit2$cov.resids  
#' names(test.fit2$cov.factor)  
#' test.fit2$cov.factor$cov  
#' test.fit2$factor  
#' test.fit2$resid.variance  
#' test.fit2$resids 
#' test.fit2$tstats 
#' test.fit2$call
#' 
#' 
#' 
#' 



fitFundamentalFactorModel <-
  function(data,exposure.names, datevar, returnsvar, assetvar,
           wls = TRUE, regression = "classic", 
           covariance = "classic", full.resid.cov = FALSE, robust.scale = FALSE,
           standardized.factor.exposure = FALSE, weight.var) {
    
    require(xts)
    require(robust)
    
    
    assets = unique(data[[assetvar]])
    timedates = as.Date(unique(data[[datevar]]))    
    data[[datevar]] <- as.Date(data[[datevar]])
    
    if (length(timedates) < 2) 
      stop("At least two time points, t and t-1, are needed for fitting the factor model.")
     if (!is(exposure.names, "vector") || !is.character(exposure.names)) 
       stop("exposure argument invalid---must be character vector.")
     if (!is(assets, "vector") || !is.character(assets)) 
       stop("assets argument invalid---must be character vector.")
    
    wls <- as.logical(wls)
    full.resid.cov <- as.logical(full.resid.cov)
    robust.scale <- as.logical(robust.scale)
    standardized.factor.exposure <- as.logical(standardized.factor.exposure)
    
    if (!match(regression, c("robust", "classic"), FALSE)) 
      stop("regression must one of 'robust', 'classic'.")
    if (!match(covariance, c("robust", "classic"), FALSE)) 
      stop("covariance must one of 'robust', 'classic'.")
    this.call <- match.call()
    
    if (match(returnsvar, exposure.names, FALSE)) 
      stop(paste(returnsvar, "cannot be used as an exposure."))
    
    
    numTimePoints <- length(timedates)
    numExposures <- length(exposure.names)
    numAssets <- length(assets)
    
    
    
    
    # check if exposure.names are numeric, if not, create exposures. factors by dummy variables
    which.numeric <- sapply(data[, exposure.names, drop = FALSE],is.numeric)
    exposures.numeric <- exposure.names[which.numeric]
    # industry factor model
    exposures.factor <- exposure.names[!which.numeric]
    if (length(exposures.factor) > 1) {
      stop("Only one nonnumeric variable can be used at this time.")
    }
    
    if (standardized.factor.exposure == TRUE) {
    if (is.na(weight.var)) {
      stop("Need to assign weight variable")
    }
      weight = by(data = data, INDICES = as.numeric(data[[datevar]]), 
                  function(x) x[[weight.var]]/sum(x[[weight.var]]))
      data[[weight.var]] <- unlist(weight)
        
      for (i in exposures.numeric) {
    standardized.exposure <- by(data = data, INDICES = as.numeric(data[[datevar]]),
                                function(x) ((x[[i]] - mean(x[[weight.var]]*x[[i]]) )*1/sd(x[[weight.var]]*x[[i]]) )) 
    data[[i]] <- unlist(standardized.exposure)
    }
    }
    
      
    regression.formula <- paste("~", paste(exposure.names, collapse = "+"))
    #  "~ BOOK2MARKET"
    if (length(exposures.factor)) {
      regression.formula <- paste(regression.formula, "- 1")
      data[, exposures.factor] <- as.factor(data[,exposures.factor])
      exposuresToRecode <- names(data[, exposure.names, drop = FALSE])[!which.numeric]
      contrasts.list <- lapply(seq(length(exposuresToRecode)), 
                               function(i) function(n, m) contr.treatment(n, contrasts = FALSE))
      names(contrasts.list) <- exposuresToRecode
    }    else {
      contrasts.list <- NULL
    }
    # turn characters into formula 
    regression.formula <- eval(parse(text = paste(returnsvar,regression.formula)))
    # RETURN ~ BOOK2MARKET 
    
    ols.robust <- function(xdf, modelterms, conlist) {
      if (length(exposures.factor)) {
        zz <- xdf[[exposures.factor]]
        xdf[[exposures.factor]] <- if (is.ordered(zz)) 
          ordered(zz, levels = sort(unique.default(zz)))
        else factor(zz)
      }
      model <- lmRob(modelterms, data = xdf, contrasts = conlist, 
                     control = lmRob.control(mxr = 200, mxf = 200, mxs = 200))
      sdest <- sqrt(diag(model$cov))
      names(sdest) <- names(model$coef)
      coefnames <- names(model$coef)
      alphaord <- order(coefnames)
      model$coef <- model$coef[alphaord]
      sdest <- sdest[alphaord]
      c(length(model$coef), model$coef, model$coef/sdest, model$resid)
    }
    ols.classic <- function(xdf, modelterms, conlist) {
      model <- try(lm(formula = modelterms, data = xdf, contrasts = conlist, 
                      singular.ok = FALSE))
      if (is(model, "Error")) {
        mess <- geterrmessage()
        nn <- regexpr("computed fit is singular", mess)
        if (nn > 0) {
          cat("At time:", substring(mess, nn), "\n")
          model <- lm(formula = modelterms, data = xdf, 
                      contrasts = conlist, singular.ok = TRUE)
        }  else stop(mess)
      }
      tstat <- rep(NA, length(model$coef))
      tstat[!is.na(model$coef)] <- summary(model, cor = FALSE)$coef[,3]
      alphaord <- order(names(model$coef))
      c(length(model$coef), model$coef[alphaord], tstat[alphaord], 
        model$resid)
    }
    wls.robust <- function(xdf, modelterms, conlist, w) {
      assign("w", w, pos = 1)
      if (length(exposures.factor)) {
        zz <- xdf[[exposures.factor]]
        xdf[[exposures.factor]] <- if (is.ordered(zz)) 
          ordered(zz, levels = sort(unique.default(zz)))
        else factor(zz)
      }
      model <- lmRob(modelterms, data = xdf, weights = w, contrasts = conlist, 
                     control = lmRob.control(mxr = 200, mxf = 200, mxs = 200))
      sdest <- sqrt(diag(model$cov))
      names(sdest) <- names(model$coef)
      coefnames <- names(model$coef)
      alphaord <- order(coefnames)
      model$coef <- model$coef[alphaord]
      sdest <- sdest[alphaord]
      c(length(model$coef), model$coef, model$coef/sdest, model$resid)
    }
    wls.classic <- function(xdf, modelterms, conlist, w) {
      assign("w", w, pos = 1)
      model <- try(lm(formula = modelterms, data = xdf, contrasts = conlist, 
                      weights = w, singular.ok = FALSE))
      if (is(model, "Error")) {
        mess <- geterrmessage()
        nn <- regexpr("computed fit is singular", mess)
        if (nn > 0) {
          cat("At time:", substring(mess, nn), "\n")
          model <- lm(formula = modelterms, data = xdf, 
                      contrasts = conlist, weights = w)
        }
        else stop(mess)
      }
      tstat <- rep(NA, length(model$coef))
      tstat[!is.na(model$coef)] <- summary(model, cor = FALSE)$coef[,3]
      alphaord <- order(names(model$coef))
      c(length(model$coef), model$coef[alphaord], tstat[alphaord], 
        model$resid)
    }
    # FE.hat has T  elements
    # every element t contains 
    # 1. number of factors (intercept incl.) 
    # 2. estimated factors at time t 
    # 3. t value of estimated factors 
    # 4. residuals  at time t       
    if (!wls) {
      if (regression == "robust") {
        # ols.robust    
        FE.hat <- by(data = data, INDICES = as.numeric(data[[datevar]]), 
                     FUN = ols.robust, modelterms = regression.formula, 
                     conlist = contrasts.list)
      } else {
        # ols.classic
        FE.hat <- by(data = data, INDICES = as.numeric(data[[datevar]]), 
                     FUN = ols.classic, modelterms = regression.formula, 
                     conlist = contrasts.list)
      }
    } else {
      if (regression == "robust") {
        # wls.robust
        resids <- by(data = data, INDICES = as.numeric(data[[datevar]]), 
                     FUN = function(xdf, modelterms, conlist) {
                       lmRob(modelterms, data = xdf, contrasts = conlist, 
                             control = lmRob.control(mxr = 200, mxf = 200, 
                                                     mxs = 200))$resid
                     }, modelterms = regression.formula, conlist = contrasts.list)
        resids <- apply(resids, 1, unlist)
        weights <- if (covariance == "robust") 
          apply(resids, 1, scaleTau2)^2
        else apply(resids, 1, var)
        weights <- weights^-1
        FE.hat <- by(data = data, INDICES = as.numeric(data[[datevar]]), 
                     FUN = wls.robust, modelterms = regression.formula, 
                     conlist = contrasts.list, w = weights)
      } 
      else { 
        # wls.classic
        resids <- by(data = data, INDICES = as.numeric(data[[datevar]]), 
                     FUN = function(xdf, modelterms, conlist) {
                       lm(formula = modelterms, data = xdf, contrasts = conlist, 
                          singular.ok = TRUE)$resid
                     },
                     modelterms = regression.formula, conlist = contrasts.list)
        resids <- apply(resids, 1, unlist)
        weights <- if (covariance == "robust") 
          apply(resids, 1, scaleTau2)^2
        else apply(resids, 1, var)
        weights <- weights^-1
        FE.hat <- by(data = data, INDICES = as.numeric(data[[datevar]]), 
                     FUN = wls.classic, modelterms = regression.formula, 
                     conlist = contrasts.list, w = weights)
      }
    }
    # if there is industry dummy variables
    if (length(exposures.factor)>0) {
      numCoefs <- length(exposures.numeric) + length(levels(data[,exposures.factor]))
      ncols <- 1 + 2 * numCoefs + numAssets
      fnames <- c(exposures.numeric, paste(exposures.factor, 
                                           levels(data[, exposures.factor]), sep = ""))
      cnames <- c("numCoefs", fnames, paste("t", fnames, sep = "."), 
                  assets)
    } else {
      numCoefs <- 1 + length(exposures.numeric)
      ncols <- 1 + 2 * numCoefs + numAssets
      cnames <- c("numCoefs", "(Intercept)", exposures.numeric, 
                  paste("t", c("(Intercept)", exposures.numeric), sep = "."), 
                  assets)
    }
    
    # create matrix for fit
    FE.hat.mat <- matrix(NA, ncol = ncols, nrow = numTimePoints, 
                         dimnames = list(as.character(timedates), cnames))
    # give each element t names 
    for (i in 1:length(FE.hat)) {
      names(FE.hat[[i]])[1] <- "numCoefs"
      nc <- FE.hat[[i]][1]
      names(FE.hat[[i]])[(2 + nc):(1 + 2 * nc)] <- paste("t", 
                                                         names(FE.hat[[i]])[2:(1 + nc)], sep = ".")
      if (length(FE.hat[[i]]) != (1 + 2 * nc + numAssets)) 
        stop(paste("bad count in row", i, "of FE.hat"))
      names(FE.hat[[i]])[(2 + 2 * nc):(1 + 2 * nc + numAssets)] <- assets
      idx <- match(names(FE.hat[[i]]), colnames(FE.hat.mat))
      FE.hat.mat[i, idx] <- FE.hat[[i]]
    }
    # give back the names of timedates
#     timedates <- as.Date(as.numeric(dimnames(FE.hat)[[1]]), origin = "1970-01-01")
    coefs.names <- colnames(FE.hat.mat)[2:(1 + numCoefs)]
    # estimated factors returns ordered by time
    f.hat <- xts(x = FE.hat.mat[, 2:(1 + numCoefs)], order.by = timedates)
    # check for outlier
    gomat <- apply(coredata(f.hat), 2, function(x) abs(x - median(x, 
                                                                  na.rm = TRUE)) > 4 * mad(x, na.rm = TRUE))
    if (any(gomat, na.rm = TRUE) ) {
      cat("\n\n*** Possible outliers found in the factor returns:\n\n")
      for (i in which(apply(gomat, 1, any, na.rm = TRUE))) print(f.hat[i, 
                                                                       gomat[i, ], drop = FALSE])
    }
    tstats <- xts(x = FE.hat.mat[, (2 + nc):(1 + 2 * nc)], order.by = timedates)
    # residuals for every asset ordered by time
    resids <- xts(x = FE.hat.mat[, (2 + 2 * numCoefs):(1 + 2 * 
                                                         numCoefs + numAssets)], order.by = timedates)
    
    if (covariance == "robust") {
      if (kappa(na.exclude(coredata(f.hat))) < 1e+10) {
        Cov.factors <- covRob(coredata(f.hat), estim = "pairwiseGK", 
                              distance = FALSE, na.action = na.omit)
      } else {
        cat("Covariance matrix of factor returns is singular.\n")
        Cov.factors <- covRob(coredata(f.hat), distance = FALSE, 
                              na.action = na.omit)
      }
      resid.vars <- apply(coredata(resids), 2, scaleTau2, na.rm = T)^2
      D.hat <- if (full.resid.cov) 
        covOGK(coredata(resids), sigmamu = scaleTau2, n.iter = 1)
      else 
        diag(resid.vars)
    }   else {
      Cov.factors <- covClassic(coredata(f.hat), distance = FALSE,na.action = na.omit)
      resid.vars <- apply(coredata(resids), 2, var, na.rm = TRUE)
      D.hat <- if (full.resid.cov) {
        covClassic(coredata(resids), distance = FALSE, na.action = na.omit)
      }  else { diag(resid.vars) }
    }
    # create betas from origial database  
    B.final <- matrix(0, nrow = numAssets, ncol = numCoefs)
    colnames <-  coefs.names
    B.final[, match("(Intercept)", colnames, 0)] <- 1
    numeric.columns <- match(exposures.numeric, colnames, 0)
    # only take the latest beta to compute FM covariance
    B.final[, numeric.columns] <- as.matrix(data[ (data[[datevar]] == timedates[numTimePoints]), exposures.numeric])
    rownames(B.final) = assets
    colnames(B.final) = colnames(f.hat)
    
    if (length(exposures.factor)>0) {
      B.final[, grep(exposures.factor, x = colnames)][cbind(seq(numAssets), 
                                                            (data[ data[[datevar]] == timedates[numTimePoints], 
                                                                            exposures.factor]))] <- 1
    }
    
    cov.returns <- B.final %*% Cov.factors$cov %*% t(B.final) + 
      if (full.resid.cov) { D.hat$cov
      }  else { D.hat  }
    mean.cov.returns = tapply(data[[returnsvar]],data[[assetvar]], mean)
Corr.returns = cov2cor(cov.returns)
    Cov.returns <- list(cov = cov.returns, mean=mean.cov.returns, eigenvalues = eigen(cov.returns, 
                                                                                      only.values = TRUE, symmetric = TRUE)$values)
    
    # 
    # # r-square for each asset = 1 - SSE/SST
    #    SSE <-  apply(fit.fund$residuals^2,2,sum) 
    #    SST <- tapply(data[,returnsvar],data[,assetvar],function(x) sum((x-mean(x))^2))
    #   r2 <- 1- SSE/SST                                  
    
    # change names for intercept
    if (!(length(exposures.factor)>0)) {
    colnames(f.hat)[1] <- "Intercept"
    }
    
# report corr matrix

if (covariance == "robust") {
  if (kappa(na.exclude(coredata(f.hat))) < 1e+10) {
    Corr.factors <- covRob(coredata(f.hat), estim = "pairwiseGK", 
                           distance = FALSE, na.action = na.omit,corr=TRUE)
  } else {
    cat("Covariance matrix of factor returns is singular.\n")
    Corr.factors <- covRob(coredata(f.hat), distance = FALSE, 
                           na.action = na.omit,corr=TRUE)
  }
  Corr.D <-  if (full.resid.cov) { cov2cor(D.hat$cov)
  }  else { NULL  }
  
}   else {
  Corr.factors <- covClassic(coredata(f.hat), distance = FALSE,na.action = na.omit,corr=TRUE)
  Corr.D <-  if (full.resid.cov) { cov2cor(D.hat$cov)
  }  else { NULL  }
}



    output <- list(returns.cov = Cov.returns, 
                   factor.cov = Cov.factors, 
                   resids.cov = D.hat,
                   returns.corr = Corr.returns,
                   factor.corr = Corr.factors,
                   resids.corr = Corr.D,
                   resid.variance = resid.vars, 
                   factor.returns = f.hat, 
                   residuals = resids, 
                   tstats = tstats,                   
                   call = this.call,
                   data = data,
                   asset.names = assets,
                   beta = B.final,
                   datevar = datevar,
                   returnsvar = returnsvar,
                   assetvar = assetvar,
                   exposure.names = exposure.names)
    class(output) <- "FundamentalFactorModel"
    return(output)
  }