R/wcPCA.R

wcPCA <- function (X, class, scale.unit = TRUE, ncp = 5, ind.sup = NULL, quanti.sup = NULL, 
          quali.sup = NULL, row.w = NULL, col.w = NULL, graph = FALSE, 
          axes = c(1, 2)) {
  
  if(!is.factor(class)) stop("class should be a factor")
  
  moy.ptab <- function(V, poids) {
    as.vector(crossprod(poids/sum(poids), as.matrix(V)))
  }
  ec.tab <- function(V, poids) {
    ecart.type <- sqrt(as.vector(crossprod(poids/sum(poids), 
                                           as.matrix(V^2))))
    ecart.type[ecart.type <= 0.0000000000000001] <- 1
    return(ecart.type)
  }
  fct.eta2 <- function(vec, x, weights) {
    VB <- function(xx) {
      return(sum((colSums((tt * xx) * weights)^2)/ni))
    }
    tt <- FactoMineR::tab.disjonctif(vec)
    ni <- colSums(tt * weights)
    unlist(lapply(as.data.frame(x), VB))/colSums(x * x * 
                                                   weights)
  }
  X <- as.data.frame(X)
  is.quali <- which(!unlist(lapply(X, is.numeric)))
  if (length(is.quali) > 0) {
    X[, is.quali] <- lapply(X[, is.quali, drop = FALSE], 
                            as.factor)
    niveau <- unlist(lapply(X[, is.quali, drop = FALSE], 
                            levels))
    if (sum(duplicated(niveau)) > 0 | any(niveau %in% (1:nrow(X)))) {
      for (j in 1:ncol(X[, is.quali, drop = FALSE])) {
        if ((sum(niveau %in% levels(X[, is.quali[j]])) != 
             nlevels(X[, is.quali[j]])) | any(levels(X[, 
                                                       is.quali[j]]) %in% (1:nrow(X)))) 
          levels(X[, is.quali[j]]) <- paste(attributes(X[, 
                                                         is.quali, drop = FALSE])$names[j], levels(X[, 
                                                                                                     is.quali, drop = FALSE][, j]), sep = "_")
      }
    }
  }
  X <- droplevels(X)
  if (!is.null(quali.sup) & !is.numeric(quali.sup)) 
    quali.sup <- which(colnames(X) %in% quali.sup)
  if (!is.null(quanti.sup) & !is.numeric(quanti.sup)) 
    quanti.sup <- which(colnames(X) %in% quanti.sup)
  if (any(is.na(X))) {
    warning("Missing values are imputed by the mean of the variable: you should use the imputePCA function of the missMDA package")
    if (is.null(quali.sup)) 
      X[is.na(X)] <- matrix(colMeans(X, na.rm = TRUE), 
                            ncol = ncol(X), nrow = nrow(X), byrow = TRUE)[is.na(X)]
    else for (j in (1:ncol(X))[-quali.sup]) X[, j] <- replace(X[, 
                                                                j], is.na(X[, j]), mean(X[, j], na.rm = TRUE))
  }
  Xtot <- X
  if (!is.null(quali.sup)) 
    X <- X[, -quali.sup, drop = FALSE]
  auxi <- colnames(X)[!sapply(X, is.numeric)]
  if (length(auxi) > 0) 
    stop(paste("\nThe following variables are not quantitative: ", 
               auxi))
  todelete <- c(quali.sup, quanti.sup)
  if (!is.null(todelete)) 
    X <- Xtot[, -todelete, drop = FALSE]
  if (!is.null(ind.sup)) {
    X.ind.sup <- X[ind.sup, , drop = FALSE]
    X <- X[-ind.sup, , drop = FALSE]
  }
  ncp <- min(ncp, nrow(X) - 1, ncol(X))
  if (is.null(row.w)) 
    row.w <- rep(1, nrow(X))
  row.w.init <- row.w
  row.w <- row.w/sum(row.w)
  if (is.null(col.w)) 
    col.w <- rep(1, ncol(X))

  data <- X
  # centre <- moy.ptab(X, row.w)
  nk <- tapply(row.w, class, sum)
  centre <- sweep(t(as.matrix(dichotom(class)))%*%diag(row.w)%*%as.matrix(X), 1, nk, "/")
  rownames(centre) <- levels(class)
  X <- X - centre[class, ]

  if (is.null(attributes(X)$row.names)) rownames(X) <- rownames(data)
  if (is.null(attributes(X)$names)) colnames(X) <- colnames(data)
  if (scale.unit) {
    ecart.type <- ec.tab(X, row.w)
    X <- t(t(X)/ecart.type)
  } else {
    ecart.type <- rep(1, length(centre))
  }
  dist2.ind <- rowSums(t(t(X^2) * col.w))
  dist2.var <- as.vector(crossprod(rep(1, nrow(X)), as.matrix(X^2 * row.w)))
  res.call <- list(row.w = (row.w/sum(row.w)), col.w = col.w, 
                   scale.unit = scale.unit, ncp = ncp, centre = centre, 
                   ecart.type = ecart.type, X = Xtot, row.w.init = row.w.init, 
                   call = match.call())
  tmp <- FactoMineR::svd.triplet(X, row.w = row.w, col.w = col.w, ncp = ncp)
  eig <- tmp$vs^2
  vp <- matrix(NA, length(eig), 3)
  rownames(vp) <- paste("comp", 1:length(eig))
  colnames(vp) <- c("eigenvalue", "percentage of variance", 
                    "cumulative percentage of variance")
  vp[, "eigenvalue"] <- eig
  vp[, "percentage of variance"] <- (eig/sum(eig)) * 100
  vp[, "cumulative percentage of variance"] <- cumsum(vp[, 
                                                         "percentage of variance"])
  V <- tmp$V
  U <- tmp$U
  eig <- eig[1:ncp]
  coord.ind <- t(t(as.matrix(U)) * sqrt(eig))
  coord.var <- t(t(as.matrix(V)) * sqrt(eig))
  contrib.var <- t(t(coord.var^2)/eig) * col.w
  dist2 <- dist2.var
  cor.var <- coord.var/sqrt(dist2)
  cos2.var <- cor.var^2
  rownames(coord.var) <- rownames(cos2.var) <- rownames(cor.var) <- rownames(contrib.var) <- colnames(X)
  colnames(coord.var) <- colnames(cos2.var) <- colnames(cor.var) <- colnames(contrib.var) <- paste("Dim", 
                                                                                                   c(1:ncol(V)), sep = ".")
  res.var <- list(coord = coord.var[, 1:ncp, drop = FALSE], 
                  cor = cor.var[, 1:ncp, drop = FALSE], cos2 = cos2.var[, 
                                                                        1:ncp, drop = FALSE], contrib = contrib.var[, 1:ncp, 
                                                                                                                    drop = FALSE] * 100)
  dist2 <- dist2.ind
  cos2.ind <- coord.ind^2/dist2
  contrib.ind <- t(t(coord.ind^2 * row.w/sum(row.w))/eig)
  rownames(coord.ind) <- rownames(cos2.ind) <- rownames(contrib.ind) <- names(dist2) <- rownames(X)
  colnames(coord.ind) <- colnames(cos2.ind) <- colnames(contrib.ind) <- paste("Dim", 
                                                                              c(1:ncol(U)), sep = ".")
  res.ind <- list(coord = coord.ind[, 1:ncp, drop = FALSE], 
                  cos2 = cos2.ind[, 1:ncp, drop = FALSE], contrib = contrib.ind[, 
                                                                                1:ncp, drop = FALSE] * 100, dist = sqrt(dist2))
  res <- list(eig = vp, var = res.var, ind = res.ind, svd = tmp)
  if (!is.null(ind.sup)) {
    if (is.null(ecart.type)) ecart.type <- rep(1, length(centre))
    # nk <- tapply(row.w, class, sum)
    # centre <- sweep(t(as.matrix(dichotom(class)))%*%diag(row.w)%*%as.matrix(X.ind.sup), 1, nk, "/")
    # rownames(centre) <- levels(class)
    X.ind.sup <- X.ind.sup - centre[class[ind.sup], ]
    # X.ind.sup <- t(t(as.matrix(X.ind.sup)) - centre)
    X.ind.sup <- t(t(X.ind.sup)/ecart.type)
    coord.ind.sup <- t(t(X.ind.sup) * col.w)
    coord.ind.sup <- crossprod(t(coord.ind.sup), tmp$V)
    dist2 <- rowSums(t(t(X.ind.sup^2) * col.w))
    cos2.ind.sup <- coord.ind.sup^2/dist2
    coord.ind.sup <- coord.ind.sup[, 1:ncp, drop = F]
    cos2.ind.sup <- cos2.ind.sup[, 1:ncp, drop = F]
    colnames(coord.ind.sup) <- colnames(cos2.ind.sup) <- paste("Dim", 
                                                               c(1:ncp), sep = ".")
    rownames(coord.ind.sup) <- rownames(cos2.ind.sup) <- names(dist2) <- rownames(X.ind.sup)
    res.ind.sup <- list(coord = coord.ind.sup, cos2 = cos2.ind.sup, 
                        dist = sqrt(dist2))
    res$ind.sup <- res.ind.sup
    res.call$ind.sup <- ind.sup
  }
  if (!is.null(quanti.sup)) {
    X.quanti.sup <- as.data.frame(Xtot[, quanti.sup, drop = FALSE])
    if (!is.null(ind.sup)) 
      X.quanti.sup <- as.data.frame(X.quanti.sup[-ind.sup, 
                                                 , drop = FALSE])
    colnames(X.quanti.sup) <- colnames(Xtot)[quanti.sup]
    res.call$quanti.sup <- X.quanti.sup
    # nk <- tapply(row.w, class, sum)
    centre.sup <- sweep(t(as.matrix(dichotom(class)))%*%diag(row.w)%*%as.matrix(X.quanti.sup), 1, nk, "/")
    rownames(centre.sup) <- levels(class)
    X.quanti.sup <- X.quanti.sup - centre.sup[class, ]
    # centre.sup <- moy.ptab(X.quanti.sup, row.w)
    # X.quanti.sup <- t(t(as.matrix(X.quanti.sup)) - centre.sup)
    if (scale.unit) {
      ecart.type.sup <- ec.tab(X.quanti.sup, row.w)
      X.quanti.sup <- t(t(X.quanti.sup)/ecart.type.sup)
    }
    coord.vcs <- t(X.quanti.sup * row.w)
    coord.vcs <- crossprod(t(coord.vcs), tmp$U)
    col.w.vcs <- rep(1, ncol(coord.vcs))
    cor.vcs <- matrix(NA, ncol(X.quanti.sup), ncol(tmp$U))
    dist2 <- as.vector(crossprod(rep(1, nrow(X.quanti.sup)), 
                                 as.matrix(X.quanti.sup^2 * row.w)))
    cor.vcs <- coord.vcs/sqrt(dist2)
    cos2.vcs <- cor.vcs^2
    colnames(coord.vcs) <- colnames(cor.vcs) <- colnames(cos2.vcs) <- paste("Dim", 
                                                                            c(1:ncol(cor.vcs)), sep = ".")
    rownames(coord.vcs) <- rownames(cor.vcs) <- rownames(cos2.vcs) <- colnames(Xtot)[quanti.sup]
    res.quanti.sup <- list(coord = coord.vcs[, 1:ncp, drop = FALSE], 
                           cor = cor.vcs[, 1:ncp, drop = FALSE], cos2 = cos2.vcs[, 
                                                                                 1:ncp, drop = FALSE])
    res$quanti.sup <- res.quanti.sup
  }
  if (!is.null(quali.sup)) {
    X.quali.sup <- as.data.frame(Xtot[, quali.sup, drop = FALSE])
    if (!is.null(ind.sup)) 
      X.quali.sup <- as.data.frame(X.quali.sup[-ind.sup, 
                                               , drop = FALSE])
    colnames(X.quali.sup) <- colnames(Xtot)[quali.sup]
    nombre <- modalite <- NULL
    if (ncp > 1) 
      eta2 <- t(sapply(X.quali.sup, fct.eta2, res$ind$coord, 
                       weights = row.w))
    else {
      eta2 <- as.matrix(sapply(X.quali.sup, fct.eta2, 
                               res$ind$coord, weights = row.w), ncol = ncp)
      colnames(eta2) <- paste("Dim", 1:ncp)
      rownames(eta2) <- colnames(X.quali.sup)
    }
    for (i in 1:ncol(X.quali.sup)) {
      var <- as.factor(X.quali.sup[, i])
      n.mod <- nlevels(var)
      modalite <- c(modalite, n.mod)
      bary <- matrix(NA, n.mod, ncol(X))
      for (j in 1:n.mod) {
        ind <- levels(var)[j]
        bary[j, ] <- moy.ptab(data[which(var == ind), ], row.w[which(var == ind)])
        nombre <- c(nombre, sum(row.w.init[which(var == ind)]))
      }
      colnames(bary) <- colnames(X)
      if ((levels(var)[1] %in% (1:nrow(X))) | (levels(var)[1] %in% c("y", "Y", "n", "N"))) {
        row.names(bary) <- paste(colnames(X.quali.sup)[i], as.character(levels(var)), sep = ".")
      } else { row.names(bary) <- as.character(levels(var)) }
      if (i == 1) { barycentre <- bary
      } else { barycentre <- rbind(barycentre, bary) }
    }
    # bary <- t(t(barycentre) - centre)
    bary <- t(t(barycentre) - moy.ptab(X, row.w))
    if (!is.null(ecart.type)) 
      bary <- t(t(bary)/ecart.type)
    dist2 <- rowSums(t(t(bary^2) * col.w))
    coord.barycentre <- t(t(bary) * col.w)
    coord.barycentre <- crossprod(t(coord.barycentre), tmp$V)
    colnames(coord.barycentre) <- paste("Dim", 1:ncol(coord.barycentre), 
                                        sep = ".")
    cos2.bary.sup <- coord.barycentre^2/dist2
    vtest <- t(t(coord.barycentre)/sqrt(eig))
    if (sum(row.w.init) > 1) 
      vtest <- vtest * sqrt(nombre/((sum(row.w.init) - 
                                       nombre)/(sum(row.w.init) - 1)))
    else vtest <- vtest * sqrt(nombre)
    cos2.bary.sup <- cos2.bary.sup[, 1:ncp, drop = FALSE]
    coord.barycentre <- coord.barycentre[, 1:ncp, drop = FALSE]
    vtest <- vtest[, 1:ncp, drop = FALSE]
    dimnames(cos2.bary.sup) <- dimnames(vtest) <- dimnames(coord.barycentre)
    names(dist2) <- rownames(coord.barycentre)
    res.quali.sup <- list(coord = coord.barycentre, cos2 = cos2.bary.sup, 
                          v.test = vtest, dist = sqrt(dist2), eta2 = eta2)
    call.quali.sup <- list(quali.sup = X.quali.sup, modalite = modalite, 
                           nombre = nombre, barycentre = as.data.frame(barycentre), 
                           numero = quali.sup)
    res$quali.sup <- res.quali.sup
    res.call$quali.sup <- call.quali.sup
  }
  res$call <- res.call
  class(res) <- c("PCA", "list")
  if (graph & (ncp > 1)) {
    print(FactoMineR::plot.PCA(res, choix = "ind", axes = axes))
    print(FactoMineR::plot.PCA(res, choix = "var", axes = axes, shadowtext = TRUE, 
                   new.plot = TRUE))
  }
  
  pca <- FactoMineR::PCA(X = X, scale.unit = scale.unit, ncp = ncp, ind.sup = ind.sup, quanti.sup = quanti.sup, 
                         quali.sup = quali.sup, row.w = row.w, col.w = col.w, graph = FALSE, 
                         axes = axes)
  res$ratio <- sum(res$eig[,"eigenvalue"]) / sum(pca$eig[,"eigenvalue"])
  return(res)
}

Try the GDAtools package in your browser

Any scripts or data that you put into this service are public.

GDAtools documentation built on Oct. 6, 2023, 5:07 p.m.