R/mixture_generator.R
In CorReg: Linear Regression Based on Linear Structure Between Variables

Documented in mixture_generator

#' Gaussian mixtures dataset generator with regression between the covariates
#' 
#' @description Generates a dataset (with an additional validation sample) made of Gaussian mixtures with some of them generated by sub-regressions on others. A response variable is then added by linear regression. This function is used to generate datasets for simulations using CorReg, or just with Gaussian Mitures.
# utilise Matrix (matrices creuses) et generateurZ
#'
#' @param n the number of individuals in the learning dataset
#' @param p the number of covariates (without the response)
#' @param ratio the ratio of covariates generated by sub-regressions on others
#' @param max_compl the number of covariates in each sub-regression
#' @param valid the number of individuals in the validation sample
#' @param positive the ratio of positive coefficients in both the regression and the sub-regressions
#' @param sigma_Y the standard deviation for the noise of the regression
#' @param sigma_X the standard deviation for the noise of the sub-regressions (all). ignored if \code{gamma=TRUE} or if \code{R2} is not NULL
#' @param R2 the strength of the sub-regressions (coefficients will be chosen to obtain this value).
#' @param R2Y the strength of the main regression (coefficients will be chosen to obtain this value).
#' @param gamma (boolean) to generate a p-sized vector \code{sigma_X} gamma-distributed
#' @param gammashape shape parameter of the gamma distribution (if needed)
#' @param gammascale scale parameter of the gamma distribution (if needed)
#' @param meanvar vector of means for the covariates.
#' @param sigmavar standard deviation of the covariates.
#' @param lambda parameter of the Poisson's law that defines the number of components in Gaussian Mixture models
#' @param Amax the maximum number of covariates with non-zero coefficients in the regression
#' @param tp1 the ratio of right-side (explicative) covariates allowed to have a non-zero coefficient in the regression
#' @param tp2 the ratio of left-side (redundant) covariates allowed to have a non-zero coefficient in the regression
#' @param tp3 the ratio of strictly independent covariates allowed to have a non-zero coefficient in the regression
#' @param lambdapois parameter used to generate the coefficient in the subregressions. Poisson's distribution.
# ' @param pb generates Y in an heuristic way that will give some issues with correlations.
#' @param nonlin to use non linear structure (squared or log). If not null, it is the proba to use power pnonlin instead of log. The type is drawn for each link between covariates
#' @param pnonlin the power used if non linear structure
#' @param scale (boolean) to scale X before computing Y
#' @param Z the binary squared adjacency matrix (size p) to obtain. If NULL it is randomly generated, based on \code{ratio} and \code{max_compl} parameters.
#' 
#' @return a list that contains:
#' \item{X_appr}{matrix of the learning set. \code{p} covariates following Gaussian Mixtures with some of them generated by sub-regressions on others.}
#' \item{Y_appr}{Response variable vector (size \code{n}) generated by linear regression on \code{X_appr} with coefficients \code{A} and residual standard deviation \code{sigma_Y}.}
#' \item{A}{vector of the of the regression generating \code{Y_appr}}
#' \item{B}{Matrix of the coefficients of sub-regressions (first line: the intercepts) then \code{B[i-1,j]} is the coefficient associated to \code{X_appr[,i]} in the sub-regression that generates \code{X_appr[,j]}}
#' \item{Z}{Binary squared adjacency matrix of size \code{p} that describes the structure of sub-regressions. \code{Z[i,j]}=1 if \code{X_appr[,i]} explains \code{X_appr[,j]}}
#' \item{X_test}{validation sample generated the same way as \code{X_appr}, with \code{valid} individuals.}
#' \item{Y_test}{Response vector associated to the validation sample}
#' \item{sigma_X}{Vector of the standard deviations of the residuals of the sub-regressions (one value for each sub-regression)}
#' \item{sigma_Y}{Standard deviation of the residual of the regression that generates \code{Y_appr} and \code{Y_test}.}
#' \item{nbcomp}{vector of the number of components for covariates that are not explained by others.}
#' 
#' @examples
#' # dataset generation
#' base = mixture_generator(n = 250, p = 4, valid = 0)
#' X_appr = base$X_appr # learning sample
#' Y_appr = base$Y_appr # response variable
#' for (i in 1:ncol(X_appr)) {
#'   hist(X_appr[, i])
#' }
#' 
#' @export
mixture_generator <- function(n = 130,
                              p = 100,
                              ratio = 0.4,
                              max_compl = 1,
                              valid = 1000,
                              positive = 0.6,
                              sigma_Y = 10,
                              sigma_X = NULL,
                              R2 = NULL,
                              R2Y = 0.4,
                              meanvar = NULL,
                              sigmavar = NULL,
                              lambda = 3, # pour l enombre de composantes des m?langes gaussiens
                              Amax = NULL,
                              lambdapois = 10, # pour les valeurs des coefs et des moyennes des variables
                              gamma = FALSE,
                              gammashape = 1,
                              gammascale = 0.5, tp1 = 1, tp2 = 1, tp3 = 1,
                              #                             pb=0,
                              nonlin = 0, pnonlin = 2, scale = TRUE, Z = NULL) {
   pb = 0
   if (is.null(Z)) {
      max_compl = min(max_compl, p - 1)
      max_compl = min(max_compl, floor(1 + (p - ratio * p)))
      
   } else {
      max_compl = max(colSums(Z))
   }
   if (is.null(R2)) {
      if (is.null(sigma_X) & !gamma) { # si R2 est nul et qu'on n'a pas de sigma X alors on le definit
         R2 = 0.99
      }
   }
   if (is.null(Amax)) {Amax = 2 * p}
   Amax = min(p + 1, Amax) # min entre p+1 et Amax  why?
   if (is.null(Z)) {
      R = round(ratio * p) # R : entier nombre de personne a gauche
   } else {
      R = sum(colSums(Z) != 0)
   }
   if (R == 0) {pb = 0}
   if (is.null(Amax) | Amax > p) {Amax = p + 1}
   # lmabda parametre le nombre de composantes des melanges gaussiens
   valid = max(valid, n)
   B = matrix(0, nrow = (p + 1), ncol = (p + 1)) # B matrice creuse
   taille = n + valid
   qui = 2:(p + 1) # vecteur de taille p qui contient les valeurs allant de 2 a p+1 avec un pas de 1
   if (R > 0) {
      if (is.null(Z)) {
         list_X2 = sample(qui, R) # melange R individus pri au hasard
      } else {
         list_X2 = which(colSums(Z) != 0)
      }
      B[1, list_X2] = rpois(R, lambdapois) * (rep(-1, R) + 2 * rbinom(R, 1, positive)) # 1ere ligne de B = ?????
      for (j in list_X2) { # remplissage aleatoire de max_compl elements de B sur chaque colonne a gauche
         if (is.null(Z)) {
            loc = (1 / max_compl) * rpois(max_compl, lambdapois) * (rep(-1, max_compl) + 2 * rbinom(max_compl, 1, positive))
            loc[loc == 0] = 1
            B[sample(qui[-c(list_X2 - 1)], size = max_compl), j] = loc
         } else {
            compl_loc = colSums(Z)[j]
            loc = (1 / compl_loc) * rpois(compl_loc, lambdapois) * (rep(-1, compl_loc) + 2 * rbinom(compl_loc, 1, positive))
            loc[loc == 0] = 1
            B[sample(qui[-c(list_X2 - 1)], size = compl_loc), j] = loc
         }
      }
      # ajout de G
      G = diag(p + 1)
      G[, list_X2] = 0
      B = B + G
   }
   vraiZ = B
   vraiZ = as.matrix(vraiZ)
   vraiZ[vraiZ != 0] = 1
   vraiZ = vraiZ - diag(diag(vraiZ))
   vraiZ = vraiZ[-1, -1]
   if (sum(c(tp1, tp2, tp3)) == 0) {
      A = rpois(p + 1, lambdapois) * (rep(-1, p + 1) + 2 * rbinom(p + 1, 1, positive))
      # on vient maintenant tailler dans A en fonction des param?tres
      A[-sample(p + 1, size = Amax)] = 0
   } else {
      A = generateurA_ou(Z = vraiZ, tp1 = tp1, tp2 = tp2, tp3 = tp3, positive = positive, lambdapois = lambdapois, pb = pb, Amax = Amax, B = B)
   }
   composantes = rpois(p - R, lambda = lambda)
   composantes[composantes > n] = n # pas plus de composantes que de points
   composantes[composantes == 0] = 1 # au moins une composante
   ploc = sum(composantes)
   
   if (is.null(meanvar)) {
      meanvar = rpois(ploc, lambdapois) * (rep(-1, ploc) + 2 * rbinom(ploc, 1, positive))
   }
   if (is.null(sigmavar)) {
      sigmavar = rpois(ploc, 5)
      sigmavar[sigmavar == 0] = 1
   }
   prop = runif(ploc)
   comp_cum = cumsum(composantes)
   comp_cum = c(0, comp_cum)
   X = matrix(0, ncol = p + 1, nrow = taille)
   epsX = X # matrice nulle
   X1g = matrix(rnorm(taille * ploc, mean = meanvar, sd = sigmavar), ncol = ploc, nrow = taille, byrow = T)
   dim(X1g)
   X1 = cbind(rep(1, times = taille), matrix(0, ncol = p - R, nrow = taille))
   for (i in 1:(p - R)) {
      prop[(comp_cum[i] + 1):comp_cum[i + 1]] = prop[(comp_cum[i] + 1):comp_cum[i + 1]] / sum(prop[(comp_cum[i] + 1):comp_cum[i + 1]])
      qui = sample(n)
      quiv = sample((n + 1):taille)
      combien = rep(1, times = composantes[i]) + floor((n - composantes[i]) * prop[(comp_cum[i] + 1):comp_cum[i + 1]])
      if (sum(combien) < n) {
         quiloc = sample(composantes[i], size = (n - sum(combien)))
         combien[quiloc] = combien[quiloc] + 1
      }
      combien = c(0, cumsum(combien))
      # idem pour la validation
      combienv = rep(1, times = composantes[i]) + floor((valid - composantes[i]) * prop[(comp_cum[i] + 1):comp_cum[i + 1]])
      if (sum(combienv) < valid) {
         quiloc = sample(composantes[i], size = (valid - sum(combienv)))
         combienv[quiloc] = combienv[quiloc] + 1
      }
      combienv = c(0, cumsum(combienv))
      for (j in 1:composantes[i]) {
         X1[qui[(combien[j] + 1):combien[j + 1]], 1 + i] = X1g[(combien[j] + 1):combien[j + 1], comp_cum[i] + j]
         X1[quiv[(combienv[j] + 1):combienv[j + 1]], 1 + i] = X1g[(combien[j + 1] + combienv[j] + 1):(combien[j + 1] + combienv[j + 1]), comp_cum[i] + j]
      }
   }
   rm(X1g)
   if (R > 0) {
      if (scale) {
         X1[, -1] = scale(X1[, -1])
      }
      X[, -list_X2] = X1
      if (scale) {
         for (j in list_X2) {
            B[, j] = B[, j] / sqrt(sum((B[, j])^2))
         }
      }
      if (!is.null(R2)) {
         sigma_X = sqrt(((1 - R2) * apply(X1 %*% B[-list_X2, list_X2], 2, var)) / R2)
      } else if (gamma) {
         sigma_X = rgamma(R, shape = gammashape, scale = gammascale)
      }
      epsX[, list_X2] = matrix(rnorm(taille * R, mean = rep(0, times = R), sd = sigma_X), ncol = R, nrow = taille, byrow = T)
      if (nonlin == 0) {
         X = X %*% B + epsX
      } else if (nonlin > 0) {
         for (i in list_X2) {
            X[, i] = B[1, i] # on prend la constante
            for (j in which(vraiZ[, i - 1] != 0)) { # pour chaque variable a droite
               j = j + 1
               if (runif(1) < nonlin) {
                  X[, i] = X[, i] + B[j, i] * X[, j]^pnonlin
               } else {
                  X[, i] = X[, i] + B[j, i] * log(abs(X[, j]))
               }
            }
         }
         X = X + epsX
      } else {
         for (i in list_X2) {
            X[, i] = B[1, i] # on prend la constante
            for (j in which(vraiZ[, i - 1] != 0)) { # pour chaque variable a droite
               j = j + 1
               X[, i] = X[, i] * B[j, i] * X[, j]
            }
         }
         X = X + epsX
      }
   } else {
      X = X1
   }
   X = as.matrix(X)
   
   if (!is.null(R2Y)) { # le calcul tient donc compte du scaling
      R2Y = min(R2Y, 1)
      R2Y = max(R2Y, 0)
      sigma_Y = sqrt(((1 - R2Y) * apply(X %*% A, 2, var)) / R2Y)
   }
   
   Y = X %*% A + rnorm(taille, mean = 0, sd = sigma_Y) # pas de scale sur Y, sinon on perd le vrai A
   
   X_appr = as.matrix(X[1:n, -1])
   X_test = as.matrix(X[(n + 1):taille, -1])
   Y_appr = as.matrix(Y[1:n])
   Y_test = as.matrix(Y[(n + 1):taille])
   return(list(X_appr = X_appr, Y_appr = Y_appr, A = A, B = B[, -1], Z = vraiZ,
               X_test = data.frame(X_test), Y_test = Y_test, sigma_X = sigma_X, sigma_Y = sigma_Y, nbcomp = composantes))
}