Nothing
#' Gaussian mixtures dataset generator with regression between the covariates
#' @description Generates a dataset (with an additional validation sample) made of Gaussian mixtures with some of them generated by sub-regressions on others. A response variable is then added by linear regression. This function is used to generate datasets for simulations using CorReg, or just with Gaussian Mitures.
#utilise Matrix (matrices creuses) et generateurZ
#' @param n the number of individuals in the learning dataset
#' @param p the number of covariates (without the response)
#' @param ratio the ratio of covariates generated by sub-regressions on others
#' @param max_compl the number of covariates in each sub-regression
#' @param valid the number of individuals in the validation sample
#' @param positive the ratio of positive coefficients in both the regression and the sub-regressions
#' @param sigma_Y the standard deviation for the noise of the regression
#' @param sigma_X the standard deviation for the noise of the sub-regressions (all). ignored if \code{gamma=TRUE} or if \code{R2} is not NULL
#' @param R2 the strength of the sub-regressions (coefficients will be chosen to obtain this value).
#' @param R2Y the strength of the main regression (coefficients will be chosen to obtain this value).
#' @param gamma (boolean) to generate a p-sized vector \code{sigma_X} gamma-distributed
#' @param gammashape shape parameter of the gamma distribution (if needed)
#' @param gammascale scale parameter of the gamma distribution (if needed)
#' @param meanvar vector of means for the covariates.
#' @param sigmavar standard deviation of the covariates.
#' @param lambda parameter of the Poisson's law that defines the number of components in Gaussian Mixture models
#' @param Amax the maximum number of covariates with non-zero coefficients in the regression
#' @param tp1 the ratio of right-side (explicative) covariates allowed to have a non-zero coefficient in the regression
#' @param tp2 the ratio of left-side (redundant) covariates allowed to have a non-zero coefficient in the regression
#' @param tp3 the ratio of strictly independent covariates allowed to have a non-zero coefficient in the regression
#' @param lambdapois parameter used to generate the coefficient in the subregressions. Poisson's distribution.
# ' @param pb generates Y in an heuristic way that will give some issues with correlations.
#' @param nonlin to use non linear structure (squared or log). If not null, it is the proba to use power pnonlin instead of log. The type is drawn for each link between covariates
#' @param pnonlin the power used if non linear structure
#' @param scale (boolean) to scale X before computing Y
#' @param Z the binary squared adjacency matrix (size p) to obtain. If NULL it is randomly generated, based on \code{ratio} and \code{max_compl} parameters.
#' @export
#'
#' @return a list that contains:
#' \item{X_appr}{matrix of the learning set. \code{p} covariates following Gaussian Mixtures with some of them generated by sub-regressions on others.}
#' \item{Y_appr}{Response variable vector (size \code{n}) generated by linear regression on \code{X_appr} with coefficients \code{A} and residual standard deviation \code{sigma_Y}.}
#' \item{A}{vector of the of the regression generating \code{Y_appr}}
#' \item{B}{Matrix of the coefficients of sub-regressions (first line : the intercepts) then \code{B[i-1,j]} is the coefficient associated to \code{X_appr[,i]} in the sub-regression that generates \code{X_appr[,j]}}
#' \item{Z}{Binary squared adjacency matrix of size \code{p} that describes the structure of sub-regressions. \code{Z[i,j]}=1 if \code{X_appr[,i]} explains \code{X_appr[,j]}}
#' \item{X_test}{validation sample generated the same way as \code{X_appr}, with \code{valid} individuals.}
#' \item{Y_test}{Response vector associated to the validation sample}
#' \item{sigma_X}{Vector of the standard deviations of the residuals of the sub-regressions (one value for each sub-regression)}
#' \item{sigma_Y}{Standard deviation of the residual of the regression that generates \code{Y_appr} and \code{Y_test}.}
#' \item{nbcomp}{vector of the number of components for covariates that are not explained by others.}
#' @examples
#' require(CorReg)
#' #dataset generation
#' base=mixture_generator(n=250,p=4,valid=0)
#' X_appr=base$X_appr #learning sample
#' Y_appr=base$Y_appr#response variable
#' for(i in 1:ncol(X_appr)){
#' hist(X_appr[,i])
#' }
#'
mixture_generator<-function(n=130,
p=100,
ratio=0.4,
max_compl=1,
valid=1000,
positive=0.6,
sigma_Y=10,
sigma_X=NULL,
R2=NULL,
R2Y=0.4,
meanvar=NULL,
sigmavar=NULL,
lambda=3,#pour l enombre de composantes des m?langes gaussiens
Amax=NULL,
lambdapois=10,#pour les valeurs des coefs et des moyennes des variables
gamma=FALSE,
gammashape=1,
gammascale=0.5,tp1=1,tp2=1,tp3=1,
# pb=0,
nonlin=0,pnonlin=2,scale=TRUE,Z=NULL
){
pb=0
if(is.null(Z)){
max_compl=min(max_compl,p-1)
max_compl=min(max_compl,floor(1+(p-ratio*p)))
}else{
max_compl=max(colSums(Z))
}
if(is.null(R2)){
if(is.null(sigma_X) & !gamma){#si R2 est nul et qu'on n'a pas de sigma X alors on le definit
R2=0.99
}
}
if(is.null(Amax)){Amax=2*p}
Amax=min(p+1,Amax) # min entre p+1 et Amax why?
if(is.null(Z)){
R=round(ratio*p) # R : entier nombre de personne a gauche
}else{
R=sum(colSums(Z)!=0)
}
if(R==0){pb=0}
if(is.null(Amax) | Amax>p){Amax=p+1}
#lmabda parametre le nombre de composantes des melanges gaussiens
valid=max(valid,n)
B=matrix(0,nrow=(p+1),ncol=(p+1)) #B matrice creuse
taille=n+valid
qui=2:(p+1) # vecteur de taille p qui contient les valeurs allant de 2 a p+1 avec un pas de 1
if(R>0){
if(is.null(Z)){
list_X2=sample(qui,R) # melange R individus pri au hasard
}else{
list_X2=which(colSums(Z)!=0)
}
B[1,list_X2]=rpois(R,lambdapois)*(rep(-1,R)+2*rbinom(R,1,positive)) # 1ere ligne de B = ?????
for(j in list_X2){#remplissage aleatoire de max_compl elements de B sur chaque colonne a gauche
if(is.null(Z)){
loc=(1/max_compl)*rpois(max_compl,lambdapois)*(rep(-1,max_compl)+2*rbinom(max_compl,1,positive))
loc[loc==0]=1
B[sample(qui[-c(list_X2-1)],size=max_compl),j]=loc
}else{
compl_loc=colSums(Z)[j]
loc=(1/compl_loc)*rpois(compl_loc,lambdapois)*(rep(-1,compl_loc)+2*rbinom(compl_loc,1,positive))
loc[loc==0]=1
B[sample(qui[-c(list_X2-1)],size=compl_loc),j]=loc
}
}
#ajout de G
G=diag(p+1)
G[,list_X2]=0
B=B+G
}
vraiZ=B
vraiZ=as.matrix(vraiZ)
vraiZ[vraiZ!=0]=1
vraiZ=vraiZ-diag(diag(vraiZ))
vraiZ=vraiZ[-1,-1]
if(sum(c(tp1,tp2,tp3))==0){
A=rpois(p+1,lambdapois)*(rep(-1,p+1)+2*rbinom(p+1,1,positive))
#on vient maintenant tailler dans A en fonction des param?tres
A[-sample(p+1,size=Amax)]=0
}else{
A=generateurA_ou(Z=vraiZ,tp1=tp1,tp2=tp2,tp3=tp3,positive=positive,lambdapois=lambdapois,pb=pb,Amax=Amax,B=B)
}
composantes=rpois(p-R,lambda=lambda)
composantes[composantes>n]=n #pas plus de composantes que de points
composantes[composantes==0]=1#au moins une composante
ploc=sum(composantes)
if(is.null(meanvar)){
meanvar=rpois(ploc,lambdapois)*(rep(-1,ploc)+2*rbinom(ploc,1,positive))
}
if(is.null(sigmavar)){
sigmavar=rpois(ploc,5)
sigmavar[sigmavar==0]=1
}
prop=runif(ploc)
comp_cum=cumsum(composantes)
comp_cum=c(0,comp_cum)
X=matrix(0,ncol=p+1,nrow=taille)
epsX=X#matrice nulle
X1g=matrix(rnorm(taille*ploc,mean=meanvar,sd=sigmavar),ncol=ploc,nrow=taille,byrow=T)
dim(X1g)
X1=cbind(rep(1,times=taille),matrix(0,ncol=p-R,nrow=taille))
for(i in 1:(p-R)){
prop[(comp_cum[i]+1):comp_cum[i+1]]=prop[(comp_cum[i]+1):comp_cum[i+1]]/sum(prop[(comp_cum[i]+1):comp_cum[i+1]])
qui=sample(n)
quiv=sample((n+1):taille)
combien=rep(1,times=composantes[i])+floor((n-composantes[i])*prop[(comp_cum[i]+1):comp_cum[i+1]])
if(sum(combien)<n){
quiloc=sample(composantes[i],size=(n-sum(combien)))
combien[quiloc]=combien[quiloc]+1
}
combien=c(0,cumsum(combien))
#idem pour la validation
combienv=rep(1,times=composantes[i])+floor((valid-composantes[i])*prop[(comp_cum[i]+1):comp_cum[i+1]])
if(sum(combienv)<valid){
quiloc=sample(composantes[i],size=(valid-sum(combienv)))
combienv[quiloc]=combienv[quiloc]+1
}
combienv=c(0,cumsum(combienv))
for (j in 1:composantes[i]){
X1[qui[(combien[j]+1):combien[j+1]],1+i]=X1g[(combien[j]+1):combien[j+1],comp_cum[i]+j]
X1[quiv[(combienv[j]+1):combienv[j+1]],1+i]=X1g[(combien[j+1]+combienv[j]+1):(combien[j+1]+combienv[j+1]),comp_cum[i]+j]
}
}
rm(X1g)
if(R>0){
if (scale){
X1[,-1]=scale(X1[,-1])
}
X[,-list_X2]=X1
if(scale){
for(j in list_X2){
B[,j]=B[,j]/sqrt(sum((B[,j])^2))
}
}
if(!is.null(R2)){
sigma_X=sqrt(((1-R2)*apply(X1%*%B[-list_X2,list_X2],2,var))/R2)
}else if(gamma){
sigma_X=rgamma(R,shape=gammashape,scale=gammascale)
}
epsX[,list_X2]=matrix(rnorm(taille*R,mean=rep(0,times=R),sd=sigma_X),ncol=R,nrow=taille,byrow=T)
if(nonlin==0){
X=X%*%B+epsX
}else if (nonlin>0){
for (i in list_X2){
X[,i]=B[1,i]#on prend la constante
for (j in which(vraiZ[,i-1]!=0)){#pour chaque variable a droite
j=j+1
if(runif(1)<nonlin){
X[,i]= X[,i]+B[j,i]*X[,j]^pnonlin
}else{
X[,i]= X[,i]+B[j,i]*log(abs(X[,j]))
}
}
}
X=X+epsX
}else{
for (i in list_X2){
X[,i]=B[1,i]#on prend la constante
for (j in which(vraiZ[,i-1]!=0)){#pour chaque variable a droite
j=j+1
X[,i]= X[,i]*B[j,i]*X[,j]
}
}
X=X+epsX
}
}else{
X=X1
}
X=as.matrix(X)
if(!is.null(R2Y)){# le calcul tient donc compte du scaling
R2Y=min(R2Y,1)
R2Y=max(R2Y,0)
sigma_Y=sqrt(((1-R2Y)*apply(X%*%A,2,var))/R2Y)
}
Y=X%*%A+rnorm(taille,mean=0,sd=sigma_Y)#pas de scale sur Y, sinon on perd le vrai A
X_appr=as.matrix(X[1:n,-1])
X_test=as.matrix(X[(n+1):taille,-1])
Y_appr=as.matrix(Y[1:n])
Y_test=as.matrix(Y[(n+1):taille])
return(list(X_appr=X_appr,Y_appr=Y_appr,A=A,B=B[,-1],Z=vraiZ,
X_test=data.frame(X_test),Y_test=Y_test,sigma_X=sigma_X,sigma_Y=sigma_Y,nbcomp=composantes))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.