R/generate.rsdata.R
In ccrs: Correct and Cluster Response Style Biased Data

Documented in generate.rsdata

#' Simulate preference data to apply CCRS
#'
#' @description Simulates artificial preference data containing content-based (and response-style-based) clusters.
#' @usage generate.rsdata(n=n,m=m,q=q,K.true=K.true,H.true=NULL,clustered.rs=FALSE,
#'               cls.cont.vec=NULL,cls.rs.vec=NULL,savedata=FALSE)
#' @param n An integer indicating the number of respondents.
#' @param m An integer indicating the number of items.
#' @param q An integer indicating the maximum rating.
#' @param K.true An integer indicating the true number of content-based clusters for n respondents.
#' @param H.true An integer indicating the true number of response-style-based clusters for n respondents. This is needed when \code{clustered.rs=TRUE}.
#' @param clustered.rs A logical value indicating whether response-style-based cluster structure exists in generated data. If \code{TRUE}, coefficients of I-spline are generated by response-style-based clusters. The default is \code{clustered.rs=FALSE}.
#' @param cls.cont.vec A vector of integers (from 1:K.true) of length n indicating the content-based cluster to which each respondent is allocated in artificial data. If it's \code{NULL}, it is generated automatically.
#' @param cls.rs.vec A vector of integers (from 1:H.true) of length n indicating the response-style-based clusters. If it's \code{NULL} and clustered.rs==T, it is generated randomly.
#' @param savedata A logical value indicating whether artificial data are saved as csv files. The default is \code{savedata=FALSE}.
#' @return A list with the following elements:
#' \item{\code{X}}{An n by m matrix of categorical variables.}
#' \item{\code{X.star}}{An n by m matrix of true preference data \code{X^*}.}
#' \item{\code{X.nors}}{An n by m matrix of categorical variables transformed by reference boundaries.}
#' \item{\code{cls.cont.vec}}{A vector of integers (from 1:H.true) indicating content-based clusters used to generate artificial data.}
#' \item{\code{cls.rs.vec}}{A vector of integers (from 1:H.true) indicating response-style-based clusters used to generate artificial data.}
#' @seealso \code{\link{create.ccrsdata}}
#' @references Takagishi, M., Velden, M. van de & Yadohisa, H. (2019). Clustering preference data in the presence of response style bias, to appear in British Journal of Mathematical and Statistical Psychology.
#' @importFrom msm rtnorm
#' @importFrom cds ispline
#' @importFrom stats runif rnorm
#' @importFrom utils write.csv
#' @examples
#' #data setting
#' n <- 30 ; m <- 10 ; H.true <- 2 ; K.true <- 2 ; q <- 5
#' datagene <- generate.rsdata(n=n,m=m,K.true=K.true,H.true=H.true,q=q,clustered.rs = TRUE)
#' #obtain n x m data matrix
#' X <- datagene$X
#' @export



generate.rsdata <- function(n=n,m=m,q=q,K.true=K.true,H.true=NULL,clustered.rs=FALSE,
                          cls.cont.vec=NULL,cls.rs.vec=NULL,savedata=FALSE) {

  ######setting that's been fixed
  sig.Xstar <- 0.1
  sig.xi <- 0.01
  sig <- 0.1
  beta.order<-c(1,2,4,5,3)
  Km <- 5 ; noisem <- 2
  q4 <- 1/4

  if(is.null(cls.cont.vec)){
    cls.cont.vec <- rep(1:K.true,(n/K.true))
  }

  if(is.null(cls.rs.vec)){
    if(clustered.rs){
      cls.tr.b4narabi <- rep(1:H.true,rep((n/H.true),H.true))
      clsnarabi <- sample(x=n,size=n,replace=FALSE)
      cls.rs.vec <- cls.tr.b4narabi[clsnarabi]
    }else{
      cls.rs.vec <- seq(1,n)
    }
  }

  if(clustered.rs){
    Beta.tr <- matrix(runif((n*3),0,1),n,3)
    Beta.tr <- t(apply(Beta.tr,1,function(x){(x/sum(x))}))
  }else{####start clustered r.s.######
    #1:no rs, 2:ac, 3:dis, 4:ext, 5:mid
    rscls<-seq(1,5)[1:H.true]
    Beta.tr <- matrix(0,H.true,3)

    #####specify beta#######
    Betasp.all <- matrix(0,5,2)
    Betasp.all[beta.order[2],] <- c(3.9,0.1) #ac
    Betasp.all[beta.order[3],] <- c(0.1,3.9) #dis
    Betasp.all[beta.order[4],] <- c(0.1,0.1) #ext
    Betasp.all[beta.order[5],] <- c(98,98) #mid

    #####function generating beta#####
    beta.gene<-function(x){
      x1 <- x[1]
      x2 <- x[2]
      a2 <- solve((1/x1)+1+(1/x2))
      a1 <- a2/x1
      a3 <- a2/x2
      return(c(a1,a2,a3))
    }

    ###beta except no r.s.
    Betasp<-matrix(0,(H.true-1),2)
    Betasp<-Betasp.all[rscls[rscls!=1],]

    if(any(rscls==1)){
      if(H.true!=1){
        if(H.true==2){
          Beta.tr<-matrix(beta.gene(Betasp),(H.true-1),3,byrow=TRUE)
        }else{
          Beta.tr<-matrix(apply(Betasp,1,beta.gene),(H.true-1),3,byrow=TRUE)
        }
        Beta.tr<-rbind(q4*c(1,2,1),Beta.tr)
      }else{
        Beta.tr<-q4*c(1,2,1)
      }
    }else{
      Beta.tr <- matrix(apply(Betasp,1,beta.gene),(H.true),3,byrow=TRUE)
    }
  }#####end clustered beta#####

  ###setting rs for middle and extreme
  b2.hand <- matrix(0,2,q)
  if(q==5){
    b2.hand[1,]<-c(0.35,0.49,0.51,0.65,1)  #rs extreme
    b2.hand[2,]<-c(0.01,0.21,0.79,0.99,1)  #rs middle
  }else if(q==7){
    b2.hand[1,]<-c(0.26,0.46,0.49,0.51,0.54,0.74,1)  #rs extreme
    b2.hand[2,]<-c(0.01,0.04,0.28,0.72,0.96,0.99,1)  #rs middle
  }

  #################### make cluster structure ##########################
  Xi <- matrix(0,K.true,m)

  ###grouping variables
  nm <- 0 ; nom <- m-nm
  if(nm==0){
    mcls<-c(rep((nom/Km),Km))
    mvar.grp<-rep(seq(1,Km),mcls)
  }else{
    mcls<-c(rep((nom/Km),Km),nm)
    mvar.grp<-rep(seq(1,(Km+1)),mcls)
  }

  ####upper and lower at each level#####
  UL.m<-matrix(0,Km,2)

  ###divide [0,1] interval by Km
  intbegin <- 1/Km
  intvec <- seq(0,1,by=intbegin)

  for(k in 1:Km){
    #add sig
    if(k==1){
      UL.m[k,]<-c((intvec[k]),(intvec[k+1]+sig))
    }else if(k==Km){
      UL.m[k,]<-c((intvec[k]-sig),(intvec[k+1]))
    }else if(k==(Km+1)){
      UL.m[k,]<-c(0,1)
      }else{
      UL.m[k,]<-c((intvec[k]-sig),(intvec[k+1]+sig))
    }
  }

  #########including noise variables##########
  #"6 (Km+1)" is allocated to noise variables (which variable is noise is random)
  noisevari.g <- sample(Km,noisem,replace=FALSE) #decide variable group by sample
  noisevari <- which(mvar.grp %in% noisevari.g) #take out each number of variables in noise group
  mvar.grp[mvar.grp %in% noisevari.g] <- Km+1

  #######make Level (K.true x Km)##########
  ######kth row elements indicate the interval (level) corresponding to kth cluster
  #(1:low, 2:middle, 3:high)

  ######function to make Level
  createlevel <- function(Level) {
    Level[1,] <- seq(1,Km)
    if(K.true>1){
      levelvec<-sample((Km-1),(K.true-1),replace=FALSE)
      llee<-1
      for(llee in 1:length(levelvec)){
        rlev<-levelvec[llee]+1
        Level[(llee+1),]<-seq(1,Km)[c(seq(rlev,Km),seq(1,(rlev-1)))]
      }}
    return(Level)
  }

  #####make Level. In K.true=2, if both 1&5 coexist in the same column (variable group), do it again.
  Level <- matrix(0,K.true,Km)
  Level <- createlevel(Level)
  Level[,noisevari.g] <- (Km+1) ##insert noise number indicating which vari is noise

  for(k in 1:K.true){
    for(le in 1:(Km)){
      levv <- Level[k,le]
      v.k <- m/Km
      if(levv!=(Km+1)){ #mean of the interval is mean of normal
      Xi[k,which(mvar.grp==le)] <- rnorm(v.k,mean=mean(UL.m[levv,]),sd=sig.xi)
      }
    }
  }

  ###trunced normal
  X.star <- matrix(0,n,m)
  for(i in 1:n){
    k<-cls.cont.vec[i]
    X.star[i,-noisevari] <- Xi[k,-noisevari]+rtnorm((m-length(noisevari)),mean=0,sd=sig.Xstar,lower=-1+sig,upper=1-sig)
    X.star[i,noisevari] <- runif(length(noisevari),0,1)
  }


  #############################################
  ####create response style#####
  #############################################
  scales<-seq(1,q)
  x.bounds<-c(0:q)/q
  tvec<-c(0, 0.5, 1)
  Mmat.01 <- ispline(x.bounds, tvec = tvec, intercept = FALSE)

  r.s.func<-function(x){
    b2<-Mmat.01%*%(x/sum(x))
    return(b2)
  }

  if(H.true!=1){
    b2<-apply(t(Beta.tr),2,r.s.func)
  }else{
    b2<-r.s.func(Beta.tr)
  }

  used.rs.num<-seq(1:5)[which(beta.order %in% c(1:H.true))]

  handrs <- c(4,5)
  if(any(used.rs.num %in% handrs)){
    used.hand.beta<-which(used.rs.num %in% handrs)
    which.hand<-which(handrs %in% used.rs.num)
    b2[-1,used.hand.beta]<-t(b2.hand[which.hand,])
  }

  b2[1,] <- -Inf ; b2[(nrow(b2)),] <- Inf

  ######transform data to r.s. biased data########
  X <- matrix(NA,n,m)

  for(k in 1:H.true){
    ind.cls <- which(cls.rs.vec==k)
    X[ind.cls,] <- as.numeric(cut(X.star[ind.cls,], breaks = b2[,k], labels = seq(1,q)))
  }

  b.nors<-r.s.func(c(q4,(q4*2),q4))
  b.nors[1,] <- -Inf
  b.nors[(nrow(b.nors)),] <- Inf

  X.nors <- matrix(as.numeric(cut(X.star, breaks =b.nors , labels = seq(1,q))),n,m)

  #----------------------------------------------------------------------------------------
  #######save data#############
  #----------------------------------------------------------------------------------------

  if(savedata) {

    colnames(X) <- lapply(c(1:m),function(x){paste("X",x,sep="")})
    colnames(X.star) <- lapply(c(1:m),function(x){paste("X.star",x,sep="")})
    #colnames(Level) <- lapply(c(1:Km),function(x){paste("Level",x,sep="")})
    #colnames(Xi) <- lapply(c(1:m),function(x){paste("Xi",x,sep="")})

    location.data <- "crssdata.csv"
    datalist.n <- cbind(X,X.star,cls.rs.vec,cls.cont.vec)
    write.csv(datalist.n,location.data)

    #others
    #location.data2 <- "crssdata.others.csv"
    #datalist.nigai <- cbind(Level,Xi)
    #write.csv(datalist.nigai, location.data2)

  }

  return(list(X=X,X.star=X.star,X.nors=X.nors,#,Level=Level
              cls.cont.vec=cls.cont.vec,cls.rs.vec=cls.rs.vec))
}