R/genData.R

Defines functions genData

Documented in genData

#' @title genData
#' @description Generating a example dataset (age estimation)
#' @details The function generates a dataset based on normal distribution where age (uniformly distributed) is a underlying explanatory variable
#' 
#' @param ntot total number of observations
#' @param nsites number of sites
#' @param nbatches number of batches
#' @param propAgeAs #proportion of sites with assosiation to age
#' @param sd_batch standard dev of batch effects (normal distr effects)
#' @param sd_signal standard dev of signal (norm distr noise)
#' @param sd_bparam sd of age coefficient 
#' @param ageRange Age range of considered ages
#' @param seed The user can set seed if wanted
#' @param verbose boolean, show progress Default: FALSE
#' @return A list with variables y (response), X (design matrix belonging to fixed effects coefficients), batch (factor with batch effect names), age (true ages)
#' @export
#' @examples
#' \dontrun{ 
#' dat = genData(seed=1)
#' }
genData = function(ntot = 1000, nsites = 1000, nbatches = 5, propAgeAs=0.05, sd_batch = 0.5, sd_signal = 0.1, sd_bparam = 0.03 , ageRange = c(10,35), seed=NULL,verbose=FALSE) {
  if(!is.null(seed)) set.seed(seed) #set seed if provided

   #GENERATE AGES
  ageMid=sum(ageRange)/2 #get mid age (to centralize age when generating data)
  agei = runif(ntot,ageRange[1],ageRange[2]) #get age vector for each inds

  #GENERATE Batch levels and random effects
  batchInd = sort(sample( 1:nbatches,ntot,replace=TRUE)) #get batch index for each inds
  batchRE = rnorm(nbatches,0,sd_batch) #simulate random batch effect (per batch)
  
  ##Simulate age effect (site specific)
  nAgeAs = ceiling(propAgeAs*nsites)    #number of sites with assosiation to age
  AgeAsind = sort(sample(1:nsites,size=nAgeAs,replace=FALSE)) #get index of selected sites (to be assosiated with age)
  bj = rnorm(nAgeAs,0,sd_bparam) #simulate effect on age #site specific
  
  #Generate noise (all sites and observations)
  if(verbose) print("Generating noise...")
  epsij = Matrix::Matrix(rnorm(ntot*nsites,0,sd_signal ),nrow=ntot,ncol=nsites) #random noise per individal per marker

  #generate data on each site (X covariate)
  if(verbose) print("Including batch effect...")
  X = batchRE[batchInd] + epsij #include batch effect and noise (include batch effect on all sites)
  
  #include age effects on selected sites 
  if(verbose) print("Including age effect...")
  tmp = t(bj%*%t(agei-ageMid))
  X[,AgeAsind] = X[,AgeAsind] + tmp
#  for(j in AgeAsind)  X[,j] = X[,j] + bj[j==AgeAsind]*(agei-ageMid) #add age effect (SLOW FOR LARGE nAgeAs)
  colnames(X) = paste0("Site ",1:nsites)
  rownames(X) = paste0("ID",1:ntot) 
  
# max(abs(X[,AgeAsind[1]] - (bj[1]*(agei-ageMid) + batchRE[batchInd] +epsij[,AgeAsind[1]])))

  #SCALE AND STORE DATA:
  if(verbose) print("Standardizing data...")
  #X2 = scale(X,center=TRUE,scale=TRUE) #normalize coeffs
  
  mean = Matrix::colMeans(X)
  sumsq = Matrix::colSums(X^2) 
  sd = sqrt((sumsq - ntot*mean^2)/(ntot-1)) #get scaling
  X = Matrix::t( (Matrix::t(X)-mean)/sd ) #transpose back
  attr(X, "scaled:center") <- mean
  attr(X, "scaled:scale") <- sd
  
  agei = scale(agei,center=TRUE,scale=TRUE) #scale age
  coefAgeAs = bj #extract age related coeffs
  names(coefAgeAs) = paste0("Site ",AgeAsind)
  
  dat = list(y=agei,X=X,batch=as.factor(batchInd), coefEffect=coefAgeAs) #store data in dataframe
  #dat = data.frame(age=agej,batch=as.factor(batchInd),beta=beta) #store data in dataframe

  return(dat)
}
  
oyvble/lme1en documentation built on April 30, 2020, 2:41 p.m.