R/saMinCV.R

Defines functions saMinCV

Documented in saMinCV

saMinCV <- function(
  x,
  label,
  targetCV,
  sampleSize,
  weightMatrix,            # missing handled
  iterations=1000,
  sampleSizeIterations=3,
  recalcIterations=10000,
  locationAdjustment,
  scaleAdjustment,
  p = 2,                   # l2 norm of the penalty function
  penalty = -1,            # negative penalties are ignored
  cooling = 0,
  preserveSatisfied=TRUE,
  fpc=TRUE
  ) {

  tolSize <- 1 # not used


  # check if X is a matrix
  if( !is.matrix(x) ) {
    stop("x must be a matrix") 
  }
 

  Cprog <- proc.time()
  
  # transform label  
  unique.label <- sort(unique(label))
  rlabel <- sapply(label, function(x){ which(x == unique.label) } ) 
  rlabel <- rlabel - 1


  # get rows etc... 
  N <- length(label)            # number of observations
  K <- ncol(x)                  # number of distinct characteristics  (independent subsets of the covariance matrix)
  R <- length(c(x)) / (N * K)   # replications of each of the distinct characteristics 
  H <- length(unique.label)     # number of strata
  
  # get colnames for x 
  if( is.null(colnames(x)) ) {
    x.colnames <- sprintf("%d",1:K)
  } else {
    x.colnames <- colnames(x)
  }
  
  #################### PROBABILITY ######################################
 
  # get prob
  if( missing(weightMatrix) ) {
    weightMatrix <- matrix( 1/N, nrow=N, ncol=H)
  }

  # check prob
  if(( nrow(weightMatrix) != N) | (ncol(weightMatrix) != H )) {
    stop( sprintf("probMatrixability matrix (weightMatrix) does not have correct dimensions\n Needed (nrow = %d, ncol = %d), provided (nrow = %d, ncol = %d)\n",
                 N, K, nrow(weightMatrix), ncol(weightMatrix) ) ) 
  }

  # get max prob
  prob <- apply(cbind(rlabel,weightMatrix), 1, function(x) max(x[-1* c(1, x[1]+2)]) ) 

  totalProb <- sum(prob)
  
  # create row major matrix for input to C program
  weightMatrix <- c(t(weightMatrix))
  
  #################### LOCATION ADJUSTMENT ######################################
 
  # location adjustment is N x K  
  if( missing( locationAdjustment ) ) {
    C_locationAdjustment = -1
  } else {
    C_locationAdjustment = c(locationAdjustment) 
  }
 

  #################### SCALE ADJUSTMENT #########################################
 
  # location adjustment is N x K  
  if( missing( scaleAdjustment ) ) {
    C_scaleAdjustment = -1
  } else {
    C_scaleAdjustment = c(scaleAdjustment) 
  }
  
  
  #################### DOMAIN MATRIX ######################################
  # it is assumed that the domain matrix is a list of matricies of dim K x J of length H
  # where K is the number of coi, 
  # J is the number of domains 
  # and H is the number of strata
  # if it is missing this list will be substituted with a list of identity matricies.

  #if( missing(domainMatrixList) ) {
    J <- K
    domainMatrixList.names <- x.colnames
    domainMatrix <- rep(c(diag(K)),H)
#  } else {
#    domainMatrixList.names <- names(domainMatrixList) 
#    J <- ncol(domainMatrixList[[1]])
#    if( J < K) stop("Error J < K")
#    if( J == K) {
#      domainMatrix <- rep(c(diag(K)),H)
#    } else {
#      domainMatrixTmp <- c()
#      for( h in 1:H ) domainMatrixTmp <- c(domainMatrixTmp, c( t(domainMatrixList[[h]]) ))
#      domainMatrix <- domainMatrixTmp 
#    }
#
#  }

  #################### PENALTY ######################################

  if( sum( penalty < 0) >= 1 ) {
    penalty <- rep(0,J)
  } else {

    if( length(penalty) == 1 )  {
      penalty <- rep(penalty,J)
    } else {
      if( length(penalty) != J ) {
        stop("penalty is not of length 1 or J")
      } 
    }

  }
  
  #################### INITIAL SAMPLE SIZE ######################################
  
  # handle sample size 
  if( length(sampleSize) > 1 ) {

    # stop if the sampleSize length does not equal the number of strata 
    if( H != length(sampleSize) ) {
      stop( "H != length(sampleSize)" )  
    # warning if there are no names 
    } else if( length(names(sampleSize)) == 0 ) {
      #warning( "sampleSize has no names, assuming that the sample sizes are in asscending order with respect to the strata in label." ) 
    # error if there are names but don't match what we find in unique.label
    } else if( !identical( sort(as.numeric(names(sampleSize))), unique.label)  ) {
      stop( "sampleSize names do match elements in label" ) 
    # at this point everthing seems ok, so reorder the sample size
    } else {
      sampleSize <- sampleSize[ as.character( unique.label ) ] 
    }


  } else if(length(sampleSize) == 1) {

    sampleSize.n <- sampleSize # make a copy of sampleSize

    sampleSize <- rep(floor(sampleSize/H) , H)

    if( sum(sampleSize) < sampleSize.n ) {
      sampleSizeAddTo <- sample(1:H,size=sampleSize.n - sum(sampleSize))
      sampleSize[ sampleSizeAddTo ] <- sampleSize[ sampleSizeAddTo ] + 1  
    }
    sampleSize <- sapply(sampleSize, function(x) max(2,x) )

  } else {
      stop( "0 == length(sampleSize)" ) 
  }
  # final sample size check 
  if( sum(sampleSize < 2) > 0 ) stop("Each element of sampleSize must be of at least 2")
  
  
  #################### ITERATIONS ######################################
  
  if(iterations < 0 ) stop("Iterations is not positive") 
  if( round(iterations) != iterations ) stop("Iterations is not an integer ") 
 

  #################### SAMPLE ITERATIONS ######################################
  
  if(sampleSizeIterations < 0 ) stop("sampleSizeIterations is not positive") 
  if( round(sampleSizeIterations) != sampleSizeIterations ) stop("sampleSizeIterations is not an integer ") 
 

  #################### SAMPLE SIZE ######################################
  
  if(recalcIterations < 0 ) stop("recalcIterations is not positive") 
  if( round(recalcIterations) != recalcIterations ) stop("recalcIterations is not an integer ") 
  
  #################### TARGET CV ######################################
  # handle sample size 

  # stop if the sampleSize lenght does not equal the number of distinct elements
  if( J != length(targetCV) ) {
    stop( "ncol(x) != length(targetCV)" )  
  # warning if there are no names 
  } else if( length(names(targetCV)) == 0 ) {
    #warning( "targetCV has no names, assuming that the targetCV are in the same order as x." ) 
  # error if there are names but don't match what we find in unique.label
  #} else if( !identical( sort(names(targetCV)), sort(domainMatrixList.names))  ) {
  #  stop( "targetCV names do match column names of domainMatrix" ) 
  } else if( anyDuplicated( names(targetCV) ) != 0  ) {
    stop( "targetCV names do exist but are not unique" ) 
  # at this point everthing seems ok, so reorder the sample size
  } else {
    #targetCV <- targetCV[ domainMatrixList.names ] 
  }
  
  costChangeSize <- 6 + H + J  
  
  # need to re-label
  acceptRate <- rep(0,costChangeSize*(iterations+1)) 

#  print(paste( "N", N))
#  print(paste( "l(x)", length(c(x))))
#  print(paste( "K", K))
#  print(paste( "R", R))
#  print(paste( "H", H))
  print(paste( "penalty: ", penalty))
  print(paste( "targetCV: ", targetCV))


  #################### RUN C FUNCTION ######################################
  r.result <- .C("R_minCV",
    as.double(c(x)),                  # 1 
    as.integer(N),                    # 2 
    as.integer(J),                    # 2
    as.integer(K),                    # 4 
    as.integer(H),                    # 5
    as.integer(R),                    # 6
    as.integer(sampleSizeIterations), # 7     
    as.integer(rlabel),               # 8 
    as.integer(domainMatrix),         # 9
    as.double(prob),                  #10
    as.double(weightMatrix),          #11
    as.double(targetCV),              #12
    as.double(C_locationAdjustment),    #13
    as.double(C_scaleAdjustment),       #14
    as.double(p),                     #15
    as.double(penalty),               #16
    as.double(sampleSize),            #17
    as.double(acceptRate),            #18
    as.double(cooling),                  #19
    as.integer(recalcIterations),     #20  /* how often to update the counter */
    as.integer(costChangeSize),       #21
    as.integer(iterations),           #22     
    as.integer(preserveSatisfied),    #23
    as.integer(fpc)                   #24
  )

  runTime <- proc.time() - Cprog
 

  #################### RETURN DATA ######################################

  ## change and other accept data
  a <- matrix(unlist(r.result[18]),ncol=costChangeSize ,byrow=T)
  colnames(a) <- c( 'change', 'U', 'T','selected','from','to', 
                   sprintf("n_%d",unique.label), 
                   x.colnames
                   )

  
  a.names <- c(colnames(a),'accepted')
  a <- cbind(a, as.numeric(a[,'U'] <= a[,'T']))
  colnames(a) <- a.names


  ## label
  newRlabel <- sapply(unlist(r.result[8]), function(x) unique.label[x+1] ) 
   
  ## sample size 
  newSampleSize <- unlist(r.result[17]) 
  names(sampleSize) <- sprintf("n_%d",unique.label)
  names(newSampleSize) <- sprintf("n_%d",unique.label)

  ## final and initial CVs
  if( missing(locationAdjustment) & missing(scaleAdjustment) ) { 
    print("no adjustment")
    CVStart  <- .cv2( x, rlabel, sampleSize, average=(R!=1), fpc=fpc)
    CV  <- .cv2( x, newRlabel, newSampleSize, average=(R!=1),fpc=fpc )
  } else if( missing(locationAdjustment) )  { 
    print("just scale adjustment")
    CVStart  <- .cv2( x, rlabel, sampleSize, average=(R!=1), scaleAdjustment=scaleAdjustment,fpc=fpc)
    CV  <- .cv2( x, newRlabel, newSampleSize, average=(R!=1), scaleAdjustment=scaleAdjustment,fpc=fpc)
  } else if( missing(scaleAdjustment) )  { 
    print("just location adjustment")
    CVStart  <- .cv2( x, rlabel, sampleSize, average=(R!=1), locationAdjustment=locationAdjustment,fpc=fpc)
    CV  <- .cv2( x, newRlabel, newSampleSize, average=(R!=1), locationAdjustment=locationAdjustment,fpc=fpc)
  } else {
    print("both adjustments")
    CVStart  <- .cv2( x, rlabel, sampleSize, average=(R!=1), locationAdjustment=locationAdjustment, scaleAdjustment=scaleAdjustment,fpc=fpc)
    CV  <- .cv2( x, newRlabel, newSampleSize, average=(R!=1), locationAdjustment=locationAdjustment, scaleAdjustment=scaleAdjustment,fpc=fpc)
  }


  ## strata Size
  strataSizeStart <- stats::aggregate(rlabel, by=list(rlabel), length)
  rownames(strataSizeStart) <- strataSizeStart$Group.1
  strataSizeStart <- strataSizeStart[,2,drop=FALSE] 

  strataSize      <- stats::aggregate(newRlabel, by=list(newRlabel), length)
  rownames(strataSize) <- strataSize$Group.1
  strataSize <- strataSize[,2,drop=FALSE] 


  myList <- list(
    "accept"=a, 
#    "cost"=unlist(r.result[7]), 
    "label"=newRlabel, 
    "sampleSize"=newSampleSize,
    "sampleSizeStart"=sampleSize,
    "CV"=CV,
    "CVStart"=CVStart,
    "targetCV"=targetCV,
    "strataSize"=strataSize,
    "strataSizeStart"=strataSizeStart,
    "runTime"=runTime,
    "variables"=x.colnames,
    "fpc"=fpc
  )


  class(myList) <- "saAlloc"

  return(myList)
}
jlisic/saAlloc documentation built on May 19, 2019, 12:47 p.m.