R/multistage.R

Defines functions svyratio.survey.design2 svymean.survey.design2 svytotal.survey.design2 is.pps as.svydesign2 .svycheck print.summary.survey.design2 summary.survey.design2 print.survey.design2 as.fpc multistage_rcpp multistage svyrecvar onestage onestrat svydesign.default detibble svydesign

Documented in as.fpc as.svydesign2 multistage multistage_rcpp onestage onestrat .svycheck svydesign svydesign.default svymean.survey.design2 svyratio.survey.design2 svyrecvar svytotal.survey.design2

##
##  Recursive estimation of linearisation variances
##  in multistage samples.
##

svydesign<-function(ids, probs = NULL, strata = NULL, variables = NULL, 
    fpc = NULL, data=NULL, nest = FALSE, check.strata = !nest, 
    weights = NULL,pps=FALSE,...){
	UseMethod("svydesign", data)
	}

detibble<-function(data) {
    if ("tbl_df" %in% class(data))
        as.data.frame(data)
    else
        data
}

svydesign.default<-function(ids,probs=NULL,strata=NULL,variables=NULL, fpc=NULL,
                    data=NULL, nest=FALSE, check.strata=!nest,weights=NULL,pps=FALSE,
                            calibrate.formula=NULL,variance=c("HT","YG"), ...){

  data<-detibble(data)
    
  variance<-match.arg(variance)
  if(is.character(pps)){
    a<-match.arg(pps,c("brewer","overton","other"))
    if (!(pps %in% c("brewer","other")))
      return(pps_design(ids=ids,probs=probs, strata=strata,variables=variables, fpc=fpc,
                 data=data,method=a,call=sys.call(-1),variance=variance,...))
  } else if (!is.logical(pps)){
    return(pps_design(ids=ids,probs=probs, strata=strata,variables=variables, fpc=fpc,
                      data=data,method=pps,call=sys.call(-1),variance=variance,...))
  }

  if (!is.character(pps) || pps!="other"){
    if (variance!="HT")
      stop("Only variance='HT' supported for this design")
  }
  
  ## less memory-hungry version for sparse tables
    interaction<-function (..., drop = TRUE) {
        args <- list(...)
        narg <- length(args)
        if (narg == 1 && is.list(args[[1]])) {
            args <- args[[1]]
            narg <- length(args)
        }
        
        ls<-sapply(args,function(a) length(levels(a)))
        ans<-do.call("paste",c(lapply(args,as.character),sep="."))
        ans<-factor(ans)
        return(ans)
        
    }

    na.failsafe<-function(message="missing values in object"){
      function(object,...){
        if (NCOL(object)==0)
          object
        else {
          ok <- complete.cases(object)
          if (all(ok)) 
            object
          else stop(message)
        }
      }
    }

     na.id<-na.failsafe("missing values in `id'")
     if(inherits(ids,"formula")) {
	 mf<-substitute(model.frame(ids,data=data, na.action=na.id))
	 ids<-eval.parent(mf)
         if (ncol(ids)==0) ## formula was ~1
           ids<-data.frame(id=1:nrow(ids))
       } else{
         if (is.null(ids))
           stop("Must provide ids= argument")
         else
           ids<-na.id(data.frame(ids))
       }

    ## make ids factor if they are character
    for(i in 1:ncol(ids)){
        if (is.character(ids[[i]]))
            ids[[i]]<-factor(ids[[i]])
    }
    
    na.prob<-na.failsafe("missing values in `prob'")
    if(inherits(probs,"formula")){
      mf<-substitute(model.frame(probs,data=data,na.action=na.prob))
      probs<-eval.parent(mf)
    }

    na.weight<-na.failsafe("missing values in `weights'")
    if(inherits(weights,"formula")){
      mf<-substitute(model.frame(weights,data=data,na.action=na.weight))
      weights<-eval.parent(mf)
     } else if (!is.null(weights))
         weights<-na.weight(data.frame(weights))
    if(!is.null(weights)){
      if (!is.null(probs))
         stop("Can't specify both sampling weights and probabilities")
       else
         probs<-as.data.frame(1/as.matrix(weights))
     }

      

    na.strata<-na.failsafe("missing values in `strata'")
    if (!is.null(strata)){
      if(inherits(strata,"formula")){
        mf<-substitute(model.frame(strata,data=data, na.action=na.strata))
        strata<-eval.parent(mf)
      }
      if (!is.list(strata))
        strata<-data.frame(strata=strata)
      has.strata<-TRUE
      for(i in 1:NCOL(strata)){ ##drop empty strata
          if (is.factor(strata[[i]])) {
              strata[[i]]<-as.factor(as.character(strata[[i]]))
          } else if (is.character(strata[[i]])){ ##coerce string to factor
              strata[[i]]<-as.factor(strata[[i]])
              }
          }
    } else {
      has.strata <-FALSE
      strata<-na.strata(as.data.frame(matrix(1, nrow=NROW(ids), ncol=NCOL(ids))))
    }

    
    if (inherits(variables,"formula")){
        mf<-substitute(model.frame(variables,data=data,na.action=na.pass))
        variables <- eval.parent(mf)
    } else if (is.null(variables)){
        variables<-data
    } else
        variables<-do.call("data.frame",variables)


    na.fpc<-na.failsafe("missing values in `fpc'")
    if (inherits(fpc,"formula")){
      mf<-substitute(model.frame(fpc,data=data,na.action=na.fpc))
      fpc<-eval.parent(mf)
    }
      
  ## check for only one PSU: probably a typo
  if ((length(unique(ids[,1]))==1) && !(nest && has.strata)){
    stop("Design has only one primary sampling unit")
  }
  
      ## force subclusters nested in clusters
      if (NCOL(ids)>1){
        N<-ncol(ids)
        for(i in 2:N){
          ids[,i]<-do.call("interaction", ids[,1:i,drop=FALSE])
        }
      }
      ## force clusters nested in strata
      if (nest && has.strata && NCOL(ids)){
        N<-NCOL(ids)
        NS<-NCOL(strata)
        for(i in 1:N)
          ids[,i]<-do.call("interaction",
                           c(strata[,1:min(i,NS),drop=FALSE], ids[,i,drop=FALSE]))
      }
      
    ## check if clusters nested in strata 
     if (check.strata && nest)
      warning("No point in check.strata=TRUE if nest=TRUE")
    if(check.strata && !is.null(strata) && NCOL(ids)){
       sc<-(rowSums(table(ids[,1],strata[,1])>0))
       if(any(sc>1)) stop("Clusters not nested in strata at top level; you may want nest=TRUE.")
    }

      ## force substrata nested in clusters
      N<-ncol(ids)
      NS<-ncol(strata)
      if (N>1){
        for(i in 2:N)
          strata[,i]<-interaction(strata[,min(i,NS)], ids[,i-1])
      }

    ## PPS: valid choices currently are FALSE and "brewer"
    if (is.logical(pps) && pps) stop("'pps' must be FALSE or a character string")
    if (is.character(pps)) {
      pps<-TRUE
    }
    
    ## Finite population correction: specified per observation
    ## Also incorporates design sample sizes formerly in nPSU
    
      if (!is.null(fpc) && !is.numeric(fpc) && !is.data.frame(fpc))
        stop("fpc must be a matrix or dataframe or NULL")

      fpc<-as.fpc(fpc,strata, ids, pps=pps)

      ## if FPC specified, but no weights, use it for weights
    if (is.null(probs) && is.null(weights)){
      if (is.null(fpc$popsize)){
        if (missing(probs) && missing(weights))
          warning("No weights or probabilities supplied, assuming equal probability")
        probs<-rep(1,nrow(ids))
      } else {
        probs<-1/weights(fpc, final=FALSE)
      }
    }

  
    if (is.numeric(probs) && length(probs)==1)
      probs<-rep(probs, NROW(variables))
    
    if (length(probs)==0) probs<-rep(1,NROW(variables))
    
    if (NCOL(probs)==1) probs<-data.frame(probs)

    rval<-list(cluster=ids)
    rval$strata<-strata
    rval$has.strata<-has.strata
    rval$prob<- apply(probs,1,prod) 
    rval$allprob<-probs
    rval$call<-match.call()
    rval$variables<-variables
    rval$fpc<-fpc
    rval$call<-sys.call(-1)
    rval$pps<-pps
    class(rval)<-c("survey.design2","survey.design")
    if (!is.null(calibrate.formula)){
        rval<-recalibrate(rval, calibrate.formula)
        rval$call<-sys.call(-1)
    }
    rval
}

onestrat<-function(x,cluster,nPSU,fpc, lonely.psu,stratum=NULL,stage=1,cal=cal){
  
  stratum_center <- attr(x, "recentering")
  if (is.null(stratum_center)) stratum_center <- 0

  if (is.null(fpc))
      f<-rep(1,NROW(x))
  else{
      f<-ifelse(fpc==Inf, 1, (fpc-nPSU)/fpc)
  }

  if (nPSU>1)
      scale<-f*nPSU/(nPSU-1)
  else
      scale<-f
  if (all(f<0.0000001))## self-representing stratum
      return(matrix(0,NCOL(x),NCOL(x)))

  scale<-scale[!duplicated(cluster)]
  
  x<-rowsum(x,cluster)
  nsubset<-nrow(x)
  
  if (nsubset<nPSU) {
    ##can't be PPS, so scale must be a constant
    x<-rbind(x,matrix(0,ncol=ncol(x),nrow=nPSU-nrow(x)))
    scale<-rep(scale[1],NROW(x))
  }
  
  if (lonely.psu!="adjust" || nsubset>1 ||
      (nPSU>1 & !getOption("survey.adjust.domain.lonely"))) {
    stratum_center <- colMeans(x)
  }
  x<-sweep(x=x, MARGIN=2, STATS=stratum_center, FUN="-")

  if (nsubset==1 && nPSU>1 && getOption("survey.adjust.domain.lonely")){ 
      warning("Stratum (",stratum,") has only one PSU at stage ",stage)
      if (lonely.psu=="average" && getOption("survey.adjust.domain.lonely"))
          scale<-NA
    }
  if (nPSU>1){
      return(crossprod(x*sqrt(scale)))
  } else {
      rval<-switch(lonely.psu, 
                   certainty=crossprod(x*sqrt(scale)),
                   remove=crossprod(x*sqrt(scale)),
                   adjust=crossprod(x*sqrt(scale)),
                   average=NA*crossprod(x),
                   fail= stop("Stratum (",stratum,") has only one PSU at stage ",stage),
                   stop("Can't handle lonely.psu=",lonely.psu)
            )
      rval
  }
}


onestage<-function(x, strata, clusters, nPSU, fpc, lonely.psu=getOption("survey.lonely.psu"),stage=0, cal){
   if (NROW(x)==0)
       return(matrix(0,NCOL(x),NCOL(x)))
   ## For the 'adjust' option for lonely PSUs,
   ## recenter around mean from all PSUs in all strata
   if (!is.null(lonely.psu) && lonely.psu == "adjust") {
     n_PSUs_from_all_strata <- sum(tapply(X = nPSU, INDEX = as.numeric(strata), FUN = head, 1))
     recentering <- colSums(x) / n_PSUs_from_all_strata
   } else {
     recentering <- 0
   }
   stratvars<- tapply(1:NROW(x), list(factor(strata)), function(index){
       onestrat(x[index,,drop=FALSE] |> `attr<-`('recentering', recentering), clusters[index],
                nPSU[index][1], fpc[index], ##changed from fpc[index][1], to allow pps(brewer)
                lonely.psu=lonely.psu,stratum=strata[index][1], stage=stage,cal=cal)
   })
   p<-NCOL(x)
   nstrat<-length(unique(strata))
   nokstrat<-sum(sapply(stratvars,function(m) !any(is.na(m))))
   apply(array(unlist(stratvars),c(p,p,length(stratvars))),1:2,sum,na.rm=TRUE)*nstrat/nokstrat
}


svyrecvar<-function(x, clusters,  stratas, fpcs, postStrata=NULL,
                    lonely.psu=getOption("survey.lonely.psu"),
                    one.stage=getOption("survey.ultimate.cluster")){

  x<-as.matrix(x)
    cal<-NULL

  if(isFALSE(fpcs$pps))
      use_rcpp<-getOption("survey.use_rcpp")
  else
      use_rcpp<-FALSE  ## Brewer's method, doesn't work with the current rcpp version
  
  ## Remove post-stratum means, which may cut across clusters
  ## Also center the data using any "g-calibration" models
  if(!is.null(postStrata)){
    for (psvar in postStrata){
      if (inherits(psvar, "greg_calibration")) {
        if (psvar$stage==0){
          ## G-calibration at population level
          x<-as.matrix(qr.resid(psvar$qr,x/psvar$w)*psvar$w)
        } else {
          ## G-calibration within clusters
            cal<-c(cal, list(psvar))
            use_rcpp<-FALSE
        }
      } else if (inherits(psvar, "raking")){
        ## raking by iterative proportional fitting
        for(iterations in 1:10){
          for(margin in psvar){
            psw<-attr(margin, "weights")
            x<- x - psw*apply(x/psw, 2, ave, margin)
          }
        }
      } else {
        ## ordinary post-stratification
        psw<-attr(psvar, "weights")
        oldw<-attr(psvar, "oldweights")
        if (is.null(oldw)) oldw<-rep(1,length(psw))
        zeroes<-which(psw==0 & oldw==0)
        if (length(zeroes)) psw[zeroes]=1
        psvar<-as.factor(psvar)
        psmeans<-rowsum(x*oldw/psw,psvar,reorder=TRUE)/as.vector(by(oldw,psvar,sum))
        x<- x-psmeans[match(psvar,sort(unique(psvar))),]*psw
      }
    }
  }
    if (use_rcpp) {
        multistage_rcpp(x, clusters,stratas,fpcs$sampsize, fpcs$popsize,
                        lonely.psu=getOption("survey.lonely.psu"),
                        one.stage=one.stage,stage=1,cal=cal)
    } else {
        multistage(x, clusters,stratas,fpcs$sampsize, fpcs$popsize,
                   lonely.psu=getOption("survey.lonely.psu"),
                   one.stage=one.stage,stage=1,cal=cal)
    }
}

multistage<-function(x, clusters,  stratas, nPSUs, fpcs,
                    lonely.psu=getOption("survey.lonely.psu"),
                     one.stage=FALSE,stage,cal){
  
  n<-NROW(x)
 
  
  v <- onestage(x,stratas[,1], clusters[,1], nPSUs[,1],
                fpcs[,1], lonely.psu=lonely.psu,stage=stage,cal=cal)
  
  if (one.stage!=TRUE && !is.null(fpcs) && NCOL(clusters)>1) {
    v.sub<-by(1:n, list(as.numeric(clusters[,1])), function(index){
      ## residuals for G-calibration using population information
      ## only on clusters at this stage.
      for(cali in cal){
        if (cali$stage != stage)
          next
        j<-match(clusters[index,1],cali$index)
        if (length(unique(j))!=1)
          stop("Internal problem in g-calibration data: stage",stage,
               ", cluster", j)
        j<-j[[1]]
        x[index,]<-as.matrix(qr.resid(cali$qr[[j]], x[index,,drop=FALSE]/cali$w[[j]])*cali$w[[j]])
      }
      multistage(x[index,,drop=FALSE], clusters[index,-1,drop=FALSE],
                 stratas[index,-1,drop=FALSE], nPSUs[index,-1,drop=FALSE],
                 fpcs[index,-1,drop=FALSE],
                 lonely.psu=lonely.psu,one.stage=one.stage-1,
                 stage=stage+1,cal=cal)*nPSUs[index[1],1]/fpcs[index[1],1]
    })
    
    for(i in 1:length(v.sub))
      v<-v+v.sub[[i]]
  }
  dimnames(v)<-list(colnames(x),colnames(x))
  v
}


## Ben Schneider's code to use C++. Should *agree* with multistage() except
## for one known bug in lonely.psu
##
multistage_rcpp <- function(x, clusters,  stratas, nPSUs, fpcs,
                            lonely.psu=getOption("survey.lonely.psu"),
                            one.stage=FALSE, stage, cal){

  lonely.psu <- switch(lonely.psu, 
                       certainty = 'certainty',
                       remove    = 'remove',
                       adjust    = 'adjust',
                       average   = 'average',
                       fail      = 'fail',
                       stop("Can't handle lonely.psu=",lonely.psu)
  )
  use_singleton_method_for_domains <- isTRUE(getOption("survey.adjust.domain.lonely"))

  # Prepare the inputs to pass to Rcpp function
  if (is.data.frame(clusters)) {
    for (j in seq_len(ncol(clusters))) {
      if (!is.numeric(clusters[[j]]))
      clusters[[j]] <- as.numeric(as.factor(clusters[[j]]))
    }
  }
  clusters <- as.matrix(clusters)

  if (is.data.frame(stratas)) {
    for (j in seq_len(ncol(stratas))) {
      if (!is.numeric(stratas[[j]]))
      stratas[[j]] <- as.numeric(as.factor(stratas[[j]]))
    }
  }
  stratas <- as.matrix(stratas)

  if (is.null(fpcs)) {
    strata_pop_sizes <- matrix(Inf,
                               nrow = nrow(nPSUs),
                               ncol = ncol(nPSUs))
  } else {
    strata_pop_sizes <- as.matrix(fpcs)
  }

  strata_samp_sizes <- as.matrix(nPSUs)

  # Call the Rcpp function
  v <- arma_multistage(Y = as.matrix(x),
                       samp_unit_ids = clusters,
                       strata_ids = stratas,
                       strata_samp_sizes = strata_samp_sizes,
                       strata_pop_sizes = strata_pop_sizes,
                       singleton_method = lonely.psu,
                       use_singleton_method_for_domains = getOption("survey.adjust.domain.lonely"),
                       use_only_first_stage = one.stage,
                       stage = stage)

  dimnames(v) <- list(colnames(x),colnames(x))
  v
}



## fpc not given are zero: full sampling.
as.fpc<-function(df,strata,ids,pps=FALSE){

  count<-function(x) sum(!duplicated(x))
  
  sampsize<-matrix(ncol=ncol(ids),nrow=nrow(ids))
  for(i in 1:ncol(ids))
    split(sampsize[,i],strata[,i])<-lapply(split(ids[,i],strata[,i]),count)
  
  if (is.null(df)){
    ## No fpc
    rval<-list(popsize=NULL, sampsize=sampsize)
    class(rval)="survey_fpc"
    return(rval)
  }
  
  fpc<-as.matrix(df)
  if (xor(ispopsize<-any(df>1), all(df>=1))){
    big<-which(fpc>=1,arr.ind=TRUE)
    small<-which(fpc<1,arr.ind=TRUE)
    cat("record",big[1,1]," stage",big[1,2],": fpc=", fpc[big[1,,drop=FALSE]],"\n")
    cat("record",small[1,1]," stage ",small[1,2],": fpc=", fpc[small[1,,drop=FALSE]],"\n")      
    stop("Must have all fpc>=1 or all fpc<=1")
  }
  
  if (ispopsize){
    if(pps) stop("fpc must be specified as sampling fraction for PPS sampling")
    popsize<-fpc
  } else {
    popsize<-sampsize/(fpc)
  }
  if (any(popsize<sampsize)){
    toobig<-which(popsize<sampsize,arr.ind=TRUE)
    cat("record",toobig[1,1],"stage",toobig[1,2],": popsize=",popsize[toobig[1,,drop=FALSE]],
        " sampsize=", sampsize[toobig[1,,drop=FALSE]],"\n")
    stop("FPC implies >100% sampling in some strata")
  }
  if (!ispopsize && any(is.finite(popsize) & (popsize>1e10))){
    big<-which(popsize>1e10 & is.finite(popsize),arr.ind=TRUE)
    warning("FPC implies population larger than ten billion (record",big[1,1]," stage ",big[1,2],")")
  }
  if(!pps){
    ## check that fpc is constant within strata.
    for(i in 1:ncol(popsize)){
      diff<-by(popsize[,i], list(strata[,i]), count)
      if (any(as.vector(diff)>1)){
        j<-which(as.vector(diff)>1)[1]
        warning("`fpc' varies within strata: stratum ",names(diff)[j], " at stage ",i)
      }
    }
  } else{
    ## check that fpc is constant with clusters
     diff<-by(popsize[,i], list(ids[,i]), count)
      if (any(as.vector(diff)>1)){
        j<-which(as.vector(diff)>1)[1]
        warning("`fpc' varies within cluster: cluster ",names(diff)[j], " at stage ",i)
      }
   }
  
  
  rval<-list(popsize=popsize, sampsize=sampsize,pps=pps)
  class(rval)<-"survey_fpc"
  rval
}

"weights.survey_fpc"<-function(object,final=TRUE,...){
  if (is.null(object$popsize) || any(object$popsize>1e12))
    stop("Weights not supplied and can't be computed from fpc.")
  if (final) {
    pop<-apply(object$popsize,1,prod)
    samp<-apply(object$sampsize,1,prod)
    pop/samp
  } else {
    object$popsize/object$sampsize
  }
}


    

print.survey.design2<-function(x,varnames=FALSE,design.summaries=FALSE,...){
  n<-NROW(x$cluster)
  if (x$has.strata) cat("Stratified ")
  un<-length(unique(x$cluster[,1]))
  if(n==un){
    cat("Independent Sampling design")
    is.independent<-TRUE
    if (is.null(x$fpc$popsize))
      cat(" (with replacement)\n")
    else cat("\n")
  } else {
    cat(NCOL(x$cluster),"- level Cluster Sampling design")
    if (is.null(x$fpc$popsize))
      cat(" (with replacement)\n")
    else cat("\n")
    nn<-lapply(x$cluster,function(i) length(unique(i)))
    cat(paste("With (",paste(unlist(nn),collapse=", "),") clusters.\n",sep=""))
    is.independent<-FALSE
  }

  print(x$call)
  if (design.summaries){
    cat("Probabilities:\n")
    print(summary(x$prob))
    if(x$has.strata){
      if (NCOL(x$cluster)>1)
        cat("First-level ")
      cat("Stratum Sizes: \n")
      oo<-order(unique(x$strata[,1]))
      a<-rbind(obs=table(x$strata[,1]),
	       design.PSU=x$fpc$sampsize[!duplicated(x$strata[,1]),1][oo],
               actual.PSU=table(x$strata[!duplicated(x$cluster[,1]),1]))
      print(a)
    }
    if (!is.null(x$fpc$popsize)){
      if (x$has.strata) {
        cat("Population stratum sizes (PSUs): \n")
        s<-!duplicated(x$strata[,1])
        a<-x$fpc$popsize[s,1]
        names(a)<-x$strata[s,1]
        a<-a[order(names(a))]
        print(a)
      } else {
        cat("Population size (PSUs):",x$fpc$popsize[1,1],"\n")
      }
    }
  }
  if (varnames){
    cat("Data variables:\n")
    print(colnames(x))
  }
  invisible(x)
}

    
summary.survey.design2<-function(object,...){
  class(object)<-c("summary.survey.design2",class(object))
  object
}

print.summary.survey.design2<-function(x,...){
  y<-x
  class(y)<-c("survey.design2",class(x))
  print(y,varnames=TRUE,design.summaries=TRUE,...)
}	
     

.svycheck<-function(object){
  if (inherits(object,"survey.design") &&
      !is.null(object$nPSU))
    warning("This is an old-style design object. Please use as.svydesign2 to update it.")
}

as.svydesign2<-function(object){
  if (inherits(object,"survey.design2"))
    return(object)
  if (!inherits(object,"survey.design"))
    stop("This function is for updating old-style survey.design objects")
  

  count<-function(x) length(unique(x))
  
  strata<-data.frame(one=object$strata)
  if ((nc<-ncol(object$cluster))>1){
    for(i in 2:nc){
      strata<-cbind(strata,object$cluster[,i-1])
    }
  }
  
  sampsize<-matrix(ncol=nc,nrow=nrow(object$cluster))
  
  sampsize[,1]<-object$nPSU[match(object$strata, names(object$nPSU))]
  if (nc>1){
    for(i in 2:nc){
      split(sampsize[,i],strata[,i])<-lapply(split(object$cluster[,i],strata[,i]),count)
    }
  }
  
  if (!is.null(object$fpc)){
    popsize<-sampsize
    popsize[,1]<-object$fpc$N[match(object$strata,object$fpc$strata)]
  } else popsize<-NULL
  if (nc>1 && !is.null(object$fpc)){
    warning("Assuming complete sampling at stages 2 -",nc)
  }

  fpc<-list(popsize=popsize,sampsize=sampsize)
  class(fpc)<-"survey_fpc"
  
           
  object$fpc<-fpc
  object$strata<-strata
  object$nPSU<-NULL
  class(object)<-c("survey.design2","survey.design")
  object
  
}

is.pps<-function(x) if(is.null(x$pps)) FALSE else (x$pps!=FALSE)
    
"[.survey.design2"<-function (x,i, ..., drop=TRUE){
  if (!missing(i)){ 
      if (is.calibrated(x) || is.pps(x) || !drop){
          ## Set weights to zero: no memory saving possible
          ## There should be an easier way to complement a subscript..
          if (is.logical(i))
              x$prob[!i]<-Inf
          else if (is.numeric(i) && length(i))
              x$prob[-i]<-Inf
          else {
              tmp<-x$prob[i,]
              x$prob<-rep(Inf, length(x$prob))
              x$prob[i,]<-tmp
          }
          index<-is.finite(x$prob)
          psu<-!duplicated(x$cluster[index,1])
          tt<-table(x$strata[index,1][psu])
          if(any(tt==1) && getOption("survey.adjust.domain.lonely")){
              warning(sum(tt==1)," strata have only one PSU in this subset.")
          }
      } else {
          ## subset everything.
          if (!is.null(x$variables)) ## phase 2 of twophase design
              x$variables<-"[.data.frame"(x$variables,i,..1,drop=FALSE)
          x$cluster<-x$cluster[i,,drop=FALSE]
          x$prob<-x$prob[i]
          x$allprob<-x$allprob[i,,drop=FALSE]
          x$strata<-x$strata[i,,drop=FALSE]
          x$fpc$sampsize<-x$fpc$sampsize[i,,drop=FALSE]
          x$fpc$popsize<-x$fpc$popsize[i,,drop=FALSE]
      }
      
  } else {
      if(!is.null(x$variables))
          x$variables<-x$variables[,..1,drop=FALSE]
  }
  
  x
}

svytotal.survey.design2<-function(x,design, na.rm=FALSE, deff=FALSE,influence=FALSE,...){

  
    if (inherits(x,"formula")){
        ## do the right thing with factors
        mf<-model.frame(x,design$variables,na.action=na.pass)
        xx<-lapply(attr(terms(x),"variables")[-1],
                   function(tt) model.matrix(eval(bquote(~0+.(tt))),mf))
        cols<-sapply(xx,NCOL)
        x<-matrix(nrow=NROW(xx[[1]]),ncol=sum(cols))
        scols<-c(0,cumsum(cols))
        for(i in 1:length(xx)){
            x[,scols[i]+1:cols[i]]<-xx[[i]]
        }
        colnames(x)<-do.call("c",lapply(xx,colnames))
    } else{
        if(typeof(x) %in% c("expression","symbol"))
            x<-eval(x, design$variables)
        else {
            if(is.data.frame(x) && any(sapply(x,is.factor))){
                xx<-lapply(x, function(xi) {if (is.factor(xi)) 0+(outer(xi,levels(xi),"==")) else xi})
                cols<-sapply(xx,NCOL)
                scols<-c(0,cumsum(cols))
                cn<-character(sum(cols))
                for(i in 1:length(xx))
                    cn[scols[i]+1:cols[i]]<-paste(names(x)[i],levels(x[[i]]),sep="")
                x<-matrix(nrow=NROW(xx[[1]]),ncol=sum(cols))
                for(i in 1:length(xx)){
                    x[,scols[i]+1:cols[i]]<-xx[[i]]
                }
                colnames(x)<-cn
            }
        }
    }
    x<-as.matrix(x)
    
    if (na.rm){
        nas<-rowSums(is.na(x))
        design<-design[nas==0,]
        if (length(nas)>length(design$prob))
            x<-x[nas==0,,drop=FALSE]
        else
            x[nas>0,]<-0
    }

    N<-sum(1/design$prob)
    total <- colSums(x/as.vector(design$prob),na.rm=na.rm)
    class(total)<-"svystat"
    attr(total, "var")<-v<-svyrecvar(x/design$prob,design$cluster,
                                     design$strata, design$fpc,
                                   postStrata=design$postStrata)
    attr(total,"statistic")<-"total"
    if (influence){
         if (na.rm && (length(nas)>length(design$prob))) {
             infl<-matrix(0,ncol=NCOL(x),nrow=length(nas))
             infl[nas==0,]<-x/design$prob
             attr(total, "influence")<-infl

        }  else {
            attr(total, "influence")<-x/design$prob
            }
        }

    if (is.character(deff) || deff){
      nobs<-sum(weights(design)!=0)
      if (deff=="replace")
        vsrs<-svyvar(x,design,na.rm=na.rm)*sum(weights(design))^2/nobs
      else
        vsrs<-svyvar(x,design,na.rm=na.rm)*sum(weights(design))^2*(N-nobs)/(N*nobs)
      attr(total, "deff")<-v/vsrs
    }
    

  return(total)
}


svymean.survey.design2<-function(x,design, na.rm=FALSE,deff=FALSE,influence=FALSE,...){
  
  if (inherits(x,"formula")){
    ## do the right thing with factors
    mf<-model.frame(x,design$variables,na.action=na.pass)
    xx<-lapply(attr(terms(x),"variables")[-1],
               function(tt) model.matrix(eval(bquote(~0+.(tt))),mf))
    cols<-sapply(xx,NCOL)
    x<-matrix(nrow=NROW(xx[[1]]),ncol=sum(cols))
    scols<-c(0,cumsum(cols))
    for(i in 1:length(xx)){
      x[,scols[i]+1:cols[i]]<-xx[[i]]
    }
    colnames(x)<-do.call("c",lapply(xx,colnames))
  }
  else {
      if(typeof(x) %in% c("expression","symbol"))
          x<-eval(x, design$variables)
      else if(is.data.frame(x) && any(sapply(x,is.factor))){
          xx<-lapply(x, function(xi) {if (is.factor(xi)) 0+(outer(xi,levels(xi),"==")) else xi})
          cols<-sapply(xx,NCOL)
          scols<-c(0,cumsum(cols))
          cn<-character(sum(cols))
          for(i in 1:length(xx))
              cn[scols[i]+1:cols[i]]<-paste(names(x)[i],levels(x[[i]]),sep="")
          x<-matrix(nrow=NROW(xx[[1]]),ncol=sum(cols))
          for(i in 1:length(xx)){
              x[,scols[i]+1:cols[i]]<-xx[[i]]
          }
          colnames(x)<-cn
      }
    }
  x<-as.matrix(x)
  
  if (na.rm){
    nas<-rowSums(is.na(x))
    design<-design[nas==0,]
    if (length(nas)>length(design$prob))
        x<-x[nas==0,,drop=FALSE]
    else
        x[nas>0,]<-0
  }
  
  pweights<-1/design$prob
  psum<-sum(pweights)
  average<-colSums(x*pweights/psum)
  x<-sweep(x,2,average)
  v<-svyrecvar(x*pweights/psum,design$cluster,design$strata, design$fpc,
              postStrata=design$postStrata)
  attr(average,"var")<-v
    attr(average,"statistic")<-"mean"
    if (influence){
        if (na.rm && (length(nas)>length(pweights))) {
            infl<-matrix(0,ncol=NCOL(x),nrow=length(nas))
            infl[nas==0,]<-x*pweights/psum
            } else {
                infl<-x*pweights/psum
            }
        attr(average,"influence") <- infl
    }
  class(average)<-"svystat"
  if (is.character(deff) || deff){
      nobs<-sum(weights(design)!=0)
      if(deff=="replace"){
        vsrs<-svyvar(x,design,na.rm=na.rm)/(nobs)
      } else {
        if(psum<nobs) {
          vsrs<-NA*v
          warning("Sample size greater than population size: are weights correctly scaled?")
        } else{
          vsrs<-svyvar(x,design,na.rm=na.rm)*(psum-nobs)/(psum*nobs)
        }
      }
      attr(average, "deff")<-v/vsrs
  }
  
  return(average)
}

svyratio.survey.design2<-function(numerator=formula, denominator, design, separate=FALSE,na.rm=FALSE,
                                  formula,covmat=FALSE,deff=FALSE,influence=FALSE,...){

    if (separate){
      strats<-sort(unique(design$strata[,1]))
      if (!design$has.strata)
          warning("Separate and combined ratio estimators are the same for unstratified designs")
      if(influence)
          warning("influence functions not available for separate ratio estimators")
      rval<-list(ratios=lapply(strats,
                   function(s) {
                     tmp<-svyratio(numerator, denominator,
                                   subset(design, design$strata[,1] %in% s),
                                   separate=FALSE,...)
                     attr(tmp,"call")<-bquote(Stratum==.(s))
                     tmp}))
      names(rval$ratios)<-strats
   
      class(rval)<-c("svyratio_separate")
      rval$call<-sys.call()
      rval$strata<-strats
      return(rval)
    }
  
    if (inherits(numerator,"formula"))
        numerator<-model.frame(numerator,design$variables,na.action=na.pass)
    else if(typeof(numerator) %in% c("expression","symbol"))
        numerator<-eval(numerator, design$variables)
    if (inherits(denominator,"formula"))
        denominator<-model.frame(denominator,design$variables,na.action=na.pass)
    else if(typeof(denominator) %in% c("expression","symbol"))
        denominator<-eval(denominator, design$variables)

    numerator<-as.matrix(numerator)
    denominator<-as.matrix(denominator)
    nn<-NCOL(numerator)
    nd<-NCOL(denominator)

    all<-cbind(numerator,denominator)
    nas<-!complete.cases(all)
    if ((na.rm==TRUE) && any(nas)){
      design<-design[!nas,]
      if (NROW(design$cluster) == NROW(all)){
        ## subset by zero weights
        all[nas,]<-1
        numerator[nas,]<-0
        denominator[nas,]<-1
      } else {
        ## subset by actually dropping rows
        all<-all[!nas,,drop=FALSE]
        numerator<-numerator[!nas,,drop=FALSE]
        denominator<-denominator[!nas,,drop=FALSE]
      }
    }
    allstats<-svytotal(all,design) 
    rval<-list(ratio=outer(allstats[1:nn],allstats[nn+1:nd],"/"))

    
    vars<-matrix(ncol=nd,nrow=nn)

    if (deff=="replace" || deff) deffs<-matrix(ncol=nd,nrow=nn)
    
    for(i in 1:nn){
      for(j in 1:nd){
        r<-(numerator[,i]-rval$ratio[i,j]*denominator[,j])/sum(denominator[,j]/design$prob)
        vars[i,j]<-svyrecvar(r*1/design$prob, design$cluster, design$strata, design$fpc,
                            postStrata=design$postStrata)
        if (deff=="replace" || deff){
          deffs[i,j]<-deff(svytotal(r,design,deff=deff))
        }
      }
    }
    if (covmat){
        ii<-rep(1:nn,nd)
        jj<-rep(1:nd,each=nn)
        allr<-sweep(numerator[,ii]-t(as.vector(rval$ratio)*t(denominator[,jj,drop=FALSE])),
                    2, colSums(denominator[,jj,drop=FALSE]/design$prob),"/")
        vcovmat<-svyrecvar(allr*1/design$prob, design$cluster, design$strata, design$fpc,
                           postStrata=design$postStrata)
        colnames(vcovmat)<-colnames(denominator)[ii]
        rval$vcov<-vcovmat
    }
    colnames(vars)<-colnames(denominator)
    rownames(vars)<-colnames(numerator)
    rval$var<-vars
    if (deff=="replace" || deff)
        attr(rval,"deff")<-deffs
    
    attr(rval,"call")<-sys.call()

    if (influence){
        if (na.rm && (length(nas)>length(design$prob))) {
            infl<-matrix(0,ncol=NCOL(r),nrow=length(nas))
            infl[nas==0,]<-r/design$prob
        } else {
                infl<-r/design$prob
            }
        attr(rval,"influence") <- infl
    }
    class(rval)<-"svyratio"
    rval
    
  }

Try the survey package in your browser

Any scripts or data that you put into this service are public.

survey documentation built on April 9, 2024, 3:01 a.m.