R/pps.R

Defines functions svyratio.pps svymean.pps svytotal.pps ppsvar print.summary.pps summary.pps print.pps pps_design.HR print.HR HR poisson_sampling pps_design.ppsdelta pps_design.ppsdcheck pps_design.ppsmat print.ppsmat print.ppsdelta print.ppsdcheck ppscov ppsmat pps_design.character pps_design image.pps make_pps_covmat combine_stages multi.HR1Dcheck multi.HRDcheck HRDcheck multi.overton2Dcheck overton2Dcheck pi2Dcheck

Documented in HR poisson_sampling ppscov ppsmat

##
## Constructing cov(R_i,R_j)/pi^*_ij, or \check{\check{\Delta}}_ij in Sarndal's notation
## We use this form because it can be sparse and because it is easy to combine
## multistage and multiphase sampling.
##
## The routines to compute the variances are in ht.R
##

pi2Dcheck<-function(pmat,tolerance=min(pmat)/1e4){
    rval<-(pmat-outer(diag(pmat),diag(pmat)))/pmat
    rval[abs(rval)<tolerance]<-0
    as(rval,"sparseMatrix")
}


## Overton's approximation for PPS
overton2Dcheck<-function(prob,strat=rep(1,length(prob))){
    fbar<-outer(prob,prob,"+")/2
    n<-ave(strat,strat,FUN=length)
    rval<- 1- (n-fbar)/(n-1) 
    rval[!outer(strat,strat,"==") | fbar==1]<-0
    diag(rval)<-(1-diag(fbar))
    as(rval,"sparseMatrix")
}

multi.overton2Dcheck<-function(id,strata,prob){
    nstage<-ncol(id)
    rval<-vector("list",nstage)
    for(stage in 1:nstage){
        uid<-!duplicated(id[,stage])
        rval[[stage]]<-list(id=id[,stage],
                            dcheck=overton2Dcheck(prob[uid,stage],
                              strata[uid,stage])
                           )
      }
    rval
}

## truncated Hartley-Rao approximation
HRDcheck<-function(prob,strat=rep(1,length(prob)),p2bar){
    fbar<-outer(prob,prob,"+")
    n<-ave(strat,strat,FUN=length)
    rval<- 1- (n-fbar+p2bar)/(n-1) 
    rval[!outer(strat,strat,"==") | fbar==1]<-0
    diag(rval)<-(1-prob)
    as(rval,"sparseMatrix")
}

multi.HRDcheck<-function(id,strata,prob,p2bar){
    nstage<-ncol(id)
    rval<-vector("list",nstage)
    for(stage in 1:nstage){
        uid<-!duplicated(id[,stage])
        rval[[stage]]<-list(id=id[,stage],
                            dcheck=HRDcheck(prob[uid,stage],
                              strata[uid,stage],p2bar[[stage]][strata[uid,stage]])
                           )
      }
    rval
}

## truncated Hartley-Rao approximation, using sample estimate mean(p) for sum(p^2/n)

multi.HR1Dcheck<-function(id,strata,prob){
    nstage<-ncol(id)
    rval<-vector("list",nstage)
    for(stage in 1:nstage){
        uid<-!duplicated(id[,stage])
        rval[[stage]]<-list(id=id[,stage],
                            dcheck=HRDcheck(prob[uid,stage],
                              strata[uid,stage],
                              ave(prob[uid,stage], strata[uid,stage])
                              )
                            )
      }
    rval
  }

##not used yet
combine_stages<-function(Dcheck1,Dcheck2){
  as(-Dcheck1*Dcheck2+Dcheck1+Dcheck2,"sparseMatrix")
}

make_pps_covmat<-function(design,method){##FIXME
  if (method=="overton")
    multi.overton2Dcheck(design$cluster, design$strata, design$allprob)
  else stop("method",method,"not recognized")

}

image.pps<-function(x,...){
  Matrix::image(x$dcheck[[1]]$dcheck,...)
}

##
pps_design<-function(method, ids,strata=NULL, probs=NULL, fpc=NULL,variables=NULL,
                     subset, data,call=sys.call(),variance="HT",...){
  UseMethod("pps_design")
}
pps_design.character<-function(method,ids,strata=NULL, probs=NULL, fpc=NULL, variables=NULL,
                   subset, data,call=sys.call(),variance="HT",...){

  if (length(ids[[2]])>1 && method!="brewer") stop("Multistage PPS sampling not supported with this method")
  rval<-svydesign(ids=ids,strata=strata,weights=NULL,
                probs=probs,fpc=fpc,data=data,pps="other")

  deltacheck<-make_pps_covmat(rval, method)

  rval$dcheck=deltacheck
  rval$variance<-variance

  rval$call<-call
  class(rval) <- c("pps","survey.design")
  rval
}

ppsmat<-function(jointprob, tolerance=0.0001){
  if ((!is.matrix(jointprob)) || !(NROW(jointprob)==NCOL(jointprob)))
    stop("jointprob must be a square matrix")
  rval<-list(pij=jointprob, tolerance=tolerance,call=sys.call())
  class(rval)<-"ppsmat"
  rval
}

ppscov<-function(probcov, weighted=FALSE){
  if ((!is.matrix(probcov)) && (!is(probcov,"Matrix")) || !(NROW(probcov)==NCOL(probcov)))
      stop("jointprob must be a square matrix")
  if (weighted){
      rval<-list(dcheck=probcov,call=sys.call())
      class(rval)<-"ppsdcheck"
  } else {
      rval<-list(delta=probcov,call=sys.call())
      class(rval)<-"ppsdelta"
  }
  rval

}


print.ppsdcheck<-function(x,...) {
  cat("PPS: Weighted covariance matrix: ")
  print(x$call)
  invisible(x)
}


print.ppsdelta<-function(x,...) {
  cat("PPS: Sampling covariance matrix: ")
  print(x$call)
  invisible(x)
}


print.ppsmat<-function(x,...) {
  cat("PPS: Joint probability matrix: ")
  print(x$call)
  invisible(x)
}

pps_design.ppsmat<-function(method,ids,strata=NULL, probs=NULL, fpc=NULL,variables=NULL,
                   subset, data,call=sys.call(),variance="HT",...){

  if (length(ids[[2]])>1) stop("Multistage PPS sampling not supported")
  rval<-svydesign(ids=ids,strata=strata,weights=NULL,variables=variables,
                probs=probs,fpc=fpc,data=data,pps="other")

  deltacheck<-pi2Dcheck(method$pij,method$tolerance)
  rval$variance<-variance

  rval$dcheck<-list(list(id=1:nrow(method$pij), dcheck=deltacheck))

  rval$call<-call
  class(rval) <- c("pps","survey.design")
  rval
}


pps_design.ppsdcheck<-function(method,ids,strata=NULL, probs=NULL, fpc=NULL,variables=NULL,
                   subset, data,call=sys.call(),variance="HT",...){

  if (length(ids[[2]])>1) stop("Multistage PPS sampling not supported")
  rval<-svydesign(ids=ids,strata=strata,weights=NULL,variables=variables,
                probs=probs,fpc=fpc,data=data,pps="other")

  deltacheck<-method$dcheck
  rval$variance<-variance

  rval$dcheck<-list(list(id=1:nrow(deltacheck), dcheck=deltacheck))

  rval$call<-call
  class(rval) <- c("pps","survey.design")
  rval
}



pps_design.ppsdelta<-function(method,ids,strata=NULL, probs=NULL, fpc=NULL,variables=NULL,
                   subset, data,call=sys.call(),variance="HT",...){

  if (length(ids[[2]])>1) stop("Multistage PPS sampling not supported")
  rval<-svydesign(ids=ids,strata=strata,weights=NULL,variables=variables,
                probs=probs,fpc=fpc,data=data,pps="other")

  w<-weights(rval)
  deltacheck<-method$delta*tcrossprod(w)
  diag(deltacheck)<-diag(method$delta)*w
  method$deltacheck<-deltacheck
  rval$variance<-variance

  rval$dcheck<-list(list(id=1:nrow(method$deltacheck), dcheck=deltacheck))

  rval$call<-call
  class(rval) <- c("pps","survey.design")
  rval
}

poisson_sampling<-function(p){
    ppscov(Diagonal(x=1-p),weighted=TRUE)
}



HR<-function(psum=NULL, strata=NULL){
  if (is.null(psum)) { ## estimate
    rval<-list(pbar=NULL,call=sys.call())
  } else if (is.data.frame(strata) || is.matrix(strata)){ #redundant
    pbar<-lapply(1:NCOL(strata), function(i){
      psum[!duplicated(strata[,i]),i]})
    strata<-lapply(1:NCOL(strata), function(i){
      strata[!duplicated(strata[,i]),i]})
    rval<-list(pbar=pbar, strata=strata,call=sys.call())
  } else if (is.null(strata) && is.numeric(psum) && length(psum)==1){
    ## single number
    rval<-list(pbar=list(psum),strata=list(1), call=sys.call())
  } else{ ## non-redundant list
    rval<-list(pbar=psum, strata=strata,call=sys.call())
  }
  class(rval)<-"HR"
  rval
}

print.HR<-function(x,...) {
  cat("PPS: Hartley-Rao correction: ")
  print(x$call)
  invisible(x)
}
pps_design.HR<-function(method,ids,strata=NULL, probs=NULL, fpc=NULL,variables=NULL,
                   subset, data,call=sys.call(),variance="HT",...){

  if (length(ids[[2]])>1) stop("Multistage PPS sampling not supported with this method")
  rval<-svydesign(ids=ids,strata=strata,weights=NULL,
                probs=probs,fpc=fpc,data=data,pps="other")

  if (is.null(method$pbar))    ## sample estimate of sum(p^2/n)
    deltacheck<-multi.HR1Dcheck(rval$cluster,rval$strata,rval$allprob)
  else 
    deltacheck<-multi.HRDcheck(rval$cluster,rval$strata,rval$allprob, method$pbar)

  rval$dcheck=deltacheck
  rval$variance<-variance

  rval$call<-call
  class(rval) <- c("pps","survey.design")
  rval
}
print.pps<-function(x,...){
  cat("Sparse-matrix design object:\n ")
  print(x$call)
}

summary.pps<-function(object,...){
  class(object)<-"summary.pps"
  object
}

print.summary.pps<-function(x,...,varnames=TRUE){
  cat("Two-phase sparse-matrix design:\n ")
  print(x$call)
  cat("Sampling probabilities:\n")
  print(summary(x$prob))
  if (varnames){
    cat("Data variables:\n")
    print(names(x$variables))
  }
  invisible(x)
}




ppsvar<-function(x,design){
  postStrata<-design$postStrata
  est<-design$variance ##Yates-Grundy or Horvitz-Thompson
  if (!is.null(postStrata)){
    for (psvar in postStrata){
      if (inherits(psvar, "greg_calibration")) {
        if (psvar$stage==0){
          ## G-calibration at population level
          x<-qr.resid(psvar$qr,x/psvar$w)*psvar$w
        } else {
          ## G-calibration within clusters
          stop("calibration within clusters not yet available for PPS designs")
        }
      } else {
        ## ordinary post-stratification
        psw<-attr(psvar, "weights")
        postStrata<-as.factor(psvar)
        psmeans<-rowsum(x/psw,psvar,reorder=TRUE)/as.vector(table(factor(psvar)))
        x<- x-psmeans[match(psvar,sort(unique(psvar))),]*psw
      }
    }
  }
  dcheck<-design$dcheck
  if (length(dcheck)!=1) stop("Multistage not implemented yet")
  rval<-switch(est,HT=htvar.matrix(rowsum(x,dcheck[[1]]$id,reorder=FALSE),dcheck[[1]]$dcheck),
               YG=ygvar.matrix(rowsum(x,dcheck[[1]]$id,reorder=FALSE),dcheck[[1]]$dcheck),
               stop("can't happen"))
  rval
}

svytotal.pps<-function(x,design, na.rm=FALSE, deff=FALSE,...){
    
    
    if (inherits(x,"formula")){
        ## do the right thing with factors
        mf<-model.frame(x,model.frame(design), na.action=na.pass)
        xx<-lapply(attr(terms(x),"variables")[-1],
                   function(tt) model.matrix(eval(bquote(~0+.(tt))),mf))
        cols<-sapply(xx,NCOL)
        x<-matrix(nrow=NROW(xx[[1]]),ncol=sum(cols))
        scols<-c(0,cumsum(cols))
        for(i in 1:length(xx)){
          x[,scols[i]+1:cols[i]]<-xx[[i]]
        }
        colnames(x)<-do.call("c",lapply(xx,colnames))
      } else {
        if(typeof(x) %in% c("expression","symbol"))
          x<-eval(x, design$variables)
        else {
            if(is.data.frame(x) && any(sapply(x,is.factor))){
              xx<-lapply(x, function(xi) {if (is.factor(xi)) 0+(outer(xi,levels(xi),"==")) else xi})
              cols<-sapply(xx,NCOL)
              scols<-c(0,cumsum(cols))
              cn<-character(sum(cols))
              for(i in 1:length(xx))
                cn[scols[i]+1:cols[i]]<-paste(names(x)[i],levels(x[[i]]),sep="")
              x<-matrix(nrow=NROW(xx[[1]]),ncol=sum(cols))
                for(i in 1:length(xx)){
                  x[,scols[i]+1:cols[i]]<-xx[[i]]
                }
              colnames(x)<-cn
            }
          }
      }
  x<-as.matrix(x)
    
  if (na.rm){
    nas<-rowSums(is.na(x))
    design<-design[nas==0,]
    if(length(nas)>length(design$prob))
      x<-x[nas==0,,drop=FALSE]
    else
      x[nas>0,]<-0
  }
    
    N<-sum(1/design$prob)
    total <- colSums(x/as.vector(design$prob),na.rm=na.rm)
    class(total)<-"svystat"
    attr(total, "var")<-v<-ppsvar(x/design$prob,design)
    attr(total,"statistic")<-"total"
    
    if (is.character(deff) || deff){
      nobs<-NROW(design$cluster)
      if (deff=="replace")
        vsrs<-svyvar(x,design,na.rm=na.rm)*sum(weights(design))^2*(N-nobs)/N
      else
        vsrs<-svyvar(x,design,na.rm=na.rm)*sum(weights(design))^2
    attr(total, "deff")<-v/vsrs
    }
    
    
  return(total)
  }

svymean.pps<-function(x,design, na.rm=FALSE,deff=FALSE,...){
  
  if (inherits(x,"formula")){
    ## do the right thing with factors
    mf<-model.frame(x,model.frame(design) ,na.action=na.pass)
    xx<-lapply(attr(terms(x),"variables")[-1],
               function(tt) model.matrix(eval(bquote(~0+.(tt))),mf))
    cols<-sapply(xx,NCOL)
    x<-matrix(nrow=NROW(xx[[1]]),ncol=sum(cols))
    scols<-c(0,cumsum(cols))
    for(i in 1:length(xx)){
      x[,scols[i]+1:cols[i]]<-xx[[i]]
    }
    colnames(x)<-do.call("c",lapply(xx,colnames))
  }
  else {
      if(typeof(x) %in% c("expression","symbol"))
          x<-eval(x, design$variables)
      else {
          if(is.data.frame(x) && any(sapply(x,is.factor))){
              xx<-lapply(x, function(xi) {if (is.factor(xi)) 0+(outer(xi,levels(xi),"==")) else xi})
              cols<-sapply(xx,NCOL)
              scols<-c(0,cumsum(cols))
              cn<-character(sum(cols))
              for(i in 1:length(xx))
                  cn[scols[i]+1:cols[i]]<-paste(names(x)[i],levels(x[[i]]),sep="")
              x<-matrix(nrow=NROW(xx[[1]]),ncol=sum(cols))
              for(i in 1:length(xx)){
                  x[,scols[i]+1:cols[i]]<-xx[[i]]
              }
              colnames(x)<-cn
          }
      }
  }
  
  
  x<-as.matrix(x)
  
  if (na.rm){
    nas<-rowSums(is.na(x))
    if (any(nas>0))
      design<-design[nas==0,]
    x[nas>0,]<-0
  }
  
  pweights<-1/design$prob
  psum<-sum(pweights)
  average<-colSums(x*pweights/psum)
  x<-sweep(x,2,average)
  v<-ppsvar(x*pweights/psum,design)
  attr(average,"var")<-v
  attr(average,"statistic")<-"mean"
  class(average)<-"svystat"
  if (is.character(deff) || deff){
      nobs<-nrow(design)
      if(deff=="replace"){
        vsrs<-svyvar(x,design,na.rm=na.rm)/(nobs)
      } else {
        if(psum<nobs) {
          vsrs<-NA*v
          warning("Sample size greater than population size: are weights correctly scaled?")
        } else{
          vsrs<-svyvar(x,design,na.rm=na.rm)*(psum-nobs)/(psum*nobs)
        }
      }
      attr(average, "deff")<-v/vsrs
  }
  
  return(average)
}



svyratio.pps<-function(numerator=formula, denominator, design, separate=FALSE,na.rm=FALSE,formula,...){

    if (separate){
      strats<-sort(unique(design$strata[,1]))
      rval<-list(ratios=lapply(strats,
                   function(s) {
                     tmp<-svyratio(numerator, denominator,
                                   subset(design, design$phase2$strata[,1] %in% s),
                                   separate=FALSE,...)
                     attr(tmp,"call")<-bquote(Stratum==.(s))
                     tmp}))
      names(rval$ratios)<-strats
   
      class(rval)<-c("svyratio_separate")
      rval$call<-sys.call()
      rval$strata<-strats
      return(rval)
    }
  
    if (inherits(numerator,"formula"))
        numerator<-model.frame(numerator,model.frame(design),na.action=na.pass)
    else if(typeof(numerator) %in% c("expression","symbol"))
        numerator<-eval(numerator, design$variables)
    if (inherits(denominator,"formula"))
        denominator<-model.frame(denominator,model.frame(design),na.action=na.pass)
    else if(typeof(denominator) %in% c("expression","symbol"))
        denominator<-eval(denominator, model.frame(design))

    nn<-NCOL(numerator)
    nd<-NCOL(denominator)

    all<-cbind(numerator,denominator)
    nas<-!complete.cases(all)
    if (na.rm){
        design<-design[!nas,]
        all<-all[!nas,,drop=FALSE]
        numerator<-numerator[!nas,,drop=FALSE]
        denominator<-denominator[!nas,,drop=FALSE]
    }
    allstats<-svytotal(all, design) 
    rval<-list(ratio=outer(allstats[1:nn],allstats[nn+1:nd],"/"))


    vars<-matrix(ncol=nd,nrow=nn)
    for(i in 1:nn){
      for(j in 1:nd){
        r<-(numerator[,i]-rval$ratio[i,j]*denominator[,j])/sum(denominator[,j]/design$prob)
        vars[i,j]<-ppsvar(r*1/design$prob, design)
      }
    }
    colnames(vars)<-names(denominator)
    rownames(vars)<-names(numerator)
    rval$var<-vars
    attr(rval,"call")<-sys.call()
    class(rval)<-"svyratio"
    rval
    
  }


"[.pps"<-function (x,i, ..., drop=TRUE){
  if (!missing(i)){ 
    ## Set weights to zero:  don't try to save memory
    ## There should be an easier way to complement a subscript..
    if (is.logical(i) && any(!i)){
      ## logical indexing: use !
      x$prob[!i]<-Inf
      x$dcheck<-lapply(x$dcheck, function(m) {m$dcheck[!i,!i]<-0; m})
    } else if (is.numeric(i) && length(i)){
      ## numeric indexing: use -
      x$prob[-i]<-Inf
      x$dcheck<-lapply(x$dcheck, function(m) {m$dcheck[-i,-i]<-0;m})
    } else if (is.character(i)) {
      ##character indexing: use brute force and ignorance
      tmp<-x$prob[i,]
      x$prob<-rep(Inf, length(x$prob))
      x$prob[i,]<-tmp
      x$dcheck<-lapply(x$dcheck, function(m) {n<-Matrix(ncol(m$dcheck),ncol(m$dcheck)); n[i,i]<-m$dcheck[i,i]; m$dcheck<-n;m})
    }
    index<-is.finite(x$prob)
    psu<-!duplicated(x$cluster[index,1])
    tt<-table(x$strata[index,1][psu])
    if(any(tt==1)){
      warning(sum(tt==1)," strata have only one PSU in this subset.")
    }
  } else {
    
  }
  x
}

degf.pps<-function(design,...) {
    inset <- weights(design, "sampling") != 0
    length(unique(design$cluster[inset, 1])) - length(unique(design$strata[inset,1]))
}





postStratify.pps<-function(design, ...) {
	stop("postStratify not yet implemented for these pps designs. Use calibrate()")
}
bschneidr/fastsurvey documentation built on March 13, 2024, 11:12 a.m.