#' @title Take sample(s) from population data frame
#' @description
#' This function generates a single sample (of class 'sample') or a multiple sample (of class 'samples.population')
#' containing all possible samples without replacement, from a given population.
#'
#' @usage take.sample(dat,y.name,n=0,m=0,type="srs",aux.name=NULL,take.all=FALSE)
#' @param dat Data frame containing the population to be sampled.
#' @param y.name Name of the response variable (must be a column name of 'dat')
#' @param n Primary sample unit sample size. If it is a vector, it is assumed to be the sample sizes of
#' within each stratum. For example, to take a stratified sample with 5 primary sample units from stratum
#' 1, 3 primary sample units from stratum 2 and 6 primary sample units from stratum 3, you would have
#' 'n=c(5,3,6). Has no effect in the case of cluster sampling.
#' @param m Secondary sample unit size. In cluster sampling, it is the number of clusters to be sampled.
#' Has no effect in the case of stratified sampling.
#' @param type Type of sampling. Valid types are
#' \itemize{
#' \item{"srs"}{for simple random sampling,}
#' \item{"strat"}{for stratified simple random sampling,}
#' \item{"clust"}{for cluster sampling.}
#' }
#' @param aux.name Name of the auxiliary variable, if this is to be included in the sample (must be a
#' column name of 'dat'). Not used in cluster sampling.
#' @param take.all if FALSE, a single sample is generated; if TRUE all possible samples without
#' replacement are generated
#' @details
#' Units are sampled without replacement. In the case of cluster sampling, secondary units are
#' sampled with equal probability; in the case of simple random sampling, primary units are sampled with
#' equal probability, and in the case of stratified simple random sampling, primary units are sampled with
#' equal probability within strata.
#'
#' @return
#' The function returns an object of class `sample? when take.all==FALSE and of class
#' 'samples.population' when take.all==TRUE. The object has the following components:
#' \itemize{
#' \item{M}{Number of secondary units in population}
#' \item{m}{Number of secondary units sampled}
#' \item{N}{Number of primary units in population; if stratified, this is a vector of length M.}
#' \item{n}{Number of primary units in sample; if stratified, this is a vector of length M.}
#' \item{y.value}{For cluster sampling, this contains a vector with the sum of the y values in the sampled
#' subunits. For srs and stratified rs, it is NULL.}
#' \item{x.value}{For cluster sampling, this contains a vector with the sizes (Nj) of the sampled
#' subunits. For srs and stratified rs, it is NULL.}
#' \item{subunit1 ... subunitm}{Objects of class 'subsample' or 'subsamples.population' containing the primary
#' unit sampled data. In the case of cluster sampling, this is NULL. Each subunit contains the
#' following components:}
#' \itemize{
#' \item{N}{Number of primary units in the subunit}
#' \item{n}{Number of primary units sampled in the subunit}
#' \item{mu.x}{Population mean of the auxiliary variable in the subunit. If there is no auxiliary
#' variable, it is NULL.}
#' \item{x.value}{Sampled auxiliary variable (x) values in the subunit. This is a matrix with one row per
#' sample. If there is no auxiliary variable, it is NULL.}
#' \item{y.value}{Sampled response (y) values in the subunit. This is a matrix with one row per sample.}
#' \item{units}{Indices of the primary units in the sample. This is a character vector, with one element
#' per sample. Each element is of the form "(i1,i2,...,in)", where i1 is the index of the first unit, i2
#' the index of the second unit, and so on.}
#' }
#' }
#'
#' @seealso \code{\link{define.subunit}}
#' @examples
#' data(ackroyd) # get ackroyd data
#'
#' # simple random sample of sales, of size 4
#' samp<-take.sample(ackroyd,y.name="sales",n=4)
#' summary(samp) # summarize sample data
#'
#' # All simple random samples of sales, of size 4
#' all.samp<-take.sample(ackroyd,y.name="sales",n=4,take.all=TRUE)
#' summary(all.samp) # summarize sample data
#'
#' # All simple random samples of sales, of size 4, with auxiliary variable "mplyees"
#' all.samp<-take.sample(ackroyd,y.name="sales",n=4,aux.name="mplyees",take.all=TRUE)
#' summary(all.samp) # summarize sample data
#'
#' # All stratified random samples of sales, of size 4, with auxiliary variable "mplyees"
#' # First need to define strata:
#' strat.ackroyd<-define.subunit(ackroyd,aux.name="nature",type="strat")
#' # Then take samples:
#' all.strat.samp<-take.sample(strat.ackroyd,type="strat", y.name="sales",n=c(2,2),aux.name="mplyees",take.all=TRUE)
#' summary(all.strat.samp) # summarize sample data
#'
#' @export
take.sample<-function(dat,y.name,n=0,m=0,type="srs",aux.name=NULL,take.all=FALSE)
{
# local function:
get.samp<-function(index,vec) vec[index]
nsamp<-1
# some checks:
valid.type=c("srs","strat","clust")
if(!is.element(type,valid.type)) {
cat("\nInvalid 'type'; valid types are: ",valid.type,"\n")
stop("'type' must be one of these")
}
valid.y.name<-names(dat)
if(!is.element(y.name,valid.y.name)) {
cat("\nInvalid 'y.name'; valid names are: ",valid.y.name,"\n")
stop("'y.name' must be one of these")
}
valid.x.name<-names(dat)[-which(names(dat)==y.name)]
if(!is.null(aux.name)) {
if(!is.element(aux.name,valid.x.name)) {
cat("\nInvalid 'aux.name'; valid names are: ",valid.x.name,"\n")
stop("'aux.name' must be one of these")
}}
# response:
y.col<-which(names(dat)==y.name)
Y<-dat[[y.col]]
N<-length(Y)
# stratified sampling:
if(type=="strat") {
if(any(n==0)) stop("You must specify samples 'n', all of size greater than zero")
if(!is.element("subunit",names(dat)[-which(names(dat)==y.name)]))
stop("No stratification variable - you can add one using function define.subunit()")
if(!is.null(aux.name)) {
x.col<-which(names(dat)==aux.name)
X<-dat[[x.col]]
if(length(X)!=N) stop("Variables 'y.name. and 'aux.name' have different lengths")
}
s.name<-sort(unique(dat$subunit))
M<-length(s.name)
if(length(n)!=M) {
cat("\nThere are M=",M," strata, but 'n' is of length ",length(n),".\n")
stop("'n' must be a vector of length M")
}
N<-rep(0,M)
for(i in 1:M) N[i]<-sum(dat$subunit==s.name[i])
if(any(N<1)) stop("At least one subunit is empty - you should re-stratify.")
if(sum(n>N)) stop("Sample size (n) is bigger than population size (N) in at lest one subunit")
if(take.all) nsamp<-exp(lgamma(N+1)-lgamma(n+1)-lgamma(N-n+1))
samp<-as.list(NULL)
for(i in 1:M) {
if(take.all) comb<-t(combinations(N[i],n[i]))
else comb<-matrix(sample(1:N[i],size=n[i]),nrow=1)
sample.index<-apply(comb,1,vec2char)
samp[[s.name[i]]]<-list(N=0,n=0,mu.x=NULL,units="",x.value=NULL,y.value=NULL)
samp[[s.name[i]]]$N<-N[i]
samp[[s.name[i]]]$n<-n[i]
samp[[s.name[i]]]$units<-sample.index
ys<-t(apply(comb,1,get.samp,Y[dat$subunit==s.name[i]]))
if(dim(comb)[2]==1) ys<-matrix(ys,ncol=1) # because 1-D matrix becomes vector above!
colnames(ys)<-paste(y.name,as.character(1:n[i]),sep=".")
samp[[s.name[i]]]$y.value<-ys
if(is.null(aux.name)) {
aux<-NULL
mean.aux<-NULL
} else {
aux<-t(apply(comb,1,get.samp,X[dat$subunit==s.name[i]]))
if(dim(comb)[2]==1) aux<-matrix(aux,ncol=1) # because 1-D matrix becomes vector above!
colnames(aux)<-paste(aux.name,as.character(1:n[i]),sep=".")
samp[[s.name[i]]]$x.value<-aux
samp[[s.name[i]]]$mu.x<-mean(samp[[s.name[i]]]$x.value)
}
class(samp[[s.name[i]]])<-c("subsamples.population", "subsample")
}
samp$M<-M
samp$m<-M
samp$N<-N
samp$n<-n
samp$nsamp<-nsamp
names(samp)[1:M]<-paste("subunit",as.character(c(1:M)),sep="")
}
# cluster sampling:
if(type=="clust") {
if(m==0) stop("You must specify 'm'")
if(!is.null(aux.name)) Warning("Variable 'aux.name' is not used in cluster sampling")
if(!is.element("subunit",names(dat)[-which(names(dat)==y.name)]))
stop("No cluster variable defined - you can add one using function define.subunit()")
X<-dat$subunit
if(length(X)!=N) stop("Variables 'y.name'. and subunit have different lengths")
s.name<-sort(unique(dat$subunit))
M<-length(s.name)
if(length(m)!=1) stop("'m' must be a scalar<",M,"\n")
samp<-as.list(NULL)
Y.tot<-rep(0,M)
# get totals in each cluster and store them
N<-rep(0,M)
for(i in 1:M) {
Y.tot[i]<-sum(Y[dat$subunit==s.name[i]])
N[i]<-sum(dat$subunit==s.name[i])
}
if(any(N<1)) stop("At least one subunit is empty - you should re-define clusters.")
if(sum(m>M)) stop("Cluster sample size (m) is bigger than population size (M)")
if(take.all) {
nsamp<-exp(lgamma(M+1)-lgamma(m+1)-lgamma(M-m+1))
comb<-t(combinations(M,m))
} else {
nsamp<-1
comb<-matrix(sample(1:M,size=m),nrow=1)
}
sample.index<-apply(comb,1,vec2char)
ys<-t(apply(comb,1,get.samp,Y.tot))
if(dim(comb)[2]==1) ys<-matrix(ys,ncol=1) # because 1-D matrix becomes vector above!
colnames(ys)<-paste(y.name,as.character(1:m),sep=".")
xs<-t(apply(comb,1,get.samp,N))
if(dim(comb)[2]==1) ys<-matrix(ys,ncol=1) # because 1-D matrix becomes vector above!
colnames(xs)<-paste("N",as.character(1:m),sep=".")
samp$units<-sample.index
samp$mu.x<-mean(N)
samp$y.value<-ys
samp$x.value<-xs
class(samp)<-c("samples.population", "sample")
samp$M<-M
samp$m<-m
samp$N<-N
samp$n<-N
samp$nsamp<-nsamp
}
# simple random sampling:
if(type=="srs") {
if(length(n)>1) stop("With srs, 'n' must contain only one value.")
if(take.all) {
nsamp<-exp(lgamma(N+1)-lgamma(n+1)-lgamma(N-n+1))
comb<-t(combinations(N,n))
} else {
nsamp<-1
comb<-matrix(sample(1:N,size=n),nrow=1)
}
sample.index<-apply(comb,1,vec2char)
ys<-t(apply(comb,1,get.samp,Y))
if(dim(comb)[2]==1) ys<-matrix(ys,ncol=1) # because 1-D matrix becomes vector above!
colnames(ys)<-paste(y.name,as.character(1:n),sep=".")
if(is.null(aux.name)) {
aux<-NULL
mean.aux<-NULL
} else {
x.col<-which(names(dat)==aux.name)
X<-dat[[x.col]]
if(length(X)!=N) stop("Variables 'y.name. and 'aux.name' have different lengths")
mean.aux<-mean(X)
aux<-t(apply(comb,1,get.samp,X))
if(dim(comb)[2]==1) aux<-matrix(aux,ncol=1) # because 1-D matrix becomes vector above!
colnames(aux)<-paste(aux.name,as.character(1:n),sep=".")
}
stratum<-list(mu.x=mean.aux,units=sample.index,y.value=ys,x.value=aux,N=N,n=n)
class(stratum)<-c("subsamples.population", "subsample")
samp<-list(subunit1=stratum,M=1,m=1,N=N,n=n,nsamp=nsamp)
}
cat("\nA total of ",vec2char(round(nsamp),"*")," samples was generated\n")
if(take.all) class(samp)<-c("samples.population", "sample")
else class(samp)<-c("sample")
samp
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.