Nothing
GeoDistMOS <- function(lat, ## Latitude variable. Must be in decimal
long, ## Longitude variable. Must be in decimal
psuID, ## PSU Cluster ID
n, ## Sample size
MOS.var, ## Variable used for PPS sampling
MOS.takeall = 1, ## Threshold value for MOS certainties
Input.ID = NULL ## ID variable from input file
) {
## Confirm latitude and longitude are numeric and have no missing values
if(is.numeric(lat) == FALSE)
stop("Latitude must be numeric.\n")
if(any(is.na(lat)) == TRUE)
stop("Latitude has missing values, which are not allowed.\n")
if(is.numeric(long) == FALSE)
stop("Longiitude must be numeric.\n")
if(any(is.na(long)) == TRUE)
stop("Longitude has missing values, which are not allowed.\n")
## Confirm sample size is numeric and positive
if(is.numeric(n) == FALSE)
stop("Sample size must be numeric.\n")
if((n > 0) == FALSE)
stop("Sample size must be greater than zero.\n")
## Confirm MOS certainty threshold is numeric, positive, and less than 1
if(is.numeric(MOS.takeall) == FALSE)
stop("MOS certainty threshold must be numeric.\n")
if((MOS.takeall > 0) == FALSE)
stop("MOS certainty threshold must be greater than zero.\n")
if((MOS.takeall <= 1) == FALSE)
stop("MOS certainty threshold must be less than or equal to one.\n")
## Convert psuID to character
psuID <- as.character(psuID)
## Convert Input.ID to character
Input.ID <- as.character(Input.ID)
## Step 1: Identify certainty PSUs
## Calculate MOS for each PSU from MOS variable
psuID.MOS <- (tapply(MOS.var, psuID, sum) * n) / sum(MOS.var)
## Compute inclusion probabilities based on PSU MOS and add PSU name
psuID.pik <- sampling::inclusionprobabilities(psuID.MOS, n)
names(psuID.pik) <- names(psuID.MOS)
## Identify PSUs above MOS certainty threshold for splitting
Cert <- (psuID.pik >= MOS.takeall)
psuID.cert <- names(Cert)[Cert]
## Step 2: Identify certainty sampling units within certainty PSU
## Note: psuID.pik/psuID.MOS gives adjustment factor from MOS to pik
psu.MOS <- list()
psu.MOS.adj <- psuID.pik/psuID.MOS
psu.pik <- list()
psu.cert <- list()
for (i in psuID.cert){
psu.MOS[[i]] <- MOS.var[psuID == i] * n / sum(MOS.var)
psu.pik[[i]] <- psu.MOS[[i]] * psu.MOS.adj[i]
psu.cert[[i]] <- (psu.pik[[i]] > MOS.takeall)
}
## Step 3: Create hierarchical cluster based on distance for within PSU
## See notes for GeoDistPSU
psu.split <- list()
cert.ID <- list()
for (i in psuID.cert){
## Select psuID sampling units and create within psuID clusters
geodf <- data.frame(long, lat)[psuID == i, ]
cert.ID[[i]] <- Input.ID[as.numeric(rownames(geodf))]
## Note: geodf has the row numbers of the original data frame.
## Input ID can be compared with that
d <- geosphere::distm(geodf, fun = geosphere::distHaversine)
dist <- as.dist(d)
hc <- hclust(dist, method = "complete")
## Create psu.cert for each certainty sampling unit
psu.cert[[i]] <- psu.cert[[i]] * cumsum(psu.cert[[i]])
## Create psuID.split for clustering
psu.split[[i]] <- psu.cert[[i]]
## If the sum of the non-certainties are less than MOS.takeall, put in separate psuID
if(sum(cutree(hc, k=1)[psu.cert[[i]] == 0] *
psu.pik[[i]][psu.cert[[i]] == 0]) < MOS.takeall) {
psu.split[[i]][psu.cert[[i]] == 0] <- max(psu.cert[[i]]) + 1
}
else {
## Identify optimum k for MOS.takeall where non-certainties need to be split
k <- 1
while(max(tapply(psu.pik[[i]][psu.cert[[i]] == 0],
cutree(hc, k = k)[psu.cert[[i]] == 0],
sum)) >= MOS.takeall){
k <- k + 1
if(max(tapply(psu.pik[[i]][psu.split[[i]] == 0],
cutree(hc, k = k)[psu.split[[i]] == 0],
sum)) < MOS.takeall){
psu.split[[i]][psu.split[[i]] == 0] <- cutree(hc, k = k) + max(psu.split[[i]])
}
}
}
}
## Create variables for psuID, psuID.split
## Use sprintf type thing over psuID.cert, or length(psu.split)
ps <- list()
for (i in psuID.cert){
ps[[i]]$psuID <- psuID[psuID == i]
ps[[i]]$psuID.max.MOS <- psu.split[which(i == psuID.cert)]
ps[[i]]$Input.ID <- cert.ID[i]
}
for (i in 1:length(psuID.cert)){
ps[[i]] <- as.data.frame(ps[[i]])
colnames(ps[[i]]) <- c("psuID.orig", "psuID.new", "Input.ID")
}
## Create data frame of split PSU observations
ps <- do.call(rbind.data.frame, ps)
ps$psuID.new <- paste0(ps$psuID.orig, ".", ps$psuID.new)
row.names(ps) <- NULL
## Merge with un-split PSUs
orig.psu <- cbind.data.frame(Input.ID, psuID, MOS.var, lat, long)
ps <- full_join(ps, orig.psu, by=c("Input.ID"))
## Add psu.ID.orig and psu.ID.new to ps data frame
ps$psuID.orig <- ifelse(is.na(ps$psuID.orig), ps$psuID, ps$psuID.orig)
ps$psuID.new <- ifelse(is.na(ps$psuID.new), paste0(ps$psuID.orig, ".", "0"), ps$psuID.new)
## Create MOS data frame
MOS.new <- tapply(ps$MOS.var, ps$psuID.new, sum)
MOS <- as.data.frame(MOS.new)
MOS$psuID.new <- rownames(MOS)
rownames(MOS) <- NULL
MOS$psuID.prob <- sampling::inclusionprobabilities(MOS$MOS, n)
## Calculate number of SSUs in each PSU
MOS$Number.SSUs <- table(ps$psuID.new)
MOS$PSU.Mean.Latitude <- tapply(ps$lat, ps$psuID.new, mean)
MOS$PSU.Mean.Longitude <- tapply(ps$long, ps$psuID.new, mean)
## Output data frame of PSUs split, split psuIDs, and matching information
## Remove un-needed variables and re-order columns
ps <- ps[, c(3, 1, 2)]
MOS <- MOS[, c(2, 3, 1, 4:6)]
## Output final MOS distribution
out <- list(ps,
MOS)
## Name data frames in list
names(out) <- c("PSU.ID.Max.MOS", "PSU.Max.MOS.Info")
## Return output and class
return(out)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.