Nothing
##' Pairwise Standard Normal Homogeneity Test
##'
##' This function performs a pairwise standard normal homogeneity test on the
##' data supplied, as described in Menne & Williams (2009).
##'
##' @usage pairwiseSNHT(data, dist, k, period, crit=100, returnStat=FALSE, ...)
##' @param data The data to be analyzed for changepoints. It must be a
##' data.frame and contain either two or three columns. The mandatory columns
##' are data and location, named as such. The option column is time, and this
##' argument will be passed to snht.
##' @param dist A distance matrix which provides the distance between location
##' i and location j. Rows and columns must be named with the locations in
##' data. Note that non-symmetric distances may be used. In that case,
##' neighbors for station i will be determined by the smallest values in the
##' row of dist corresponding to i.
##' @param k How many of the nearest neighbors should be used to construct
##' pairwise difference time series? Note that more than k neighbors may be
##' used if there are ties in the distances between locations.
##' @param period The SNHT works by calculating the mean of the data on the
##' previous period observations and the following period observations. Thus,
##' this argument controls the window size for the test statistics.
##' @param crit The critical value such that if the snht statistic is larger
##' than crit, a changepoint is assumed to have occured. Defaults to 100, as
##' recommended in Haimberger (see references).
##' @param returnStat See return value. If TRUE, the snht statistics for each
##' time point and for each difference pair are returned.
##' @param ... Additional arguments to pass to the snht function (such as
##' robust, time, or estimator).
##'
##' @details The pairwise snht works with a set of time series. For each time
##' series, it's closest k neighbors are determined, and a time series of the
##' difference between each of those time series is created. The snht is then
##' applied to each of these difference time series. Changepoints in one time
##' series can be detected by searching for large values of the test statistic
##' across all difference time series for a particular location.
##'
##' The usefulness of the pairwise snht is that it removes any patterns in the
##' data that could affect the basic snht. For example, seasonal and linear
##' trends that exist globally will be removed from the difference series, and
##' thus changepoints are more easily detected.
##'
##' @return If returnStat is TRUE, the snht statistics for each time point and
##' for each difference pair are returned.
##'
##' Otherwise, a named list is returned. The first element, data, contains the
##' homogenized data in the same format as the supplied data. The second
##' element, breaks, contains a data.frame where the first column is the
##' location where a break occured, the second column is the time of the break,
##' and the third column is the amount that data after the break was shifted
##' by.
##'
##' @references L. Haimberger. Homogenization of radiosonde temperature time
##' series using innovation statistics. Journal of Climate, 20(7): 1377-1403,
##' 2007.
##'
##' Menne, M. J., & Williams Jr, C. N. (2009). Homogenization of temperature
##' series via pairwise comparisons. Journal of Climate, 22(7), 1700-1717.
##'
##' @author Josh Browning (jbrownin@@mines.edu)
##' keyword ~snht ~homogeneity ~pairwise
##'
##' @export
##'
##' @importFrom plyr ddply
##' @importFrom reshape2 dcast
##' @importFrom reshape2 melt
##' @importFrom methods is
##'
##' @examples
##' dist = matrix(1, nrow=2, ncol=2)
##' colnames(dist) = c(1, 2)
##' rownames(dist) = c(1, 2)
##' dist = dist - diag(2)
##' pairwiseSNHT(data=data.frame(data=c(rep(1, 10), rep(2, 10), rep(3, 10), rep(4, 10)),
##' location=rep(1:2, each=10)), dist=dist, k=1, period=2)
pairwiseSNHT <- function(data, dist, k, period, crit=100, returnStat=FALSE,
...){
#data quality checks
stopifnot(is(data,"data.frame"))
if(is(data, "data.table")){
stop("data must be a data.frame, not a data.table")
}
if(ncol(data)==2){
stopifnot(colnames(data) %in% c("data","location"))
# Reorder columns
data = data[, c("data", "location")]
}
if(ncol(data)==3){
stopifnot(colnames(data) %in% c("data","location","time"))
# Reorder columns
data = data[, c("data", "location", "time")]
}
stopifnot(ncol(data) %in% c(2,3))
locs = as.character(unique(data$location))
stopifnot(rownames(dist) == colnames(dist))
stopifnot(all(rownames(dist) %in% locs))
stopifnot(all(locs %in% rownames(dist)))
stopifnot(k >= 1) #Must have at least one neighbor
stopifnot(k <= length(locs)-1) #Can have at most length(locs)-1 neighbor, since self can't be used
stopifnot(diag(dist) == 0)
if(any(dist[row(dist) != col(dist)]<=0))
stop("Off diagonal elements of dist must be >0")
pairs = getPairs(dist, k=k)
uniquePairs = getUniquePairs(pairs)
#Add times if they don't already exist (just 1:nrow()).
if(!"time" %in% colnames(data)){
if(length( unique( table( data$location ) ) ) != 1){
stop("All locations must have the same number of obs if time is not provided!
May need to remove unused levels in data.")
}
data$order = 1:nrow(data) #ensure original ordering is preserved
data = plyr::ddply(data, "location", function(df){
df = df[order(df$order),]
df$time = 1:nrow(df)
return(df)
} )
data$order = NULL
}
#Restructure data
data = reshape2::dcast(data, formula = time ~ location, value.var = "data")
diffs = data.frame(time = data$time)
for(i in 1:nrow(uniquePairs)){
diffs = cbind(diffs, data[,uniquePairs[i,1]] - data[,uniquePairs[i,2]])
colnames(diffs)[ncol(diffs)] = paste0(uniquePairs[i,1],"-",uniquePairs[i,2])
}
#Compute snht statistics
statistics = apply(diffs[, -1, drop=FALSE], 2, snht, period=period, time=diffs[,1], ...)
avgDiff = do.call("cbind", lapply(statistics, function(x) x$rightMean-x$leftMean ) )
statistics = do.call("cbind", lapply(statistics, function(x) x$score))
if(returnStat)
return(statistics)
candidate = createCandidateMatrix(data, statistics = statistics,
pairs = pairs, crit = crit)
out = unconfoundCandidateMatrix(candidate = candidate, pairs = pairs,
statistics = statistics, data = data, period = period, avgDiff = avgDiff)
out$data = reshape2::melt(data = out$data, id.vars = "time")
rownames(out$data) = NULL
colnames(out$data)[colnames(out$data) == "value"] = "data"
colnames(out$data)[colnames(out$data) == "variable"] = "location"
out$data = out$data[,c("data", "location", "time")]
return(out)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.