#' scankd: A package for implementing the approach of Kulldorff (2001)
#'
#' The package scankd provides all the functions required to implement
#' the scan statistics developed in Kulldorff (2001). The main help
#' page of interest is \code{ngkd}, which describes how to use this
#' algorithm.
#'
#'
#' @docType package
#' @name scankd
NULL
#' @title Raster map of France.
#'
#' @details
#' Raster map of France.
#'
#'
#' @format An object of class \code{SpatialPixelsDataFrame} (package
#' sp).
#'
"francekd"
#' @title Implements The Approach of Kulldorff (2001)
#' @export
#'
#' @details \code{ngkd} prepares the data for the scan (identification
#' of neighbourhood for each pixel. \code{Kulldorff2001} implements the
#' scan statistics, and calculates the most likely cluster (size,
#' duration and likelihood ratio) associated to each pixel of the map.
#' \code{identifyClusters} identifies and plots the most likely
#' non-overlapping clusters on the map. \code{circleskd} can be used
#' to plot the clusters to an already existing image of the area.
#'
#' @param pts an object inheriting the class SpatialPixels of the
#' package sp, containing the grid of pixels used in the scan (each
#' pixel being the possible center of a circular cluster).
#' @param radius a numeric vector of positive and increasing values
#' corresponding to the possible cluster radiuses (ordered
#' increasingly) tested in the scan. The units should be the same as
#' those used in \code{pts}.
#' @param time a numeric vector of positive and increading values
#' corresponding to the possible cluster durations tested in the scan
#' (ordered increasingly). The time units are not important, as these
#' values are not used in the calculations (only for display
#' purposes).
#' @param x for \code{prind.ngkd}, an object of class "ngkd" returned
#' by the function \code{ngkd}. For \code{circleskd}, an object of
#' class \code{SpatialPointsDataFrame} indicating the location of the
#' cluster centers, returned by \code{identifyClusters}.
#' @param observed a data.frame with \code{Nt} columns and \code{Np}
#' rows (where \code{Nt} is the number of time units considered in the
#' scan, and \code{Np} the number of pixels in \code{pts}), containing
#' the number of cases observed in each pixel during each time unit.
#' @param theoretical a data.frame with \code{Nt} columns and
#' \code{Np} rows (where \code{Nt} is the number of time units
#' considered in the scan, and \code{Np} the number of pixels in
#' \code{pts}), containing the number of cases expected in each pixel
#' during each time unit under "normal" conditions.
#' @param ngkd an object of class "ngkd" returned
#' by the function \code{ngkd}.
#' @param loglambda logical. Should the actual value of the likelihood
#' ratio be returned, or its logarithm (the latter often results in
#' clearer maps).
#' @param spixdf an object of class \code{SpatialPixelsDataFrame}
#' returned by the function \code{Kulldorff2001}.
#' @param selection a character string indicating how to identify the clusters:
#' if \code{"nclust"}, a fixed number of clusters is identified (this number corresponds
#' to the argument \code{nclustMax}. If \code{"lambda"}, all the clusters characterized
#' by a likelihood ratio (or log-likelihood ratio, according to what is stored in
#' \code{spixdf}) greater than the parameter \code{lambda} are identified.
#' @param nclustMax the maximum number of clusters that should be
#' returned by the function, if \code{selection="nclust"}.
#' @param lambda the threshold of the (log-) likelihood ratio, above which the
#' clusters are considered significant and are returned by the function,
#' if \code{selection="lambda"}.
#' @param plot logical. If \code{TRUE}, the function plots the
#' location of the most likely clusters on a map of the
#' (log-)likelihood ratios for each pixel.
#' @param text Either a missing value, in which case the function
#' indicates the order of the clusters (first, second, etc.) with
#' their duration at the center of the cluster; or a character string
#' indicating the variable in \code{x} that should be used to plot the
#' text at the center of the cluster (e.g. "radius", "time", "index",
#' "loglambda"); or a character vector with the same number of
#' elements as \code{nrow(x)} indicating for each cluster what text to
#' write at the center of the cluster.
#' @param \dots additional arguments to be passed to or from other
#' functions.
#'
#' @return \code{ngkd} returns a list of class \code{"ngkd"} required
#' by all the functions of the packages (stores all the spatial and
#' temporal information about the scan). \code{Kulldorff2001}
#' implements the scan statistics of Kulldorff (2001) and return, for
#' each pixel of the map, the (log-)likelihood ratio associated to the
#' best cluster centered on this pixel, as well as the diameter and
#' duration of the pixel, and the observed and expected number of cases in the best
#' cluster (these variables are returned as a
#' \code{SpatialPixelsDataFrame}). \code{identifyClusters} returns a
#' \code{SpatialPointsDataFrame} containing the most likely
#' nonoverlapping clusters. Each row corresponds to the center of a
#' cluster, and the data slot contains the pixel index in the map used
#' to create the \code{ngkd} object corresponding to this point, the
#' size and duration of the cluster, as well as the (log-) likelihood
#' ratio corresponding to this pixel.
#'
#' @author Clement Calenge, \email{clement.calenge@@oncfs.gouv.fr}
#' @references Kulldorff, M. (2001) Prospective time periodic
#' geographical disease surveillance using a scan
#' statistic. \emph{Journal of the Royal Statistical Society: Series
#' A}, \bold{164}, 61-72.
#' @importFrom plotrix draw.circle
#' @import graphics
#' @import stats
#' @import methods
#' @import sp
#' @useDynLib scankd
#'
#' @examples
#' ## We have a map of France
#' image(francekd)
#'
#'
#' ##############################################
#' ##
#' ## Simulation of a dataset. Beginning of the simulations
#' ##
#' ## We simulate the following situation:
#' ## (i) a "normal noise" of cases during 52 weeks (in average, 0.001
#' ## cases expected per pixel):
#' set.seed(9809)
#' cases <- as.data.frame(matrix(as.double(rpois(nrow(francekd)*52,0.001)),
#' nrow=nrow(francekd)))
#'
#' ## (ii) we add a cluster of "unusual" cases in the North-east of
#' ## France for the last 8 weeks
#' df <- structure(list(gg = c(160L, 123L, 230L, 368L, 333L, 231L,
#' 271L, 307L, 161L, 159L, 126L, 157L, 270L, 235L, 336L, 333L, 193L,
#' 303L, 307L), gi = c(3L, 1L, 6L, 2L, 8L, 4L, 4L, 7L, 4L, 6L, 2L, 6L,
#' 2L, 2L, 2L, 1L, 6L, 8L, 8L)), .Names = c("gg", "gi"), row.names =
#' c(NA, -19L), class = "data.frame")
#' for (i in 1:nrow(df)) {
#' cases[df[i,1], df[i,2]] <- cases[df[i,1], df[i,2]]+1
#' }
#'
#' ## End of the simulations
#' ##############################################
#'
#' ## Show these data (sum over the last year)
#' su <- apply(cases,1,sum)
#' francekd$totalCases <- su
#' image(francekd[,"totalCases"])
#' ## Note the cluster in North-eastern France
#'
#' ## The object cases contains the number of cases recorded in each pixel
#' ## (rows) for each week (column) with the first column corresponding to
#' ## the most recent week.
#'
#' ## Suppose, for example, that we expect in theory 0.001 cases per
#' ## pixel for each week (the model might be more complex, but is
#' ## good enough for this demonstration). Therefore, the expected
#' ## number of cases is:
#' expect <- as.data.frame(matrix(rep(0.001,nrow(francekd)*52),
#' nrow=nrow(francekd)))
#'
#' ## Now, we scan with the following radiuses (in meters)
#' (rad <- c(1:6)*50000)
#'
#' ## And every week, for one year, i.e.
#' (weekno <- 1:52)
#' ## each element of weekno corresponds to a column in cases or in expect
#' ## the most recent week is stored first in this data.frame
#'
#' ## Prepare the data:
#' ng <- ngkd(francekd, rad, weekno)
#'
#' ## Scan:
#' kd <- Kulldorff2001(cases, expect, ng)
#'
#' ## identify Clusters
#' clust <- identifyClusters(kd, ng)
#'
#' ## Note that the function identifies correctly both the size and
#' ## the duration on he cluster.
#'
#' ## or, to show these clusters on the map of all cases:
#' image(francekd[,"totalCases"])
#' circleskd(clust, c("Best", "Second", "Third"))
#'
#' ## One remark: look at the values of loglambda, Nobs and Nth in clust:
#' clust
#'
#' ## The second cluster is characterized by a Nobs=6, whereas the expectation
#' ## for this number is 0.764. The observed value seems much higher that
#' ## the expectation. But remember that this cluster corresponds to a maximum
#' ## value of log-lambda, calculated over clusters with 901 different possible
#' ## centres (number of pixels in francekd), 6 different radiuses and 52
#' ## possible duration. Look at the maximum number generated by a Poisson
#' ## distribution with mean 0.764 when we draw 901*6*52 numbers.
#' ## We simulate 100 times this maximum:
#' hist(sapply(1:100,function(r) max(rpois(900*6*52, 0.764))),
#' xlab="Expected maximum number of obs. in a cluster", col="red",
#' main="")
#' ## 8 obs can be reasonably expected.
#'
#'
#' ## Therefore: alternative way to identify the clusters: from simulations
#' ## Imagine that some simulations indicate that all clusters with
#' ## a log likelihood ratio greater than 20 are significant (see
#' ## Kulldorf 2001 for a possible way to perform these simulations).
#'
#' ## We keep the clusters above 20
#' clust2 <- identifyClusters(kd, selection="lambda", lambda=20)
#' ## There is one cluster identified:
#' clust2
#'
#' ## plot this cluster
#' image(francekd[,"totalCases"])
#' circleskd(clust2, "Best")
#'
ngkd <- function(pts, radius, time)
{
if (!inherits(pts, "SpatialPixels"))
stop("pts should inherit the class \"SpatialPixels\"")
if (any(diff(radius)<=0))
stop("The radiuses should be ordered by increasing size")
if (any(diff(time)<=0))
stop("The time lags should be ordered by increasing size")
## Distance matrix
coo <- coordinates(pts)
di <- as.matrix(dist(coo))
## neigh
ngkd <- lapply(1:nrow(pts), function(i) {
vois <- lapply(radius, function(j) {
idv <- c(1:ncol(di))[di[i,]<j]
})
for (j in 2:length(vois)) {
deja <- unlist(vois[c(1:(j-1))])
if (is.null(vois[[j]][!(vois[[j]]%in%deja)]))
stop("two virtually identical radiuses (difference between two successive radiuses smaller than cellsize).")
vois[[j]] <- vois[[j]][!(vois[[j]]%in%deja)]
}
vec <- unlist(vois)
j1 <- 0
j2 <- 0
cur <- 1
for (i in 1:length(radius)) {
j1[i] <- (cur)-1 ## le -1 est pour l'indice en C
j2[i] <- (cur+length(vois[[i]])-1)-1 ## idem
vec[(j1[i]+1):(j2[i]+1)] <- vois[[i]]
cur <- cur+length(vois[[i]])
}
df <- data.frame(as.integer(j1), as.integer(j2))
return(list(as.integer(vec-1), df)) ## idem, le - 1
})
class(ngkd) <- "ngkd"
attr(ngkd, "map") <- pts
attr(ngkd, "radius") <- radius
attr(ngkd, "time") <- time
return(ngkd)
}
#' @rdname ngkd
#' @export
print.ngkd <- function(x, ...)
{
if (!inherits(x, "ngkd"))
stop("x should inherit the class ngkd")
cat("**************************\n",
"* Object of class ngkd\n",
"* Neighborhood of each pixel for each radius\n",
"* required by the function Kulldorff2001\n\n",
"Number of pixels in the map: ", length(x),"\n",
"Number of radiuses considered in the analysis: ",
nrow(x[[1]][[2]]),"\n",
"Number of time units considered in the analysis: ",
length(attr(x,"time")),"\n",
sep="")
}
#' @rdname ngkd
#' @export
Kulldorff2001 <- function(observed, theoretical, ngkd, loglambda=TRUE)
{
if (!inherits(observed, "data.frame"))
stop("observed should be of class data.frame")
if (!inherits(theoretical, "data.frame"))
stop("theoretical should be of class data.frame")
if (ncol(observed)!=length(attr(ngkd,"time")))
stop("the number of columns in observed does not correspond to the number of time units")
if (nrow(observed)!=length(ngkd))
stop("the number of units in observed does not correspond to the number of pixels")
if (nrow(observed)!=nrow(theoretical))
stop("the numbers of units in observed and theoretical do not match")
if (ncol(observed)!=ncol(theoretical))
stop("the numbers of column in observed and theoretical do not match")
NparT <- as.double(cumsum(unlist(apply(observed,2,sum))))
if (!is.double(observed[1,1]))
observed <- lapply(observed, as.double)
resu <- .Call("trouveCluster", ngkd, observed, theoretical, NparT, PACKAGE="scankd")
resu <- as.data.frame(resu)
resu[,2] <- attr(ngkd, "radius")[resu[,2]]
resu[,3] <- attr(ngkd, "time")[resu[,3]]
if (loglambda) {
resu[,1] <- log(resu[,1])
names(resu) <- c("loglambda", "radius","time","Nobs","Nth")
} else {
names(resu) <- c("lambda", "radius","time","Nobs","Nth")
}
coordinates(resu) <- coordinates(attr(ngkd,"map"))
gridded(resu) <- TRUE
return(resu)
}
#' @rdname ngkd
#' @export
identifyClusters <- function(spixdf, ngkd, selection=c("nclust","lambda"),
nclustMax=nrow(spixdf), lambda=NULL, plot=TRUE)
{
if (!inherits(spixdf, "SpatialPixelsDataFrame"))
stop("spixdf should have been generated by Kulldorff2001")
if (ncol(spixdf)!=5)
stop("spixdf should have been generated by Kulldorff2001")
if (!names(spixdf)[1]%in%c("loglambda","lambda"))
stop("spixdf should have been generated by Kulldorff2001")
if (!all(names(spixdf)[2:3]==c("radius","time")))
stop("spixdf should have been generated by Kulldorff2001")
selection <- match.arg(selection)
if (selection=="lambda")
nclustMax=nrow(spixdf)
index <- 1:nrow(spixdf)
ma <- which.max(spixdf[[1]])
lam <- spixdf[[1]][ma]
dia <- spixdf[[2]][ma]
tim <- spixdf[[3]][ma]
nob <- spixdf[[4]][ma]
nth <- spixdf[[5]][ma]
ind <- ma
coo <- coordinates(spixdf)
cons <- rep(0, length(index))
if (nclustMax>1) {
di <- as.matrix(dist(coo))
for (i in 1:(nclustMax-1)) {
## which pixels are already in a cluster?
cons <- cons + as.numeric(di[ind[i],]<=dia[i])
cons <- as.numeric(cons>0)
if (sum(cons) == length(cons)) {
break
}
## Identify all the clusters with none of these pixels:
cons[cons==0] <- sapply(c(1:nrow(di))[cons==0], function(j) {
## for each focus pixel outside the present clusters,
## consider if the cluster associated with the focus
## includes pixels in already identified clusters
bad <- any((di[j,]<spixdf[[2]][j])&(cons==1))
## return
return(bad)
})
if (sum(cons) == length(cons)) {
break
}
##
spixdf2 <- spixdf[cons==0,]
index2 <- index[cons==0]
ma <- which.max(spixdf2[[1]])
lam[i+1] <- spixdf2[[1]][ma]
dia[i+1] <- spixdf2[[2]][ma]
tim[i+1] <- spixdf2[[3]][ma]
nob[i+1] <- spixdf2[[4]][ma]
nth[i+1] <- spixdf2[[5]][ma]
ind[i+1] <- index2[ma]
}
}
df <- data.frame(index=ind, radius=dia, time=tim, lam=lam, Nobs=nob, Nth=nth)
names(df)[4] <- names(spixdf)[1]
coordinates(df) <- coo[ind,,drop=FALSE]
if (selection=="lambda") {
df <- df[df[[4]]>lambda,]
}
if (plot&nrow(df)>0) {
co <- coordinates(df)
image(spixdf)
for (i in 1:nrow(df)) {
text(co[i,1], co[i,2], paste0(i, " (",df[[3]][i],")"))
draw.circle(co[i,1], co[i,2], df[[2]][i])
}
}
attr(df, "type") <- "clusterskd"
return(df)
}
#' @rdname ngkd
#' @export
circleskd <- function(x, text=NA)
{
if (!inherits(x,"SpatialPointsDataFrame"))
stop("x should inherits the class SpatialPixelsDataFrame")
if (is.null(attr(x,"type")))
stop("x should have been generated by identifyClusters")
if (attr(x,"type")!="clusterskd")
stop("x should have been generated by identifyClusters")
if (length(text)>1) {
if (length(text)!=nrow(x))
stop("text should be either of length 1 or the same as nrow(x)")
}
co <- coordinates(x)
for (i in 1:nrow(x)) {
if (length(text)==1) {
if (is.na(text)) {
text2 <- paste0(i, " (",x[[3]][i],")")
} else {
if (any(names(x)==text)) {
text2 <- x[[text]][i]
} else {
text2 <- text
}
}
} else {
text2 <- text[i]
}
text(co[i,1], co[i,2], text2)
draw.circle(co[i,1], co[i,2], x[[2]][i])
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.