R/krige.R
In geostatsp: Geostatistical Modelling with Likelihood and Bayes

Documented in krigeLgm

# y is gaussian, w is positive
# y = (w^lambda - 1)/lambda
# lambda * y + 1 = w^lambda = exp(lambda * log(w))

meanBoxCox=function(pred, sd, boxcox, Nbc = 100) {

  # if boxcox is negative, don't let values get closer
  # than this to zero
	epsX = exp(12*boxcox)

	epsBC = 0.001 # check box-cox within this distance
  # of zero or one
	if(abs(boxcox)<epsBC){
		return(
			exp(pred  + sd^2/2)
		)
	} else if (abs(boxcox-1)<epsBC){
		return(pred)
	}

  # normal probabilities for numerical integration
	SXboxcox = seq(-7,7,len=Nbc)
	SXboxcox = unique(signif(
		c(
			SXboxcox,
			SXboxcox/SXboxcox[1]
		),6)
)
	SXboxcox = sort(SXboxcox)
	PXboxcox = pnorm(SXboxcox)#, log=TRUE)
  # probability of normal being in a bin
	DXboxcox = diff(PXboxcox)/2
	NDX = length(DXboxcox)
	DXboxcox = c(
		DXboxcox[1],
		DXboxcox[-1] + DXboxcox[-NDX],
		DXboxcox[NDX]
	)
	IXboxcox = log(DXboxcox)  

	x = boxcox * (outer(sd, SXboxcox) +  pred)+1

  # negatives to zero
	xneg= which( as.vector(x < 0))

	if(boxcox<0){
    # get rid of values very close to zero
		xneg = c(xneg,
			which(abs(as.vector(x)) < epsX)
		)
	}

	x[xneg] = NA

	logx = log(x)/boxcox 

	IXmat = matrix(IXboxcox, nrow(x), ncol(x), byrow=TRUE)

	result =  rowSums(exp(logx + IXmat),na.rm=TRUE)
	allNA = rowSums(!is.na(x))==0

	if(length(xneg)) {
		IXmat[xneg] = NA
		result = cbind(
			predict=result,
			probComplex.boxcox = 
			1-rowSums(exp(IXmat),na.rm=TRUE)
		)
		result[allNA,] = NA        
	} else {
		result[allNA] = NA
	}


	result
}


krigeOneRowPar = function(
	Drow, yFromRowDrow, 
	locations,
	param,coordinates,Ny,
	cholVarDataInv,
	cholVarDatInvData,
	xminl,xresl,ncoll,
	lengthc){

		  # covariance of cells in row Drow with data points
	resC =  .C(C_maternArasterBpoints, 
		as.double(xminl), 
		as.double(xresl), 
		as.integer(ncoll), 
		as.double(yFromRowDrow), 
		as.double(0), as.integer(1),
		as.double(crds(coordinates)[,1]), 
		as.double(crds(coordinates)[,2]), 
		N=as.integer(Ny), 
		result=as.double(matrix(0, ncoll, 
			lengthc)),
		as.double(param["range"]),
		as.double(param["shape"]),
		as.double(param["variance"]),
		as.double(param["anisoRatio"]),
		as.double(param["anisoAngleRadians"])
	) 
	covDataPred = matrix(resC$result, nrow=ncoll, ncol=Ny)


	cholVarDataInvCovDataPred = tcrossprod(cholVarDataInv, covDataPred)

	x= cbind( # the conditional expectation
		forExp=as.vector(crossprod(cholVarDataInvCovDataPred, 
			cholVarDatInvData)),
				  # part of the conditional variance
		forVar=apply(cholVarDataInvCovDataPred^2, 2, sum)
	) 
	x

}


krigeLgm = function(
	formula, data, 
	grid,
	covariates=NULL, 
	param,   
	expPred=FALSE,
	nuggetInPrediction=TRUE, 
	mc.cores=getOption("mc.cores", 1L)) {

	 # this function really needs some tidying!

	trend = formula
	locations = grid
	coordinates=data
	theVars = NULL

	haveBoxCox = any(names(param)=="boxcox")
	NsimBoxCox=50
	if(haveBoxCox) {
		haveBoxCox = abs(param["boxcox"]-1) > 0.001
		if(param['boxcox']<0) NsimBoxCox=100
		if(param['boxcox']< -0.2) NsimBoxCox=200
		if(param['boxcox']< -0.5) NsimBoxCox=400
	}

	haveNugget = any(names(param)=="nugget")
	if(haveNugget) { 
		haveNugget = param["nugget"] > 0
	} 
	if(!haveNugget) {
		nuggetInPrediction=FALSE
	}

	if(is.numeric(locations)){
		locations = squareRaster(data, locations)
	}
	if(nrow(locations) * ncol(locations) > 10^7) warning("there are lots of cells in the prediction raster,\n this might take a very long time")


		observations = meanRaster = NULL

	noCovariates = length(names(covariates)) == 0
	if(is.data.frame(covariates)) {
		if(!nrow(covariates)) noCovariates = TRUE
	}

	if(noCovariates) {
		  # no coariates, mean is intercept
		if(any(names(param)=='(Intercept)')) {
			meanForRaster = param['(Intercept)']		
		} else {
			meanForRaster = 0
		}
		meanFixedEffects = 
		rep(meanForRaster, ncell(locations))
		meanRaster = terra::rast(locations)
		terra::values(meanRaster) = meanFixedEffects	
	}

	# if covariates is a df
	if( 
		is.data.frame(covariates) & any(class(formula)=="formula") & !noCovariates
	)  {

		    # put zeros for covariates not included in the data frame
		notInCov = setdiff(all.vars(formula), names(covariates))
		for(D in notInCov)
			covariates[[D]] = 0

		modelMatrixForRaster = model.matrix(formula, covariates)
		if(nrow(modelMatrixForRaster) < nrow(covariates)) {
		    	# some cells missing covariates
			toAdd = setdiff(rownames(covariates), rownames(modelMatrixForRaster))
			toAdd = matrix(NA, length(toAdd), ncol(modelMatrixForRaster),
				dimnames = list(toAdd, colnames(modelMatrixForRaster)))
			modelMatrixForRaster = rbind(
				modelMatrixForRaster, toAdd
			)[rownames(covariates), ]
		}

		theParams = intersect(colnames(modelMatrixForRaster), names(param))

		meanForRaster = drop(
			tcrossprod( param[theParams], modelMatrixForRaster[,theParams] )
		)
		meanFixedEffects = rep(NA, ncell(locations))
		if('space' %in% names(covariates)  & 'space' %in% names(locations)) {
			meanFixedEffects[match(covariates$space, terra::values(locations)[,'space'])] = meanForRaster
		} else {
			meanFixedEffects[as.integer(names(meanForRaster))] = meanForRaster
		}
		meanRaster = rast(locations)
		terra::values(meanRaster) = meanFixedEffects


	} # end covariates is DF

# if data is spdf
	if(any(class(data)=="SpatVector")&
		any(class(formula)=="formula")) {

# if all covaraites in data
		if(
			all(setdiff(names(covariates), 'space') %in% names(data))
		) {

			modelMatrixForData = model.matrix(formula, values(data))

			theParams = intersect(colnames(modelMatrixForData), names(param))

			meanForData =	 as.vector(tcrossprod(
				param[theParams],
				modelMatrixForData[,theParams])
		)
			names(meanForData) = rownames(modelMatrixForData)

			haveData = match(names(meanForData), 
				rownames(values(data)))

			data = data[haveData,]
			coordinates=data


			observations = drop(values(data)[,
				all.vars(formula)[1] ] )

			if(haveBoxCox) {
				if(abs(param["boxcox"]) < 0.001) {
					observations = log(observations)
					expPred = TRUE
					haveBoxCox = FALSE
				} else {
					observations = ((observations^param["boxcox"]) - 1)/
					param["boxcox"]
				}

			} # have boxcox
			observations = observations - meanForData		
		} # end all covariates in data

	} # end data is spdf	


	  # old code, not called from lgm
	if(!length(observations) | is.null(meanRaster)) {
		warning("this is old code, unmaintained")


		  # the above didn't create observations and meanRaster
		  # use the old code, probably not being called from lgm

	   # find factors, so we reproject rasters using
	   # the correct method.
	   # search for factors in the data supplied



	   # look for factors in the model formula
		if(any(class(trend)=="formula")){

			trendFormula = update.formula(trend, junk ~ . )


			covariatesForData = values(data)

			if(is.vector(data)) {
				observations = data
			} else {			
				observations = all.vars(trend)[1]
				observations = covariatesForData[,observations]
			}

			theVars = all.vars(trendFormula)[-1]

			if(length(theVars)) {
				factorsInData = unlist(lapply(
					covariatesForData[,theVars,drop=FALSE],
					is.factor))
				factorsInData = names(factorsInData)[factorsInData]
			} else {
				factorsInData=NULL
			} 


			allterms = rownames(attributes(terms(trend))$factors)

			factorsInFormula = grep("^factor", allterms, value=TRUE)
			factorsInFormula = gsub("^factor\\(", "", factorsInFormula)
			factorsInFormula = gsub("\\)$", "", factorsInFormula)

			factorsInTrend=NULL

			allterms = gsub("^[[:alnum:]]+\\(", "", allterms)
			allterms = gsub("\\)$", "", allterms)

			if(!all(allterms %in% names(data)))
				warning("some covariates don't appear in data")
# trend not formula
		} else { 
		    # trend is a data frame of covariates
		    # look for factors in it
			covariatesForData = as.data.frame(trend)

			observations = as.data.frame(data)[,1]

			factorsInTrend = unlist(lapply(
				covariatesForData, is.factor
			))
			factorsInTrend = names(factorsInTrend)[factorsInTrend]
			factorsInFormula = factorsInData = NULL

		    # guess at the formula
			trendFormula = as.formula(paste(
				"junk ~ ",
				paste(c('1', names(covariatesForData)), collapse="+")
			)
		)
		} # end trend not a formula


	   # we know which variables factors
		theFactors = unique(c(factorsInFormula, factorsInData, factorsInTrend))
		theFactors = theFactors[theFactors %in% names(covariates) ]

		if(length(grep("SpatRaster|^list", class(covariates)))) { 
 	    # if there's only variable in the model assign it's name to covariates
			covariateNames = all.vars(
				update.formula(trendFormula, junk~ . )
			)[-1]
			if(length(covariateNames)==1){
		      # so far only one variable
				names(covariates)= covariateNames
			} 
	     # loop through factors
	     # and make sure integer values in rasters get converted
	     # to things with parameter values!
			for(D in theFactors) {
		      # is this variable in param with  a factor around it? 
		      # for instance factor(x)1 and factor(x)2 ?
				paramWithFactor = grep(
					paste("factor\\(", D, "\\)[[:digit:]]+$", sep=""),
					names(param), value=TRUE)
				paramStartWithD = grep(
					paste("^", D, ".+$", sep=""),
					names(param), value=TRUE)
				paramFactorCharacter = grep(
					paste("factor\\(", D, "\\).+$", sep=""),
					names(param), value=TRUE)
				if(length(paramWithFactor)) {
			       # formula will convert to factor, don't 
			       # create factor beforehand
					theLevels = gsub(
						paste("^factor\\(",D,"\\)",sep=""),
						"",paramWithFactor)
					theLevels = as.integer(theLevels)
					allValues = unique(covariates[[D]])[,1]
					dontHave = allValues[!allValues %in% theLevels]
			       # make values with no data all equal to the lowest value
		        # so it's the baseline when turning into a factor.
					forRecla = cbind(dontHave, min(allValues)-1)

					covariates[[D]] = 
					classify(covariates[[D]], forRecla)

				} else if( length(paramStartWithD) ) {
			       # not a bunch of digits, 
			       # stuff like xTrees and xGrassland for covariate x and levels Trees and Grassland
			       # see if these line up with 
					theLevels = gsub(paste("^", D, sep=""),"",paramStartWithD)

					levelsTable = levels(covariates[[D]])[[1]]

					inId = theLevels %in% as.character(levelsTable[,1])
					inLabel = theLevels %in% levelsTable[,2]

					if(mean(inId) > mean(inLabel)){
						levelsTable$levelsInParams =  
						as.character(levelsTable[,1])
						labelCol = ncol(levelsTable)
						levelsInTable = levelsTable[,1] %in% 
						theLevels
					} else {
						levelsInTable = levelsTable[,2]%in% theLevels
						labelCol=2
					}

					if(mean(theLevels %in% levelsTable[,labelCol]) < 0.4)
						warning("many levels appear missing in covariate", D)
					valuesInParams = levelsTable[levelsInTable,1]

					allValues = unlist(unique(covariates[[D]]))
					allValues = levelsTable[levelsTable[,2] %in% allValues, 1]
					dontHave = allValues[!allValues %in% valuesInParams]
					forRecla = cbind(dontHave, min(as.vector(minmax(covariates[[D]])))-1)
					covariates[[D]] = classify(covariates[[D]], forRecla)


					levelsTable = 
					levelsTable[c(1, 1:nrow(levelsTable)),c(1,labelCol)]
					levelsTable[1,1]= min(allValues)-1
					levelsTable[1,2] = ''
					colnames(levelsTable)[2] = "levels"
					levels(covariates[[D]]) = levelsTable

				} else if (length(paramFactorCharacter)) {
			       # stuff like factor(x)Trees and factor(x)Grassland for covariate x and levels Trees and Grassland
					theLevels = gsub(paste("^factor\\(", D,"\\)", sep=""),"",
						paramFactorCharacter)
					levelsTable = levels(covariates[[D]])[[1]]
					levelsInTable = levelsTable[,2]%in% theLevels
					if(mean(theLevels %in% levelsTable[,2]) < 0.4)
						warning("many levels appear missing in covariate", D)
					valuesInParams = as.numeric(levelsTable[levelsInTable,1])

#			       allValues = unlist(unique(covariates[[D]]))
#			       dontHave = allValues[!allValues %in% valuesInParams]
					forRecla = cbind(levelsTable[!levelsInTable,'ID'], min(levelsTable[,'ID'])-1)
					covariates[[D]] = classify(covariates[[D]], forRecla)


					levelsTable = 
					levelsTable[c(1, 1:nrow(levelsTable)),]
					levelsTable[1,1]= min(levelsTable[,'ID'])-1
					levelsTable[1,2] = "0"
					colnames(levelsTable)[2]="levels"
					levels(covariates[[D]])[[1]] =  levelsTable			


				} else {
					warning("don't know what to do with covariate", D, 
						"\n can't assign parameters to levels of this factor")			
				}


			} # end Dloop through factors

			if( # if here
				length(grep("SpatRaster|^list", class(covariates))) & length(theVars)
			) {
		      # method for resampling covariate rasters

				method = resampleMethods(formula, covariates)

				covariates = stackRasterList(covariates, template=locations, method=method)

				theVars = do.call('intersect',
					dimnames(attributes(terms(trendFormula))$factors))

				if(nlyr(covariates)==1 & length(theVars)==1) {
					names(covariates) = theVars
				}

		      # construct the fixed effects component
				covariatesDF = cbind(values(covariates, dataframe=TRUE), 
					crds(covariates, df=TRUE, na.rm=FALSE))
		      # get rid of trailing _ created by as.data.frame
				names(covariatesDF) = gsub("_levels$", "", names(covariatesDF))
			} else {
				covariatesDF = as.data.frame(matrix(NA, ncol=0, nrow=ncell(locations)))
			} # done if here

		 # end covariates is raster old code, assume it's a data frame
		} else {
			covariatesDF=covariates
		} 

	   # get rid of response variable in trend formula
		meanRaster = rast(locations)
		names(meanRaster) = "fixed"

	# if have covariates
		if(length(all.vars(trendFormula)) ){ 
			missingVars = all.vars(trendFormula)[-1] %in% names(covariatesDF)
			missingVars = all.vars(trendFormula)[-1][!missingVars]

	     # check if all variables are in covariates
			if(length(missingVars)) {
				cat("cant find covariates ",
					paste(missingVars, collapse=","),
					" for prediction, imputing zeros\n")		

				covariatesDF[,missingVars]=0	
			}

			modelMatrixForRaster = model.matrix(trendFormula, cbind(covariatesDF,junk=0))

			if(!all(colnames(modelMatrixForRaster)%in% names(param))){
				warning("cant find coefficients",
					paste(setdiff(colnames(modelMatrixForRaster), names(param)), collapse=", "),
					"in param\n")
			}


			meanFixedEffects = 
			modelMatrixForRaster %*% param[colnames(modelMatrixForRaster)]

			anyNA = apply(covariatesDF, 1, function(qq) any(is.na(qq)))
			if(any(anyNA)) {
				oldmm = rep(NA, ncell(meanRaster))
				oldmm[!anyNA] = meanFixedEffects
				terra::values(meanRaster) = oldmm
			} else {
				terra::values(meanRaster) = meanFixedEffects
			}



			modelMatrixForData = model.matrix(trendFormula, 
				cbind(covariatesForData,junk=0))
			haveData = match(rownames(modelMatrixForData), 
				rownames(covariatesForData))
			observations = observations[haveData]
			coordinates = coordinates[haveData,]

			meanForData = 
			modelMatrixForData %*% param[colnames(modelMatrixForData)]
#no covariates	

		} else { 
			if(any(names(param)=='(Intercept)')) {
				terra::values(meanRaster) = param['(Intercept)'] 
			} else {
				terra::values(meanRaster) = 0
			}
			meanForData = rep(values(meanRaster)[1], length(observations))
		}



# subtract mean from data

		theNAdata =  
		is.na(observations)

		if(all(theNAdata)) {
			warning(
				'it appears there are no observations without at least one covariate missing')
		}



		if(any(theNAdata)) {
			noNAdata = !theNAdata
			if(length(grep("^SpatVector", class(coordinates)))) {
				coordinates = coordinates[noNAdata,]	
			} else if(any(class(coordinates)=="dist")){
				coordinates = as.matrix(coordinates)
				coordinates = coordinates[noNAdata,noNAdata]
				coordinates = as.dist(coordinates)
			} else {
				warning("missing vlaues in data but unclear how to remove them from coordinates")
			}
			observations = observations[noNAdata]
		}




		if(haveBoxCox) {
			if(abs(param["boxcox"]) < 0.001) {
				observations = log(observations)
				expPred = TRUE
				haveBoxCox = FALSE
			} else {
				observations = ((observations^param["boxcox"]) - 1)/
				param["boxcox"]
			}

		} # end have box cox

		observations = observations - meanForData

 # end old code not called from LGM
	}

	cholVarDataInv = geostatsp::matern(
		coordinates, 
		param=param, type='inverseCholesky')


	cholVarDatInvData = cholVarDataInv %*% observations

	Ny = length(observations)
	param = fillParam(param)


	datForK = list(
		locations=locations,param=param,
		coordinates=coordinates,Ny=Ny,
		cholVarDataInv=cholVarDataInv,
		cholVarDatInvData = cholVarDatInvData,
		xminl=xmin(locations),
		xresl = xres(locations),
		ncoll=ncol(locations),
		lengthc=length(coordinates)

	)
	Srow = 1:nrow(locations)


	if( (mc.cores ==1) | !requireNamespace("parallel", quietly=TRUE)) {
		sums=mapply(#geostatsp:::
			krigeOneRowPar, 
			Drow = Srow, 
			yFromRowDrow = yFromRow(locations,Srow),
			MoreArgs=datForK,
			SIMPLIFY=FALSE)
	} else {
		sums=parallel::mcmapply(krigeOneRowPar, Srow, 
			yFromRow(locations,Srow),
			MoreArgs=datForK,SIMPLIFY=FALSE,mc.cores=mc.cores)
	}


	sums <- simplify2array(sums)
	 # row sums of cholVarDataInvCovDataPred
	forExpected = sums[,'forExp',]
	 # row sums of squares
	forVar = sums[,'forVar',]

	randomRaster = rast(meanRaster)
	names(randomRaster) = "random"
	terra::values(randomRaster) = t(forExpected)


	predRaster = meanRaster + randomRaster
	names(predRaster) = "predict"

	if(any(forVar > param["variance"])){
		themax = max(forVar - param["variance"],na.rm=TRUE)
		if(themax > 1e-6)
			message("possible numerical instability in kriging, conditional variances higher than marginal")	
		forVar = pmin(forVar, param["variance"])	
	}

	krigeSd = rast(meanRaster)
	names(krigeSd) = "krigeSd"

	if(nuggetInPrediction) {
		terra::values(krigeSd) = sqrt(sum(param[c("nugget","variance")]) - 
			as.vector(forVar))
	} else {
		terra::values(krigeSd) = sqrt(param["variance"] - as.vector(forVar))
	}

	names(meanRaster) = "fixed"


	result = c(meanRaster, randomRaster, predRaster,
		krigeSd)

	 # box-cox
	if(haveBoxCox){ 
		names(result)[names(result)=="predict"] = "predict.boxcox"

		bcpred = meanBoxCox(
			pred=values(result[['predict.boxcox']], mat=FALSE, dataframe=FALSE), 
			sd= values(result[['krigeSd']], mat=FALSE, dataframe=FALSE),
			boxcox=param['boxcox']
		)

		newraster=rast(result[["predict.boxcox"]])
		names(newraster) = "predict"
		if(is.matrix(bcpred)){
			terra::values(newraster) = bcpred[,'predict']
			add(result) = newraster
#      names(newraster) = 'probComplex.boxcox'
#      values(newraster) = bcpred[,'probComplex.boxcox']
#      result = addLayer(result, 
#          newraster)
		} else {
			terra::values(newraster) = bcpred
			add(result) = newraster
		}
	} # end have box cox


	if(expPred){
		names(result)[names(result)=="predict"] = "predict.log"
		newLayer = exp(result[["predict.log"]]+ 0.5*result[["krigeSd"]]^2 )
		names(newLayer) = "predict"
		add(result) = newLayer
	} # end expPred

	result
}