R/mtsdi.r

Defines functions print.mtsdi print.summary.mtsdi summary.mtsdi plot.mtsdi predict.mtsdi getmean mnimput em.recursion em.nofilter em.filter em.gam em.spline em.arima em.contribt2 em.putvalue em.contribt1 em.correctmat em.splitvec em.partsigma em.partmu em.rearrangemat em.arrangemat em.rearrangevec em.arrangevec em.dispersion em.replacewmean em.countmnear em.countmvec em.extractcoord em.existna em.trace em.det em.mean elapsedtime mkjnw edaprep mstats

Documented in edaprep elapsedtime em.arima em.arrangemat em.arrangevec em.contribt1 em.contribt2 em.correctmat em.countmnear em.countmvec em.det em.dispersion em.existna em.extractcoord em.filter em.gam em.mean em.nofilter em.partmu em.partsigma em.putvalue em.rearrangemat em.rearrangevec em.recursion em.replacewmean em.spline em.splitvec em.trace getmean mkjnw mnimput mstats plot.mtsdi predict.mtsdi print.mtsdi print.summary.mtsdi summary.mtsdi

# R package for air multivariate time series data imputation based o EM algorithm
# IMS - UERJ - Brasil
# written by Washington Junger <wjunger@ims.uerj.br>
# restarted on 01/09/2004
# original work: EM Library for R
# Last change: 10/07/2002; 30/05/2003 (R version)
#
#---------------------------------
# last changes list: (date first)
# 01/09/2004 - first beta release - v. 0.1.0
# 21/10/2004 - added smooth spline method for filtering - v. 0.1.1
# 04/05/2005 - new interface, new features, C code for fastening, code rewriting etc - v. 0.2.0
# 01/12/2005 - variance window, now it returns some measure of uncertainty, added plot function - v. 0.2.1
# 30/06/2006 - fixed bug where data were not time series
# 25/07/2006 - documentation fixed - v. 0.2.2
# 25/10/2006 - documentation and some minor bugs fixed - v. 0.2.3
# 04/01/2007 - some improvements and bug fixes - v. 0.2.4
# 14/06/2007 - some improvements and bug fixes - v. 0.2.5
# 20/06/2007 - some improvements and bug fixes - v. 0.2.6
# 08/11/2012 - fix namespace, startup functions, and warnings in em.spline - v. 0.3.3
# 02/01/2018 - minor bug fixes

#---------------------------------
# To do list:
# 
#
#



mstats <- function(dataset)
# Carries some missing data statistics out
# dataset - data matrix
{
n <- dim(dataset)[1]
p <- dim(dataset)[2]

pollnames <- names(dataset)
colnames <- c("missing", "missing(%)")
rownames <- c(1:n)

# getting status of columns
colmissing <- matrix(0,p,2)
dimnames(colmissing) <- list(pollnames,colnames)

for (j in 1:p)
    {
	colmissing[j,1] <- sum(is.na(dataset[,j]))
	colmissing[j,2] <- 100*sum(is.na(dataset[,j]))/n
	}

# getting status of rows
rowmissing <- matrix(NA,n,2)
dimnames(rowmissing) <- list(rownames,colnames)

rowcounts <- em.extractcoord(dataset)[[2]][,2]
relrowcounts <- 100*rowcounts/p

rowmissing[,1:2] <- cbind(rowcounts,relrowcounts)
# a little bit of information
maxmissing <- max(rowmissing[,1],na.rm=TRUE)
counts <- matrix(0,p+1,2)
dimnames(counts) <- list(as.character(c(0:p)),colnames)

for (k in 0:p)
	{
	counts[(k+1),1] <- sum(ifelse(rowcounts[1:n]==k,1,0))
	counts[(k+1),2] <- 100*counts[(k+1),1]/n
	}

retval <- list(rows=rowmissing,columns=colmissing,pattern=counts)
return(retval)
}


edaprep <- function(dataset)
# Prepares a dataset for exploratory data analysis
# ds - data set
{
mean <- em.mean(dataset)	# estimates Xi mean
coord <- em.extractcoord(dataset)	# extracts NA coordinates
retval <- em.replacewmean(dataset,mean,coord,TRUE)	# replaces NA with mean
return(retval)	 # returning value
}


mkjnw <- function()
# Creates Johnson & Wichern example matrix
{
retval <- as.data.frame(matrix(c(NA,0,3,7,2,6,5,1,2,NA,NA,5),ncol=3,byrow=TRUE))	# returning value
colnames(retval) <- c("X1","X2","X3")
return(retval)
}
	
	
elapsedtime <- function(st,et)
# Computes the elapsed time between t1 and t2
# t1 - start time
# t2- end time
{
time <- et[3]-st[3]		# gets time from the third position of the vectors
h <- trunc(time/3600)
if (h<10)
	hs <- paste("0",h,sep="",collapse=" ")
else
	hs <- h
time <- time-h*3600
min <- trunc(time/60)
if (min<10)
	mins <- paste("0",min,sep="",collapse=" ")
else
	mins <- min
time <- time-min*60
sec <- trunc(time)
if (sec<10)
	secs <- paste("0",sec,sep="",collapse=" ")
else
	secs <- sec

retval <- paste(hs,":",mins,":",secs,sep="",collapse=" ")
return(retval)
}


em.mean <- function(m)
# Calculates mean vector regardless NA
# m - data matrix 
{
retval <- t(apply(m,2,mean,na.rm=TRUE))	# returns the mean function applied to all series
return(retval)
}


em.det <- function(x)
# Calculates the determinant of a matrix
# x - matrix to calculate the determinant
{
retval <- prod(eigen(x)$values)		# returns the product of the eigenvalues
return(retval)
}	


em.trace <- function(x)
# Calculates the trace of a matrix
# x - matrix to calculate the trace
{
retval <- sum(diag(x))	# returns the summation of main diagonal elements
return(retval)
}	


em.existna <- function(v)
# Checks existence and counts missing values in a given vector
# v - vector to be checked
{
na.count <- sum(is.na(v))
		
retval <- list(naexist=na.count>0,nacount=na.count)		# returning values
return(retval)
}


em.extractcoord <- function(m)
# Extracts NA coordinates
# m - data matrix	
{
r <- dim(m)[1]		# internal function variables initialization
c <- dim(m)[2]      
n <- 0

for (i in 1:r)		# checks if the ith element is NA and counts
	{
	for (j in 1:c)
		if (is.na(m[i,j]))
			n <- n+1
	}
mm.1 <- matrix(NA,nrow=n,ncol=2,byrow=TRUE)	 # internal function variables initialization
mm.2 <- matrix(0,nrow=r,ncol=2,byrow=TRUE)
n <- 0  # internal function variables initialization
		
for (i in 1:r)	# extracts NA coordinates
	{
	k <- 0
	for (j in 1:c)
		if (is.na(m[i,j]))
			{
			n <- n+1
			k <- k+1
			mm.1[n,1] <- i
			mm.1[n,2] <- j
			}
	mm.2[i,1] <- i
	mm.2[i,2] <- k
	}
retval <- list(nacoord=mm.1,nainrow=mm.2)  # returning values
return(retval)			
}
	
	
em.countmvec <- function(x)
# Counts missing vector in the given data matrix
# x - data matrix
{
mr <- apply(x,1,em.existna)	 # count missing values per row
	
retval <- sum(mr$nacount>=dim(x)[2])	 # if all the elements in the vector are missing then counts it as missing vector
return(retval)
}


em.countmnear <- function(x)
# counts consecutive missing values in columns
{
rows <- nrow(x)
cols <- ncol(x)
mc <- matrix(0,nrow=rows,ncol=cols)
for(j in 1:cols)
	{
	for(i in 1:rows)
		{                                                                                
		count <- 0                                                                                    
		k <- i                                                                                        
		while(is.na(x[k,j])) 
			{                                                         
			count <- count+1                                                                            
			k <- k-1
			if(k==0)
				break
			}                                                                               
		mc[i,j] <- count
		}                                                                       
	} 
return(mc)
}


em.replacewmean <- function(m,mu,c,diag=FALSE)
# Replaces NA with mean
# m - data matrix
# mu - mean vector
# c - coordinates list
# diag - diagnostics (used for arima diagnostics)
{
n <- dim(c$nacoord)[1]	# internal function variable initialization
for (i in 1:n)
	# if ((c$nainrow[c$nacoord[i,1],2] != length(mu)) || (diag == TRUE))	 # if not all the elements are 	missing or diagnostics call (deprecated)
		# then fill in the missing elements with series mean
		m[c$nacoord[i,1],c$nacoord[i,2]] <- mu[c$nacoord[i,2]]
	
return(m)
}


em.dispersion <- function(m,v=TRUE)
# Estimates variance or correlation
# m - data matrix
# v - TRUE:variance FALSE:correlation
{
if (v == TRUE)
	retval <- cov(m,use="complete.obs")		# returns variance matrix
else
	retval <- cor(m,use="complete.obs")		# 	returns correlation matrix
return(retval)
}


em.arrangevec <- function(x,sv)
# Arranges a vector based in the given order
# x - vector to be arranged
# sv - split vector
{
lna <- dim(sv$part1)[1]		# internal function variables initialization
lknown <- dim(sv$part2)[1]
newx <- numeric(length(x))

for (i in 1:lna)	# stores missing elements at the beginning of the vector
	newx[i] <- x[sv$part1[i,1]]	
for (i in 1:lknown)		# stores known elements at the end of the vector
	newx[i+lna] <- x[sv$part2[i,1]]

return(newx)
}
	
	
em.rearrangevec <- function(x,sv)
# Rearranges a vector to its original order
# x - vector to be rearranged
# sv - split vector
{
lna <- dim(sv$part1)[1]		# internal function variables initialization
lknown <- dim(sv$part2)[1]
originalx <- numeric(length(x))
	
for (i in 1:lna)	# restores estimated elements to the original missing location
	originalx[sv$part1[i,1]] <- x[i]	
for (i in 1:lknown)		# restores known elements to their original location
	originalx[sv$part2[(i),1]] <- x[i+lna]

return(originalx)
}


em.arrangemat <- function(x,sv)
# Arranges a matrix based on a given order
# x - matrix to be arranged
# sv - split vector
{
lna <- dim(sv$part1)[1]		# internal function variables initialization
lknown <- dim(sv$part2)[1]
newx.t <- matrix(NA,nrow=dim(x)[1],ncol=dim(x)[2],byrow=TRUE)
newx <- matrix(NA,nrow=dim(x)[1],ncol=dim(x)[2],byrow=TRUE)
	
for (i in 1:lna)	# moves rows related to the missing elements to the beginning of the matrix
	newx.t[i,] <- x[sv$part1[i,1],]	
for (i in 1:lknown)		# moves rows related to the known elements to the end of the matrix
	newx.t[(i+lna),] <- x[sv$part2[i,1],]
for (j in 1:lna)	# moves columns related to the missing elements to the beginning of the matrix
	newx[,j] <- newx.t[,sv$part1[j,1]]	
for (j in 1:lknown)		# moves columns related to the missing elements to the end of the matrix
	newx[,(j+lna)] <- newx.t[,sv$part2[j,1]]
		
return(newx)
}
	
	
em.rearrangemat <- function(x,sv)
# Rearranges a matrix to its original order
# x - matrix to be rearranged
# sv - split vector
{
lna <- dim(sv$part1)[1]		# internal function variables initialization
lknown <- dim(sv$part2)[1]
newx.t <- matrix(NA,nrow=dim(x)[1],ncol=dim(x)[2],byrow=TRUE)
newx <- matrix(NA,nrow=dim(x)[1],ncol=dim(x)[2],byrow=TRUE)

for (i in 1:lna)	# restores rows related to estimated values to the original missing location
	newx.t[sv$part1[i,1],] <- x[i,]	
for (i in 1:lknown)		# restores rows related to known values to the original location
	newx.t[sv$part2[i,1],] <- x[(i+lna),]
for (j in 1:lna)	# restores columns related to estimated values to the original missing location
	newx[,sv$part1[j,1]] <- newx.t[,j]	
for (j in 1:lknown)		# restores columns related to known values to the original location
	newx[,sv$part2[j,1]] <- newx.t[,(j+lna)]

return(newx)
}


em.partmu <- function(mu,x)
# Divides mean vector in 2 partitions
# mu - mean vector
# x - partitioning position
{
mu.1 <- mu[1:x]	 # stores first partition of the vector
mu.2 <- mu[(x+1):length(mu)]	 # stores second partition of the vector

retval <- list(mu1=mu.1,mu2=mu.2)
return(retval)
}
	

em.partsigma <- function(sigma,x)
# Divides covariance matrix in 4 partitions
# sigma - covariance matrix
# x - partitioning position
{
c <- dim(sigma)[2]		# sub-matrices related to the partitions initialization

sigma.11 <- sigma[1:x,1:x]	 # stores values to the partition S11
sigma.22 <- sigma[(x+1):c,(x+1):c]	 # stores values to the partition S22
sigma.12 <- sigma[1:x,(x+1):c]	 # stores values to the partition S12
sigma.21 <- sigma[(x+1):c,1:x]	 # stores values to the partition S21

retval <- list(sigma11=sigma.11,sigma12=sigma.12,sigma21=sigma.21,sigma22=sigma.22)
return(retval)
}
	

em.splitvec <- function(v)
# Splits a vector in 2 parts: NA part and non-NA one
# v - vector to be split
{
l <- length(v)		# internal function variables initialization
n <- 0

for (j in 1:l)		# counts missing elements
	if (is.na(v[j]))
		n <- n+1
	
v.1 <- matrix(NA,nrow=n,ncol=2,byrow=TRUE)		# internal function variables initialization
v.2 <- matrix(NA,nrow=l-n,ncol=2,byrow=TRUE)
jv1 <- 1	# internal function variables initialization
jv2 <- 1

for (j in 1:l)
	if (is.na(v[j]))	# stores missing 	elements in the first part
		{
		v.1[jv1,1] <- j
		v.1[jv1,2] <- v[j]
		jv1 <- jv1+1
		}
	else	# stores known 	elements in the second part
		{
		v.2[jv2,1] <- j
		v.2[jv2,2] <- v[j]
		jv2 <- jv2+1
		}

retval <- list(part1=v.1,part2=v.2)
return(retval)		
}
	

em.correctmat <- function(m,l)
# Substitutes original values in the matrix by those estimated by EM algorithm
# m - matrix to be corrected
# l - list with the estimated values
{
r.l.1 <- dim(l$cont2na)[1]		# internal function variables initialization
c.l.1 <- dim(l$cont2na)[2]
r.l.2 <- dim(l$cont2obs)[1]
c.l.2 <- dim(l$cont2obs)[2]
for (i in 1:r.l.1)		# overwrite with the missing X missing contribution to T2
	{
	for (j in 1:c.l.1)
	m[i,j] <- l$cont2na[i,j]
	}
for (i in 1:r.l.2)		# overwrite with the missing X known contribution to T2
	{
	for (j in 1:c.l.2)
		{
		m[i,(c.l.1+j)] <- l$cont2obs[i,j]
		m[(c.l.1+j),i] <- l$cont2obs[i,j]
		}
	}

return(m)
}
	

em.contribt1 <- function(mp,sp,sv)
# Estimates the NA vector (contribution to T1)
# mp - mean vector
# sp - partitioned covariance matrix
# sv - NA/non <- NA split vector
{
retval <- mp$mu1+sp$sigma12%*%solve(sp$sigma22)%*%(sv$part2[,2]-mp$mu2)		 # returns the estimated vector
return(retval)
}
	

em.putvalue <- function(pv,sv)
# Replaces missing values with predicted values
# pv - predicted vector
# sv - split vector
{
sv$part1[,2] <- pv		# internal function variable initialization

retval <- c(sv$part1[,2],sv$part2[,2])		# returns the complete vector
return(retval)
}
	
	
em.contribt2 <- function(sp,pv,sv)
# Estimates the contribution of NA vector to T2
# sp - partitioned covariance matrix
# pv - predicted vector
# sv - split vector
{
c1 <- sp$sigma11-sp$sigma12%*%solve(sp$sigma22)%*%sp$sigma21+pv%*%t(pv)		 # estimates missing X missing contribution to T2
c2 <- pv%*%t(sv$part2[,2])		# estimates missing X known contribution to T2

retval <- list(cont2na=c1,cont2obs=c2)		# returning values
return(retval)
}
	

em.arima <- function(xn,o,s,eps,maxit)
# Estimates one-step ahead ARIMA predictions for each time series of a given multivariate time series
#  09/07/2002 - Modified to support seasonal series 
# xn - data matrix
# o - order matrix
# s - seasonal period
# mit - optimizer max iterations
# eps - optimizer relative tolerance
{
rows <- dim(xn)[1]
cols <- dim(xn)[2]
models <- as.list(rep(NA,cols))

ar.pred <- matrix(NA,nrow=rows,ncol=cols)		 # internal function variable initialization
for (j in 1:cols)		# applies models to all series
	{
	if (is.null(s))
		{
		order <- o[1:3,j]	# assigns jth order vector to jth series - without seasonality
		seasonal <- list(order=c(0,0,0),period=NA)
		}
	else
		{
		order <- o[1:3,j]
		seasonal <- list(order=o[4:6,j],period=s)		# assigns jth order vector to jth series - with 		seasonality
		}
			
	models[[j]] <- arima(xn[,j],order=order,seasonal=seasonal,xreg=NULL,optim.control=list(maxit=maxit,reltol=eps))	# fits 	arima model to jth series
	#ar.filter <- predict(models[[j]],xreg=NULL)		# applies filter to jth series
	#ar.pred[,j] <- ar.filter$pred		# stores filtered series to internal variable
	ar.pred[,j] <- xn[,j] - residuals(models[[j]]) # compute the one-step ahead prediction using the inovation
	}

retval <- list(ar.pred=ar.pred,models=models)	 # returning value
return(retval)
}


em.spline <- function(xn,df,w)
# Estimates smooth spline for each time series of a given multivariate time series
# xn - data matrix
# df - degrees of freedom for the splines. If NULL, cross-validation is used 
# w - weights matrix
{
rows <- dim(xn)[1]
cols <- dim(xn)[2]

t <- seq(1:rows)	# time index for smoothing
if (length(df) == 1)
df <- rep(df,cols)	# extend df vector if length=1
	
sp.pred <- matrix(NA,nrow=rows,ncol=cols)	 # internal function variable initialization
if (is.null(w))
	w <- matrix(1,nrow=rows,ncol=cols)	# creates null weights matrix
		
models <- as.list(rep(NA,cols))
if (is.null(df))
	for (j in 1:cols)		# applies models to all series
		{
		models[[j]] <- smooth.spline(x=t,y=xn[,j],w=w[,j],cv=TRUE)
		sp.pred[,j] <- predict(models[[j]],t)$y
		}
else
	for (j in 1:cols)		# applies models to all series
		{
		models[[j]] <- smooth.spline(x=t,y=xn[,j],w=w[,j],df=df[j])
		sp.pred[,j] <- predict(models[[j]],t)$y
		}
			
retval <- list(sp.pred=sp.pred,models=models)	 # returning value
return(retval)
}

	
em.gam <- function(formula,xn,names,dataset,w,eps,maxit,bf.eps,bf.maxit)
# Estimates a gam model for each time series of a given multivariate time series
# formula - gam formula
# xn - dependent variables matrix
# names - variable names
# dataset - dataset for covariates
# w - weights matrix
{
rows <- dim(xn)[1]
cols <- dim(xn)[2]

t <- seq(1:rows)	# time index for smoothing
ga.pred <- matrix(NA,nrow=rows,ncol=cols)	 # internal function variable initialization
if (is.null(w))
	w <- rep(1,rows)	# creates null weights 	vector
	
models <- as.list(rep(NA,cols))
XN <- as.data.frame(xn)
dimnames(XN) <- names
dataset <- cbind.data.frame(xn,dataset)
formula <- paste("XN$",formula,sep="")
for (j in 1:cols)		# applies models to all series
	{
	models[[j]] <- 	gam(as.formula(formula[j]),family=gaussian(),data=dataset,weights=w,na.action=na.exclude,epsilon=eps,maxit=maxit,bf.epsilon=bf.eps,bf.maxit=bf.maxit)
	ga.pred[,j] <- fitted(models[[j]])
	}

retval <- list(ga.pred=ga.pred,models=models)	# returning value
return(retval)
}


em.filter <- function(XN,names,dataset,method,ar.control,sp.control,ga.control,f.eps,f.maxit,ga.bf.eps,ga.bf.maxit)
# Performs filtering on the time series
{
if (method == "arima")
	{
	m.obj <- em.arima(xn=XN,o=ar.control$order,s=ar.control$period,eps=f.eps,maxit=f.maxit)		# then predicts one-step ahead arima estimates
	MA <- m.obj$ar.pred
	}
else if (method == "spline")
	{
	m.obj <- em.spline(xn=XN,df=sp.control$df,w=sp.control$weights)		# then predicts smooth spline estimates
	MA <- m.obj$sp.pred
	}
else if (method == "gam")
	{
	m.obj <- em.gam(formula=ga.control$formula,xn=XN,names=names,dataset=dataset,w=ga.control$weights,eps=f.eps,maxit=f.maxit,bf.eps=ga.bf.eps,bf.maxit=ga.bf.maxit)	# gam estimates
	MA <- m.obj$ga.pred
	}
else
	stop("Filtering method not implemented.")

retval <- list(MA=MA,models=m.obj$models)
return(retval)
}


em.nofilter <- function(M,by,rows,cols)
# Create mean matrix when it is not time series
{
nbylevels <- nlevels(by)
MA <- matrix(NA,nrow=rows,ncol=cols)
for (i in 1:rows)
	MA[i,] <- M[,,by[i]]
	
retval <- list(MA=MA)
return(retval)
}


em.recursion <- function(X,XN,MA,S,C,rows,cols,by,nbylevels,n)
# Performs algorithm recursion over the observations
{
weights <- matrix(1.0,rows,cols)
SCT1 <- array(0.0,dim=c(1,cols,nbylevels))	# internal function variables initialization
SCT2 <- array(0.0,dim=c(cols,cols,nbylevels))
		
for (i in 1:rows)		# applies to all vectors
	{
	if((C$nainrow[i,2] != 0) &&  (C$nainrow[i,2] != cols))	# if there is at least one missing 	value but not intire vector is missing (mincol is deprecated)
		{     # then runs E step
		SV <- em.splitvec(X[C$nainrow[i,1],])	# splits i-th vector at position nainrow
		MP <- em.partmu(em.arrangevec(MA[i,],SV),C$nainrow[i,2])	# assigns predicted vector
		SP <- em.partsigma(em.arrangemat(S[,,by[i]],SV),C$nainrow[i,2])		# assigns partitioned covariance matrix
		CT1 <- em.contribt1(MP,SP,SV)	# estimates contribution to T1
		TV <- em.putvalue(CT1,SV)	# puts estimated values in a temporary vector
		XN[C$nainrow[i,1],] <- em.rearrangevec(TV,SV)	# replaces existing vector in XN with the original order vector
		CT2 <- em.contribt2(SP,CT1,SV)		# estimates contributions to T2
		SCT1[,,by[i]] <- SCT1[,,by[i]]+XN[i,]	# increments summation vector of T1 in order to update mean vector
		SCT2INC <- em.rearrangemat(em.correctmat(TV%*%t(TV),CT2),SV)  # increment in SCT2, but matrix is ok
		SCT2[,,by[i]] <- SCT2[,,by[i]]+SCT2INC	# increment summation matrix of T2 in order to update covariance matrix
        }
	else if((C$nainrow[i,2] != 0) &&  (C$nainrow[i,2] == cols))	# if whole vector is missing (mincol is deprecated)
		{
		XN[C$nainrow[i,1],] <- MA[i,]
		SCT1[,,by[i]] <- SCT1[,,by[i]]+XN[i,] 	# \tilde{x}=\tilde{\mu}
		SCT2INC <- S[,,by[i]]+XN[i,]%*%t(XN[i,])	# \tilde{x}_{j}\tilde{x}_{j}^{'}=\tilde{\Sigma}+\tilde{\mu}_{j}\tilde{\mu}_{j}^{'}
		SCT2[,,by[i]] <- SCT2[,,by[i]]+SCT2INC
		}
	else	# else (no missing values)
		{
		SCT1[,,by[i]] <- SCT1[,,by[i]]+XN[i,]	# increments summation vector of T1 in order to update mean vector
		SCT2INC <- XN[i,]%*%t(XN[i,])
		SCT2[,,by[i]] <- SCT2[,,by[i]]+SCT2INC	# increments summation matrix of T2 in order to update covariance matrix
		}
    weights[i,] <- (cols-0.5*C$nainrow[i,2])/cols   # crude measure of uncertanty
	}
retval <- list(XN=XN,SCT1=SCT1,SCT2=SCT2,weights=weights)
return(retval)
}


mnimput <- function(formula,dataset,by=NULL,log=FALSE,log.offset=1,eps=1e-3,maxit=1e2,ts=TRUE,method="spline",sp.control=list(df=NULL,weights=NULL),ar.control=list(order=NULL,period=NULL),ga.control=list(formula,weights=NULL),f.eps=1e-6,f.maxit=1e3,ga.bf.eps=1e-6,ga.bf.maxit=1e3,verbose=FALSE,digits=getOption("digits"))
# EM algorithm with time series option
#  09/07/2002 - Modified to support multiplicative seasonal series
# dataset - input data matrix
# eps - stop criterion
# maxit - maximum iterations
# ts - is time series
# method - filtering method for time series imputation. it can be "smooth", "gam" or "arima"
# sp.control()
# df - degrees of freedom for splines
# weights - matrix holding weights for smooth spline
# ar.control()
# order - order matrix (required if time series is true)
# period - seasonal period (required if time series is true)
# ar.maxit - ARIMA optimizer max iterations (required if time series is true)
# ar.eps - ARIMA optimizer relative tolerance (required if time series is true)
# gam.control()
#
{
if (missing(formula))
	stop("Model formula is missing")
if (missing(dataset))
	stop("Dataset is missing")

t1 <- proc.time()	# start time
call <- match.call()	# stores call sintaxe
if (verbose)
	cat("Expectation-Maximization Algorithm\n")
mdframe <- model.frame(formula,dataset,na.action=na.pass)  # get missing data frame
names <- dimnames(mdframe)		# keeps dataset variables names

rows <- dim(mdframe)[1]		# internal function variables initialization
cols <- dim(mdframe)[2]
if (is.null(by))
	{
	by <- as.factor(rep(1,rows))
	nbylevels <- nlevels(by)
	}
else if (sum(is.na(by))>0)
		stop("BY indicator must not have missing values")
else
	{
	by <- as.factor(by)
	nbylevels <- nlevels(by)
	}

X <- matrix(unlist(mdframe),nrow=rows,ncol=cols)	 #copies input data matrix into X matrix to avoid conflicts with variables names

if (log)
   if (!is.null(log.offset))
      X <- log(X+log.offset)
   else
      stop("Log offset is set to NULL")

M <- array(sapply(by(X,by,em.mean),as.matrix),dim=c(cols,1,nbylevels))	 # assigns mean vector
dimnames(M) <- list(names[[2]],"mean",levels(as.factor(by)))
C <- em.extractcoord(X)		# stores missing values coordinates
XN <- em.replacewmean(X,M,C,FALSE)		# replaces jth column missing data in X matrix with jth column mean
#S <- em.dispersion(XN,TRUE)	# assigns the covariance matrix of XN
S <- array(sapply(by(XN,by,em.dispersion),as.matrix),dim=c(cols,cols,nbylevels))  # creates an array of covariance matrices
dimnames(S) <- list(names[[2]],names[[2]],levels(as.factor(by)))
cc <- 1e35		# initializes convergence criterion variable
#det <- em.det(S)		# assigns determinant of S
det <- 0
for (i in 1:nbylevels) 
	det <- det+em.det(S[,,i])  # sum determinants of all covariance matrices
k <- 0	
n <- c(by(by,by,length))
#m <- c(by(X,by,em.countmvec))		# assigns missing vector counts
models <- NULL

if ((method=="spline") && (length(sp.control$df)==1))
	sp.control$df <- rep(sp.control$df,cols)
	
if (verbose)
	cat("Iteration   Covergence Criterion   Elapsed Time (hh:mm:ss)\n")
	
while ((cc > eps) && (k < maxit))	# repeat until the convergence criterion or maximum iterations is reached
	{
	olddet <- det	# reassigns determinant of S
	if (ts)
		{		# if is time series
		filtered <- em.filter(XN,names,dataset,method,ar.control,sp.control,ga.control,f.eps,f.maxit,ga.bf.eps,ga.bf.maxit)
		MA <- filtered$MA
		models <- filtered$models
		}
	else
		{
		MA <- em.nofilter(M,by,rows,cols)$MA		# if it is not a time series problem, use the actual estimate of column means
		models <- NULL
		}
	recursed <- em.recursion(X,XN,MA,S,C,rows,cols,by,nbylevels,n)
	XN <- recursed$XN
	for (j in 1:nbylevels)
		M[,,j] <- recursed$SCT1[,,j]/n[j]	# estimates sufficient statistic T1 and computes the mean vector
	for (j in 1:nbylevels)
		S[,,j] <- recursed$SCT2[,,j]/n[j]-M[,,j]%*%t(M[,,j])	# estimates sufficient statistic T2 and computes the covariance matrix
	weights <- recursed$weights
	
	det <- 0
    for (i in 1:nbylevels) 
        det <- det+em.det(S[,,i])  # sum determinants of all covariance matrices
	cc <- abs((det-olddet)/olddet)		# updates covergence criterion
	k <- k+1	# increments iterations counter
	t2 <- proc.time()	# end time
	
	if (verbose)
		cat(paste(k,paste("         ",format(round(cc,digits),nsmall=digits),sep=""),elapsedtime(t1,t2),"\n", 		sep="          "))	# updates log file
	}
	
converged <- ifelse(cc<=eps,TRUE,FALSE)	
if (verbose)
	{
	if (converged)
		cat("\nConverged\n")	# updates log file
	else
		cat("\nDid not converged\n")	# updates log file
	}	
dimnames(MA) <- names	# gives variables names back
dimnames(XN) <- names	# gives variables names back
dimnames(weights) <- names
for (j in 1:nbylevels)
	names(M[,,j]) <- names[[2]]
for (j in 1:nbylevels)
	dimnames(S[,,j]) <- list(names[[2]],names[[2]])
	 
XN <- as.data.frame(XN)		# converts imputed dataset in dataframe
etime <- elapsedtime(t1,proc.time())	# getting elapsed time
missings <- C$nainrow[,2]
	
retval <- list(call=call,filled.dataset=XN,dataset=mdframe,muhat=M,sigmahat=S,level=MA,weights=weights,missings=missings,iterations=k,convergence=cc,converged=converged,time=etime,models=models,log=log,log.offset=log.offset)	 # assigns return values to temporary object
class(retval) <- "mtsdi"
return(retval)		# returning value
}


getmean <- function(object,weighted=TRUE,mincol=1,maxconsec=3)
# gets mean of row when mincol sastifies
{
if (!inherits(object,"mtsdi"))
    stop("Object class is not mtsdi")
     
if (object$log)
    filled.dataset <- exp(as.matrix(object$filled.dataset))-object$log.offset
else
    filled.dataset <- as.matrix(object$filled.dataset)
weights <- as.matrix(object$weights)
nm <- as.vector(object$missings)
n <- nrow(filled.dataset)
p <- ncol(filled.dataset)

misscount <- em.countmnear(object$dataset)
for(j in 1:p)
	for(i in 1:n)
		filled.dataset[i,j] <- ifelse(misscount[i,j]<=maxconsec,filled.dataset[i,j],NA)

cmean <- double(n)

for (t in 1:n)
    {
	if ((p-nm[t]) >= mincol)
		{
     	if (weighted)
        	cmean[t] <- weighted.mean(filled.dataset[t,],weights[t,],na.rm=TRUE)
     	else
        	cmean[t] <- mean(filled.dataset[t,],na.rm=TRUE)
     	}
	else
		cmean[t] <- NA
	}
return(cmean)
}


predict.mtsdi <- function(object,...)
# puts values on its original scale if log is used
{
if (!inherits(object,"mtsdi"))
    stop("Object class is not mtsdi")
     
if (object$log)
    filled.dataset <- exp(as.matrix(object$filled.dataset))-object$log.offset
else
    filled.dataset <- as.matrix(object$filled.dataset)
return(as.data.frame(filled.dataset))
}


plot.mtsdi <- function(x,vars="all",overlay=TRUE,level=TRUE,points=FALSE,leg.loc="topright",horiz=FALSE,at.once=FALSE,...)
# plot method for imputed data plotting
{
if (!inherits(x,"mtsdi"))
    stop("The class of this object is not mtsdi.")
if(x$log)
	{
	ylab <- "Axis on the log scale"
	dataset <- log(x$dataset+x$log.offset)
	}
else
	{
	ylab <- "Axis on the original scale"
	dataset <- x$dataset
	}
filled.dataset <- as.matrix(x$filled.dataset)
llevel <- x$level

rows <- dim(dataset)[[2]]

if (vars[1]=="all")
	vars <- seq(1:rows)
else if ((max(vars)>rows) || (length(vars)>rows))
	stop("Argument vars must be \"all\" or a valid vector")

#if(typeof(leg.loc)!="list")	
#	if (leg.loc=="auto") 
#		{ 
#		leg.loc <- list(x=18.5,y=16.1)
#		horiz <- FALSE
#		}

for (i in vars)
	{
	if (at.once)
		get(getOption("device"))()
	else
		if (interactive() && (i!=vars[1]) && (length(vars)!=1))
			{
			prompt <- readline("Press ENTER for next page or X to exit and keep this page... ")
			if((prompt == "x") || (prompt == "X"))
				break
			}
	plot(filled.dataset[,i],ylab=ylab,type="l",col="red",...)
	if (overlay)
		lines(dataset[,i],col="black",...)
	if (level)
		lines(llevel[,i],col="blue",...)
	if (points)
		points(filled.dataset[,i],col="red",...)
	legend(x=leg.loc,legend=c("observed","imputed","level"),col=c("black","red","blue"),lty=1,horiz=horiz,bty="n") # alternate location c(18.5,16.1)
	title(main=paste("Imputed data for series",colnames(filled.dataset)[i]))
	}
}


summary.mtsdi <- function(object,...)
# summary
{
call <- object$call
muhat <- object$muhat
sigmahat <- object$sigmahat
iterations <- object$iterations
convergence <- object$convergence
converged <- object$converged
time <- object$time
models <- object$models
log <- object$log
log.offset <- object$log.offset

retval <- list(call=call,muhat=muhat,sigmahat=sigmahat,iterations=iterations,convergence=convergence,converged=converged,time=time,models=models,log=log,log.offset=log.offset)	 # assigns return values to temporary object
class(retval) <- "summary.mtsdi"
return(retval)
}


print.summary.mtsdi <- function(x,digits=getOption("digits"),print.models=TRUE,...)
# summary print method
{
cat("\nCall:\n")
print(x$call)
cat("\nEstimated mean vector:\n")
for (i in 1:dim(x$muhat)[3])
	{
	if(dim(x$muhat)[3]!=1)
		cat("\nCovariance window id: ",dimnames(x$muhat)[[3]][i],"\n",sep="")
	print(x$muhat[,,i])
	}
cat("\nEstimated covariance matrix:\n")
for (i in 1:dim(x$muhat)[3])
	{
	if(dim(x$muhat)[3]!=1)
		cat("\nBY factor level: ",dimnames(x$muhat)[[3]][i],"\n",sep="")
	print(x$sigmahat[,,i])
	}

if (x$log)
	cat("\nData are on the log scale with an offset of ",x$log.offset,".\n",sep="")
else
	cat("\nData are on the original scale.\n")

conv <- ifelse(x$converged==TRUE,"converged","did not converge")
cat("\nThe algorithm ",conv," after ",x$iterations," iterations with relative diference in covariance matrix equal to ",round(x$convergence,digits),".\n",sep="")
cat("The process took ",x$time,".\n",sep="")

if ((print.models)&&(!is.null(x$models)))
	{
	for (i in length(x$models))
		x$models[[i]]$call <- NULL
	cat("\nTime filtering models:")
	for (i in 1:length(x$models))
		{
		cat("\n\nFilter model for variate: ",dimnames(x$muhat)[[1]][i],"\n",sep="")
		print(x$models[[i]])
		}
	}
}


print.mtsdi <- function(x,digits=getOption("digits"),...)
# print method for mtsdi class
{
cat("\nCall:\n")
print(x$call)
if (x$log)
	cat("\nData are on the log scale with an offset of ",x$log.offset,".\n",sep="")
else
	cat("\nData are on the original scale.\n")

conv <- ifelse(x$converged==TRUE,"converged","not converged")
cat("\nThe algorithm ",conv," after ",x$iterations," iterations with relative diference in covariance matrix equal to ",round(x$convergence,digits),".\n",sep="")
cat("The process took ",x$time,".\n",sep="")
}

Try the mtsdi package in your browser

Any scripts or data that you put into this service are public.

mtsdi documentation built on April 12, 2025, 1:15 a.m.