knitr::opts_chunk$set(echo = TRUE, comment = NA, error = TRUE)

Source: http://thomasleeper.com/Rcourse/Tutorials/mi.html

n <- 10
bad <- 3
good <- n - bad

set.seed(10)
x <- c(sample(1:n, good, replace = TRUE), rep(NA, bad))
x
mean(x, na.rm = TRUE)

Standard Error of only the elements that are not NA:

$$SE = \frac {\sigma_x} {\sqrt {\sum x_i}}$$

sd(x, na.rm = TRUE) / sqrt(sum(!is.na(x)))

Generate 15 samples of new data replacing the NA elements with values extracted from the GOOD values.

na.count <- length(x[is.na(x) == TRUE])    # calculate the # of NAs

imp <- replicate(15, 
                 c(x[!is.na(x)],                           # get the 7 non-NA elements
                   sample(x[!is.na(x)], na.count, TRUE)),  # get 3 elements from non-NA vec
                 simplify = FALSE)                         # output to be a vector
imp

Applying the concept to the activity dataset

The dataset has to be transformed and put the intervals as variables and the steps as values. The date is the ID or index.

library(reshape)
library(RepDataPeerAssessment1)

# data(activity)
str(activity)
activity

Renaming steps as values

This is not really necessary because we can specify values in the function.

Keeping original variable names

# this data frame has the following form:
# date  interval_1  interval_2  interval_3 ... interval_n
#         step_11                                 step_1n
#         step_21                                 step_2n
#
actbycol <- cast(activity, date ~ interval, value = "steps")
# saving actbycol with another name
intervals <- actbycol
save(intervals, file = paste(project.data, "intervals.RData"), sep = "/")
cat("# of variables:", (ncol(actbycol)), "\t")
cat("# of intervals:", (ncol(actbycol)-1))
cat("\n")
# take a sample of the original data frame
mysample <- actbycol[sample(1:nrow(actbycol), 
                            10, 
                            replace=FALSE), 
                     c(1:7, 283:289)]
mysample
splice <- list(rows = c(1:5, 57:61), cols = c(1:5, 286:289))
mysample <- actbycol[splice$rows, splice$cols]
mysample

Starting with interval 0 of activity by intervals

In our case, the vector to investigate is any of the interval columns. Let's start with interval 0 assigning it as a vector.

x0 <- actbycol$`0`
x0

The mean is:

mean(x0, na.rm = TRUE)

The standard error:

sd(x0, na.rm = TRUE) / sqrt(sum(!is.na(x0)))
x <- x0
na.count <- length(x[is.na(x) == TRUE])    # calculate the # of NAs

imp <- replicate(15, 
                 c(x[!is.na(x)],                           # get the 7 non-NA elements
                   sample(x[!is.na(x)], na.count, TRUE)),  # get 3 elements from non-NA vec
                 simplify = FALSE)                         # output to be a vector
imp
calcAfter <- function(x, samples = 15) {
  na.count <- length(x[is.na(x) == TRUE])    # calculate the # of NAs

  imp <- replicate(samples, c(x[!is.na(x)],
                     sample(x[!is.na(x)], na.count, TRUE)),  
                   simplify = FALSE)                  

  means <- sapply(imp, mean)
  grandm <- mean(means)

  ses <- sapply(imp, sd)/sqrt(10)
  within <- mean(ses)
  between <- sum((means - grandm)^2)/(length(imp) - 1)

  grandvar <- within + ((1 + (1/length(imp))) * between)
  grandse <- sqrt(grandvar)

  list(grandMeans = grandm, grandSE = grandse)
}
calcAfter(x0, 15)

Now, let's see if something changed:

means <- sapply(imp, mean)
means
grandm <- mean(means)
grandm
ses <- sapply(imp, sd)/sqrt(10)
within <- mean(ses)
between <- sum((means - grandm)^2)/(length(imp) - 1)

grandvar <- within + ((1 + (1/length(imp))) * between)
grandse <- sqrt(grandvar)
grandse

Applying on interval 5 but with a function

calcBefore <- function(x) {
  na.count <- length(x[is.na(x) == TRUE])    # calculate the # of NAs
  notna.count <- length(x[is.na(x) == FALSE]) 
  mean <- mean(x, na.rm = TRUE)
  serror <- sd(x, na.rm = TRUE) / sqrt(sum(!is.na(x)))
  list(notna = notna.count, na = na.count, mean = mean, stdError = serror)
}
calcBefore(x0)
x5 <- actbycol$`5`
calcBefore(x5)
calcAfter(x5)
x10 <- actbycol$`10`
calcBefore(x10)
calcAfter(x10)
x25 <- actbycol$`25`
calcBefore(x25)
calcAfter(x25)

Vectorizing functions

lapply example

y <- list(a = 1:5, b = rnorm(10))
y
lapply(y, mean)

sapply example

y <- list(a = 1:5, b = rnorm(10))
y
sapply(y, mean)
# subset of only the numerical data, no date
actbycol2 <- actbycol[, 2:289]

Applying a function to all the dataframe, excluding the date""

sapply(actbycol2, calcBefore)

Calculating the Standard Error

sapply(actbycol2, calcAfter)
hist(activity$value)

histogram of activity without zeros

activity.nonzero <- subset(activity, activity$value > 0)
hist(activity.nonzero$value)

Histogram of activity restored from cast()

actbycol.melt <- melt(actbycol, id = "date")
rownames(actbycol.melt) <- NULL
hist(subset(actbycol.melt, actbycol.melt$value > 0)$value)

Function to imputting NA with random values

random.imp <- function (a){
  missing <- is.na(a)        # logical vector. TRUE for NAs
  n.missing <- sum(missing)  # number of missing values
  a.obs <- a[!missing]       # vector of GOOD values
  imputed <- a               # copy of original vector

  # Take a sample from GOOD part of the vector and replace missing elements
  imputed[missing] <- sample (a.obs, n.missing, replace=TRUE)
  return (imputed)           # return complete vector
}
x
indNA <- which(is.na(x))   # which rows are NA
indNA
missing <- is.na(x)
missing
n.missing <- sum(missing)
n.missing
x.obs <- x[!missing]
x.obs
imputed <- x
imputed
imputed[missing] <- sample (x.obs, n.missing, replace=TRUE)
imputed

Running function on a vector

imp <- random.imp(x)
plot(density(imp[indNA]), main = "Observed and imputed values", xlab = "x")
lines(density(imp[which(!((1:nrow(array(x))) %in% indNA))]), col = "red")
legend("topleft",text.col = c("black","red"),legend = c(" Imputation","Observed values"))

Running function on 1st interval of actbycol2

x0
imp <- random.imp(x0)      # impute random values
indNA <- which(is.na(x0))   # which rows are NA

plot(density(imp[indNA]), 
     main = "Observed and imputed values", 
     xlab = "x0")
lines(density(imp[which(!((1:nrow(array(x0))) %in% indNA))]), col = "red")
legend("topright",text.col = c("black","red"),legend = c(" Imputation","Observed values"))
actbycol2[splice$rows, splice$cols-1]

replaced by random values

actbycol.rep <- random.imp(actbycol2[, 1:10])
actbycol.rep[splice$rows, 1:10]

numerical dataframe with replaced random values

summary(actbycol2)
actbycol.imp <- random.imp(actbycol2)    
actbycol.imp[splice$rows, splice$cols-1]
date <- actbycol[, 1]
actbycol.bind <- cbind(date, actbycol.imp)
actbycol.imp.melt <- melt(actbycol.bind, id = "date")
names(actbycol.imp.melt) <- c("date", "interval", "steps")
# rownames(actbycol.melt) <- NULL
hist(subset(actbycol.imp.melt, actbycol.imp.melt$steps > 0)$steps)
#hist(actbycol.imp.melt$steps)
dim(activity)
dim(actbycol.imp.melt)

Imputing random values. Density plot

library(reshape)
library(lattice)
library(mice)

# load the data
data("activity")

# convert interval variable to multiple columns
actbycol <- cast(activity, date ~ interval, value = "steps")

# imputting random values
actbycol.imp <- random.imp(actbycol[2:289]) 

# rebuild to interval to one column
date <- actbycol[, 1]                       # get the date
actbycol.bind <- cbind(date, actbycol.imp)  # merge 288 cols with date
actbycol.imp.melt <- melt(actbycol.bind, id = "date")  # go back to long df
names(actbycol.imp.melt) <- c("date", "interval", "steps")  # rename vars

# order by date so it is the same as original
actbycol.imp.melt <- actbycol.imp.melt[order(actbycol.imp.melt$date), ]

# convert interval to numeric
actbycol.imp.melt$interval <- as.numeric(as.character(actbycol.imp.melt$interval))

# prepare for plotting
imp.df <- actbycol.imp.melt                 # the imputted data frame
indVar <- "steps"                           # variable for plotting
indNA <- which(is.na(activity[, indVar]))   # which rows are NA
indNA.not <- which(!((1:nrow(activity)) %in% indNA))  # 

plot(density(imp.df[indNA, indVar]), 
     main = "Observed and imputed values", 
     xlab = "steps")
lines(density(imp.df[indNA.not, indVar]), 
      col = "red")
legend("topright",text.col = c("black","red"),legend = c(" Imputation","Observed values"))
lwd <- 1.5
data <- imp.df[, c("steps", "interval")]
tp <- xyplot(steps~interval, data = imp.df,
             na.groups = ici(imp.df),
             ylab ="steps", xlab="interval",
       cex = 0.75, 
       xlim = c(0, 2355),

       lex=lwd
       #ylim = c(0, 900), 
       #xlim = c(0, 500))
)
#print(tp, newpage = FALSE, position = c(0.48,0.08,1,0.92))
tp

Summary. Random imputing

library(RepDataPeerAssessment1)

# get information before imputing data
df.before <- info.imp(activity, indVar, "Before", 2)

# get information after imputing data
df.random <- info.imp(imp.df, indVar, "Random", 2)


df.random <- cbind(df.before, df.random)
df.random
(df.random["ses", "Before"] - df.random["ses", "Random"]) / df.random["ses", "Before"] * 100
library(fBasics)
df <- activity[, c("interval", "steps")]
basicStats(df)

Normalized variables steps and row-index

y <- imp.df$steps
y.sig <- sd(y)
y.mu <- mean(y)
y.norm <- (y - y.mu) / y.sig
min(y.norm)
max(y.norm)

x <- seq(1:nrow(imp.df))
x.sig <- sd(x)
x.mu <- mean(x)
x.norm <- (x - x.mu) / x.sig
min(x.norm)
max(x.norm)

Normalized interval (optional) DO NOT RUN

This has the disadvantage of showing discontinuities along the x-axis since the interval is no n contiguous jumping in steps of 5.

x <- imp$interval
x.sig <- sd(x)
x.mu <- mean(x)
x.norm <- (x - x.mu) / x.sig
min(x.norm)
max(x.norm)

Normalized plot x vs y of random imputting

Normalizing the row-index should be the prefferred option to avoid showing gap along the x-axis.

# find NA and not NA row-indexes
indVar <- "steps"
indNA <- which(is.na(activity[, indVar]))   # which rows are NA
indNA.not <- which(!((1:nrow(activity)) %in% indNA))

# plot row-index vs steps
col <- c(mdc(4),mdc(5))

plot(x.norm[indNA.not], y.norm[indNA.not], 
     main = "index vs steps",
     xlab = "x",
     ylab = "y",
     col = col[1]
     )
points(x.norm[indNA], y.norm[indNA], col = col[2])

Mean Imputation

library(RepDataPeerAssessment1)
library(lattice)
library(mice)

par(mfrow = c(1,2))
data(activity)

activity.nodate <- activity[, c(1, 3)]

### impute the mean
#imp <- mice(activity.nodate, method="mean", m=1, maxit=1)
imp <- mice(activity.nodate, method="mean", m=1, maxit=1, print = FALSE)

### Figure 1.1
lwd <- 1.5
# par(mfrow=c(1,2))

max <- max(nas.complete(activity$steps))

breaks <- seq(0, max*1.1, 20)
nudge <- 1

x <- matrix(c(breaks-nudge, breaks+nudge), ncol=2)

obs <- activity[, "steps"]
mis  <- imp$imp$steps[,1]

fobs <- c(hist(obs, breaks, plot=FALSE)$counts, 0)
fmis <- c(hist(mis, breaks, plot=FALSE)$counts, 0)

y <- matrix(c(fobs, fmis), ncol=2)

matplot(x, y, type="s",
        col=c(mdc(4),mdc(5)), lwd=2, lty=1,
        #xlim = c(0, 170), 
        ylim = c(-5, 200), 
        yaxs = "i",
        xlab="Steps",
        ylab="Frequency")
box()

tp <- xyplot(imp, steps~interval, 
             na.groups = ici(imp),
             ylab ="steps", xlab="interval",
       cex = 0.75, 

       lex=lwd
       #ylim = c(0, 900), 
       #xlim = c(0, 500))
)
print(tp, newpage = FALSE, position = c(0.48,0.08,1,0.92))
# create summary table
indVar <- "steps"     # variable to analyze

imp.df <- get.imp.df(activity, imp, indVar)  # merge imputed values in original

# get information before imputing data
df.before <- info.imp(activity, indVar, "Before", 2)    # summary before

# get information after imputing data
df.mean <- info.imp(imp.df, indVar, "Mean", 2)          # summary after

res.mean <- cbind(df.before, df.mean)       # merge the two data frames
res.mean
(res.mean["ses", "Before"] - res.mean["ses", "Mean"]) / res.mean["ses", "Before"] * 100

Verify that data frame merged well

# x<-cbind(rnorm(10),rnorm(10))
indVar <- "steps"
indNA <- which(is.na(activity[, indVar]))            # which rows are NA
indNA.not <- which(!((1:nrow(activity)) %in% indNA)) # rows that are not NA

plot1<-xyplot(imp.df[indNA.not, indVar]~imp.df[indNA.not, "interval"],
              col="blue", cex = 0.75)
plot2<-xyplot(imp.df[indNA, indVar]~imp.df[indNA, "interval"], 
              col = "red", cex = 0.75)

library(latticeExtra)
plot1+plot2

Regression Imputation

fit <- lm(steps ~ interval, data = activity)   # fit the variables
pred <- predict(fit, newdata = ic(activity))   # build predictor

### alternative using mice
imp <- mice(activity[,c(1,3)], method="norm.predict", 
            m=1, maxit=3, seed=1, print = FALSE)

### Figure 1.2
par(mfrow = c(1,2))

fmis <- c(hist(pred, breaks, plot=FALSE)$counts, 0)
y <- matrix(c(fobs, fmis), ncol=2)

matplot(x, y, type="s",
        col=c(mdc(4),mdc(5)), lwd=2, lty=1,
        ylim = c(0, 200),
        yaxs = "i",
        xlab="steps",
        ylab="Frequency")
box()

tp <- xyplot(imp, steps~interval,
       ylab="steps", xlab="interval",
       cex = 0.75, lex=lwd
)
print(tp, newpage = FALSE, position = c(0.48,0.08,1,0.92))
# create summary table
indVar <- "steps"     # variable to analyze

imp.df <- get.imp.df(activity, imp, indVar)  # merge imputed values in original

# get information before imputing data
df.before <- info.imp(activity, indVar, "Before", 2)    # summary before

# get information after imputing data
df.regression <- info.imp(imp.df, indVar, "Regression", 2)          # summary after

res.regre <-cbind(df.before, df.regression)       # merge the two data frames
res.regre
(res.regre["ses", "Before"] - res.regre["ses", "Regression"]) / res.regre["ses", "Before"] * 100

Stochastic regression imputation

library(lattice)
library(mice)

par(mfrow = c(1,2))

imp <- mice(activity[,c(1,3)], method="pmm", m=1, maxit=3, seed=1, print = FALSE)

lwd <- 2
nudge <- 1

max <- max(nas.complete(activity$steps))
# breaks <- seq(-100, 900, 10)
breaks <- seq(0, max*1.1, 20)

obs <- activity[, "steps"]
mis  <- imp$imp$steps[,1]

fobs <- c(hist(obs, breaks, plot=FALSE)$counts, 0)
fmis <- c(hist(mis, breaks, plot=FALSE)$counts, 0)

x <- matrix(c(breaks-nudge, breaks+nudge), ncol=2)
y <- matrix(c(fobs, fmis), ncol=2)

matplot(x, y, type="s",
        col=c(mdc(4),mdc(5)), lwd=2, lty=1,
        ylim = c(0, 300),
        yaxs = "i",
        xlab="steps",
        ylab="Frequency")

box()

tp <- xyplot(imp, steps~interval, na.groups=ici(imp),
       ylab="steps", xlab="interval",
       cex = 0.75, lex=lwd
       )

print(tp, newpage = FALSE, position = c(0.48,0.08,1,0.92))

# print(stripplot(imp), newpage = FALSE, position = c(0.2, 0.02, 2,0.7))
# stripplot(imp)
# create summary table
indVar <- "steps"     # variable to analyze

imp.df <- get.imp.df(activity, imp, indVar)  # merge imputed values in original

# get information before imputing data
df.before <- info.imp(activity, indVar, "Before", 2)    # summary before

# get information after imputing data
df.pmm <- info.imp(imp.df, indVar, "pmm", 2)          # summary after

res.pmm <- cbind(df.before, df.pmm)       # merge the two data frames
res.pmm
(res.pmm["ses", "Before"] - res.pmm["ses", "pmm"]) / res.pmm["ses", "Before"]  * 100

LOCF with pkg zoo

### Figure 1.4

library(zoo)

vec <- activity$steps

bz <- zoo(vec)
length(bz)
imp <- zoo::na.locf(bz)
length(bz) - length(imp)

nas.count(imp)
limRow <- length(imp)
limRow

colvec <- ifelse(is.na(vec), mdc(2), mdc(1))    # missing data coloring (Mdc)

imp.df <- data.frame(imp)
names(imp.df) <- "steps"
imp.ind <- as.numeric(rownames(imp.df))
range(imp.ind)

plot(activity$interval[imp.ind], imp.df$steps, 
     col = colvec, 
     # type="l", 
     xlab="Observation", 
     ylab="Steps")

points(imp.df[,], col = colvec, pch=20, cex=1)
zoo.get.stats <- function(x, uniVar, colName, decs = 3) {
  tmp <- data.frame(nrows = nrow(x),
                    median = median(x[, uniVar]),
                    mean = mean(x[, uniVar]),
                    sd = sd(x[, uniVar]),
                    ses = sd(x[, uniVar]) / sqrt(sum(!is.na(x[, uniVar])))
  )
  tmp <- data.frame(round(t(tmp), decs))
  names(tmp) <- colName
  return(tmp)
}  
# create summary table
indVar <- "steps"     # variable to analyze


# get information before imputing data
df.before <- info.imp(activity, indVar, "Before", 2)    # summary before

# get information after imputing data
df.locf <- zoo.get.stats(imp.df, indVar, "locf", 2)           # summary after

res.locf <- cbind(df.before, df.locf)       # merge the two data frames
res.locf
(res.locf["ses", "Before"] - res.locf["ses", "locf"]) / res.locf["ses", "Before"] * 100

LOCF

library(lattice)
library(mice)


St <- activity$steps

locf <- function(x) {
  # last observation carried forward
  NonNAindex <- which(!is.na(x))  # find the index of all non-NAs
  firstNonNA <- min(NonNAindex)   # get the lower index or 1st non-NA
  a <- x[firstNonNA]              # assign 1st non-NA observation to pivot

  for (i in 1:length(x)) {        # count from the 1st to the end of the vector
    if (is.na(x[i])) x[i] <- a    # if current element is NA, then make it as pivot
    else a <- x[i]                # else make pivot same as ith
  }
  return(x)
}

Sti <- locf(St)
length(Sti)
nas.count(Sti)
colvec <- ifelse(is.na(St), mdc(2), mdc(1))

### Figure 1.4

limRow <- 12000

plot(Sti[1:limRow], 
     col = colvec, 
     type="l", 
     xlab="Observation", 
     ylab="Steps")

points(Sti[1:limRow], col = colvec, pch=20, cex=1)
LOCF <- function(x) {
    # Last Observation Carried Forward (for a left to right series)
    LOCF <- max(which(!is.na(x))) # the location of the Last Observation to Carry Forward
    x[LOCF:length(x)] <- x[LOCF]
    return(x)
}

Sti <- LOCF(St)
length(Sti)
nas.count(Sti)

par(mfrow=c(1,1))
limRow <- 17568

colvec <- ifelse(is.na(St), mdc(2), mdc(1))    # missing data coloring (Mdc)

plot(Sti[1:limRow], 
     col = colvec, 
     type="l", 
     xlab="Observation", 
     ylab="Steps")

points(Sti[1:limRow], col = colvec, pch=20, cex=1)
na.locf <- function(x) {
  v <- !is.na(x)
  c(NA, x[v])[cumsum(v)+1]
}

Sti <- na.locf(St)
length(Sti)

nas.count(Sti)
par(mfrow=c(1,1))
limRow <- 12000

colvec <- ifelse(is.na(St), mdc(2), mdc(1))    # missing data coloring (Mdc)

plot(Sti[1:limRow], 
     col = colvec, 
     type="l", 
     xlab="Observation", 
     ylab="Steps")

points(Sti[1:limRow], col = colvec, pch=20, cex=1)
mean(St, na.rm = TRUE)
# Standard Error of only the elements that are not NA
sum(!is.na(St))
sd(St, na.rm = TRUE) / sqrt(sum(!is.na(St)))
mean(Sti, na.rm = TRUE)
# Standard Error of only the elements that are not NA
sum(!is.na(Sti))
sd(Sti, na.rm = TRUE) / sqrt(sum(!is.na(Sti)))

Applying LOCF to a transposed data.frame

library(reshape)
library(RepDataPeerAssessment1)

data(activity)

#activity$interval <- as.factor(activity$interval)
names(activity) <- c("value", "date", "interval")
actbycol <- cast(activity, date ~ interval)

actbycol.nodate <- actbycol[, 2:289]    # remove the date column for now
act <- sapply(actbycol.nodate, zoo::na.locf)    # apply LOCF to numerical intervals
df <- data.frame(act)       # create data frame
names(df) <- colnames(act)  # rename

# rebuild to old activity structure
actbycol.shifted <- actbycol[2:61, 1]       # get only the date column but not row #1
actbycol.new <- cbind(actbycol.shifted, df)

#rownames(actbycol.new) <- NULL
names(actbycol.new)[1] <- "date"
activity.new <- melt(actbycol.new, id = "date")    # melt DF as original
names(activity.new) <- c("date", "interval", "steps")  # rename vars
mean(activity.new$steps)
sd(activity.new$steps) / sqrt(sum(!is.na(activity.new$steps)))

This number is not better than the one we obtained by doing the LOFC replacement in the whole steps column above.



AlfonsoRReyes/RepDataPeerAssessment1 documentation built on May 5, 2019, 4:53 a.m.