knitr::opts_chunk$set(echo = TRUE, comment = NA, error = TRUE)
Source: http://thomasleeper.com/Rcourse/Tutorials/mi.html
n <- 10 bad <- 3 good <- n - bad set.seed(10) x <- c(sample(1:n, good, replace = TRUE), rep(NA, bad)) x
mean(x, na.rm = TRUE)
Standard Error of only the elements that are not NA:
$$SE = \frac {\sigma_x} {\sqrt {\sum x_i}}$$
sd(x, na.rm = TRUE) / sqrt(sum(!is.na(x)))
Generate 15 samples of new data replacing the NA elements with values extracted from the GOOD values.
na.count <- length(x[is.na(x) == TRUE]) # calculate the # of NAs imp <- replicate(15, c(x[!is.na(x)], # get the 7 non-NA elements sample(x[!is.na(x)], na.count, TRUE)), # get 3 elements from non-NA vec simplify = FALSE) # output to be a vector imp
activity
datasetThe dataset has to be transformed and put the intervals as variables and the steps as values. The date is the ID or index.
library(reshape) library(RepDataPeerAssessment1) # data(activity) str(activity) activity
steps
as values
This is not really necessary because we can specify values
in the function.
# this data frame has the following form: # date interval_1 interval_2 interval_3 ... interval_n # step_11 step_1n # step_21 step_2n # actbycol <- cast(activity, date ~ interval, value = "steps")
# saving actbycol with another name intervals <- actbycol save(intervals, file = paste(project.data, "intervals.RData"), sep = "/")
cat("# of variables:", (ncol(actbycol)), "\t") cat("# of intervals:", (ncol(actbycol)-1)) cat("\n")
# take a sample of the original data frame mysample <- actbycol[sample(1:nrow(actbycol), 10, replace=FALSE), c(1:7, 283:289)] mysample
splice <- list(rows = c(1:5, 57:61), cols = c(1:5, 286:289)) mysample <- actbycol[splice$rows, splice$cols] mysample
0
of activity by intervalsIn our case, the vector to investigate is any of the interval columns. Let's start with interval 0 assigning it as a vector.
x0 <- actbycol$`0` x0
The mean is:
mean(x0, na.rm = TRUE)
The standard error:
sd(x0, na.rm = TRUE) / sqrt(sum(!is.na(x0)))
x <- x0 na.count <- length(x[is.na(x) == TRUE]) # calculate the # of NAs imp <- replicate(15, c(x[!is.na(x)], # get the 7 non-NA elements sample(x[!is.na(x)], na.count, TRUE)), # get 3 elements from non-NA vec simplify = FALSE) # output to be a vector imp
calcAfter <- function(x, samples = 15) { na.count <- length(x[is.na(x) == TRUE]) # calculate the # of NAs imp <- replicate(samples, c(x[!is.na(x)], sample(x[!is.na(x)], na.count, TRUE)), simplify = FALSE) means <- sapply(imp, mean) grandm <- mean(means) ses <- sapply(imp, sd)/sqrt(10) within <- mean(ses) between <- sum((means - grandm)^2)/(length(imp) - 1) grandvar <- within + ((1 + (1/length(imp))) * between) grandse <- sqrt(grandvar) list(grandMeans = grandm, grandSE = grandse) }
calcAfter(x0, 15)
Now, let's see if something changed:
means <- sapply(imp, mean) means
grandm <- mean(means) grandm
ses <- sapply(imp, sd)/sqrt(10) within <- mean(ses) between <- sum((means - grandm)^2)/(length(imp) - 1) grandvar <- within + ((1 + (1/length(imp))) * between) grandse <- sqrt(grandvar) grandse
5
but with a functioncalcBefore <- function(x) { na.count <- length(x[is.na(x) == TRUE]) # calculate the # of NAs notna.count <- length(x[is.na(x) == FALSE]) mean <- mean(x, na.rm = TRUE) serror <- sd(x, na.rm = TRUE) / sqrt(sum(!is.na(x))) list(notna = notna.count, na = na.count, mean = mean, stdError = serror) }
calcBefore(x0)
x5 <- actbycol$`5` calcBefore(x5) calcAfter(x5)
x10 <- actbycol$`10` calcBefore(x10) calcAfter(x10)
x25 <- actbycol$`25` calcBefore(x25) calcAfter(x25)
lapply
exampley <- list(a = 1:5, b = rnorm(10)) y lapply(y, mean)
sapply
exampley <- list(a = 1:5, b = rnorm(10)) y sapply(y, mean)
# subset of only the numerical data, no date actbycol2 <- actbycol[, 2:289]
Applying a function to all the dataframe, excluding the date""
sapply(actbycol2, calcBefore)
sapply(actbycol2, calcAfter)
hist(activity$value)
activity.nonzero <- subset(activity, activity$value > 0) hist(activity.nonzero$value)
actbycol.melt <- melt(actbycol, id = "date") rownames(actbycol.melt) <- NULL hist(subset(actbycol.melt, actbycol.melt$value > 0)$value)
random.imp <- function (a){ missing <- is.na(a) # logical vector. TRUE for NAs n.missing <- sum(missing) # number of missing values a.obs <- a[!missing] # vector of GOOD values imputed <- a # copy of original vector # Take a sample from GOOD part of the vector and replace missing elements imputed[missing] <- sample (a.obs, n.missing, replace=TRUE) return (imputed) # return complete vector }
x indNA <- which(is.na(x)) # which rows are NA indNA
missing <- is.na(x) missing
n.missing <- sum(missing) n.missing
x.obs <- x[!missing] x.obs
imputed <- x
imputed
imputed[missing] <- sample (x.obs, n.missing, replace=TRUE) imputed
imp <- random.imp(x) plot(density(imp[indNA]), main = "Observed and imputed values", xlab = "x") lines(density(imp[which(!((1:nrow(array(x))) %in% indNA))]), col = "red") legend("topleft",text.col = c("black","red"),legend = c(" Imputation","Observed values"))
actbycol2
x0 imp <- random.imp(x0) # impute random values indNA <- which(is.na(x0)) # which rows are NA plot(density(imp[indNA]), main = "Observed and imputed values", xlab = "x0") lines(density(imp[which(!((1:nrow(array(x0))) %in% indNA))]), col = "red") legend("topright",text.col = c("black","red"),legend = c(" Imputation","Observed values"))
actbycol2[splice$rows, splice$cols-1]
actbycol.rep <- random.imp(actbycol2[, 1:10])
actbycol.rep[splice$rows, 1:10]
summary(actbycol2) actbycol.imp <- random.imp(actbycol2) actbycol.imp[splice$rows, splice$cols-1]
date <- actbycol[, 1] actbycol.bind <- cbind(date, actbycol.imp)
actbycol.imp.melt <- melt(actbycol.bind, id = "date") names(actbycol.imp.melt) <- c("date", "interval", "steps") # rownames(actbycol.melt) <- NULL hist(subset(actbycol.imp.melt, actbycol.imp.melt$steps > 0)$steps) #hist(actbycol.imp.melt$steps)
dim(activity) dim(actbycol.imp.melt)
library(reshape) library(lattice) library(mice) # load the data data("activity") # convert interval variable to multiple columns actbycol <- cast(activity, date ~ interval, value = "steps") # imputting random values actbycol.imp <- random.imp(actbycol[2:289]) # rebuild to interval to one column date <- actbycol[, 1] # get the date actbycol.bind <- cbind(date, actbycol.imp) # merge 288 cols with date actbycol.imp.melt <- melt(actbycol.bind, id = "date") # go back to long df names(actbycol.imp.melt) <- c("date", "interval", "steps") # rename vars # order by date so it is the same as original actbycol.imp.melt <- actbycol.imp.melt[order(actbycol.imp.melt$date), ] # convert interval to numeric actbycol.imp.melt$interval <- as.numeric(as.character(actbycol.imp.melt$interval)) # prepare for plotting imp.df <- actbycol.imp.melt # the imputted data frame indVar <- "steps" # variable for plotting indNA <- which(is.na(activity[, indVar])) # which rows are NA indNA.not <- which(!((1:nrow(activity)) %in% indNA)) # plot(density(imp.df[indNA, indVar]), main = "Observed and imputed values", xlab = "steps") lines(density(imp.df[indNA.not, indVar]), col = "red") legend("topright",text.col = c("black","red"),legend = c(" Imputation","Observed values"))
lwd <- 1.5 data <- imp.df[, c("steps", "interval")] tp <- xyplot(steps~interval, data = imp.df, na.groups = ici(imp.df), ylab ="steps", xlab="interval", cex = 0.75, xlim = c(0, 2355), lex=lwd #ylim = c(0, 900), #xlim = c(0, 500)) ) #print(tp, newpage = FALSE, position = c(0.48,0.08,1,0.92)) tp
library(RepDataPeerAssessment1) # get information before imputing data df.before <- info.imp(activity, indVar, "Before", 2) # get information after imputing data df.random <- info.imp(imp.df, indVar, "Random", 2) df.random <- cbind(df.before, df.random) df.random
(df.random["ses", "Before"] - df.random["ses", "Random"]) / df.random["ses", "Before"] * 100
library(fBasics) df <- activity[, c("interval", "steps")] basicStats(df)
steps
and row-index
y <- imp.df$steps y.sig <- sd(y) y.mu <- mean(y) y.norm <- (y - y.mu) / y.sig min(y.norm) max(y.norm) x <- seq(1:nrow(imp.df)) x.sig <- sd(x) x.mu <- mean(x) x.norm <- (x - x.mu) / x.sig min(x.norm) max(x.norm)
This has the disadvantage of showing discontinuities along the x-axis since the interval is no n contiguous jumping in steps of 5.
x <- imp$interval x.sig <- sd(x) x.mu <- mean(x) x.norm <- (x - x.mu) / x.sig min(x.norm) max(x.norm)
x
vs y
of random imputtingNormalizing the row-index should be the prefferred option to avoid showing gap along the x-axis.
# find NA and not NA row-indexes indVar <- "steps" indNA <- which(is.na(activity[, indVar])) # which rows are NA indNA.not <- which(!((1:nrow(activity)) %in% indNA)) # plot row-index vs steps col <- c(mdc(4),mdc(5)) plot(x.norm[indNA.not], y.norm[indNA.not], main = "index vs steps", xlab = "x", ylab = "y", col = col[1] ) points(x.norm[indNA], y.norm[indNA], col = col[2])
library(RepDataPeerAssessment1) library(lattice) library(mice) par(mfrow = c(1,2)) data(activity) activity.nodate <- activity[, c(1, 3)] ### impute the mean #imp <- mice(activity.nodate, method="mean", m=1, maxit=1) imp <- mice(activity.nodate, method="mean", m=1, maxit=1, print = FALSE) ### Figure 1.1 lwd <- 1.5 # par(mfrow=c(1,2)) max <- max(nas.complete(activity$steps)) breaks <- seq(0, max*1.1, 20) nudge <- 1 x <- matrix(c(breaks-nudge, breaks+nudge), ncol=2) obs <- activity[, "steps"] mis <- imp$imp$steps[,1] fobs <- c(hist(obs, breaks, plot=FALSE)$counts, 0) fmis <- c(hist(mis, breaks, plot=FALSE)$counts, 0) y <- matrix(c(fobs, fmis), ncol=2) matplot(x, y, type="s", col=c(mdc(4),mdc(5)), lwd=2, lty=1, #xlim = c(0, 170), ylim = c(-5, 200), yaxs = "i", xlab="Steps", ylab="Frequency") box() tp <- xyplot(imp, steps~interval, na.groups = ici(imp), ylab ="steps", xlab="interval", cex = 0.75, lex=lwd #ylim = c(0, 900), #xlim = c(0, 500)) ) print(tp, newpage = FALSE, position = c(0.48,0.08,1,0.92))
# create summary table indVar <- "steps" # variable to analyze imp.df <- get.imp.df(activity, imp, indVar) # merge imputed values in original # get information before imputing data df.before <- info.imp(activity, indVar, "Before", 2) # summary before # get information after imputing data df.mean <- info.imp(imp.df, indVar, "Mean", 2) # summary after res.mean <- cbind(df.before, df.mean) # merge the two data frames res.mean
(res.mean["ses", "Before"] - res.mean["ses", "Mean"]) / res.mean["ses", "Before"] * 100
# x<-cbind(rnorm(10),rnorm(10)) indVar <- "steps" indNA <- which(is.na(activity[, indVar])) # which rows are NA indNA.not <- which(!((1:nrow(activity)) %in% indNA)) # rows that are not NA plot1<-xyplot(imp.df[indNA.not, indVar]~imp.df[indNA.not, "interval"], col="blue", cex = 0.75) plot2<-xyplot(imp.df[indNA, indVar]~imp.df[indNA, "interval"], col = "red", cex = 0.75) library(latticeExtra) plot1+plot2
fit <- lm(steps ~ interval, data = activity) # fit the variables pred <- predict(fit, newdata = ic(activity)) # build predictor ### alternative using mice imp <- mice(activity[,c(1,3)], method="norm.predict", m=1, maxit=3, seed=1, print = FALSE) ### Figure 1.2 par(mfrow = c(1,2)) fmis <- c(hist(pred, breaks, plot=FALSE)$counts, 0) y <- matrix(c(fobs, fmis), ncol=2) matplot(x, y, type="s", col=c(mdc(4),mdc(5)), lwd=2, lty=1, ylim = c(0, 200), yaxs = "i", xlab="steps", ylab="Frequency") box() tp <- xyplot(imp, steps~interval, ylab="steps", xlab="interval", cex = 0.75, lex=lwd ) print(tp, newpage = FALSE, position = c(0.48,0.08,1,0.92))
# create summary table indVar <- "steps" # variable to analyze imp.df <- get.imp.df(activity, imp, indVar) # merge imputed values in original # get information before imputing data df.before <- info.imp(activity, indVar, "Before", 2) # summary before # get information after imputing data df.regression <- info.imp(imp.df, indVar, "Regression", 2) # summary after res.regre <-cbind(df.before, df.regression) # merge the two data frames res.regre
(res.regre["ses", "Before"] - res.regre["ses", "Regression"]) / res.regre["ses", "Before"] * 100
library(lattice) library(mice) par(mfrow = c(1,2)) imp <- mice(activity[,c(1,3)], method="pmm", m=1, maxit=3, seed=1, print = FALSE) lwd <- 2 nudge <- 1 max <- max(nas.complete(activity$steps)) # breaks <- seq(-100, 900, 10) breaks <- seq(0, max*1.1, 20) obs <- activity[, "steps"] mis <- imp$imp$steps[,1] fobs <- c(hist(obs, breaks, plot=FALSE)$counts, 0) fmis <- c(hist(mis, breaks, plot=FALSE)$counts, 0) x <- matrix(c(breaks-nudge, breaks+nudge), ncol=2) y <- matrix(c(fobs, fmis), ncol=2) matplot(x, y, type="s", col=c(mdc(4),mdc(5)), lwd=2, lty=1, ylim = c(0, 300), yaxs = "i", xlab="steps", ylab="Frequency") box() tp <- xyplot(imp, steps~interval, na.groups=ici(imp), ylab="steps", xlab="interval", cex = 0.75, lex=lwd ) print(tp, newpage = FALSE, position = c(0.48,0.08,1,0.92)) # print(stripplot(imp), newpage = FALSE, position = c(0.2, 0.02, 2,0.7)) # stripplot(imp)
# create summary table indVar <- "steps" # variable to analyze imp.df <- get.imp.df(activity, imp, indVar) # merge imputed values in original # get information before imputing data df.before <- info.imp(activity, indVar, "Before", 2) # summary before # get information after imputing data df.pmm <- info.imp(imp.df, indVar, "pmm", 2) # summary after res.pmm <- cbind(df.before, df.pmm) # merge the two data frames res.pmm
(res.pmm["ses", "Before"] - res.pmm["ses", "pmm"]) / res.pmm["ses", "Before"] * 100
### Figure 1.4 library(zoo) vec <- activity$steps bz <- zoo(vec) length(bz) imp <- zoo::na.locf(bz) length(bz) - length(imp) nas.count(imp) limRow <- length(imp) limRow colvec <- ifelse(is.na(vec), mdc(2), mdc(1)) # missing data coloring (Mdc) imp.df <- data.frame(imp) names(imp.df) <- "steps" imp.ind <- as.numeric(rownames(imp.df)) range(imp.ind) plot(activity$interval[imp.ind], imp.df$steps, col = colvec, # type="l", xlab="Observation", ylab="Steps") points(imp.df[,], col = colvec, pch=20, cex=1)
zoo.get.stats <- function(x, uniVar, colName, decs = 3) { tmp <- data.frame(nrows = nrow(x), median = median(x[, uniVar]), mean = mean(x[, uniVar]), sd = sd(x[, uniVar]), ses = sd(x[, uniVar]) / sqrt(sum(!is.na(x[, uniVar]))) ) tmp <- data.frame(round(t(tmp), decs)) names(tmp) <- colName return(tmp) }
# create summary table indVar <- "steps" # variable to analyze # get information before imputing data df.before <- info.imp(activity, indVar, "Before", 2) # summary before # get information after imputing data df.locf <- zoo.get.stats(imp.df, indVar, "locf", 2) # summary after res.locf <- cbind(df.before, df.locf) # merge the two data frames res.locf
(res.locf["ses", "Before"] - res.locf["ses", "locf"]) / res.locf["ses", "Before"] * 100
library(lattice) library(mice) St <- activity$steps locf <- function(x) { # last observation carried forward NonNAindex <- which(!is.na(x)) # find the index of all non-NAs firstNonNA <- min(NonNAindex) # get the lower index or 1st non-NA a <- x[firstNonNA] # assign 1st non-NA observation to pivot for (i in 1:length(x)) { # count from the 1st to the end of the vector if (is.na(x[i])) x[i] <- a # if current element is NA, then make it as pivot else a <- x[i] # else make pivot same as ith } return(x) } Sti <- locf(St) length(Sti) nas.count(Sti) colvec <- ifelse(is.na(St), mdc(2), mdc(1)) ### Figure 1.4 limRow <- 12000 plot(Sti[1:limRow], col = colvec, type="l", xlab="Observation", ylab="Steps") points(Sti[1:limRow], col = colvec, pch=20, cex=1)
LOCF <- function(x) { # Last Observation Carried Forward (for a left to right series) LOCF <- max(which(!is.na(x))) # the location of the Last Observation to Carry Forward x[LOCF:length(x)] <- x[LOCF] return(x) } Sti <- LOCF(St) length(Sti) nas.count(Sti) par(mfrow=c(1,1)) limRow <- 17568 colvec <- ifelse(is.na(St), mdc(2), mdc(1)) # missing data coloring (Mdc) plot(Sti[1:limRow], col = colvec, type="l", xlab="Observation", ylab="Steps") points(Sti[1:limRow], col = colvec, pch=20, cex=1)
na.locf <- function(x) { v <- !is.na(x) c(NA, x[v])[cumsum(v)+1] } Sti <- na.locf(St) length(Sti) nas.count(Sti) par(mfrow=c(1,1)) limRow <- 12000 colvec <- ifelse(is.na(St), mdc(2), mdc(1)) # missing data coloring (Mdc) plot(Sti[1:limRow], col = colvec, type="l", xlab="Observation", ylab="Steps") points(Sti[1:limRow], col = colvec, pch=20, cex=1)
mean(St, na.rm = TRUE)
# Standard Error of only the elements that are not NA sum(!is.na(St)) sd(St, na.rm = TRUE) / sqrt(sum(!is.na(St)))
mean(Sti, na.rm = TRUE)
# Standard Error of only the elements that are not NA sum(!is.na(Sti)) sd(Sti, na.rm = TRUE) / sqrt(sum(!is.na(Sti)))
library(reshape) library(RepDataPeerAssessment1) data(activity) #activity$interval <- as.factor(activity$interval) names(activity) <- c("value", "date", "interval") actbycol <- cast(activity, date ~ interval) actbycol.nodate <- actbycol[, 2:289] # remove the date column for now
act <- sapply(actbycol.nodate, zoo::na.locf) # apply LOCF to numerical intervals df <- data.frame(act) # create data frame names(df) <- colnames(act) # rename # rebuild to old activity structure actbycol.shifted <- actbycol[2:61, 1] # get only the date column but not row #1 actbycol.new <- cbind(actbycol.shifted, df) #rownames(actbycol.new) <- NULL
names(actbycol.new)[1] <- "date"
activity.new <- melt(actbycol.new, id = "date") # melt DF as original names(activity.new) <- c("date", "interval", "steps") # rename vars
mean(activity.new$steps)
sd(activity.new$steps) / sqrt(sum(!is.na(activity.new$steps)))
This number is not better than the one we obtained by doing the LOFC replacement in the whole steps column above.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.