knitr::opts_chunk$set(echo = TRUE, message = FALSE, dev="svg", out.height = "500px", strip.white=TRUE ,collapse = TRUE, dev.args = list(bg="transparent") ) library(ggplot2) library(vegalite) library(DT) library(readxl) library(data.table) vegalite <- vegalite() %>% cell_size(700, 500) %>% legend_color(orient="left") #vegalite <- vegalite() ggopts<-theme_bw(base_size=24)+theme( panel.background= element_rect(fill="transparent", colour="transparent"), plot.background= element_rect(fill="transparent", colour="transparent"), panel.border = element_rect(fill = NA, colour = NA), axis.ticks = element_line(colour = "white"), text=element_text(colour="white") ) Mode <- function(x) { ux <- unique(x) ux[which.max(tabulate(match(x, ux)))] } myFile<-"Predicting Age.xlsx"
We have the results from our survey.
basedata<-setDT(read_excel(myFile)) results<-basedata[Age<100&Age>Experience,]
Our sample has r nrow(basedata)
records but with basic data cleansing we will be working with r nrow(results)
.
DT::datatable(results)
DT::datatable(results,style="bootstrap",options=list(dom = "lftir" ))
ggplot(results)+ggopts+ aes(x = Age, y=..count..)+ geom_histogram(bins = 10, fill="white", colour="white")
vegalite %>% add_data(results) %>% encode_x("Experience","quantitative") %>% encode_y("*", "quantitative", aggregate="count") %>% bin_x(maxbins=10) %>% mark_bar()
vegalite %>% add_data(results) %>% encode_x("Field","ordinal") %>% encode_y("*", "quantitative", aggregate="count") %>% mark_bar()
vegalite %>% add_data(results) %>% encode_x("Experience","quantitative") %>% encode_y("Age", "quantitative") %>% encode_color("Field", "nominal") %>% mark_point()
testPer<-.75 inTest<-sample(1:nrow(results),nrow(results)*testPer) training<-results[inTest,] holdout<-results[!inTest,]
vegalite %>% add_data(training) %>% encode_x("Experience","quantitative") %>% encode_y("Age", "quantitative") %>% encode_color("Field", "nominal") %>% mark_point()
vegalite %>% add_data(holdout) %>% encode_x("Experience","quantitative") %>% encode_y("Age", "quantitative") %>% encode_color("Field", "nominal") %>% mark_point()
We could take some measure of central tendency to predict the age of attendees.
averages<-training[,.(Mean=floor(mean(Age)) ,Median=floor(median(Age)) ,Mode=Mode(Age) )] knitr::kable(averages)
holdout[,colnames(averages):=averages] holdout.m<-melt(holdout, measure.vars = c("Age",colnames(averages))) vegalite %>% add_data(holdout.m) %>% encode_x("Name", "ordinal") %>% encode_y("value", "quantitative") %>% encode_color("variable", "nominal")%>% mark_point()
holdout.lse<-melt(holdout, measure.vars = colnames(averages)) holdout.lse[,Error:=(Age-value)^2] knitr::kable(holdout.lse[,.(LSE=sum(Error)), variable][order(LSE)])
$y=mx+c$
expLM<-lm(Age~Experience, training) summary(expLM)
training[,expLMres:=expLM$fitted] ggplot(training, aes(x=Experience, y=Age))+ geom_point()+ geom_line(aes(y=expLMres),colour="blue")+ theme_minimal()
holdout[,expLMres:=predict(expLM,holdout)] holdout.m<-melt(holdout, measure.vars = c("Age","expLMres")) vegalite %>% add_data(holdout.m) %>% encode_x("Name", "ordinal") %>% encode_y("value", "quantitative") %>% encode_color("variable", "nominal")%>% mark_point()
holdout.lse<-melt(holdout, measure.vars = c("expLMres",colnames(averages))) holdout.lse[,Error:=(Age-value)^2] knitr::kable(holdout.lse[,.(LSE=sum(Error)), variable][order(LSE)])
fieldLM<-lm(Age~Experience + Field, training) summary(fieldLM)
training[,fieldLMres:=fieldLM$fitted] ggplot(training, aes(x=Experience, y=Age, group=Field, colour=Field))+ geom_point()+ geom_line(aes(y=fieldLMres, group=Field),colour="blue")+ facet_wrap(~Field)+ theme_minimal()
holdout[,fieldLMres:=predict(fieldLM,holdout)] holdout.m<-melt(holdout, measure.vars = c("Age","fieldLMres")) vegalite %>% add_data(holdout.m) %>% encode_x("Name", "ordinal") %>% encode_y("value", "quantitative") %>% encode_color("variable", "nominal")%>% mark_point()
holdout.lse<-melt(holdout, measure.vars = c("fieldLMres","expLMres",colnames(averages))) holdout.lse[,Error:=(Age-value)^2] knitr::kable(holdout.lse[,.(LSE=sum(Error)), variable][order(LSE)])
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.