knitr::opts_chunk$set(echo = TRUE)
I completed 15/15 problems
The final grades are: A(90s) B(80s) C(60s and 70s) D(50s) F(<50) with no curving.
The final grade will be a consideration of multiple parts of the course:
ddt <- read.csv("DDT.csv") myData <- with(ddt, table(RIVER,SPECIES)) mile=with(ddt, as.numeric(factor(MILE))) coplot(LENGTH~WEIGHT | RIVER * SPECIES, data = ddt, pch = 1, col=1:17)
Shows CCATFISH LENGTH v WEIGHT in FCM, LCM, and SCM rivers by mile marker.
Creates a factor vector m with data from the MILE column of ddt, as numerical data.
unique() returns a vector of the unique values in m and then length() returns how many how many elements are in the vector returned by unique()
SMBUFFALO and LMBASS are only found in the TRM river
x<-subset(ddt, RIVER=="FCM" & SPECIES=="CCATFISH") mean(x$DDT)
MS 1.14 - pg 8
quantitative
quantitative
qualitative
quantitative
qualitative
quantitative
qualitative
MS pg 12,13
Simple Random Sampling: Uses a random number generator to create a random number table that correlates with the data set being sampled from.
Stratified Random Sampling: Used when experimental units of a population can be seperated in to strata in which the characteristics of the experimental units are more similar within their respective strata than across the stratum.
Cluster Sampling: Used to sample natural groupings (clusters) of experimental units first, and then sample the units within each cluster.
Systematic Sampling: Selects every $n^{th}$ experimental unit from a list of all experimental units.
MS 1.15 - pg 15
mtbe <- read.csv("MTBE.csv") ind = sample(1:223,5,replace=F) mtbe[ind,] mtbeo=na.omit(mtbe) depth=mtbeo[mtbeo$Aquifier == "Bedrock",]$Depth paste0("The mean depth is: ",mean(depth))
MS 1.16 - pg 15
eq <- read.csv("EARTHQUAKE.csv") eq[sample(nrow(eq),30),]
The above code gives 30 random rows containing the aftershock data
plot(ts(eq$MAG))
paste0("The median of the entire eq dataframe is: ", median(eq$MAGNITUDE))
Designed experiment
All fish in Tennessee river and its tributaries
Species, River
MS 2.1 - pg 26
Bar graph
None, Both, Legs Only, Wheels Only
Legs only
library(qcc) freq=c(15,8,63,20) names(freq)=c("None","Both","LegsO","WheelsO") pareto.chart(freq, ylab = "frequency", col=heat.colors(length(freq)))
MS 2.4 - pg 27
flaws <- c(32,12,6) products <- c("Windows","Office", "Explorer") pie(flaws, labels = products, col = 1:3)
Explorer has the lowest proportion of security issues.
freq=c(6,8,22,3,11) categories=c("Denial of Serv","Info Disc","Remote exec","Spoofing", "Priv Elev") l=rep(categories,freq) pareto<-function(x,mn="Pareto barplot",...){ # x is a vector x.tab=table(x) xx.tab=sort(x.tab, decreasing=TRUE,index.return=FALSE) cumsum(as.vector(xx.tab))->cs length(x.tab)->lenx bp<-barplot(xx.tab,ylim=c(0,max(cs)),las=2) lb<-seq(0,cs[lenx],l=11) axis(side=4,at=lb,labels=paste(seq(0,100,length=11),"%",sep=""),las=1,line=-1,col="Blue",col.axis="Red") for(i in 1:(lenx-1)){ segments(bp[i],cs[i],bp[i+1],cs[i+1],col=i,lwd=2) } title(main=mn,...) } pareto(l)
They should focus on remote code execution
MS 2.10 - pg 28
swd=read.csv("SWDEFECTS.csv", header=TRUE) library(plotrix) tab=table(swd$defect) rtab=tab/sum(tab) round(rtab,2) pie3D(rtab,labels=list("OK","Defective"),main="pie plot of SWD")
Though there are observations of defective software code, the likelihood of a peice of software being defective is relatively low.
MS 2.72 - pg 70
volt <- read.csv("VOLTAGE.csv") volt <- subset(volt, LOCATION == "OLD") v <- volt$VOLTAGE #Say we want 9 bins #what is each width? inc=(10.6-8)/9 #Make some class breaks cl=seq(8.0,10.6,by=inc) #sort voltages sort(v) #Cut the voltages three places Vc=cut(v,breaks=cl,ord=TRUE) #Make a table tab=table(Vc) barplot(tab/sum(tab),space=0, col=rainbow(3), main="Histogram of voltages", ylab="Rel. Frequency",xlab="Voltage")
stem(v)
To me, the histogram offers quicker and more pleasant interpretation of the data presented.
volt <- read.csv("VOLTAGE.csv") volt <- subset(volt, LOCATION == "NEW") v <- volt$VOLTAGE #Say we want 9 bins #what is each width? inc=(10.6-8)/9 #Make some class breaks cl=seq(8.0,10.6,by=inc) #sort voltages sort(v) #Cut the voltages three places Vc=cut(v,breaks=cl,ord=TRUE) #Make a table tab=table(Vc) barplot(tab/sum(tab),space=0, col=rainbow(3), main="Histogram of voltages", ylab="Rel. Frequency",xlab="Voltage")
If larger readings are better, than the "OLD" location has more larger readings than the "NEW". So it might be best to stay at the OLD location.
OLD
Mode <-function(x){ xtab<-table(x) modes<-xtab[max(xtab)==xtab] mag<-as.numeric(modes[1]) #in case mult. modes, this is safer themodes<-names(modes) mout<-list(themodes=themodes,modeval=mag) return(mout) } volt <- read.csv("VOLTAGE.csv") v_old = subset(volt, LOCATION == "OLD") v_new = subset(volt, LOCATION == "NEW") mean(v_old$VOLTAGE) median(v_old$VOLTAGE) Mode(v_old)
NEW
mean(v_new$VOLTAGE) median(v_new$VOLTAGE) Mode(v_new)
Interpret The data at the old location seems to be skewed left, while the data at the new location seems to be normal, or mound shaped. For the OLD site we should use the median, and for the NEW site we should use the mean.
#OLD LOCATION z = (10 - mean(v_old$VOLTAGE)) / sd(v_old$VOLTAGE) z
#NEW LOCATION z = (10 - mean(v_new$VOLTAGE)) / sd(v_new$VOLTAGE) z
More likely to occur at the OLD location because that voltage deviates less from the mean
boxplot(v_old)
It looks like there is
z = (v_old$VOLTAGE - mean(v_old$VOLTAGE)) / sd(v_old$VOLTAGE) z
If z > 3 is an outlier, then yes there is 1 outlier
boxplot(v_new)
No outliers it seems
z = (v_new$VOLTAGE - mean(v_new$VOLTAGE)) / sd(v_new$VOLTAGE) z
No outliers according to zscores
names = c("Old", "New") boxplot(v_old$VOLTAGE, v_new$VOLTAGE, names= names, horizontal = F)
MS 2.73 - pg 70
According to pg 47, under the "Empirical rule" to capture 95% of data we use ybar + or - 2s
rp <- read.csv("ROUGHPIPE.csv") ybar <- mean(rp$ROUGH) left <- ybar - (2*sd(rp$ROUGH)) right <- ybar + (2*sd(rp$ROUGH)) paste0("Interval is: (",round(left,4), ",",round(right,4),")" )
MS 2.80 - pg 72
gobi <- read.csv("GOBIANTS.csv") mean(gobi$AntSpecies) median(gobi$AntSpecies) paste("There are two modes: 4,5")
Interpretation
We see that the mean is 12.81818 which gives us the "center" of the distribution
When the data is sorted in ascending order on the AntSpecies column, the middle value (since the number of entries in this case is odd) is 5
The mode indicates the values that appear the most. In this case, 4 and 5 are tied, thus we have two modes. This means that out of the 11 sites measured, the most common number of distinct species found was 4 and 5.
hist(gobi$AntSpecies)
As we can see by the histogram, the data is skewed to the right. According to the textbook, mean is sensitive to skewness. Thus, in this case we should use the median
ds <- gobi$PlantCov[gobi$Region == "Dry Steppe"] mean(ds) median(ds)
The mode is 40
gd <- gobi$PlantCov[gobi$Region == "Gobi Desert"] mean(gd) median(gd)
The mode is 30
Yes. Because there is a relatively drastic difference in the values of the mean and medians between the two regions.
MS 2.84 - pg 74
g <- read.csv("GALAXY2.csv") hist(x=g$VELOCITY)
According to the above histogram, it would seem that the double cluster theory is valid because of the symmetry of the distribution. It looks to me as if the data is distributed in two differing ways. Yes.
Left most cluster A1775A
mean(g[g$VELOCITY < 21000,]) sd(g[g$VELOCITY < 21000,])
Right most cluster A1775B
mean(g[g$VELOCITY > 21000,]) sd(g[g$VELOCITY > 21000,])
It will probably belong to A1775A because the observation lies within 2 standard deviations. But the observation lies more than 4 standard deviations away when comparing it to A1775B.
ddt <- read.csv("DDT.csv") library(ggplot2) p <- ggplot(ddt, aes(x=RIVER, y=LENGTH, fill=SPECIES)) + geom_boxplot() + ggtitle(label = "Adam Gracy") p
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.