knitr::opts_chunk$set(echo = TRUE)

I completed 15/15 problems

Question #1

The final grades are: A(90s) B(80s) C(60s and 70s) D(50s) F(<50) with no curving.

The final grade will be a consideration of multiple parts of the course:

Question #2

(a)

ddt <- read.csv("DDT.csv")

myData <- with(ddt, table(RIVER,SPECIES))


mile=with(ddt, as.numeric(factor(MILE)))
coplot(LENGTH~WEIGHT | RIVER * SPECIES, data = ddt, pch = 1, col=1:17)

(b)

Shows CCATFISH LENGTH v WEIGHT in FCM, LCM, and SCM rivers by mile marker.

(c)

Creates a factor vector m with data from the MILE column of ddt, as numerical data.

(d)

unique() returns a vector of the unique values in m and then length() returns how many how many elements are in the vector returned by unique()

(e)

SMBUFFALO and LMBASS are only found in the TRM river

(f)

x<-subset(ddt, RIVER=="FCM" & SPECIES=="CCATFISH")
mean(x$DDT)

Question #3

MS 1.14 - pg 8

(a)

quantitative

(b)

quantitative

(c)

qualitative

(d)

quantitative

(e)

qualitative

(f)

quantitative

(g)

qualitative

Question #4

MS pg 12,13

(a)

(b)

Simple Random Sampling: Uses a random number generator to create a random number table that correlates with the data set being sampled from.

Stratified Random Sampling: Used when experimental units of a population can be seperated in to strata in which the characteristics of the experimental units are more similar within their respective strata than across the stratum.

Cluster Sampling: Used to sample natural groupings (clusters) of experimental units first, and then sample the units within each cluster.

Systematic Sampling: Selects every $n^{th}$ experimental unit from a list of all experimental units.

Question #5

MS 1.15 - pg 15

mtbe <- read.csv("MTBE.csv")
ind = sample(1:223,5,replace=F)
mtbe[ind,]

mtbeo=na.omit(mtbe)

depth=mtbeo[mtbeo$Aquifier == "Bedrock",]$Depth
paste0("The mean depth is: ",mean(depth))

Question #6

MS 1.16 - pg 15

Book question

eq <- read.csv("EARTHQUAKE.csv")
eq[sample(nrow(eq),30),]

The above code gives 30 random rows containing the aftershock data

(ai)

plot(ts(eq$MAG)) 

(aii)

paste0("The median of the entire eq dataframe is: ", median(eq$MAGNITUDE))

Question #7

(a)

Designed experiment

(b)

All fish in Tennessee river and its tributaries

(c)

Species, River

Question #8

MS 2.1 - pg 26

(a)

Bar graph

(b)

None, Both, Legs Only, Wheels Only

(c)

Legs only

( d & e)

library(qcc)
freq=c(15,8,63,20) 
names(freq)=c("None","Both","LegsO","WheelsO") 


pareto.chart(freq, ylab = "frequency", col=heat.colors(length(freq)))

Question #9

MS 2.4 - pg 27

(a)

flaws <- c(32,12,6)
products <- c("Windows","Office", "Explorer")

pie(flaws, labels = products, col = 1:3)

Explorer has the lowest proportion of security issues.

(b)

freq=c(6,8,22,3,11) 
categories=c("Denial of Serv","Info Disc","Remote exec","Spoofing", "Priv Elev") 
l=rep(categories,freq)


pareto<-function(x,mn="Pareto barplot",...){  # x is a vector
x.tab=table(x)
xx.tab=sort(x.tab, decreasing=TRUE,index.return=FALSE)
cumsum(as.vector(xx.tab))->cs
length(x.tab)->lenx
bp<-barplot(xx.tab,ylim=c(0,max(cs)),las=2)
lb<-seq(0,cs[lenx],l=11)
axis(side=4,at=lb,labels=paste(seq(0,100,length=11),"%",sep=""),las=1,line=-1,col="Blue",col.axis="Red")
for(i in 1:(lenx-1)){
segments(bp[i],cs[i],bp[i+1],cs[i+1],col=i,lwd=2)
}
title(main=mn,...)

}

pareto(l)

They should focus on remote code execution

Question #10

MS 2.10 - pg 28

swd=read.csv("SWDEFECTS.csv", header=TRUE)
library(plotrix) 
tab=table(swd$defect) 
rtab=tab/sum(tab) 
round(rtab,2)
pie3D(rtab,labels=list("OK","Defective"),main="pie plot of SWD")

Though there are observations of defective software code, the likelihood of a peice of software being defective is relatively low.

Question #11

MS 2.72 - pg 70

(a) RF Histogram of Old Process

volt <- read.csv("VOLTAGE.csv")

volt <- subset(volt, LOCATION == "OLD")
v <- volt$VOLTAGE
#Say we want 9 bins
#what is each width?
inc=(10.6-8)/9
#Make some class breaks
cl=seq(8.0,10.6,by=inc)
#sort voltages
sort(v)
#Cut the voltages three places
Vc=cut(v,breaks=cl,ord=TRUE)

#Make a table
tab=table(Vc)

barplot(tab/sum(tab),space=0, col=rainbow(3), main="Histogram of voltages",
        ylab="Rel. Frequency",xlab="Voltage")

(b) Stem() of Old Process

stem(v)

To me, the histogram offers quicker and more pleasant interpretation of the data presented.

(c) Make New Process histogram

volt <- read.csv("VOLTAGE.csv")

volt <- subset(volt, LOCATION == "NEW")
v <- volt$VOLTAGE
#Say we want 9 bins
#what is each width?
inc=(10.6-8)/9
#Make some class breaks
cl=seq(8.0,10.6,by=inc)
#sort voltages
sort(v)
#Cut the voltages three places
Vc=cut(v,breaks=cl,ord=TRUE)

#Make a table
tab=table(Vc)

barplot(tab/sum(tab),space=0, col=rainbow(3), main="Histogram of voltages",
        ylab="Rel. Frequency",xlab="Voltage")

(d) Which is better?

If larger readings are better, than the "OLD" location has more larger readings than the "NEW". So it might be best to stay at the OLD location.

(e) Mean/Median/Mode

OLD

Mode <-function(x){
    xtab<-table(x)
    modes<-xtab[max(xtab)==xtab]
    mag<-as.numeric(modes[1]) #in case mult. modes, this is safer
    themodes<-names(modes)
    mout<-list(themodes=themodes,modeval=mag)
    return(mout)
    }
volt <- read.csv("VOLTAGE.csv")
v_old = subset(volt, LOCATION == "OLD")
v_new = subset(volt, LOCATION == "NEW")
mean(v_old$VOLTAGE)
median(v_old$VOLTAGE)
Mode(v_old)

NEW

mean(v_new$VOLTAGE)
median(v_new$VOLTAGE)
Mode(v_new)

Interpret The data at the old location seems to be skewed left, while the data at the new location seems to be normal, or mound shaped. For the OLD site we should use the median, and for the NEW site we should use the mean.

(f)

#OLD LOCATION
z = (10 - mean(v_old$VOLTAGE)) / sd(v_old$VOLTAGE)
z

(g)

#NEW LOCATION
z = (10 - mean(v_new$VOLTAGE)) / sd(v_new$VOLTAGE)
z

(h)

More likely to occur at the OLD location because that voltage deviates less from the mean

(i)

boxplot(v_old)

It looks like there is

(j)

z = (v_old$VOLTAGE - mean(v_old$VOLTAGE)) / sd(v_old$VOLTAGE)
z

If z > 3 is an outlier, then yes there is 1 outlier

(k)

boxplot(v_new)

No outliers it seems

(L)

z = (v_new$VOLTAGE - mean(v_new$VOLTAGE)) / sd(v_new$VOLTAGE)
z

No outliers according to zscores

(m)

names = c("Old", "New")
boxplot(v_old$VOLTAGE, v_new$VOLTAGE, names= names, horizontal = F)

Question #12

MS 2.73 - pg 70

According to pg 47, under the "Empirical rule" to capture 95% of data we use ybar + or - 2s

rp <- read.csv("ROUGHPIPE.csv")

ybar <- mean(rp$ROUGH)

left <- ybar - (2*sd(rp$ROUGH))
right <- ybar + (2*sd(rp$ROUGH))

paste0("Interval is: (",round(left,4), ",",round(right,4),")" )

Question #13

MS 2.80 - pg 72

(a)

gobi <- read.csv("GOBIANTS.csv")
mean(gobi$AntSpecies)
median(gobi$AntSpecies)

paste("There are two modes: 4,5")

Interpretation

(b)

hist(gobi$AntSpecies)

As we can see by the histogram, the data is skewed to the right. According to the textbook, mean is sensitive to skewness. Thus, in this case we should use the median

(c)

ds <- gobi$PlantCov[gobi$Region == "Dry Steppe"]
mean(ds)
median(ds)

The mode is 40

(d)

gd <- gobi$PlantCov[gobi$Region == "Gobi Desert"]
mean(gd)
median(gd)

The mode is 30

(e)

Yes. Because there is a relatively drastic difference in the values of the mean and medians between the two regions.

Question #14

MS 2.84 - pg 74

(a)

g <- read.csv("GALAXY2.csv")

hist(x=g$VELOCITY)

(b)

According to the above histogram, it would seem that the double cluster theory is valid because of the symmetry of the distribution. It looks to me as if the data is distributed in two differing ways. Yes.

(c)

Left most cluster A1775A

mean(g[g$VELOCITY < 21000,])
sd(g[g$VELOCITY < 21000,])

Right most cluster A1775B

mean(g[g$VELOCITY > 21000,])
sd(g[g$VELOCITY > 21000,])

(d)

It will probably belong to A1775A because the observation lies within 2 standard deviations. But the observation lies more than 4 standard deviations away when comparing it to A1775B.

Question #15

ddt <- read.csv("DDT.csv")
library(ggplot2)

p <- ggplot(ddt, aes(x=RIVER, y=LENGTH, fill=SPECIES)) + 
  geom_boxplot() + ggtitle(label = "Adam Gracy")
p


agracy2246/MATH4753grac0009 documentation built on April 26, 2020, 9:39 a.m.