visualisation: Visualisation

library(randomForest)
library(ggplot2)
library(gridExtra)
library(corrplot)
library(caret)
library(tree)

forest<- read.csv("../Data/train.csv")
forest$Id<- NULL
soil <- forest[ ,c(15:54)]
area <- forest[,c(11:14)]
forest <- forest[,c(-15:-54, -11:-14)]
fact <- factor(apply(soil, 1, function(x) which(x == 1)), labels = c(1:38))
forest$Soil_Type <- as.integer(fact)
fact2 <- factor(apply(area, 1, function(x) which(x == 1)), labels = c(1:4))
forest$Wilderness_Area <- as.integer(fact2)
forest<- forest[ ,c(1:10,12,13,11)]
forestTrain<-forest


set.seed(1)
forest1<- forest[runif(dim(forest)[1]) > 0.8, ]
forest1$Id <- NULL

#Remove columns with zero variance
sub = apply(forest1[,-56], 2, function(col) all(var(col) !=0 ))
forestSub<- forest1[,sub]
n<- dim(forestSub)
set.seed(1)
split <- runif(dim(forestSub)[1]) > 0.2
train <- forestSub[split,]
test <- forestSub[!split,]

#Tree prediction
train1<- train
test1<- test
names(train1)<- c("Elevation", "Aspect","Slope","H_D_To_Hydro","V_D_To_Hydro","H_D_To_Roads","Hillshade_9am" ,"Hillshade_Noon" ,"Hillshade_3pm","H_D_To_Fire_Points" ,"Soil_Type","Wilderness_Area","Cover_Type" )
names(test1)<- c("Elevation", "Aspect","Slope","H_D_To_Hydro","V_D_To_Hydro","H_D_To_Roads","Hillshade_9am" ,"Hillshade_Noon" ,"Hillshade_3pm","H_D_To_Fire_Points" ,"Soil_Type","Wilderness_Area","Cover_Type" )
tree.forests = tree(factor(Cover_Type) ~., data = train1)
plot(tree.forests)
text(tree.forests, cex=1.3)
tree.prediction = predict(tree.forests, test1[,-13], type='class')
sa <- data.frame(cover=test[,13], pred=tree.prediction)

#Use randomForest for prediction
rf <- randomForest(factor(Cover_Type) ~ ., train, mtry=12, ntree=1000)
predictions <- predict(rf, test)
pred <- data.frame(Cover_Type=test$Cover_Type, Prediction=predictions)
rownames(pred)=NULL

#Next step. After training modelmove on to General case. Test set prediction
forestTest<- read.csv("../Data/test.csv")
forestTest$Id<- NULL
soil<- forestTest[ ,c(15:54)]
area<- forestTest[,c(11:14)]
forest<- forestTest[,c(-15:-54, -11:-14)]
Newfactor <- factor(apply(soil, 1, function(x) which(x == 1)), labels = c(1:40))
forestTest$Soil_Type<- as.integer(Newfactor)
Newfactor2 <- factor(apply(area, 1, function(x) which(x == 1)), labels = c(1:4))
forestTest$Wilderness_Area<- as.integer(Newfactor2)
forestTest<- forestTest[ ,c(1:10,56,55)]
head(forestTest)

#Remove columns witt zero variance
sub = apply(forestTest, 2, function(col) all(var(col) !=0 ))
TestSub<- forestTest[,sub]
n<- dim(TestSub)

#Use our previous data set "forest" as training set.
forestTest$Id <- NULL
forestTrain$Id<- NULL
#forestTrain$Cover_Type<- as.factor(forestTarin$Cover_Type)

#Use randomForest for prediction
rf1 <- randomForest(factor(Cover_Type) ~ ., mtry = 12,ntrees=1000, importance = TRUE,forestTrain)
predictions <- predict(rf1, TestSub)

Forest_Cover_Type <- data.frame(predictions)
names(Forest_Cover_Type) <- "Forest_Cover_Type"
forestTest <- cbind(forestTest, Forest_Cover_Type)
write.csv(forestTest, file = "../Data/out.csv")