runs/makedata.R

options(scipen = 99)
options(digits = 5)

if(!exists("p1"))p1 <- fread("zcat ./input/properties_2016.csv.zip")
if(!exists("t1"))t1 <- fread("zcat ./input/train_2016_v2.csv.zip")


tmp0 <- names(p1)[ sapply(names(p1), function(x)mean(is.na(p1[[x]]))) < 0.8 ]
p1 <- p1[, tmp0, with = FALSE]

# Remove finisedsquarefeet2
p1[!is.na(finishedsquarefeet12), sum(calculatedfinishedsquarefeet != finishedsquarefeet12)]
p1[, finishedsquarefeet12 := NULL]

# Count half bathrooms
p1[, halfbathcnt := (bathroomcnt - fullbathcnt)/0.5]

# Garage total square feeet = 0
p1[garagetotalsqft == 0, garagecarcnt := 0]

# Has hottun or spa
p1[, hashottuborspa := ifelse(hashottuborspa == "true", 1, 0)]

# fireplace flag
p1[, fireplaceflag := ifelse(fireplaceflag == "true", 1, 0)]

# taxdelinqflag
p1[, taxdelinquencyflag := ifelse(taxdelinquencyflag == "true", 1, 0)]

# Property zooning desc
p1[, firstletter_propertyzoningdesc := substr(propertyzoningdesc, 1, 1)]

# Census drit (kanskje dette er en lurevariabel)
p1[, censustractandblock := NULL]

# Make training data and create dummys  for all catvars only in training data first
dt1 <- merge(t1, p1)
rm(t1)

# Save and remove
gz <- gzfile("./input/prop.gz", "w")
write.csv(p1, gz, row.names = FALSE)
close(gz)
rm(p1)

gz <- gzfile("./input/train.gz", "w")
write.csv(dt1, gz, row.names = FALSE)
close(gz)
rm(dt1)
steinarv/k1 documentation built on Oct. 19, 2017, 4:41 a.m.