R/data.R

Defines functions data.same.n data.by.stadtteil data.process load.data fetch.dataframe fetch.page load.cities fetch.cities.letter cities.to.list html.to.num html.to.txt txt.to.num

#  ==== Constans ====
DATAIMPORT.ROOT <- paste0(getwd(), "/../data/")

WOHNTYP.WG <- 0
WOHNTYP.1ZIMMER <- 1
WOHNTYP.WOHNUNG <- 2
WOHNTYP.HAUS <- 3

ANZEIGEN.PRO.SEITE <- 20

# ==== Helpers ====
txt.to.num <- function(str) {
  regmatches(str, regexpr("[[:digit:]]+", str)) %>%
    as.numeric()
}
html.to.txt <- function(html) {
  html_text(html, trim = T)
}
html.to.num <- function(html) {
  html %>% html.to.txt() %>% txt.to.num()
}

# ==== Load Cities ====
library(jsonlite)

cities.to.list <- function(cities) {
  cities <- cities %>% distinct(city_name, .keep_all = T)
  l <- as.list(cities$city_id.n)
  names(l) <- cities$city_name
  l
}

fetch.cities.letter <- function(q) {
  fromJSON(paste0("https://www.wg-gesucht.de/ajax/getCities.php?country_parameter=&query=", q))
}

load.cities <- function(forceUpdate = F) {
  datapath <- paste0(DATAIMPORT.ROOT, "wg_staedte.RData")
  # Load cached data if it exists
  if (file.exists(datapath) && !forceUpdate) {
    load(datapath)
  } else {
    for (l in LETTERS) {
      try({
        temp <- fetch.cities.letter(l)
        if (exists("Staedte.temp")) {
          Staedte.temp <- rbind(Staedte.temp, temp)
        } else {
          Staedte.temp <- temp
        }
      })
    }
    Staedte <- Staedte.temp
    Staedte$city_id.n <- as.numeric(Staedte$city_id)
    Staedte.timestamp <- date()
    save(Staedte, Staedte.timestamp, file = datapath)
  }
  # # List to return results
  # l <- list()
  # l$Staedte <- Staedte
  # l$timestamp <- Staedte.timestamp
  # l
  Staedte
}

# ==== Load actual WG-Gesucht Data ====
library(rvest)
fetch.page <- function(session) {
  skip_rows <- 4
  trs <- session %>%
    html_nodes("#table-compact-list tr")
  count <- length(trs) - skip_rows

  df.temp <- data.frame(
    id = numeric(count),
    active = logical(count),

    miete = numeric(count),
    groesse = numeric(count),
    stadtteil = character(count),

    eintrag = character(count),
    frei.ab = character(count),
    frei.bis = character(count),

    m = numeric(count),
    w = numeric(count),
    g.m = numeric(count),
    g.w = numeric(count),
    g.e = numeric(count),

    stringsAsFactors = F
  )

  if (count <= 0) {
    return(df.temp)
  }

  for (i in 1:count) {
    tr <- trs[i + skip_rows]
    children <- html_children(tr)

    df.temp$active[i] <- !grepl("inactive", html_attr(tr, "class"))
    df.temp$id[i] <- txt.to.num(html_attr(tr, "adid"))

    # m[i] <- 0 # Männliche
    # w[i] <- 0 # Weibliche
    # g.m[i] <- 0 # Männliche gesucht
    # g.w[i] <- 0 # Weibliche gesucht
    # g.e[i] <- 0 # egal gesucht
    for (img in children[2] %>% html_nodes("img")) {
      p <- strsplit(strsplit(html_attr(img, "src"), "img/wg")[[1]][2], ".gif")[[1]][1]
      switch(p,
             w = {df.temp$w[i] <- df.temp$w[i] + 1},
             m = {df.temp$m[i] <- df.temp$m[i] + 1},
             wg = {df.temp$g.w[i] <- df.temp$g.w[i] + 1},
             mg = {df.temp$g.m[i] <- df.temp$g.m[i] + 1},
             eg = {df.temp$g.e[i] <- df.temp$g.e[i] + 1}
      )
    }
    df.temp$miete[i] <- html.to.num(children[4])
    df.temp$groesse[i] <- html.to.num(children[5])
    df.temp$stadtteil[i] <- html.to.txt(children[6])
    df.temp$eintrag[i] <- html.to.txt(children[3])
    df.temp$frei.ab[i] <- html.to.txt(children[7])
    df.temp$frei.bis[i] <- html.to.txt(children[8])
    # for (j in 1:length(children)) {
    #   print(html.to.txt(children[j]))
    # }
  }
  df.temp
}

fetch.dataframe <- function(city, limit, wohntyp, onUpdate = NULL) {
  sess <- html_session(paste0("http://www.wg-gesucht.de/wg-zimmer-in-Konstanz.", city ,".0.0.0.html"))
  session <- sess

  df <- fetch.page(sess) # erste Seite laden
  if (!is.null(onUpdate)) { onUpdate(1 / limit) }

  if (limit > 1 && nrow(df) == ANZEIGEN.PRO.SEITE) {
    for(i in 1:(limit - 1)) {
      sess <- jump_to(sess, paste0("http://www.wg-gesucht.de/wg-zimmer-in-Konstanz.", city ,".0.0.", i,".html")) # nächste Seiten laden

      df.neu <- fetch.page(sess)
      df <- rbind(df, df.neu)
      if (!is.null(onUpdate)) { onUpdate((i + 1) / limit) }

      if (nrow(df) < ANZEIGEN.PRO.SEITE) { break } # Schleife verlassen wenn keine nächste Seite vorhanden
    }
  }

  df
}

load.data <- function(city = 74, wohntyp = WOHNTYP.WG, rows = 100, forceUpdate = F, onUpdate = NULL) {
  datapath <- paste0(DATAIMPORT.ROOT, "wg_data_", city, "_", wohntyp, ".RData")

  update <- TRUE
  # Load cached data if it exists
  if (file.exists(datapath) && !forceUpdate) {
    load(datapath)

    if(nrow(Daten) > rows) {
      update <- FALSE
    }
  }

  if (update) {
    Daten <- fetch.dataframe(city = city, wohntyp = wohntyp, limit = ceiling(rows / 20), onUpdate = onUpdate)
    Daten.timestamp <- date()
    save(Daten, Daten.timestamp, file = datapath)
  } else {
    if (!is.null(onUpdate)) { onUpdate(1) } # progress auf 1 setzen
  }

  if (nrow(Daten) > rows) {
    Daten <- head(Daten, rows)
  }

  # List to return results
  l <- list()
  l$Daten <- data.process(Daten)
  l$timestamp <- Daten.timestamp
  l
}

# Verarbeitung der Daten / Vorbereiten von Werten
data.process <- function(data) {
  # Bewohneranzahlen berechnen
  data$bewohner.ges <- data$g.m + data$g.w + data$g.e
  data$bewohner <- data$m + data$w + data$bewohner.ges

  # Geschlechtsverhältnis berechnen
  data$geschl.verh <- data$m / (data$m + data$w)
  if (nrow(data) > 0 && sum(is.nan(data$geschl.verh)) > 0) data[is.nan(data$geschl.verh),]$geschl.verh <- NA # replace all NaN with NA

  # Miete pro m^2 berechnen
  data$miete.proqm <- round(data$miete / data$groesse, digits = 2)
  data
}

# Daten nach Stadtteil gruppieren
data.by.stadtteil <- function(Daten) {
  Daten %>%
    mutate(stadtteil.lower = tolower(stadtteil)) %>%
    group_by(stadtteil.lower) %>%
    summarise(
      count = n(),
      miete.proqm = mean(miete.proqm, na.rm = T),
      bewohner = mean(bewohner, na.rm = T),
      geschl.verh = mean(geschl.verh, na.rm = T),
      stadtteil = stadtteil[1]
    ) %>%
    filter(count > 1)
}

data.same.n <- function(Daten1, Daten2) {
  n1 <- nrow(Daten1)
  n2 <- nrow(Daten2)

  if (n1 > n2) {
    Daten1 <- sample_n(Daten1, n2)
  } else if (n2 > n1) {
    Daten2 <- sample_n(Daten2, n1)
  }
  l <- list()
  l$Daten1 <- Daten1
  l$Daten2 <- Daten2
  l
}
jansim/WG-SuchR documentation built on July 10, 2022, 9:41 p.m.