pttR: Retrieve and Process Textual Data from PTT

library(rvest)
library(dplyr)
library(tidyr)
library(stringr)
devtools::load_all()

## Links & Raw data
Links <- c(
  newest = "http://zh.pttpedia.wikia.com/index.php?title=%E7%89%B9%E6%AE%8A%3A%E6%96%B0%E9%A0%81%E9%9D%A2&namespace=0&username=&limit=500",
  board_name = "http://zh.pttpedia.wikia.com/wiki/%E5%88%86%E9%A1%9E:PTT%E7%9C%8B%E6%9D%BF",
  event_name = "http://zh.pttpedia.wikia.com/wiki/%E5%88%86%E9%A1%9E:PTT%E4%BA%8B%E4%BB%B6",
  famous_people = "http://zh.pttpedia.wikia.com/wiki/%E5%88%86%E9%A1%9E:PTT%E5%90%8D%E4%BA%BA",
  culture = "http://zh.pttpedia.wikia.com/wiki/%E5%88%86%E9%A1%9E:PTT%E6%96%87%E5%8C%96",
  notation = "http://zh.pttpedia.wikia.com/wiki/%E5%88%86%E9%A1%9E:PTT%E6%B5%81%E8%A1%8C%E7%AC%A6%E8%99%9F",
  basic_term = "http://zh.pttpedia.wikia.com/wiki/%E5%88%86%E9%A1%9E:PTT%E5%9F%BA%E6%9C%AC%E7%94%A8%E8%AA%9E",
  post_type = "http://zh.pttpedia.wikia.com/wiki/PTT%E7%9A%84%E5%90%84%E9%A1%9E%E6%96%87%E7%AB%A0"
)

rawhtml <- vector("list", length = 8)
for (i in seq_along(rawhtml)) {
  rawhtml[[i]] <- read_html(Links[i])
}

## Helper Functions -------------------

# Extract link texts from 'Category Pages'
# Preprocess to rm parentheses and aka text
# in the same a tag text
extract_link_text <- function(rawhtml) {
  rawhtml %>%
    html_node("div#mw-pages") %>%
    html_nodes("li") %>%
    html_nodes("a") %>%
    html_text %>%
    paste(collapse = "_") %>%
    str_replace_all("、", "_") %>%
    str_replace_all("( [\\(（]|[\\(（])", "_") %>%
    str_replace_all("( [\\)）]|[\\)）])", "") %>%
    strsplit("_") %>%
    purrr::flatten_chr()
}

## Data From 'Newest Pages' ------------------

page_names <- rawhtml[[1]] %>%
  html_nodes("a.mw-newpages-pagename") %>%
  html_text() %>%
  str_remove("之亂")

page_date <- rawhtml[[1]] %>%
  html_nodes("span.mw-newpages-time") %>%
  html_text() %>%
  str_extract("^20[0-9]{2}年[0-9]{1,2}月[0-9]{1,2}日") %>%
  str_replace("年", "-") %>%
  str_replace("月", "-") %>%
  str_replace("日", "")

newest_df <- data_frame(value = page_names,
                        source = "newest",
                        date = as.Date(page_date,
                                       tz = "CST")
                        ) %>%
  mutate(value = str_remove(value,
                           "[ （\\(].+[）\\)]$"))

## Data from 'Ptt Board Name' -----------------
board_name <- extract_link_text(rawhtml[[2]]) %>%
  as_data_frame() %>%
  cbind(source = "board_name")


## Data from 'Ptt Event' -----------------
rm_term <- c("之亂$", "在ptt$", "在Ptt$", "在PTT$", "事件$", "與ptt$", " $")

event_name <- extract_link_text(rawhtml[[3]]) %>%
  str_remove(paste0(rm_term, collapse = "|")) %>%
  as_data_frame() %>%
  cbind(source = "event_name")

## Data from 'Ptt Famous People' -----------------
famous_people <- extract_link_text(rawhtml[[4]]) %>%
  as_data_frame() %>%
  cbind(source = "famous_people")

## Data from 'Ptt Culture' -----------------
culture <- extract_link_text(rawhtml[[5]]) %>%
  as_data_frame() %>%
  cbind(source = "culture")

## Data from 'Ptt notation' -----------------
notation <- rawhtml[[6]] %>%
  html_node("div#mw-pages") %>%
  html_nodes("a") %>%
  html_text %>%
  append("崩╰(〒皿〒)╯潰") %>%
  as_data_frame() %>%
  cbind(source = "notation")



## Data from 'Ptt Basic Terms' -----------------
basic_term <- extract_link_text(rawhtml[[7]]) %>%
  as_data_frame() %>%
  cbind(source = "basic_term")

## Data from 'Ptt Post Type' ------------------
post_type <- rawhtml[[8]] %>%
  html_node("div.mw-content-text") %>%
  html_nodes("li") %>%
  html_nodes("a") %>%
  html_text() %>%
  as_data_frame() %>%
  cbind(source = "post_type")


## Combine all source --------------------
ptt_dict <- bind_rows(basic_term,
                  board_name,
                  culture,
                  event_name,
                  famous_people,
                  notation,
                  post_type,
                  newest_df[ ,-3]) %>%
  rename("term" = value, "source" = source) %>%
  distinct(term, .keep_all = T)

Encoding(ptt_dict$source) <- "latin1"
ptt_dict$source<- iconv(ptt_dict$source, "latin1", "UTF-8")


attr(ptt_dict, "date") <- newest_df$date[1]
# attr(ptt_dict, "date")

saveRDS(ptt_dict, "data-raw/ptt_dict.rds")

liao961120/pttR documentation built on Dec. 16, 2019, 2:19 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

liao961120/pttR
Retrieve and Process Textual Data from PTT

data-raw/deprecated/internal-ptt-dict-construct.R
In liao961120/pttR: Retrieve and Process Textual Data from PTT

R Package Documentation

Browse R Packages

We want your feedback!

liao961120/pttR Retrieve and Process Textual Data from PTT

data-raw/deprecated/internal-ptt-dict-construct.R In liao961120/pttR: Retrieve and Process Textual Data from PTT

R Package Documentation

Browse R Packages

We want your feedback!

liao961120/pttR
Retrieve and Process Textual Data from PTT

data-raw/deprecated/internal-ptt-dict-construct.R
In liao961120/pttR: Retrieve and Process Textual Data from PTT