knitr::opts_chunk$set(echo=FALSE, warning=FALSE, message=FALSE, eval = FALSE,
                      fig.align='center',fig.width=10, fig.height=7) # Places figures on their own pages
options(
  htmltools.dir.version = FALSE, 
  formatR.indent = 2, width = 55, 
  digits = 2,scipen=999,tinytex.verbose = TRUE,
  knitr.kable.NA = '',
  fig.width=12, fig.height=8)
require("rvest")
require("xml2")
require("httr")
require("stringr")
require("tidyverse")
require("tidyselect")
require("here")

\newpage

1.研究目标

评价验收

认定获批

2.资料来源

2.1 数据来源

科技部网站

中国政府网

2.2 数据进度

名单公示

验收

评估

异常提示

评价验收

3.数据抓取流程(data-raw到data-extract)

3.1 基本过程

require("rvest")
require("stringr")
require("tidyverse")
require("tidyselect")
require("openxlsx")

建设名单

历史名单

path_xlsx <- "03-list-raw/raw-agri-park-list.xlsx"

dt_read <- openxlsx::read.xlsx(path_xlsx)

dt_out <- dt_read %>% 
  mutate(batch = str_extract(class, "\\d{1}"),
         batch = str_pad(batch, width = 2,
                         side = "left", pad = "0"))  %>%
  filter(type == "国家农业科技园区") %>%
  select(index, batch, name, province) 

list_batch <- sort(unique(dt_out$batch))

i <- 1
for (i in 1:length(list_batch)) { 
  path_out <- glue::glue("xlsx/list-batch-{list_batch[i]}.xlsx")
  dt_out %>%
    filter(batch == list_batch[i]) %>%
    mutate(index = 1:nrow(.)) %>%
    openxlsx::write.xlsx(.,path_out)
  }

更新年度名单

path_html <- "03-list-raw/list-year-2020-batch09.html"

# xpath for list
## for year 2020
xpath_list <-"/html/body/div[5]/div[3]/div[3]/p[position() >= 20  and position() <= 45]"

tbl_raw <- read_html(path_html,encoding = "UTF-8") %>%
  html_nodes(xpath = xpath_list ) %>%
  html_text() %>%
  as_tibble() %>%
  rename_at(all_of(names(.)),  ~all_of("index_name")) %>%
  mutate(index_name = str_replace(index_name, " ", "")) %>%
  separate(col = index_name, into = c("index", "name"), sep = "\\.")  %>%
  mutate(index = as.numeric(index)) 

require(techme)
data("BasicProvince")
batch <- "09"
ptn_province <- paste0(BasicProvince$province, collapse = "|")

tbl_info <- tbl_raw %>%
  add_column(batch = batch, .after = "index") %>%
  # get the province
  mutate(province = str_extract(name,ptn_province))

check <- sum(is.na(tbl_info$province))
if (check >0) warning("Povince with NA, please check!")

# write out
path_out <- glue::glue("xlsx/list-batch-{batch}.xlsx")
openxlsx::write.xlsx(tbl_info, path_out)

验收情况

3.2 抓取html

html 2018

#--------------------------
# this chunk should run only once
#--------------------------

# files html path
files_dir <- "01-check-raw/check-year-2018.html"

# xpath for data table
css_tbl <-"body > div.policyLibraryOverview > div.policyLibraryOverview_content > div.pages_content > p:nth-child(n+13)"

Year <- 2018

tbl_raw <- read_html(files_dir,encoding = "UTF-8") %>%
  html_nodes(css =  css_tbl) %>%
  html_text() %>%
  as_tibble() %>%
  separate(col = "value", into = c("index", "name")) %>%
  add_column(result = "pass",
             year= Year,
             doc_num="国科办农2018-99") %>%
  add_column(id= 1:nrow(.), .before = "index")



# files csv path
path_out <- paste0("xlsx/check-year-",Year, ".xlsx")
write.xlsx(tbl_raw, path_out)

html 2023

#--------------------------
# this chunk should run only once
#--------------------------

# files html path
files_dir <- "01-check-raw/check-year-2023.html"

# xpath for data table
css_tbl <-"body > div.policyLibraryOverview > div.policyLibraryOverview_content > div.pages_content > p:nth-child(n+13)"

xpath_tbl <- "//*[@id='Zoom']/div[3]/table/tbody" # for 2023

Year <- 2023

tbl_raw <- read_html(files_dir,encoding = "UTF-8") %>%
  #html_nodes(css =  css_tbl) %>%
  html_nodes(xpath =  xpath_tbl) %>%
  html_table(header = T) %>%
  .[[1]] %>%
  as_tibble() %>%
  rename_all(., ~c("index", "name")) %>%
  #separate(col = "value", into = c("index", "name")) %>%
  add_column(result = "pass",
             year= Year,
             doc_num="国科办农2023-60") %>%
  add_column(id= 1:nrow(.), .before = "index")



# files csv path
path_out <- paste0("xlsx/check-year-",Year, ".xlsx")
write.xlsx(tbl_raw, path_out)

3.2 抓取word

# install.packages("readtext")
require("readtext")

# files html path
files_dir <- "01-check-raw/check-year-2021.docx"


Year <- 2021

doc_text <- readtext(files_dir) %>%
  select(text) %>%
  str_split(., "\n") %>%
  .[[1]]  

# case coexist pass and fail
row_point1 <- which(str_detect(doc_text,"验收通过的园区"))
row_point2 <- which(str_detect(doc_text,"验收不通过的园区"))
range_pass <- (row_point1 + 1):(row_point2 -1)
range_fail <- (row_point2 + 1):length(doc_text)

# case all pass
row_point1 <- which(str_detect(doc_text,"通过验收"))
range_pass <- (row_point1 + 1):length(doc_text)

docnum <- "国科办农2021-150号"
tbl_out <- doc_text %>%
  as_tibble() %>%
  mutate(result=ifelse(
    row_number() %in% range_pass, 
    "pass",
    #ifelse(row_number() %in% range_fail, 
    #       "fail",
           NA
    #       )
    )
    ) %>%
  filter(!is.na(result)) %>%
  separate(col = "value", into = c("index", "name"),
           sep = "\\.") %>%
  add_column(year= Year,
             doc_num= docnum)  %>%
  add_column(id= 1:nrow(.), .before = "index")




# files csv path
path_out <- paste0("xlsx/check-year-",Year, ".xlsx")
write.xlsx(tbl_out, path_out)

3.3 导出为分析数据

dir_path <- here::here("data-raw","public-site","agri-park","xlsx")

files_xlsx <- list.files(dir_path)



files_target <- files_xlsx[which(str_detect(files_xlsx, "check")) ]

url_xlsx <- paste0(dir_path, "/", files_target)

years_target <- str_extract_all(files_target, "(\\d{4})") %>%
  unlist() %>%
  as.numeric(.)

tbl_out <- NULL

i <- 1
for (i in 1: length(files_target)) {

  tbl_tem <- read.xlsx(url_xlsx[i]) %>%
    mutate(index = as.numeric(index))

  tbl_out <- bind_rows(tbl_out, tbl_tem)
}

path_out <- paste0("data-update/wide-check-upto-year-", max(years_target), ".xlsx")

write.xlsx(tbl_out, path_out)

评价情况

3.2 抓取2019年html

#--------------------------
# this chunk should run only once
#--------------------------

# files html path
Year <- 2019
files_dir <- glue::glue("02-eval-raw/eval-year-{Year}.html")

# xpath for data table
css_tbl <-"body > div.policyLibraryOverview > div.policyLibraryOverview_content > div.pages_content > p:nth-child(n+103)"


tbl_raw <- read_html(files_dir,encoding = "UTF-8") %>%
  html_nodes(css =  css_tbl) %>%
  html_text() %>%
  as_tibble() %>%
  # clean
  mutate(value = mgsub::mgsub(value,                 c(fixed("\u00a0"),fixed("\n")," "),
                              c("", "","")),
         value = str_trim(value))

# detect head
row_point <- which(str_detect(tbl_raw$value,"优秀|达标|不达标"))

range_good <- (row_point[1] + 1):(row_point[2] -1)
range_ok <- (row_point[2] + 1):(row_point[3] -1)
range_fail <- (row_point[3] + 1):length(tbl_raw)

result_chn <- c("优秀","达标","不达标")
result_eng <- c("good", "ok", "fail")
ptn_result <- paste0(result_chn, collapse = "|")

tbl_out <- tbl_raw %>%
  mutate(result = str_extract(value, ptn_result)) %>%
  fill(result, .direction = "down") %>%
  filter(!str_detect(value, ptn_result)) %>%
  separate(col = "value", 
           into = c("index", "name"), sep = "\\.") %>%
  mutate(index = as.numeric(index)) %>%
  mutate(result = mgsub::mgsub(result, 
                               result_chn, result_eng))

require(techme)
data("BasicProvince")
data("ProvinceCity")
ptn_province <- paste0(BasicProvince$province, collapse = "|")
ptn_city <- paste0(unique(ProvinceCity$city_clean), collapse = "|")

tbl_info <-  tbl_out %>%
  add_column(year= Year, .before = "index") %>%
  mutate(province_name = str_extract(name, ptn_province),
         city_clean= str_extract(name, ptn_city)) %>%
  # match
  left_join(., select(ProvinceCity, city_clean, province_clean), by = "city_clean") %>%
  mutate(province = ifelse(is.na(province_name),
                           province_clean,
                           province_name)) %>%
  # check it
  select(year, index, name, result,province)

check <- sum(is.na(tbl_info$province))
if (check >0) warning("Povince with NA, please check!")

# files csv path
path_out <- paste0("xlsx/eval-year-",Year, ".xlsx")
write.xlsx(tbl_info, path_out)

3.2 抓取2020年html

#--------------------------
# this chunk should run only once
#--------------------------

# files html path
Year <- 2020
files_dir <- glue::glue("02-eval-raw/eval-year-{Year}.html")

# xpath for data table
css_tbl <-"body > div.policyLibraryOverview > div.policyLibraryOverview_content > div.pages_content > p:nth-child(n+103)"

css_tbl <- "#Zoom > p:nth-child(n+27)"
tbl_raw <- read_html(files_dir,encoding = "UTF-8") %>%
  html_nodes(css =  css_tbl) %>%
  html_text() %>%
  as_tibble() %>%
  # clean
  mutate(value = mgsub::mgsub(value,                 c(fixed("\u00a0"),fixed("\n")," "),
                              c("", "","")),
         value = str_trim(value))

# detect head
row_point <- which(str_detect(tbl_raw$value,"优秀|达标|不达标"))

range_good <- (row_point[1] + 1):(row_point[2] -1)
range_ok <- (row_point[2] + 1):(row_point[3] -1)
range_fail <- (row_point[3] + 1):length(tbl_raw)

result_chn <- c("优秀","达标","不达标")
result_eng <- c("good", "ok", "fail")
ptn_result <- paste0(result_chn, collapse = "|")

tbl_out <- tbl_raw %>%
  mutate(result = str_extract(value, ptn_result)) %>%
  fill(result, .direction = "down") %>%
  filter(!str_detect(value, ptn_result)) %>%
  separate(col = "value", 
           into = c("index", "name"), sep = "\\.") %>%
  mutate(index = as.numeric(index)) %>%
  mutate(result = mgsub::mgsub(result, 
                               result_chn, result_eng))

require(techme)
data("BasicProvince")
data("ProvinceCity")
ptn_province <- paste0(BasicProvince$province, collapse = "|")
ptn_city <- paste0(unique(ProvinceCity$city_clean), collapse = "|")

tbl_info <-  tbl_out %>%
  add_column(year= Year, .before = "index") %>%
  mutate(province_name = str_extract(name, ptn_province),
         city_clean= str_extract(name, ptn_city)) %>%
  # match
  left_join(., select(ProvinceCity, city_clean, province_clean), by = "city_clean") %>%
  mutate(province = ifelse(is.na(province_name),
                           province_clean,
                           province_name)) %>%
  # check it
  select(year, index, name, result,province)

check <- sum(is.na(tbl_info$province))
if (check >0) warning("Povince with NA, please check!")

# files csv path
path_out <- paste0("xlsx/eval-year-",Year, ".xlsx")
write.xlsx(tbl_info, path_out)

3.2 抓取2021年html

#--------------------------
# this chunk should run only once
#--------------------------

# files html path
Year <- 2021
files_dir <- glue::glue("02-eval-raw/eval-year-{Year}.html")

# xpath for data table
css_tbl <-"body > div.policyLibraryOverview > div.policyLibraryOverview_content > div.pages_content > p:nth-child(n+103)"
#Zoom > p:nth-child(22)

css_tbl <- "#Zoom > p:nth-child(n+22)"
tbl_raw <- read_html(files_dir,encoding = "UTF-8") %>%
  html_nodes(css =  css_tbl) %>%
  html_text() %>%
  as_tibble() %>%
  # clean
  mutate(value = mgsub::mgsub(value,                 c(fixed("\u00a0"),fixed("\n")," "),
                              c("", "","")),
         value = str_trim(value)) %>%
  filter(value!="") %>%
  filter(!str_detect(value,"、"))

# detect head
row_point <- which(str_detect(tbl_raw$value,"优秀|达标|不达标的园区|保留园区资格|取消园区资格"))

range_good <- (row_point[1] + 1):(row_point[2] -1)
range_ok <- (row_point[2] + 1):(row_point[3] -1)
range_fail <- (row_point[3] + 1):(row_point[4] -1)
range_retain <- (row_point[4] + 1):(row_point[5] -1)
range_cancel <- (row_point[5] + 1):length(tbl_raw)

result_chn <- c("优秀","达标","不达标","保留","取消")
result_eng <- c("good", "ok", "fail","retain","cancel")
ptn_result <- paste0(result_chn, collapse = "|")

tbl_out <- tbl_raw %>%
  mutate(result = str_extract(value, ptn_result)) %>%
  fill(result, .direction = "down") %>%
  filter(!str_detect(value, ptn_result)) %>%
  separate(col = "value", 
           into = c("index", "name"), sep = "\\.") %>%
  mutate(index = as.numeric(index)) %>%
  mutate(result = mgsub::mgsub(result, 
                               result_chn, result_eng))

require(techme)
data("BasicProvince")
data("ProvinceCity")
ptn_province <- paste0(BasicProvince$province, collapse = "|")
ptn_city <- paste0(unique(ProvinceCity$city_clean), collapse = "|")

tbl_info <-  tbl_out %>%
  add_column(year= Year, .before = "index") %>%
  mutate(province_name = str_extract(name, ptn_province),
         city_clean= str_extract(name, ptn_city)) %>%
  # match
  left_join(., select(ProvinceCity, city_clean, province_clean), by = "city_clean") %>%
  mutate(province = ifelse(is.na(province_name),
                           province_clean,
                           province_name)) %>%
  # check it
  select(year, index, name, result,province)

check <- sum(is.na(tbl_info$province))
if (check >0) warning("Povince with NA, please check!")

# files csv path
path_out <- paste0("xlsx/eval-year-",Year, ".xlsx")
write.xlsx(tbl_info, path_out)

3.2 抓取2022年html

#--------------------------
# this chunk should run only once
#--------------------------

# files html path
Year <- 2022
files_dir <- glue::glue("02-eval-raw/eval-year-{Year}.html")

# xpath for data table
css_tbl <-"body > div.policyLibraryOverview > div.policyLibraryOverview_content > div.pages_content > p:nth-child(n+103)"
#Zoom > p:nth-child(22)
css_tbl <-"#Zoom > div:nth-child(n+18)"

tbl_raw <- read_html(files_dir,encoding = "UTF-8") %>%
  html_nodes(css =  css_tbl) %>%
  html_text() %>%
  as_tibble() %>%
  # clean
  mutate(value = mgsub::mgsub(value,                 c(fixed("\u00a0"),fixed("\n")," "),
                              c("", "","")),
         value = str_trim(value)) %>%
  filter(value!="") %>%
  mutate(value = str_replace(value, "河南郑州", "1.河南郑州")) #%>%
  #filter(!str_detect(value,"、"))

# detect head
row_point <- which(str_detect(tbl_raw$value,"保留园区资格|取消园区资格"))

#range_good <- (row_point[1] + 1):(row_point[2] -1)
#range_ok <- (row_point[2] + 1):(row_point[3] -1)
#range_fail <- (row_point[3] + 1):(row_point[4] -1)
range_retain <- (row_point[1] + 1):(row_point[2] -1)
range_cancel <- (row_point[2] + 1):length(tbl_raw)

result_chn <- c("优秀","达标","不达标","保留","取消")
result_eng <- c("good", "ok", "fail","retain","cancel")
ptn_result <- paste0(result_chn, collapse = "|")

tbl_out <- tbl_raw %>%
  mutate(result = str_extract(value, ptn_result)) %>%
  fill(result, .direction = "down") %>%
  filter(!str_detect(value, ptn_result)) %>%
  separate(col = "value", 
           into = c("index", "name"), sep = "\\.") %>%
  mutate(index = as.numeric(index)) %>%
  mutate(result = mgsub::mgsub(result, 
                               result_chn, result_eng))

require(techme)
data("BasicProvince")
data("ProvinceCity")
ptn_province <- paste0(BasicProvince$province, collapse = "|")
ptn_city <- paste0(unique(ProvinceCity$city_clean), collapse = "|")

tbl_info <-  tbl_out %>%
  add_column(year= Year, .before = "index") %>%
  mutate(province_name = str_extract(name, ptn_province),
         city_clean= str_extract(name, ptn_city)) %>%
  # match
  left_join(., select(ProvinceCity, city_clean, province_clean), by = "city_clean") %>%
  mutate(province = ifelse(is.na(province_name),
                           province_clean,
                           province_name)) %>%
  # check it
  select(year, index, name, result,province)

check <- sum(is.na(tbl_info$province))
if (check >0) warning("Povince with NA, please check!")

# files csv path
path_out <- paste0("xlsx/eval-year-",Year, ".xlsx")
write.xlsx(tbl_info, path_out)


huhuaping/techme documentation built on June 16, 2024, 3:38 a.m.