knitr::opts_chunk$set(echo=FALSE, warning=FALSE, message=FALSE, eval = FALSE, fig.align='center',fig.width=10, fig.height=7) # Places figures on their own pages options( htmltools.dir.version = FALSE, formatR.indent = 2, width = 55, digits = 2,scipen=999,tinytex.verbose = TRUE, knitr.kable.NA = '', fig.width=12, fig.height=8)
\newpage
农业部国家农业科学观测实验站
教育部野外科学观测研究站
【农业部】
2020-01-02 农业农村部办公厅关于确定第二批国家农业科学观测实验站的通知 80个:html表格
2018年01月30日 农业部办公厅关于确定第一批国家农业科学观测实验站的通知 36个:docx附件
【教育部】
2016年的学科群体系是最新公布,里面有小麦
两个批次的国家农业科学观测实验站,没有明确区分小麦
require("rvest") require("xml2") require("httr") require("stringr") require("tidyverse") require("tidyselect") require("here") require("openxlsx")
Year <- 2018 # files html path file_dir <- "html/" file_name <- list.files(file_dir) file_id <- which(str_detect(file_name, glue::glue("moa-year-{Year}.*xlsx$"))) file_sel <- file_name[file_id] file_path <- paste0(file_dir, file_sel) tbl_read <- openxlsx::read.xlsx(file_path, colNames = T) names_eng <- c("index", "name", "institution") tbl_out <- tbl_read %>% rename_all(., ~all_of(names_eng)) %>% mutate(index = 1:nrow(.)) %>% mutate_all(., .funs = str_trim) %>% add_column(year = Year, .before = "index") %>% add_column(officer = "MOA", .before = "year") file_name <- str_extract(file_sel, "(.+)(?=\\.)") path_out <- glue::glue("xlsx/{file_name}.xlsx") openxlsx::write.xlsx(tbl_out, path_out)
#-------------------------- # this chunk should run only once #-------------------------- #-------------------------- # this chunk should run only once #-------------------------- Year <- 2019 # files html path file_dir <- "html/" file_name <- list.files(file_dir) file_id <- which(str_detect(file_name, glue::glue("moa-year-{Year}"))) file_sel <- file_name[file_id] file_path <- paste0(file_dir, file_sel) xpath_tbl <- "/html/body/div[4]/div[1]/div[2]/div[1]/div/div/table" tbl_raw <- read_html(file_path,encoding = "UTF-8") %>% html_nodes(xpath = xpath_tbl) %>% html_table(fill = T, header = T) %>% .[[1]] names_eng <- c("index", "name", "institution") tbl_out <- tbl_raw %>% rename_all(., ~all_of(names_eng)) %>% mutate(index = 1:nrow(.)) %>% mutate_all(., .funs = str_trim) %>% add_column(year = Year, .before = "index") %>% add_column(officer = "MOA", .before = "year") file_name <- str_extract(file_sel, "(.+)(?=\\.)") path_out <- glue::glue("xlsx/{file_name}.xlsx") openxlsx::write.xlsx(tbl_out, path_out)
#-------------------------- # this chunk should run only once #-------------------------- #-------------------------- # this chunk should run only once #-------------------------- Year <- 2019 # files html path file_dir <- "html/" file_name <- list.files(file_dir) file_id <- which(str_detect(file_name, glue::glue("moe-year-{Year}"))) file_sel <- file_name[file_id] file_path <- paste0(file_dir, file_sel) xpath_tbl <- "/html/body/div[1]/div[2]/div[1]/main/article/div/div[2]/table" tbl_raw <- read_html(file_path,encoding = "UTF-8") %>% html_nodes(xpath = xpath_tbl) %>% html_table(fill = T, header = T) %>% .[[1]] names_eng <- c("index", "name", "institution") tbl_out <- tbl_raw %>% rename_all(., ~all_of(names_eng)) %>% mutate(index = 1:nrow(.)) %>% mutate_all(., .funs = str_trim)%>% mutate(name = str_replace(name,"\\n", ""), institution =str_replace(institution,"\\n", "")) %>% add_column(year = Year, .before = "index") %>% add_column(officer = "MOE", .before = "year") file_name <- str_extract(file_sel, "(.+)(?=\\.)") path_out <- glue::glue("xlsx/{file_name}.xlsx") openxlsx::write.xlsx(tbl_out, path_out)
#-------------------------- # this chunk should run only once #-------------------------- Year <- 2019 # files html path file_dir <- "html/" file_name <- list.files(file_dir) file_id <- which(str_detect(file_name, glue::glue("most-year-{Year}"))) file_sel <- file_name[file_id] file_path <- paste0(file_dir, file_sel) xpath_tbl <- "//div[contains(@id,'Zoom')]//table" tbl_raw <- read_html(file_path,encoding = "UTF-8") %>% html_nodes(xpath = xpath_tbl) %>% html_table(fill = T, header = T) %>% .[[1]] names_eng <- c("index", "name", "institution","administrator","result") tbl_out <- tbl_raw %>% rename_all(., ~all_of(names_eng)) %>% mutate(index = 1:nrow(.)) %>% mutate_all(., .funs = str_trim)%>% mutate(name = str_replace(name,"\\n", ""), institution =str_replace(institution,"\\n", "、"), administrator =mgsub::mgsub(administrator,"\U00A0", "、")) %>% add_column(year = Year, .before = "index") %>% add_column(officer = "MOST", .before = "year") %>% mutate(institution = str_replace(institution, "中科院","中国科学院")) %>% # tidy name select(-result) # drop columns file_name <- str_extract(file_sel, "(.+)(?=\\.)") path_out <- glue::glue("xlsx/{file_name}.xlsx") openxlsx::write.xlsx(tbl_out, path_out)
#-------------------------- # this chunk should run only once #-------------------------- #-------------------------- # this chunk should run only once #-------------------------- Year <- 2021 # files html path file_dir <- "html/" file_name <- list.files(file_dir) file_id <- which(str_detect(file_name, glue::glue("most-year-{Year}"))) file_sel <- file_name[file_id] file_path <- paste0(file_dir, file_sel) xpath_tbl <- "//table[@class='MsoNormalTable']" tbl_raw <- read_html(file_path,encoding = "UTF-8") %>% html_nodes(xpath = xpath_tbl) %>% html_table(fill = T, header = T) %>% .[[1]] names_eng <- c("index", "name", "institution","administrator") tbl_out <- tbl_raw %>% rename_all(., ~all_of(names_eng)) %>% mutate(index = 1:nrow(.)) %>% mutate_all(., .funs = str_trim)%>% mutate(name = str_replace(name,"\\n", ""), institution =str_replace(institution,"\\n", "")) %>% add_column(year = Year, .before = "index") %>% add_column(officer = "MOST", .before = "year") file_name <- str_extract(file_sel, "(.+)(?=\\.)") path_out <- glue::glue("xlsx/{file_name}.xlsx") openxlsx::write.xlsx(tbl_out, path_out)
file_dir <- "xlsx/" file_name <- list.files(file_dir) file_path <- paste0(file_dir, file_name) tbl_out <- tibble(url = file_path) %>% mutate(dt = map(url, openxlsx::read.xlsx)) %>% select(-url) %>% unnest(dt)
require("techme") data("queryTianyan") dt_match <- queryTianyan %>% select(name_origin, province) %>% rename(institution = "name_origin") data("ProvinceCity") dt_city <- ProvinceCity %>% select( city_clean, province_clean) ptn_province <- paste0(unique(ProvinceCity$province_clean), collapse = "|") ptn_city <- paste0(unique(ProvinceCity$city_clean), collapse = "|") list_institution <- tbl_out %>% mutate(institution_raw = institution) %>% #use only the first institution mutate(institution = map_chr(.x = institution_raw, .f = function(x) as.character(unlist(str_split(x, pattern='、'))[[1]]) ) ) %>% select(institution) %>% unique() %>% left_join(., dt_match, by= "institution" ) %>% # filter obvious province info mutate(province_raw = str_extract(institution, ptn_province)) %>% mutate(province = ifelse(is.na(province), province_raw, province)) %>% # match city mutate(city_clean = str_extract(institution, ptn_city)) %>% left_join(., dt_city, by= "city_clean" ) %>% mutate(province = ifelse(is.na(province), province_clean, province)) %>% filter(is.na(province)) %>% select(institution) %>% arrange(institution) # check check <- sum(is.na(list_institution$province)) if(check > 0) stop("please check the name of institution!") dir_xlsx <- "d:/github/techme/data-raw/data-tidy/hack-tianyan/ship/" file_xlsx <- glue::glue("ship-tot{nrow(list_institution)}-{Sys.Date()}.xlsx") path_xlsx <- paste0(dir_xlsx, file_xlsx) openxlsx::write.xlsx(list_institution, path_xlsx)
循环查询
得到结果,并人工确认
重新整合更新qureyTianyan
build 并push techme
匹配省份信息
数据集名为PubObsStation
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.