#----------------Scripting Wiki Pages----------------
# Scripting different tables from wiki pages about coronavirus
#----------------Functions----------------
`%>%` <- magrittr::`%>%`
#----------------US----------------
# Summarise table of cases in the US
# Using : https://en.wikipedia.org/wiki/2020_coronavirus_outbreak_in_the_United_States
url <- "https://en.wikipedia.org/wiki/2020_coronavirus_outbreak_in_the_United_States"
us_raw <- url %>%
xml2::read_html() %>%
rvest::html_node(xpath = '//*[@id="mw-content-text"]/div/table[6]') %>%
rvest::html_table(fill = TRUE,
header = TRUE)
# Setting the names
names(us_raw)[min(which(names(us_raw) == "Location"))] <- "county_city"
names(us_raw)[min(which(names(us_raw) == "Location"))] <- "state"
us_raw$date <- lubridate::mdy(us_raw$`Date announced`)
# Dropping details raw
us <- us_raw %>% dplyr::filter(!is.na(date)) %>%
dplyr::select(- `Date announced`) %>%
dplyr::select(case_no_temp = `Case no.`,
date,
status = Status,
cdc_origin_type = `CDC origin type`,
origin = Origin,
county_city,
state,
treatment_facility = `Treatment facility`,
sex = Sex,
age = Age) %>%
dplyr::mutate(case_no = strsplit(case_no_temp, split = "\\[") %>%
purrr::map(~.x[1]) %>%
as.numeric()) %>%
dplyr::select(-case_no_temp) %>%
dplyr::select(case_no, dplyr::everything())
head(us)
tail(us)
str(us)
#----------------South Korea----------------
# Summarise table of cases in the South Korea
# Using : https://en.wikipedia.org/wiki/2020_coronavirus_outbreak_in_South_Korea
`%>%` <- magrittr::`%>%`
url_sk <- "https://en.wikipedia.org/wiki/2020_coronavirus_outbreak_in_South_Korea"
sk_raw <- url_sk %>%
xml2::read_html() %>%
rvest::html_node(xpath = '//*[@id="mw-content-text"]/div/table[8]') %>%
rvest::html_table(fill = TRUE,
header = TRUE)
head(sk_raw)
sk_prov_map <- data.frame(province = c("Gyeonggi", "Gyeonggi", "Gyeonggi",
"Gangwon",
"Gyeongsang", "Gyeongsang", "Gyeongsang", "Gyeongsang","Gyeongsang",
"Chungcheong", "Chungcheong", "Chungcheong", "Chungcheong",
"Jeolla", "Jeolla", "Jeolla", "Jeolla"),
city = c("Incheon", "Seoul", "Gyeonggi",
"Gangwon",
"Gyeongbuk", "Daegu", "Gyeongnam", "Busan", "Ulsan",
"Chungbuk", "Sejong", "Daejeon", "Chungnam",
"Jeonbuk", "Gwangju", "Jeonnam", "Jeju"),
stringsAsFactors = FALSE)
sk_prov_map$province <- tolower(sk_prov_map$province)
sk_prov_map$city <- tolower(sk_prov_map$city)
sk_names <- c("date", "time",
"Incheon", "Seoul", "Gyeonggi",
"Gangwon",
"Gyeongbuk", "Daegu", "Gyeongnam", "Busan", "Ulsan",
"Chungbuk", "Sejong", "Daejeon", "Chungnam",
"Jeonbuk", "Gwangju", "Jeonnam", "Jeju",
"quarantine_station",
"confirmed_new", "confirmed_total",
"death_new", "death_total",
"tested_total", "tested_current",
"discharged_total",
"source")
sk_names <- sk_names %>% tolower()
sk_df <- sk_raw[-1,] %>% stats::setNames(sk_names) %>%
dplyr::select(-time, -source, quarantine_station) %>%
dplyr::mutate(date = lubridate::ymd(date)) %>%
dplyr::filter(!is.na(date))
head(sk_df)
tail(sk_df)
str(sk_df)
sk_df1 <- sk_df %>% dplyr::select(date)
# Removing brackets
for(i in 2:ncol(sk_df)){
print(i)
x <- ifelse(grepl(")", x = sk_df[,i]),strsplit(sk_df[, i], split = ")") %>% purrr::map_chr(~.x[2]), sk_df[, i] )
x <- ifelse(grepl("\\[", x = x),strsplit(x, split = "\\[") %>% purrr::map_chr(~.x[1]), x )
x <- ifelse(grepl(",", x = x), gsub(pattern = ",", replacement = "", x), x)
x <- gsub(pattern = ",", replacement = "", x)
sk_df1[[names(sk_df)[i]]] <- ifelse(is.na(as.numeric(x)), 0, x)
}
# View(sk_df1)
totals_sk <- c("confirmed_new", "confirmed_total",
"death_new", "death_total",
"tested_total", "tested_current",
"discharged_total")
sk_df2 <- sk_df1 %>%
tidyr::pivot_longer(cols = c(-date), names_to = "city") %>%
dplyr::mutate(cases = as.numeric(value)) %>%
dplyr::select(-value)
head(sk_df2)
sk_df3 <- sk_df2 %>% dplyr::filter(city %in% totals_sk) %>%
dplyr::group_by(date, city) %>%
dplyr::summarise(total = max(cases, na.rm = TRUE)) %>%
dplyr::ungroup()
head(sk_df3)
covid_south_korea <- sk_df2 %>%
dplyr::filter(!city %in% totals_sk) %>%
dplyr::group_by(date, city) %>%
dplyr::summarise(total = sum(cases, na.rm = TRUE)) %>%
dplyr::ungroup() %>%
dplyr::left_join(sk_prov_map, by = "city") %>%
dplyr::select(date, city, province, total) %>%
as.data.frame()
str(covid_south_korea)
View(covid_south_korea)
usethis::use_data(covid_south_korea, overwrite = TRUE)
write.csv(covid_south_korea, "/Users/ramikrispin/R/packages/coronavirus_csv/south_korea/covid_south_korea_long.csv", row.names = FALSE)
write.csv(sk_df1, "/Users/ramikrispin/R/packages/coronavirus_csv/south_korea/covid_south_korea_wide.csv", row.names = FALSE)
write.csv(sk_prov_map, "/Users/ramikrispin/R/packages/coronavirus_csv/south_korea/sk_city_prov_mapping.csv", row.names = FALSE)
#----------------Italy----------------
# Summarise table of cases in the Italy
# Using : https://en.wikipedia.org/wiki/2020_coronavirus_outbreak_in_Italy
# url_italy <- "https://en.wikipedia.org/wiki/2020_coronavirus_outbreak_in_Italy"
#
# italy_raw <- url_italy %>%
# xml2::read_html() %>%
# rvest::html_node(xpath = '//*[@id="mw-content-text"]/div/table[4]') %>%
# rvest::html_table(fill = TRUE,
# header = TRUE)
#
#
#
# names(italy_raw)
#
# # View(italy_raw)
# head(italy_raw)
#
# italy_region_mapping <- data.frame(area = c("North-West", "North-West", "North-West", "North-West",
# "North-East", "North-East", "North-East", "North-East", "North-East",
# "Center", "Center", "Center", "Center",
# "South","South", "South", "South", "South", "South",
# "Islands", "Islands"),
# sub_region = c("VDA", "LIG", "PIE", "LOM", "VEN", "TN", "BZ",
# "FVG", "EMR", "MAR", "TOS", "UMB", "LAZ", "ABR", "MOL",
# "CAM", "BAS", "PUG", "CAL", "SIC", "SAR"),
# province = c(),
# stringsAsFactors = FALSE)
# italy_names <- c("Date","VDA", "LIG", "PIE", "LOM", "VEN", "TN", "BZ",
# "FVG", "EMR", "MAR", "TOS", "UMB", "LAZ", "ABR", "MOL",
# "CAM", "BAS", "PUG", "CAL", "SIC", "SAR",
# "confirmed_new", "confirmed_total", "death_new", "death_total",
# "recovery_total", "tested_total",
# "refs", "notes")
#
#
# italy1 <- italy_raw[, which(!is.na(names(italy_raw)))] %>%
# stats::setNames(italy_names) %>%
# dplyr::mutate(date = lubridate::ymd(Date)) %>%
# dplyr::filter(!is.na(date)) %>%
# dplyr::select(date, dplyr::everything()) %>%
# dplyr::select(-refs, -Date, - notes)
#
#
#
# italy2 <- italy1 %>% dplyr::select(date)
# # Removing brackets
# for(i in 2:ncol(italy1)){
# x <- ifelse(grepl(")", x = italy1[,i]),strsplit(italy1[, i], split = ")") %>%
# purrr::map_chr(~.x[2]), italy1[, i] )
# x <- ifelse(grepl("\\[", x = x),strsplit(x, split = "\\[") %>%
# purrr::map_chr(~.x[1]), x )
# x <- ifelse(grepl(",", x = x), gsub(pattern = ",", replacement = "", x), x)
# x <- gsub(pattern = ",", replacement = "", x)
# italy2[[names(italy1)[i]]] <- as.numeric(x)
# }
#
# totals_italy <- c("confirmed_new", "confirmed_total",
# "death_new", "death_total",
# "recovery_total", "tested_total")
#
#
# head(italy2)
#
# italy3 <- italy2 %>%
# tidyr::pivot_longer(cols = c(-date), names_to = "sub_region") %>%
# dplyr::mutate(cases = as.numeric(value)) %>%
# dplyr::mutate(cases = ifelse(is.na(cases), 0, cases))
# head(italy3)
#
#
#
# italy4 <- italy3 %>% dplyr::filter(sub_region %in% totals_italy) %>%
# dplyr::group_by(date, sub_region) %>%
# dplyr::summarise(total = max(cases, na.rm = TRUE)) %>%
# dplyr::ungroup()
#
# head(italy4)
#
# covid_italy <- italy3 %>%
# dplyr::filter(!sub_region %in% totals_italy) %>%
# dplyr::group_by(date, sub_region) %>%
# dplyr::summarise(total = sum(cases, na.rm = TRUE)) %>%
# dplyr::ungroup() %>%
# dplyr::left_join(italy_region_mapping, by = "sub_region") %>%
# dplyr::select(date, region, sub_region, total) %>%
# as.data.frame()
#
# str(covid_italy)
# View(covid_italy)
#
# usethis::use_data(covid_italy, overwrite = TRUE)
#
# write.csv(covid_italy, "/Users/ramikrispin/R/packages/coronavirus_csv/italy/covid_italy_long.csv", row.names = FALSE)
# write.csv(italy2, "/Users/ramikrispin/R/packages/coronavirus_csv/italy/covid_italy_wide.csv", row.names = FALSE)
# write.csv(italy_region_mapping, "/Users/ramikrispin/R/packages/coronavirus_csv/italy/italy_region_mapping.csv", row.names = FALSE)
#
#----------------Iran----------------
# Summarise table of cases in the Iran
# Using : https://en.wikipedia.org/wiki/Template:2019%E2%80%9320_coronavirus_pandemic_data/Iran_medical_cases
url_iran <- "https://en.wikipedia.org/wiki/Template:2019%E2%80%9320_coronavirus_pandemic_data/Iran_medical_cases"
iran_raw <- url_iran %>%
xml2::read_html() %>%
rvest::html_node(xpath = '//*[@id="mw-content-text"]/div/table[2]') %>%
rvest::html_table(fill = TRUE,
header = TRUE)
iran_region_mapping <- data.frame(region = c(rep("Region 1", 7), rep("Region 2", 6),
rep("Region 3", 6), rep("Region 4", 6),
rep("Region 5", 6)) ,
province_abb = c("Qom", "Teh", "Maz", "Alb",
"Sem", "Gol", "Qaz", "Esf",
"Frs", "Hor", "Koh", "Cha",
"Bus","Gil", "Ard", "Azs",
"Azg", "Kur", "Zan", "Mar",
"Ham", "Khz", "Krs", "Lor",
"Ilm", "Khr", "Sis", "Yaz",
"Khs", "Ker", "Khn"),
province = c("Qom", "Tehran", "Mazandaran", "Alborz",
"Semnan", "Golestan", "Qazvin", "Esfahan",
"Fars", "Hormozgan", "Kohgiluyeh and Buyer Ahmad", "Chahar Mahall and Bakhtiari",
"Bushehr", "Gilan", "Ardebil", "East Azarbaijan",
"West Azarbaijan", "Kordestan", "Zanjan", "Markazi",
"Hamadan", "Khuzestan", "Kermanshah", "Lorestan",
"Ilam", "Razavi Khorasan", "Sistan and Baluchestan", "Yazd",
"South Khorasan", "Kerman", "North Khorasan"),
stringsAsFactors = FALSE) %>%
dplyr::arrange(province)
iran_region_mapping
iran_names <- iran_raw[1,] %>% as.character()
iran_names[33] <- "confirmed_new"
iran_names[34] <- "confirmed_total"
iran_names[35] <- "death_new"
iran_names[36] <- "death_total"
covid_iran <- iran_raw[-1, ] %>% stats::setNames(iran_names) %>%
dplyr::select(- Sources) %>%
dplyr::mutate(date = as.Date(Date)) %>%
dplyr::select(-Date, -confirmed_new, -confirmed_total, -death_new, -death_total) %>%
dplyr::filter(!is.na(date)) %>%
tidyr::pivot_longer(cols = -date, names_to = "province_abb") %>%
dplyr::mutate(cases = as.numeric(value)) %>%
dplyr::mutate(cases = ifelse(is.na(cases), 0, cases)) %>%
dplyr::select(-value) %>%
dplyr::left_join(iran_region_mapping , by = "province_abb") %>%
dplyr::select(date, region, province, cases) %>%
as.data.frame()
covid_iran_wide <- covid_iran %>% tidyr::pivot_wider(names_from = province, values_from = cases)
str(covid_iran)
head(covid_iran)
View(covid_iran)
usethis::use_data(covid_iran, overwrite = TRUE)
write.csv(covid_iran, "/Users/ramikrispin/R/packages/coronavirus_csv/iran/covid_iran_long.csv", row.names = FALSE)
write.csv(covid_iran_wide, "/Users/ramikrispin/R/packages/coronavirus_csv/iran/covid_iran_wide.csv", row.names = FALSE)
write.csv(iran_region_mapping, "/Users/ramikrispin/R/packages/coronavirus_csv/iran/iran_region_mapping.csv", row.names = FALSE)
#----------------Germany----------------
# Summarise table of cases in the Germany
# Using : https://en.wikipedia.org/wiki/2020_coronavirus_pandemic_in_Germany
url_gr <- "https://en.wikipedia.org/wiki/2020_coronavirus_pandemic_in_Germany"
gr_raw <- url_gr %>%
xml2::read_html() %>%
rvest::html_node(xpath = '//*[@id="mw-content-text"]/div/table[3]') %>%
rvest::html_table(fill = TRUE,
header = TRUE)
gr_states <- gr_raw[2:18,1]
dates <- lubridate::mdy(paste(names(gr_raw)[-1], gr_raw[1,-1], 2020, sep = "-"))
gr_df <- lapply(gr_states, function(i){
r <- NULL
r <- which(gr_raw$State == i)
df <- data.frame(dates = dates,
cases = as.character(gr_raw[r, -1]))
})
gr_names <- c("date_temp", gr_raw[1, 2:ncol(gr_raw)])
gr_raw2 <- gr_raw[-1, ] %>% as.data.frame() %>%
setNames(gr_names) %>%
dplyr::add_rownames(var = "month")
View(gr_raw2)
sk_prov_map <- data.frame(province = c("Gyeonggi", "Gyeonggi", "Gyeonggi",
"Gangwon",
"Gyeongsang", "Gyeongsang", "Gyeongsang", "Gyeongsang","Gyeongsang", "Gyeongsang",
"Chungcheong", "Chungcheong", "Chungcheong", "Chungcheong",
"Jeolla", "Jeolla", "Jeolla", "Jeolla"),
city = c("Incheon", "Seoul", "Gyeonggi",
"Gangwon",
"Daegu", "Gyeongbuk","Gyeongnam", "Gyeongsang", "Busan", "Ulsan",
"Chungbuk", "Chungnam", "Sejong", "Daejeon",
"Jeonbuk", "Jeonnam", "Gwangju", "Jeju"),
stringsAsFactors = FALSE)
sk_prov_map$province <- tolower(sk_prov_map$province)
sk_prov_map$city <- tolower(sk_prov_map$city)
sk_names <- sk_raw[1, ] %>% as.character() %>% tolower()
sk_names <- sk_names[- which(sk_names == "gyeongsang")]
sk_names[20] <- "confirmed_new"
sk_names[21] <- "confirmed_total"
sk_names[22] <- "death_new"
sk_names[23] <- "death_total"
sk_names[24] <- "tested_total"
sk_names <- c(sk_names, "source")
sk_df <- sk_raw[-1,] %>% stats::setNames(sk_names) %>% dplyr::select(-time, -source) %>%
dplyr::mutate(date = lubridate::ymd(date)) %>%
dplyr::filter(!is.na(date))
head(sk_df)
tail(sk_df)
sk <- sk_df %>% tidyr::pivot_longer(cols = c(-date), names_to = "city") %>%
dplyr::mutate(cases = strsplit(value, split = "\\[") %>%
purrr::map_chr(~.x[1]))
sk$tested <- gsub(",", "", sk$tested) %>% as.numeric
sk <- sk %>% dplyr::left_join(sk_prov_map, by = "city") %>%
dplyr::select(date, city, province, cases)
tail(sk, 20)
sk %>% dplyr::filter(city == "seoul")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.