equalize_links <- function(link){
rm_last_char <- substring(link, first = nchar(link)) == "/"
ifelse(rm_last_char, yes = substring(link, first = 1, last = nchar(link) - 1), no = link)
}
filter_links <- function(links, domain, parsed_links, filter_domain = FALSE){
links$href <- sapply(links$href, equalize_links)
# some links are reported like: "//www.url.de".
# example (url <- dc("2f 2f 77 77 77 2e 73 61 74 75 72 6e 2e 64 65 2f 64 65 2f 63 61 74 65 67 6f 72 79 2f 5f 74 65 65 67 65 72 e4 74 65 2d 34 37 36 31 32 30 2e 68 74 6d 6c"))
needs_start <- substring(text = links$href, first = 1, last = 1) %in% c("/", "?") &
!grepl(x = links$href, pattern = "http") &
!grepl(x = links$href, pattern = "www.", fixed = TRUE)
needs_start[is.na(needs_start)] <- FALSE
links$href[needs_start]
links$href %<>%
{ifelse(test = needs_start, yes = paste0("https://", domain, .), no = .)}
hash_link <- substr(links$href, 1, 1) == "#"
is_empty <- !nchar(links$href)
is_na <- is.na(links$href)
alr_exist <- links$href %in% parsed_links$href
is_html <- sapply(c("mailto:", "javascript:", ".tif", ".mp4", ".mp3", ".tiff", ".png", ".gif", ".jpeg",".jpg", ".zip", ".pdf"),
FUN = grepl, x = tolower(links$href)) %>%
as.matrix %>%
rowSums %>%
magrittr::not()
#!is_na & --> need NA for click with selenium
# links2 <- links
# links <- links2
keep <- !hash_link & !is_empty & !alr_exist & is_html
keep[is.na(keep)] <- TRUE
links %<>%
.[keep, ] %>%
{.[!duplicated(.), ]}
domains <- links$href %>%
urltools::domain()
# cant always use www. - will grab over domian() if www. is required.
# domain ("https://www.google.de) will yield www.google.de others without wwww.
# https://superuser.com/questions/453673/some-websites-dont-work-with-the-www-prefix#_=_
# %>%
# gsub(pattern = "www.", replacement = "")
# e.g. jobs.lidl.de should be same_domain as lidl.de - therefore use grepl instead of "=="
same_domain <- is.na(domains) | grepl(pattern = domain, x = domains)
if(filter_domain) links %<>% .[same_domain, ]
# links %>% grepl(pattern = "jobs") %>% which %>% {links2[.]}
not_matched <- grepl(substring(text = links$href, first = 1, last = 1), pattern = "[a-z]", perl = TRUE) &
!grepl(links$href, pattern = "https://") &
!grepl(links$href, pattern = "http://") &
!grepl(links$href, pattern = "www.") &
!is.na(links$href)
links$href %<>%
{ifelse(
test = not_matched,
yes = paste0("https://", domain, "/", .),
no = .)
}
return(links)
}
sort_links <- function(links){
if(!dim(links)[1]){
warning("No links to sort: links are empty.")
return(links)
}
# urls have to be case sensitive, see https://www.saturn.de/webapp/wcs/stores/servlet/MultiChannelAllJobsOverview.
links$text <- tolower(links$text)
direct_match <- paste(c("(?=.*jobs)(?=.*suche)", "(?=.*jobs)(?=.*suche)(?=.*page=)", "(?=.*jobs)(?=.*suche)", "successfactors", "all open positions", "jobportal", "sjobs.brassring",
"Ergebnisse 1 – 25 von", "stellenboerse", "vacancies", "current-vacancies", "Artikel pro Seite", "1 – 10 of ", "bewerbungsportal", "myworkdayjobs"), collapse = "|")
# todo: könnte reihenfolge hier reinbringen - stellenangebote vor "über uns"
prioritize <- c("stellenmarkt", "stellenboerse", "current-vacancies", "Explore Opportunities", "recruiting.ultipro.com/", "offenepositionen", "vacancies", "bewerber", "jobfinder ", "stellen suchen", "jobportal", "jobbörse", "jobboerse", "jobs", "job", "all-jobs", "jobsuche","offenestellen", "offene-stellen", "stellenangebote", "job offers", "careers", "career", "karriere", "beruf", "über uns", "ueber uns", "ueber-uns", "uber ", "über ", "ueber ", "all open positions", "brassring")
de_prioritize <- c("impressum", "paypal", "nutzungsbedingungen", "kontakt", "standort", "veranstaltungen", "newsletter", "datenschutz", "facebook", "instagram", "lpsnmedia.net", "google.com/recaptcha", "usercentrics", "linkedin", "googletagmanager", "cookies", "addthis.com", "xing.com", "youtube", "cookiebot.com", "google.com", "youtube-nocookie", "twitter", "linkedin", "signup", "request-password", "checkpoint", "signup", "wa.me", "vimeo",
"google.de/maps", "google.de/intl", "fls.doubleclick.net")
direct <- sapply(links$href, grepl, perl = TRUE, pattern = direct_match, USE.NAMES = FALSE) %>%
which
href <- sapply(unique(prioritize), stringr::str_count, string = tolower(links$href))
href[is.na(href)] <- 0
text <- sapply(unique(prioritize), stringr::str_count, string = links$text)
text[is.na(text)] <- 0
# todo: only heuristic
dims <- dim(text)
weights <- matrix(rep(rev(seq(dims[2])^2), dims[1]), nrow = dims[1], byrow = TRUE)
all <- as.matrix(href + text)
first <- as.matrix(all*weights) %>%
rowSums(na.rm = TRUE) %>%
{order(., decreasing = TRUE)[0:sum(. > 0)]}
links[first, ]$href
href <- sapply(unique(de_prioritize), stringr::str_count, string = tolower(links$href))
text <- sapply(unique(de_prioritize), stringr::str_count, string = links$text)
# todo: only heuristic so far
dims <- dim(text)
weights <- matrix(rep(rev(seq(dims[2])^2), dims[1]), nrow = dims[1], byrow = TRUE)
all <- as.matrix(href + text)
last <- as.matrix(all*weights) %>%
rowSums(na.rm = TRUE) %>%
{order(., decreasing = TRUE)[0:sum(. > 0)]}
iframe <- which(links$text == "iframe")
iframe <- setdiff(iframe, last)
first <- setdiff(first, last)
order <- c(iframe, direct, first, setdiff(seq(links$href), c(first, last, direct)),last) %>%
unique
return(links[order, ])
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.