#' Generate fetch list of Url's from crawlDB
#'
#' @description
#' Queries the crawlDB for urls matching the given parameters.
#'
#' @param out_dir (Required) Output directory for this crawl.
#' @param work_dir (Required) Working directory for this crawl.
#' @param regExOut RegEx URL filter - omit links with these keywords.
#' @param regExIn RegEx URL filter - keep links with these keywords.
#' @param max_depth maximum depth for selected url's
#' @param topN Choose these top links.
#' @param external_site Logical. If False, host outside the seed list will NOT be crawled.
#' @param max_urls_per_host Max number of URL's to generate per host.
#' @param crawl_delay crawl delay for requests to the same host
#' @param log_file Name of log file. If null, writes to stdout().
#' @param seeds_only gen only seeds
#' @param min_score minimum score for url
#' @export
#'
generateR <- function(out_dir=NULL,
work_dir=NULL,
regExOut=NULL,
regExIn=NULL,
max_depth =NULL,
topN = NULL,
external_site=F,
max_urls_per_host=10,
crawl_delay=NULL,
log_file=NULL,
seeds_only=F,
min_score=0
){
## create output directory
this_dir <- paste0(out_dir,"fetch_",gsub(":|-| ","",Sys.time()),"/")
dir.create(this_dir)
log_con<-set_log_file(log_file)
tryCatch({
## Check values
if(is.null(work_dir)){stop('Work directory was not provided.')}
if(is.null(out_dir)){stop('Output directory was not provided.')}
if(!file.exists(paste0(work_dir,"crawlDB.sqlite"))){stop("Can't find crawlDB. Run inject.R to create.")}
## these should already be here
crawlDB <- DBI::dbConnect(RSQLite::SQLite(), paste0(work_dir,"crawlDB.sqlite"))
today <- as.numeric(Sys.Date())
## set next crawl date
get_next_crawl <- function(today, tbl,crawl_int=NULL){
if(is.null(crawl_int)) crawl_int<-100*365
if(!is.null(tbl)) tbl <- paste0(tbl,".")
paste0( tbl,'crawled = 0 ')
}
## do filt in
get_filtin <- function(regExIn, tbl){
if(is.null(regExIn)) return(NULL)
if(!is.null(tbl)) tbl <- paste0(tbl,".")
val <- paste0(" ",tbl,"path like ")
q <- strsplit(regExIn,'\\|')[[1]]
q <- paste0(val," '%",q,"%' ", collapse=' OR ')
q <- paste( " ( ", q, " ) ")
return(q)
}
## do filter out
get_filtout <- function(regExOut, tbl){
if(is.null(regExOut)) return(NULL)
if(!is.null(tbl)) tbl <- paste0(tbl,".")
val <- paste0(" ",tbl,"path not like ")
q <- strsplit(regExOut,'\\|')[[1]]
q <- paste0(val," '%",q,"%' ", collapse=' AND ')
q <- paste( " ", q, " ")
return(q)
}
## get limit
get_subquery<- function(max_urls_per_host, tbl=NULL){
if(is.null(max_urls_per_host)) return(NULL)
paste(" limit ",max_urls_per_host )
}
## get max_depth
get_max_depth<- function(max_depth, tbl=NULL){
if(is.null(max_depth)) return(NULL)
paste(" and depth <= ",max_depth )
}
## get max_urls_per_host
get_limit <- function(max_urls_per_host, tbl=NULL){
if(is.null(max_urls_per_host)) return(NULL)
paste(" limit ",max_urls_per_host )
}
## build query
generate_query <- function(q="select * from linkDB ",
next_crawl=NULL,
max_depth=NULL,
fin=NULL,
fout=NULL,
seeds_only=F,
subquery=NULL,
limit=NULL,
ext_site=external_site,
min_score=0
){
## if external sites not allowed
if(!ext_site){
q <-paste(q,'inner join (select t3.server from linkDB t3 where t3.is_seed=1) t2 on t1.server = t2.server')
}
## handle next crawl
q <-paste(q, 'where', next_crawl)
## handle max depth
q <-paste(q, max_depth)
## handle filter in and filter out
if(!is.null(fin) & !is.null(fout)){
q <- paste(q,'and ( ', fin, 'and', fout,' and t1.score >= ', min_score, ' or t1.depth = 0 )')
}else if(!is.null(fin)){
q <- paste(q,'and (', fin, ' and t1.score >= ', min_score, ' or t1.depth = 0 )')
}else if(!is.null(fout)){
q <- paste(q,'and (', fout,' and t1.score >= ', min_score, ' or t1.depth = 0 )')
}
## handle seeds only crawl
if(seeds_only){
q <- paste(q,'and t1.is_seed=1')
}
## if limit is given
if(!is.null(limit)){
q <- paste(q, ' ', limit)
}
return(q)
}
## base query to select url's
base_q = paste0=(
"SELECT ROW_NUMBER ()
OVER ( PARTITION BY t1.server ORDER BY score desc, depth asc ) batch, t1.*
FROM linkDB t1 ") # ORDER BY is_seed desc,
## create query to select urls with parameters
q <-generate_query(
q = base_q,
next_crawl=get_next_crawl(today, 't1'),
max_depth=get_max_depth(max_depth, 't1'),
fin=get_filtin(regExIn, 't1'),
fout=get_filtout(regExOut, 't1'),
seeds_only=seeds_only,
subquery=NULL,
limit=NULL,
ext_site=external_site,
min_score=min_score)
## create a temporary table for the fetch list using the query. order by
## score and depth to select most relevant.
q <- paste("create temporary table fetch_list as ",q,
" order by score desc, depth asc" )
st<-Sys.time()
writeLines(paste('GenerateR: Begining generate query - ',st), con=log_con)
# creating fetch list
DBI::dbExecute(crawlDB,q)
## filter fetch list so hosts have no more than the allowed max number of url's.
DBI::dbExecute(crawlDB,paste0("delete from fetch_list where batch > ",max_urls_per_host))
## filter total number of urls in fetch list
DBI::dbExecute(crawlDB,paste0(
"delete from fetch_list where url not in (
select url from fetch_list ",get_limit(topN), " )")) # order by is_seed desc, RANDOM()
## end time for query
et <- Sys.time()
## write log info
num<-DBI::dbGetQuery(crawlDB, "select count(*) from fetch_list")
writeLines(paste('GenerateR: Finished generate query - ',et), con=log_con)
writeLines(paste('GenerateR: Found', num, 'urls.'), con=log_con)
writeLines(paste('GenerateR: Query time - ',round(as.numeric(et)-as.numeric(st)),'seconds.'), con=log_con)
## update the crawled status for the selected urls
DBI::dbExecute(crawlDB,
paste("update linkDB set crawled = 1",
",next_crawl = ", as.numeric(Sys.Date()),
",last_crawl = ", as.numeric(Sys.Date()),
"where url in ( select t2.url from fetch_list t2)"))
## create fetch_db, attach, and insert fetch_list table
fetchDB_fname <-path.expand(paste0(this_dir,"fetch_list.sqlite"))
DBI::dbExecute(crawlDB, paste0("attach database '",fetchDB_fname,"' as fetch_db;"))
DBI::dbExecute(crawlDB, "create table fetch_db.fetch_list(batch integer, depth integer ,url text, server text);")
DBI::dbExecute(crawlDB, "insert into fetch_db.fetch_list select batch, depth, url, server from fetch_list;")
DBI::dbExecute(crawlDB, "detach database fetch_db;")
## faster subset of batches on big lists
fetchDB <- DBI::dbConnect(RSQLite::SQLite(),fetchDB_fname )
DBI::dbExecute(fetchDB, "create index batch ON fetch_list(batch);")
## drop temp table fetch_list
DBI::dbExecute(crawlDB, " drop table fetch_list")
# ## query the final fetch list
# fetch_list<- DBI::dbGetQuery(crawlDB, paste0('select * from fetch_list'))
# fetch_list$RowNum <- NULL
#
# ## prep and save fetch list for output
# fetch_list <- create_fetch_list(fetch_list,crawl_delay)
#save(fetch_list,file=paste0(this_dir,'fetch_list.rda'))
#f_out<-gzfile(paste0(this_dir,'fetch_list.csv.gz'))
#write.csv(fetch_list, file=f_out)
},
error = function(e){
writeLines(paste('GenerateR: ',e), con=log_con)
paste('GenerateR: ',e)
class(e) <- 'error'
e
},
finally = {
DBI::dbDisconnect(crawlDB)
DBI::dbDisconnect(fetchDB)
if(class(log_con)[1]=="file")close(log_con)
})
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.