Install using with:
devtools::install_github("javierluraschi/sparkwarc")
The following example loads a very small subset of a WARC file from Common Crawl, a nonprofit 501 organization that crawls the web and freely provides its archives and datasets to the public.
library(sparkwarc) library(sparklyr) library(DBI) library(dplyr)
sc <- spark_connect(master = "local")
spark_read_warc(sc, path = spark_warc_sample_path(), name = "WARC")
```{sql query-1, connection=sc, max.print=1} SELECT count(value) FROM WARC WHERE length(regexp_extract(value, ' 0
```r cc_regex <- function(ops) { ops %>% filter(regval != "") %>% group_by(regval) %>% summarize(count = n()) %>% arrange(desc(count)) %>% head(100) } cc_stats <- function(regex) { tbl(sc, "warc") %>% transmute(regval = regexp_extract(value, regex, 1)) %>% cc_regex() }
cc_stats("http-equiv=\"Content-Language\" content=\"(.*)\"")
cc_stats("<script .*src=\".*/(.+)\".*")
cc_stats("<([a-zA-Z]+)>")
cc_stats(" ([a-zA-Z]{5,10}) ")
cc_stats("<meta .*keywords.*content=\"([^,\"]+).*")
cc_stats("<script .*src=\".*/([^/]+.js)\".*")
spark_disconnect(sc)
warc_big <- normalizePath("~/cc.warc.gz") # Name a 5GB warc file if (!file.exists(warc_big)) # If the file does not exist download.file( # download by gsub("s3n://commoncrawl/", # mapping the S3 bucket url "https://commoncrawl.s3.amazonaws.com/", # into a adownloadable url sparkwarc::cc_warc(1)), warc_big) # from the first archive file
config <- spark_config() config[["spark.memory.fraction"]] <- "0.9" config[["spark.executor.memory"]] <- "10G" config[["sparklyr.shell.driver-memory"]] <- "10G" sc <- spark_connect(master = "local", config = config)
spark_read_warc( sc, "warc", warc_big, repartition = 8)
df <- data.frame(list(a = list("a,b,c")))
```{sql query-8, connection=sc, max.print=1} SELECT count(value) FROM WARC WHERE length(regexp_extract(value, '<([a-z]+)>', 0)) > 0
```{sql query-9, connection=sc, max.print=1} SELECT count(value) FROM WARC WHERE length(regexp_extract(value, '<html', 0)) > 0
cc_stats("http-equiv=\"Content-Language\" content=\"([^\"]*)\"")
cc_stats("WARC-Target-URI: http://([^/]+)/.*")
cc_stats("<([a-zA-Z]+)>")
cc_stats("<meta .*keywords.*content=\"([a-zA-Z0-9]+).*")
spark_disconnect(sc)
By running sparklyr in EMR, one can configure an EMR cluster and load about ~5GB of data using:
sc <- spark_connect(master = "yarn-client") spark_read_warc(sc, "warc", cc_warc(1, 1)) tbl(sc, "warc") %>% summarize(n = n()) spark_disconnect_all()
To read the first 200 files, or about ~1TB of data, first scale the cluster, consider maximizing resource allocation with the followin EMR config:
[ { "Classification": "spark", "Properties": { "maximizeResourceAllocation": "true" } } ]
Followed by loading the [1, 200]
file range with:
sc <- spark_connect(master = "yarn-client") spark_read_warc(sc, "warc", cc_warc(1, 200)) tbl(sc, "warc") %>% summarize(n = n()) spark_disconnect_all()
To query ~1PB for the entire crawl, a custom script would be needed to load all the WARC files.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.