## Tools for working with PatentsView.org
## ================================================================================
## ================================================================================
## Utilites functions
## ================================================================================
is.0 <- function(x) length(x) == 0
## Calculates number of lines a file has
get.file.nlines <- function(file.name, dir.path = getwd()) {
file.name %>%
file.path(dir.path, .) %>%
normalizePath %>%
paste("grep -c $", .) %>%
system(intern = TRUE) %>%
as.numeric
}
## Tests
## "patview-tsv/nber.category.tsv" %>% get.file.nlines
## --------------------------------------------------------------------------------
#' Downloads list of files from PatentsView.org that are available for download
#'
#' @param patview.files.list.path A path to the html page with the table that contains the list of files to download. Will try to download it if it not founl. The default is in R working directury "patview.files.list.html"
#' @return Saved file path.
#' @export
patview.download.files.list <- function(patview.files.list.path = file.path(getwd(), "patview.files.list.html")) {
if(!file.exists(patview.files.list.path)) {
download.file("http://www.patentsview.org/download/"
, destfile = patview.files.list.path)
message("File downloaded - ", patview.files.list.path)
return(patview.files.list.path)
} else message("File exists. Delete it if you what a new one or set new name")
}
## Tests
## patview.download.files.list()
## --------------------------------------------------------------------------------
#' Gets list of files from PatentsView.org that are available for download
#'
#' @param patview.files.list.path A path to the html page with the table that contains the list of files to download. Will try to download it if it not founl. The default is in R working directury "patview.files.list.html"
#' @return Saved file path.
#' @import magrittr XML
#' @export
patview.get.files.list <- function(patview.files.list.path =
file.path(getwd(), "patview.files.list.html")) {
if(!file.exists(patview.files.list.path))
patview.files.list.path %<>% patview.download.files.list
if(file.exists(patview.files.list.path)) {
cbind(patview.files.list.path %>%
htmltab %>%
set_names(names(.) %>% make.names %>% tolower)
, url = patview.files.list.path %>%
htmltab::htmltab(which = 1, bodyFun = "getHTMLLinks") %>%
set_names(names(.) %>%
make.names %>%
tolower) %>%
extract2("table.name")
, stringsAsFactors = FALSE) %>%
dplyr::transmute(name = stringr::str_extract(table.name, "^[a-zA-Z_]+")
, url
, size = stringr::str_replace(table.name, "^[a-zA-Z_]+", "")
, description
, rows = x..of.rows
, origin) %>%
return()
} else {
message("Can't find the file with list of files and their urls.")
return()
}
}
## Tests
## patview.get.files.list()
## ================================================================================
## Main functions
## ================================================================================
## --------------------------------------------------------------------------------
#' Downloads PatentsView.org files
#'
#' @param url URL of the file
#' @param dest.file Safe as name. Default is the same as it is in the url.
#' @param dest.dir Where to save. Default to the working directory patview-data-XXXXXXXX-zip, where XXXXXXXX is the verstion (i.e., date) of files automatically detected from the url (assumes certain form of url)
#' @return Saved file path.
#' @import magrittr stringr
#' @export
patview.download.file <- function(url
, dest.file = character(0)
, dest.dir = character(0)) {
## Assumes certain form of the link
patview.data.version <- url %>%
str_extract("\\d{8}") %>%
ifelse(is.na(.), "", .)
patview.data.name <- url %>% str_extract("[^/]+$")
if(dest.file %>% is.0) {
dest.file <- patview.data.name %>%
str_replace(patview.data.version, "") %>%
str_replace_all("[_-]+", ".")
}
if(dest.dir %>% is.0) {
dest.dir <- "patview-zip" %>%
file.path(getwd(), .)
}
if(!dir.exists(dest.dir)) dir.create(dest.dir)
dest.file.name <- paste0("patview."
, patview.data.version, "."
, dest.file) %>%
str_replace_all("\\.+", ".")
dest.file.path <- dest.file.name %>%
file.path(dest.dir, .)
if(dest.file.path %>% file.exists) {
message("File '", dest.file, "' is already exist. Exiting.")
return() %>% invisible
} else {
download.file(url
, destfile = dest.file.path
, method = "curl")
message("File downloaded - ", dest.file)
return(dest.file.name)
}
}
## Tests
## "http://www.patentsview.org/download/"
## "http://www.patentsview.org/data/20171226/nber_category.tsv.zip" %>%
## patview.download.file
## "http://s3.amazonaws.com/data.patentsview.org/20181127/download/cpc_subsection.tsv.zip" %>% patview.download.file
## --------------------------------------------------------------------------------
#' Unzips (PatentsView.org bulk data) files.
#'
#' It also renames the files ("_" -> ".")
#' @param zipfile File name to unzip.
#' @param zipdir Default is in working directory "patview-zip"
#' @param exdir A path where to extract zip file. Default is in working directory "patview-tsv"
#' @return Unzipped file path.
#' @import magrittr stringr
#' @export
patview.unzip.file <- function(zipfile
, zipdir = file.path(getwd(), "patview-zip")
, exdir = file.path(getwd(), "patview-tsv")) {
if(!dir.exists(exdir)) dir.create(exdir)
if(str_detect(zipfile, exdir %>% list.files) %>% any) {
message("Seems lile file ", zipfile, " is already extracted. Exiting.")
} else {
zipfile %>%
basename %>%
file.path(zipdir,.) %>%
unzip(exdir = exdir, junkpaths =TRUE) %T>%
file.rename(str_replace_all(.,"_", ".")) %>%
str_replace_all("_", ".") %>%
return
}
}
## Tests
## "patview-20171226-nber.category.tsv.zip" %>% patview.unzip.file
## "patview.20181127.cpc.subsection.tsv.zip" %>% patview.unzip.file
## --------------------------------------------------------------------------------
#' Reads PatentsView .tsv files and saves them to .rds (also optional in batches)
#'
#' @param file File name (.tsv is expected)
#' @param dir Directory where the file is. Default is in working directory "patview-tsv"
#' @param dir.rds Where to save. Default is in working directory "patview-rds"
#' @param batch.lines How many lines is to read in one batch (10^7 is recomended). The default is 0, meanning that reading will be done in one batch as a single file.
#' @param file.lines Length of the .tsv file. The default is 0. If is is not changed and batch.lines is specified then it will try to calculate it with grep.
#' @param compress.rds Should .rds be compressed. Default is FALSE
#' @param return.table Returns table if TRUE. Default is FALSE in which case returns list of saved files. Works only for batch.lines = 0
#' @return Saved file(s) path.
#' @import magrittr stringr data.table
#' @export
patview.save.rds <- function(file
, dir = file.path(getwd(), "patview-tsv")
, dir.rds = file.path(getwd(), "patview-rds")
, batch.lines = 0
, file.lines = 0
, return.table = FALSE
, compress.rds = FALSE) {
if(!dir.exists(dir.rds)) dir.create(dir.rds)
if(batch.lines == 0) {
file.rds.path <- file %>%
basename %>%
str_replace("\\.tsv$", "") %>%
paste0(".rds") %>%
file.path(dir.rds, .)
if(file.rds.path %>% file.exists) {
message("File ", file, " exists. Delete it if you want to replace.")
return()
}
field.names <-
file.path(dir, file) %>%
fread(nrows = 1
, header = FALSE) %>%
make.names
tab <- file.path(dir, file) %>%
fread(showProgress = TRUE
, strip.white = FALSE
, quote = ""
, sep = "\t"
, stringsAsFactors = FALSE
, colClasses = rep("character", length(field.names)))
tab %>% saveRDS(file = file.rds.path, compress = compress.rds)
message("File saved!")
if(return.table) return(tab)
else return(file.rds.path)
} else {
if(file.lines == 0) {
message("Counting lines in the input file...")
file.lines <- get.file.nlines(file, dir)
message("The file '", file, "' has - ", file.lines, " lines.")
}
batch.file.format <- paste0("%0", nchar(file.lines), "d")
## Set start read rows for fread
rows.skip <- seq(from = 1
, to = file.lines
, by = batch.lines)
rows.read <- rows.skip[-1] %>%
c(file.lines) %>%
'-'(rows.skip)
field.names <-
file.path(dir, file) %>%
fread(nrows = 1
, header = FALSE) %>%
make.names
sapply(1:length(rows.read), function(i) {
## extract batch
message("* Reading lines from ", rows.skip[i])
started <- Sys.time()
file.rds.path <- file %>%
str_replace("\\.tsv$", "") %>%
paste0("-"
, sprintf(batch.file.format, rows.skip[i]), "-" # add padding
, sprintf(batch.file.format, rows.skip[i] + rows.read[i] - 1)
, ".rds") %>%
file.path(dir.rds, .)
if(file.rds.path %>% file.exists) {
message("File exists. Delete it if you want to replace.")
} else {
message(" - Started: ", date())
file.path(dir, file) %>%
fread(nrows = rows.read[i]
, header = FALSE
, skip = rows.skip[i]
, showProgress = TRUE
, strip.white = FALSE
, quote = ""
, sep = "\t"
, stringsAsFactors = FALSE
, colClasses = rep("character", length(field.names))) %>%
set_names(field.names) %>%
saveRDS(file.rds.path)
gc()
message(" - Done! (in ", as.numeric(Sys.time() - started) %>% round, " minutes)")
}
return(file.rds.path)
}) %>% return
}
}
## Tests
## "nber.category.tsv" %>% patview.save.rds
## "nber.category.tsv" %>% patview.save.rds(batch.lines = 2)
## readRDS("patview-rds/nber.category-4-5.rds")
## "cpc.subsection.tsv" %>% patview.save.rds(return.table = TRUE)
## --------------------------------------------------------------------------------
#' Deploys PatentsView dataset. Needs lot of disk space and memory
#'
#' @return files path
#' @import magrittr
#' @export
patview.full.deploy <- function() {
patview.download.files.list() %>%
message("List of PatView bulk files downloaded!") %T>%
patview.get.files.list %>%
extract2("url") %>%
pblapply(patview.download.file) %>%
message("All PatView bulk files downloaded!") %T>%
pblapply(patview.unzip.file) %>%
message("All PatView bulk files Unzipped!") %T>%
message("Reading files....") %T>%
pblapply(patview.save.rds)
}
## --------------------------------------------------------------------------------
#' Filter tables of PatentsView.org bulk data
#'
#' @description
#' Similar to dplyr::filter but for tables of PatentsView.org bulk data saved in multiple .rds files
#' @param file.dir A path to directory with .rds files. Default is in working directory "patview-rds"
#' @param file.pattern A pattern for getting a file or a set of files (data batches)
#' @param ... A filtering conditions to fetch certain rows. (See dplyr::filter)
#' @return A data.table with a subset the data.
#' @import pbapply magrittr data.table
#' @export
patview.filter <- function(file.pattern, ...
, file.dir = file.path(getwd(), "patview-rds")
, pbapply = TRUE
, cols = character(0)) {
if(pbapply) {
file.dir %>%
file.path(list.files(., pattern = file.pattern)) %>%
pblapply(function(file.path)
file.path %>%
readRDS %>%
dplyr::filter(...) %>%
dplyr::select(if(cols %>% is.0) everything() else cols)) %>%
rbindlist %>%
return
} else {
file.dir %>%
file.path(list.files(., pattern = file.pattern)) %>%
lapply(function(file.path)
file.path %>%
readRDS %>%
dplyr::filter(...) %>%
dplyr::select(if(cols %>% is.0) everything() else cols)) %>%
rbindlist %>%
return
}
}
## --------------------------------------------------------------------------------
## Tests
## "nber.category.rds" %>% patview.filter(TRUE)
## "nber.category.rds" %>% patview.filter(id == 4)
## "nber.category" %>% patview.filter(str_detect(title, "m"))
## "nber.category.rds" %>% patview.filter(TRUE, select.fields = c("title", "id"))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.