# add a space after "net" custom_synonyms <- "2D|2-D 3D|3-D 4D|4-D algorithms|algorithm big data|big-data bottom-hole|bottomhole cased hole|cased-hole data driven|data-driven deep-water|deepwater deep water|deepwater feedforward|feed-forward fracture| frack fracturing| frack fractures| frack fractured| frack fracks|frack frackd|frack modelling|modeling multi phase|multiphase multi-phase|multiphase neural net |neural network neural nets|neural network neural-network|neural network neural networks|neural network pressures|pressure real time|real-time reservoirs|reservoir systems|system time lapse|time-lapse underbalanced drilling|UBD underbalance drilling|UBD two phase|two-phase well-head|wellhead " # read text table and split rows at carriage return custom_synonyms <- read.table(text = custom_synonyms, header = FALSE, sep = "\n", stringsAsFactors = FALSE) # split string at one column by the delimiter "|" custom_synonyms <- data.frame(do.call('rbind', strsplit(custom_synonyms$V1, split = "|", fixed = TRUE)), stringsAsFactors = FALSE) custom_synonyms
# save custom words to .rda file in package ./data synonyms_rda <- paste(system.file("data", package = "petro.One"), "synonyms.rda", sep = "/") save(custom_synonyms, file = synonyms_rda)
# save custom words to .rda file in project ./data synonyms_rda <- paste(PROJHOME, "data", "synonyms.rda", sep = "/") save(custom_synonyms, file = synonyms_rda)
saveTo_data <- function(where = NULL) { if (is.null(where)) stop() if (where == "project") { } }
# write to a text file with a bar separated synonyms synonyms_txt <- paste(system.file("extdata", package = "petro.One"), "synonyms.txt", sep = "/") write.table(custom_synonyms, file = synonyms_txt, sep = "|", quote = FALSE, row.names = FALSE, col.names = FALSE)
# write to a text file with a bar separated synonyms synonyms_txt <- paste(PROJHOME, "inst/extdata", "synonyms.txt", sep = "/") synonyms_txt write.table(custom_synonyms, file = synonyms_txt, sep = "|", quote = FALSE, row.names = FALSE, col.names = FALSE)
library(petroOne) library("tm") p1 <- onepetro_page_to_dataframe("1000_conference.html") p2 <- onepetro_page_to_dataframe("2000_conference.html") p3 <- onepetro_page_to_dataframe("3000_conference.html") nn_papers <- rbind(p1, p2, p3) # create corpus vdocsSyn <- VCorpus(VectorSource(nn_papers$title_data)) vdocsSyn <- tm_map(vdocsSyn, content_transformer(tolower))
# enhanced replaceBy including the dataframe to use as a replacement replaceBy <- function(x, df) { for (k in seq_len(nrow(df))) { x <- gsub(df$X1[[k]], df$X2[[k]], x, fixed = TRUE) } x } vdocs_proc <- tm_map(vdocsSyn, content_transformer(function(x) replaceBy(x, custom_synonyms)))
vcum <- c() for (ix in 1:nrow(custom_synonyms)) { pattern <- custom_synonyms$X1[ix] v <- grep(pattern, nn_papers$title_data) cat(pattern, length(v), "\n") vcum <- c(vcum, v) } # vcum corpus_range(vdocs_proc, vcum)
corpus_range(vdocs_proc, vcum)
tdm <- TermDocumentMatrix(vdocs)
pattern <- custom_synonyms$X1[1] v1 <- grep(pattern, nn_papers$title_data) v1 grep(pattern, nn_papers$title_data, value = TRUE)
pattern <- shQuote(custom_synonyms$X1[2]) pattern v2 <- grep(pattern, nn_papers$title_data) v2 grep(pattern, nn_papers$title_data, value = TRUE)
corpus_range <- function(corp, vec) { cumdf <- data.frame() for (i in vec) { # cat(i, corp[[i]]$content, "\n") df <- data.frame(corp.num = i, corp.content = corp[[i]]$content) cumdf <- rbind(cumdf, df) } cumdf }
vec <- c(255, 355, 387, 472) vec <- v1 x <- vdocs_proc for (i in vec) { cat(i, x[[i]]$content, "\n") }
pattern <- custom_synonyms$X1[1] v1 <- grep(pattern, nn_papers$title_data) v1 grep(pattern, nn_papers$title_data, value = TRUE)
corpus_range(vdocs_proc, vcum)
load_synonyms <- function() { synfile <- system.file("extdata", "synonyms.txt", package = "petro.One") print(synfile) custom_synonyms <- utils::read.table(file = synfile, header = TRUE, sep = "|", stringsAsFactors = FALSE) names(custom_synonyms) <- c("original", "replace_by") custom_synonyms } load_synonyms()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.