#' Create a database of text and metadata to feed into make_model
#'
#' @param xmlfolder string input folder with text files to run. Typically contains either XML files
#' from the MJP or files cleaned with stripXML
#' @param date_vec vector vector of dates, usually from strip_dates
#' @param wordsToRemove vector a vector of words to remove from the corpus; an initial stopword list
#' @param stemDoc logical if TRUE, runs SnowballC's stemDocument function
#' @param pattern string pattern of the endings of the files of xmlfolder; defaults to
#' '.tei.xml', but if xmlfolder is taken from stripXML, should be '.txt'
#' @param journalVec vector a vector of the names of journals to be included.
#' possibile examples are "Blast", "Egoist", "Poetry Magazine", "Freewoman", "NewFreewoman"
#' @param yearRangeRule string the rule specifying which years are to be included. Can take values like
#' "> 1900", "==1919", etc.
#' @return a data.frame of text for running with MALLET
#' @seealso \code{\link{strip_dates}} and \code{\link{stripXML}} which this depends on, as well as
#' \code{\link{make_model}}, which takes the data.frame created by this function as input
#' @export
#' @examples
#' dataframe <- stripped_xmlData
# this function runs some of the snowballC and tm corpus prep functions so that we can
# feed into MALLET
# takes as input the folder where the xml files are stored, and the vector of dates stripped from
# those files
prep_corpus <- function(xmlfolder, date_vec = date_vec, wordsToRemove=NULL, stemDoc=FALSE, pattern = '.tei.xml', journalVec = NULL, yearRangeRule = NULL){
library(tm);
source <- DirSource(xmlfolder)
xml_words <- c("ab", "div", "npb", "abthe", "tei", "</body>", "<body>", "</text>", "<text>", "TEI",
"<ab>", "</ab>", "type=", "teiHeader", "fileDesc", "titleStmt", "publicationStmt",
"</title>", "<title>", "idno", "n=", "sourceDesc", "</p>", "<p>", "npb", "pb")
wordsToRemove <- unlist(wordsToRemove)
wordsToRemove <- unname(wordsToRemove)
wordsToRemove <- as.vector(wordsToRemove)
# not used right now
# xml_words <- c("filedesc", "titlestmt", "titleblast", "issu", "titl", "titlestmt",
# "egoisttitl", "titleblasttitl", "publicationstmt", "idno", "typecdi",
# "nmojp", "typemetsid", "sourcedesc", "platerp", "titleblasttitle",
# "platerp", "div", "tei", "teihead", "text", "front", "pb", "nfoutsidecov",
# "teiheader", "nfoutsidecover", "nfrontinsidecover", "titlethe", "egoisttitle",
# "pan", "electronic", "transcription", "modernist", "journals", "project", "ocrd",
# "taggedpdfteixslp", "pan", "titlethe", "egoisttitle", "insert", "mojp", "id",
# "transcription", "titlepage", "titlepart", "th", "blast", "edited"
# )
# # these either
# journal_words <- c("joyce", "dedalus", "bloom", "rouveyr", "lg", "pthe", "everi",
# "large", "obvious", "possibly", "inde", "pwe", "deasi", "nd", "ab",
# "mme", "thee", "little", "realize", "post", "section", "yang", "quit",
# "top", "mere", "note", "definit", "large", "sir", "perhap", "hot", "arghol",
# "notic", "yo", "jean", "tabl", "juli", "section", "di", "fanett", "net",
# "armi", "becom", "miss", "alreadi", "dolmetsch", "decemb", "dun", "roll",
# "stood", "told", "something", "found", "develop", "meet", "relate",
# "berkeley", "richard", "fraulein", "marsden", "effect", "appear",
# "dont", "thi", "thou", "alway", "cran", "set", "chapter", "comm",
# "pankhurst", "tri", "pa", "leave", "something", "stand", "direct",
# "phe", "german", "english", "abdiv", "pascas", "freewoman",
# "bitzenko", "februari", "januari", "els", "blenner", "pi", "yer",
# "novemb", "alphons", "karo", "mean", "cant", "typearticl", "op",
# "miriam", "conme", "sawat", "attent", "imit", "mackenzi", "laddhako",
# "weining",
# "em", "ess", "vo", "hobson", "roddi", "florence", "drysdal",
# "cours", "norman", "item", "pari", "diomed", "noth", "howev",
# "cunningham", "debussi", "edward", "paracelsus",
# "ibsen", "kreisler", "starestski", "les", "qui",
# "que", "des","plus", "pour", "est", "pas", "like",
# "may", "dans", "une", "upon", "esse", "can",
# "nos", "avec", "hir", "january", "february", "
# march", "april", "june", "july", "august", "september",
# "october", "november", "december", "schlaf", "maria",
# "james", "must", "nous", "sur", "tout", "vous",
# "supplement", "comme", "ces", "bien", "aux", "mon",
# "ma", "jamess", "conmee", "felt", "twelve", "two",
# "three", "four", "five", "six", "seven", "eight", "nine",
# "ten", "eleven", "polti", "roddy", "stephen", "bloy", "roddy",
# "bertha", "kueifei", "medtner", "alphonse", "berthe", "george",
# "casey", "jane", "delius", "laura", "tarr", "anastasya", "rebecca",
# "pƩguy", "soltyk", "one", "egoist", "thing", "men")
myCorpus <- tm::Corpus(source, readerControl=list(reader=readPlain))
myCorpus <- tm::tm_map(myCorpus, PlainTextDocument)
library(SnowballC);
myCorpus <- tm::tm_map(myCorpus, removeWords, xml_words)
myCorpus <- tm::tm_map(myCorpus, removePunctuation)
myCorpus <- tm::tm_map(myCorpus, removeNumbers)
myCorpus <- tm::tm_map(myCorpus, content_transformer(tolower))
myCorpus <- tm::tm_map(myCorpus, removeWords, stopwords("english"))
myCorpus <- tm::tm_map(myCorpus, removeWords, stopwords("french"))
myCorpus <- tm::tm_map(myCorpus, removeWords, stopwords("german"))
# remove specific words
# not run right now
if (!is.null(wordsToRemove)) myCorpus <- tm::tm_map(myCorpus, removeWords, wordsToRemove)
#myCorpus <- tm_map(myCorpus, removeWords, c(journal_words, xml_words))
if (stemDoc) myCorpus <- tm::tm_map(myCorpus, removeWords, stemDocument)
#myCorpus <- tm_map(myCorpus, stemDocument)
# start here
myCorpus <- tm::tm_map(myCorpus, stripWhitespace)
myCorpus <- tm::tm_map(myCorpus, PlainTextDocument)
# load corpus into a data.frame so it can be run by MALLET
textdata <- sapply(myCorpus, toString)
textdata <- unname(textdata)
dataframe <- data.frame(text = textdata, date = date_vec, stringsAsFactors = FALSE)
id <- list.files(xmlfolder, pattern = paste0('*', pattern))
id <- sapply(id, FUN = function(i) sub(pattern, '', i))
dataframe$id <- id
journal <- vector()
for (i in 1:length(myCorpus)){
if (grepl("Egoist", id[i])){
journal[i] <- "Egoist"
} else if (grepl("Blast", id[i])){
journal[i] <- "Blast"
} else if (grepl("NewFreewoman", id[i])){
journal[i] <- "NewFreewoman"
}else if (grepl("Poetry", id[i])){
journal[i] <- "Poetry Magazine"
} else{
journal[i] <- "Freewoman"
}
}
dataframe$journal <- journal
# from here try left_join on journal and date with data.frame of the metadata
dataframe$years <- lubridate::year(dataframe$date)
if (!is.null(journalVec)) dataframe <- subset(dataframe, subset = journal %in% journalVec)
if (!is.null(yearRangeRule)) dataframe <- subset(dataframe, subset = eval(base::parse(text = paste("years", yearRangeRule))))
dataframe$years <- NULL
dataframe$text <- stringr::str_replace_all(dataframe$text, ",", " ")
dataframe
}
# example
# date_vec <- strip_dates("~/Dropbox/The Egoist PDFs/XML")
# dataframe <- prep_corpus("~/Dropbox/The Egoist PDFs/XML/textfiles", date_vec)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.