#' Read the HB WiWo Corpus
#'
#' Reads the XML-files from the HB WiWo corpus and seperates the text and meta
#' data.
#'
#' @param path Character string with Path where the data files are.
#' @param file Character string with names of the XML files.
#' @param do.meta Logical: Should the algorithm collect meta data?
#' @param do.text Logical: Should the algorithm collect text data?
#' @return \item{meta}{ id source date title abstract dachzeile}
#' \item{text}{ Text} \item{metamult}{ person company industry country author category
#' klassifikation (mehrere moeglich) thema sachgruppe serie}
#' @keywords manip
#'
#' @export readHBWiWo
#'
readHBWiWo <- function(path = getwd(), file = list.files(path = path, pattern = "*.xml$",
full.names = FALSE, recursive = TRUE),
do.meta = TRUE, do.text = TRUE){
stopifnot(is.character(file), is.character(path),
is.logical(do.meta), is.logical(do.text),
length(path) == 1, length(do.meta) == 1, length(do.text) == 1)
text <- NULL
meta <- NULL
metamult <- NULL
for(i in 1:length(file)){
cat(paste(file[i]), "\n")
article <- readLines(con = paste(path,file[i], sep="/"), encoding = "latin1")
if(grepl("utf(-){0,1}8", ignore.case = TRUE, article[1])){
article <- readLines(con = paste(path,file[i], sep="/"), encoding = "utf8")
}
article <- gsub(pattern = """, replacement = "\"",article)
article <- gsub(pattern = "&", replacement = "&",article)
article <- gsub(pattern = "'", replacement = "\'",article)
lines <- grep(pattern = "</Dokument>|</Document>", article)
lines <- cbind(c(1,lines[-length(lines)]),lines)
article <- apply(lines, 1, function(x)paste(article[x[1]:x[2]], collapse = " "))
id <- stringr::str_extract(article, "ID=\"(.*?)\"")
id <- gsub(pattern="ID=|\"", replacement="", x=id)
abstract <- stringr::str_extract(article, "<Abstract>(.*?)</Abstract>")
abstract <- removeXML(abstract)
if(do.meta){
source <- stringr::str_extract(article, "<Source>(.*?)</Source>|<Quelle>(.*?)</Quelle>")
source <- removeXML(source)
date <- stringr::str_extract(article, "<Date>(.*?)</Date>|<Datum>(.*?)</Datum>")
date <- as.Date(removeXML(date))
title <- stringr::str_extract(article, "<Title>(.*?)</Title>|<Titel>(.*?)</Titel>|<UB>(.*?)</UB>")
title <- removeXML(title)
dachzeile <- stringr::str_extract(article, "<DZ>(.*?)</DZ>")
dachzeile <- removeXML(dachzeile)
company <- stringr::str_extract_all(article, "<Company>(.*?)</Company>|<Firma>(.*?)</Firma>")
names(company) <- id
tmp <- rep(names(company), lengths(company))
company <- unlist(company)
names(company) <- tmp
company <- trimws(gsub(pattern="<Company>|</Company>|<Firma>|</Firma>", replacement="", x=company))
country <- stringr::str_extract_all(article, "<Country>(.*?)</Country>|<Land>(.*?)</Land>")
names(country) <- id
tmp <- rep(names(country), lengths(country))
country <- unlist(country)
names(country) <- tmp
country <- trimws(gsub(pattern="<Country>|</Country>|<Land>|</Land>", replacement="", x=country))
industry <- stringr::str_extract_all(article, "<Industry>(.*?)</Industry>|<Industrie>(.*?)</Industrie>")
names(industry) <- id
tmp <- rep(names(industry), lengths(industry))
industry <- unlist(industry)
names(industry) <- tmp
industry <- trimws(gsub(pattern="<Industry>|</Industry>|<Industrie>|</Industrie>", replacement="", x=industry))
author <- stringr::str_extract_all(article, "<Author>(.*?)</Author>|<Autor>(.*?)</Autor>")
names(author) <- id
tmp <- rep(names(author), lengths(author))
author <- unlist(author)
names(author) <- tmp
author <- trimws(gsub(pattern="<Author>|</Author>|<Autor>|</Autor>", replacement="", x=author))
category <- stringr::str_extract_all(article, "<Category>(.*?)</Category>|<Rubrik>(.*?)</Rubrik>")
names(category) <- id
tmp <- rep(names(category), lengths(category))
category <- unlist(category)
names(category) <- tmp
category <- trimws(gsub(pattern="<Category>|</Category>|<Rubrik>|</Rubrik>", replacement="", x=category))
klassifikation <- stringr::str_extract_all(article, "<Klassifikation>(.*?)</Klassifikation>")
names(klassifikation) <- id
tmp <- rep(names(klassifikation), lengths(klassifikation))
klassifikation <- unlist(klassifikation)
names(klassifikation) <- tmp
klassifikation <- trimws(gsub(pattern="<Klassifikation>|</Klassifikation>", replacement="", x=klassifikation))
thema <- stringr::str_extract_all(article, "<Thema>(.*?)</Thema>")
names(thema) <- id
tmp <- rep(names(thema), lengths(thema))
thema <- unlist(thema)
names(thema) <- tmp
thema <- trimws(gsub(pattern="<Thema>|</Thema>", replacement="", x=thema))
sachgruppe <- stringr::str_extract_all(article, "<Sachgruppe>(.*?)</Sachgruppe>")
names(sachgruppe) <- id
tmp <- rep(names(sachgruppe), lengths(sachgruppe))
sachgruppe <- unlist(sachgruppe)
names(sachgruppe) <- tmp
sachgruppe <- trimws(gsub(pattern="<Sachgruppe>|</Sachgruppe>", replacement="", x=sachgruppe))
serie <- stringr::str_extract_all(article, "<Serie>(.*?)</Serie>")
names(serie) <- id
tmp <- rep(names(serie), lengths(serie))
serie <- unlist(serie)
names(serie) <- tmp
serie <- trimws(gsub(pattern="<Serie>|</Serie>", replacement="", x=serie))
person <- stringr::str_extract_all(article, "<Person>(.*?)</Person>")
names(person) <- id
tmp <- rep(names(person), lengths(person))
person <- unlist(person)
names(person) <- tmp
person <- trimws(gsub(pattern="<Person>|</Person>", replacement="", x=person))
mData <- data.frame(id, source, date, title, abstract, dachzeile,
stringsAsFactors = FALSE)
meta <- rbind(meta, mData)
metamult$person <- c(metamult$person, person)
metamult$company <- c(metamult$company, company)
metamult$industry <- c(metamult$industry, industry)
metamult$country <- c(metamult$country, country)
metamult$author <- c(metamult$author, author)
metamult$category <- c(metamult$category, category)
metamult$klassifikation <- c(metamult$klassifikation, klassifikation)
metamult$thema <- c(metamult$thema, thema)
metamult$sachgruppe <- c(metamult$sachgruppe, sachgruppe)
metamult$serie <- c(metamult$serie, serie)
}
if(do.text){
text_new <- stringr::str_extract(article, "<Text>(.*?)</Text>")
text_new <- trimws(gsub(pattern="<Text>|</Text>", replacement="", x=text_new))
text_new[is.na(text_new)] <- ""
abstract[is.na(abstract)] <- ""
text_new <- trimws(paste(abstract, text_new))
names(text_new) <- id
text <- as.list(c(text, text_new))
}
}
res <- list("meta" = meta, "text" = text, "metamult" = metamult)
class(res) <- "textmeta"
if (do.text) res <- deleteAndRenameDuplicates(res)
summary(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.