Sys.setlocale(category="LC_ALL", locale="C");
library('Rstem');
#' Function create.articles.data.frame creates dataframe from preprocesses files lokated in one directory
#'
#' Files should have flat structure and should have format returned by preprocess function from the package
#'
#' @param input.path string directory with preprocessed files
#' @param output.path string parametr is the name (location+name) where the dataframe should be saved
#'
#' @examples
#' create.articles.data.frame("C:\\Users\\vostruk001\\Projects\\Studia\\MOW\\Projekt\\MOW_PROJEKT\\prepr\\", "C:\\Users\\vostruk001\\Projects\\Studia\\MOW\\Projekt\\MOW_PROJEKT\\data\\")
#'
#' @export
create.articles.data.frame <- function(articles.directory, output.df.filename) {
init.stop.words.dictionary();
art.stats <- compute.global.stats(articles.directory);
df.file <- file(output.df.filename, "wt");
add.df.header(df.file, art.stats);
add.df.stats.row(df.file, art.stats);
add.article.rows(art.stats, df.file, articles.directory);
close(df.file);
}
Sys.setlocale(category="LC_ALL", locale="C");
# given the filename function reads and returns all its lines
read.all.lines <- function(file.name) {
handle <- file(file.name, "rt");
lines <- readLines(handle);
close(handle);
return(lines);
}
# function returns vector of categories given the string of categories
get.article.categories <- function(category.line) {
category.vector <- strsplit(category.line, "[ \t]*,[ \t]*");
return( category.vector[[1]] );
}
# given filename function reads preprocessed file and
# returns the list with $abstract (text) and $classes (categories)
# returns NA if something went wrong
read.preprocessed.article <- function(file.name) {
lines <- read.all.lines(file.name);
if( length(lines) != 3 ) {
stop(paste('invalid article file:', file.name));
}
categories <- get.article.categories(lines[1]);
abstract <- lines[3];
return(list(abstract=abstract, categories=categories));
}
#function preprocess text including lowercasing all words, removing non-literal symbols,
#stemming using Porter algorithm, function returns words vector
stem.abstract.text <- function(abstract.text) {
abstract.text <- gsub("[^A-Za-z]+", " ", abstract.text);
abstract.text <- tolower(abstract.text);
abstract.text <- strsplit(abstract.text, "[ \t]+");
abstract.text <- abstract.text[[1]];
MAX.WORD.LENGTH <- 255;
abstract.text <- abstract.text[
nchar(abstract.text) < MAX.WORD.LENGTH & # NIE MOZE BYC &&
nchar(abstract.text) > 0
];
return(wordStem(abstract.text, language="english"));
}
#function removes English stop words (like the, you etc) from the words vector
remove.stop.words <- function(wordsVector) {
return( wordsVector[!is.stop.word(wordsVector)] );
}
#function reads the article and returns list of two vectors: $abstract word vector after stemming
# and $categories vector of the labels (categories) for that abstract
load.article <- function(file.name) {
art <- read.preprocessed.article(file.name);
abstract <- stem.abstract.text(art$abstract);
abstract <- remove.stop.words(abstract);
categories <- art$categories;
return( list(abstract=abstract, categories=categories) );
}
#function adds words from wordsWector to the dictionary env (to create df later)
# env[word] contains number of times word occures in all texts
insert.words <- function(env, words) {
for(w in words) {
if(!exists(w, envir=env)) {
assign(w, 1, envir=env);
}
else {
assign(w, 1+get(w, envir=env), envir=env);
}
}
}
# function counts how many times each word occures in the corpus (for ex. can be useful for tf-idf)
# returning $words - words vocabulary $categories - categories vocabulary, $files.count - number of files preprocessed
compute.global.stats <- function(articles.directory) {
words <- new.env(hash=TRUE, parent=emptyenv());
categories <- new.env(hash=TRUE, parent=emptyenv());
count <- 0;
articles <- list.files(
path=articles.directory,
pattern="*.txt",
full.names=TRUE,
ignore.case=TRUE
);
for(a in articles) {
cat(sprintf("[%s]\n", a));
tmp <- load.article(a);
insert.words(words, tmp$abstract);
insert.words(categories, tmp$categories);
count <- count + 1;
}
return( list(count=count, words=words, categories=categories) );
}
META.COLUMNS <- c('filename', 'wordcount');
# function writes to the files column names of the dataframe
add.df.header <- function(df.file, art.stats) {
word.col.names <- paste('w.', sort(ls(envir=art.stats$words)), sep='');
cats.col.names <- paste('c.', sort(ls(envir=art.stats$categories)), sep='');
meta.col.names <- paste('meta.', META.COLUMNS, sep='');
cat( c(word.col.names, cats.col.names, meta.col.names),
file=df.file, sep='\t' );
cat( '\n', file=df.file, sep='' );
}
write.env.cols <- function(envir, file) {
col.names <- sort(ls(envir=envir));
for(c in col.names) {
tmp <- get(c, envir=envir);
cat(tmp, file=file, sep='');
cat('\t', file=file, sep='');
}
}
# function adds first row with statistics of text corpus to the dataframe
add.df.stats.row <- function(df.file, art.stats) {
write.env.cols(art.stats$words, df.file);
write.env.cols(art.stats$categories, df.file);
for( i in 1:(length(META.COLUMNS)-1) ) {
cat('na\t', file=df.file, sep='');
}
cat('na\n', file=df.file, sep='');
}
write.art.row.cols <- function(aenv, genv, file) {
col.names <- sort(ls(envir=genv));
for(c in col.names) {
if( exists(c, envir=aenv, inherits=FALSE) ) {
tmp <- get(c, envir=aenv);
}
else {
}
}
}
# function adds data row of words from one source file
add.article.row <- function(art, stats, file) {
art.env <- new.env(hash=TRUE, parent=emptyenv());
insert.words(art.env, art$abstract);
category.env <- new.env(hash=TRUE, parent=emptyenv());
insert.words(category.env, art$categories);
write.art.row.cols(art.env, stats$words, file);
write.art.row.cols(category.env, stats$categories, file);
}
# function adds rows representing particular articles to dataframe
add.article.rows <- function(art.stats, file, articles.directory) {
articles <- list.files(
path=articles.directory,
pattern="*.txt",
full.names=TRUE,
ignore.case=TRUE
);
for(a in articles) {
tmp <- load.article(a);
tmp$file.name <- a;
add.article.row(tmp, art.stats, file);
}
}
STOP.WORDS.VECTOR <- c(
"a", "about", "above", "above", "across", "after",
"afterwards", "again", "against", "all", "almost",
"alone", "along", "already", "also","although","always",
"am","among", "amongst", "amoungst", "amount", "an", "and",
"another", "any","anyhow","anyone","anything","anyway", "anywhere",
"are", "around", "as", "at", "back","be","became", "because",
"become","becomes", "becoming", "been", "before", "beforehand",
"behind", "being", "below", "beside", "besides", "between", "beyond",
"bill", "both", "bottom","but", "by", "call", "can", "cannot",
"cant", "co", "con", "could", "couldnt", "cry", "de", "describe",
"detail", "do", "done", "down", "due", "during", "each", "eg",
"eight", "either", "eleven","else", "elsewhere", "empty",
"enough", "etc", "even", "ever", "every", "everyone", "everything",
"everywhere", "except", "few", "fifteen", "fify", "fill", "find",
"fire", "first", "five", "for", "former", "formerly", "forty",
"found", "four", "from", "front", "full", "further", "get", "give",
"go", "had", "has", "hasnt", "have", "he", "hence", "her", "here",
"hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him",
"himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc",
"indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last",
"latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me",
"meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly",
"move", "much", "must", "my", "myself", "name", "namely", "neither", "never",
"nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not",
"nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one",
"only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves",
"out", "over", "own","part", "per", "perhaps", "please", "put", "rather", "re",
"same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she",
"should", "show", "side", "since", "sincere", "six", "sixty", "so", "some",
"somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still",
"such", "system", "take", "ten", "than", "that", "the", "their", "them",
"themselves", "then", "thence", "there", "thereafter", "thereby", "therefore",
"therein", "thereupon", "these", "they", "thickv", "thin", "third", "this",
"those", "though", "three", "through", "throughout", "thru", "thus", "to",
"together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un",
"under", "until", "up", "upon", "us", "very", "via", "was", "we", "well",
"were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter",
"whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which",
"while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will",
"with", "within", "without", "would", "yet", "you", "your", "yours", "yourself",
"yourselves");
STOP.WORDS.ENV <- NA;
init.stop.words.dictionary <- function() {
STOP.WORDS.ENV <<- new.env(hash=TRUE, parent=emptyenv(), size=length(STOP.WORDS.VECTOR));
for(word in STOP.WORDS.VECTOR) {
assign(word, TRUE, envir=STOP.WORDS.ENV);
}
}
is.stop.word <- function(word) {
return(
sapply(word, function(w) exists(w, envir=STOP.WORDS.ENV, inherits=FALSE))
);
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.