r
myTable = read.table('pathToFile', header = TRUE, sep = " ", stringsAsFactors = FALSE)
r
write.csv2('pathToFile.csv', row.names = FALSE, encoding = 'Windows-1252')
# Load required libraries library('styloWorkshop') # Connect to our texts database db.connect() # Fetch information about all available texts texts = get.texts()
(assuming that texts meta data were fetched to the texts variable)
r
texts
r
texts %>%
get.values('docsrc')
r
sportztg = texts %>%
filter(docsrc %in% 'SPORTZTG' & date %in% '2013-03-12')
r
textsWithYear = texts %>%
mutate(year = substr(date, 1, 4))
r
texts %>%
filter(source == 'British Fiction') %>%
write.corpus('author', 'title', 'corpus', 0.9)
r
texts %>%
filter(source %in% 'amc') %>%
mutate(year = substr(date, 1, 4)) %>%
write.corpus('docsrc', 'year', 'primary_set', 0.7, limit = 5*10^6) %>%
write.corpus('docsrc', 'year', 'secondary_set', limit = 5*10^6)
Parse texts from directory "corpus" into bi-grams of words neglecting characters case and assuming they are in German.
features = load.corpus.and.parse( corpus.dir = 'corpus', language = 'German', features = 'w', ngram.size = 2, preserve.case = FALSE )
Compute frequency table using relative frequencies from parsed features
freqTab = count.freqs(features, relative = TRUE)
(assuming that freqTab denotes a variable with computed features frequency table)
r
freqTab[, 1:20]
r
fNumber = ncol(freqTab)
freqTab[, (fNumber - 20):fNumber]
r
freqTab[, 200:210]
r
freqTab[, 'my Feature']
r
freqTabAdj = freqTab[, 1:200]
r
fNumber = ncol(freqTab)
freqTabAdj = freqTab[, (fNumber - 50):fNumber]
r
freqTabAdj = freqTab[, 50:250]
r
freqTabAdj = perform.culling(freqTab, 30)
r
freqTabAdj = delete.stop.words(freqTab, c('sampleFeature', 'my Feature'))
r
freqTabAdj = delete.stop.words(freqTab, stylo.pronouns('English'))
(assuming that freqTabAdj variable holds final features frequency table)
results = stylo( gui = FALSE, frequencies = freqTabAdj, mfw.min = ncol(freqTabAdj), mfw.max = ncol(freqTabAdj), analysis.type = 'CA', distance.measure = 'delta', # distance metric delta/argamon/eder/simple/manhattan/canberra/euclidean/cosine linkage = 'ward' # method of linking texts into tree ward/nj/single/complete/average/mcquitty/median/centroid )
r
as.data.frame(results$table.with.all.zscores)
r
as.data.frame(results$distance.table)
results = stylo( gui = FALSE, frequencies = freqTabAdj, mfw.min = ncol(freqTabAdj), mfw.max = ncol(freqTabAdj), analysis.type = 'MDS', distance.measure = 'delta', # distance metric delta/argamon/eder/simple/manhattan/canberra/euclidean/cosine text.id.on.graphs = 'both', label.offset = 4 # adjust if labels intersect points on the plot )
r
as.data.frame(results$table.with.all.zscores)
r
as.data.frame(results$distance.table)
results = stylo( gui = FALSE, frequencies = freqTabAdj, mfw.min = ncol(freqTabAdj), mfw.max = ncol(freqTabAdj), analysis.type = 'PCV', # PCV/PCR pca.visual.flavour = 'classic', # classic/loadings/technical/symbols text.id.on.graphs = 'both', label.offset = 4 # adjust if labels intersect points on the plot )
r
par(mfrow = c(1, 1))
(assuming that trainFreqTab variable holds features frequency table for the training set and testFreqTab holds one for the test set)
Obtaining information on misclassification after running analysis:
summary = paste(readLines('final_results.txt'), collapse = '\n') cat(summary)
results = classify( gui = FALSE, training.frequencies = trainFreqTab, test.frequencies = testFreqTab, mfw.min = ncol(trainFreqTab), mfw.max = ncol(trainFreqTab), classification.method = 'delta', distance.measure = 'delta', # distance metric delta/argamon/eder/simple/manhattan/canberra/euclidean/cosine )
results = classify( gui = FALSE, training.frequencies = trainFreqTab, test.frequencies = testFreqTab, mfw.min = ncol(trainFreqTab), mfw.max = ncol(trainFreqTab), classification.method = 'knn', k.value = 1 # number of neighbors taken into account )
results = classify( gui = FALSE, training.frequencies = trainFreqTab, test.frequencies = testFreqTab, mfw.min = ncol(trainFreqTab), mfw.max = ncol(trainFreqTab), classification.method = 'svm', svm.kernel = 'linear', # linear/polynomial/radial svm.degree = 3, svm.coef0 = 0, svm.cost = 1 )
results = classify( gui = FALSE, training.frequencies = trainFreqTab, test.frequencies = testFreqTab, mfw.min = ncol(trainFreqTab), mfw.max = ncol(trainFreqTab), classification.method = 'naivebayes' )
results = classify( gui = FALSE, training.frequencies = trainFreqTab, test.frequencies = testFreqTab, mfw.min = ncol(trainFreqTab), mfw.max = ncol(trainFreqTab), classification.method = 'nsc' )
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.