Rstudio / R

Initialization

# Load required libraries
library('styloWorkshop')

# Connect to our texts database
db.connect()

# Fetch information about all available texts
texts = get.texts()

Corpora preparation

(assuming that texts meta data were fetched to the texts variable)

Features frequency table

Features parsing

Parse texts from directory "corpus" into bi-grams of words neglecting characters case and assuming they are in German.

features = load.corpus.and.parse(
  corpus.dir = 'corpus',
  language = 'German',
  features = 'w',
  ngram.size = 2,
  preserve.case = FALSE
)

Feautures frequency table computation

Compute frequency table using relative frequencies from parsed features

freqTab = count.freqs(features, relative = TRUE)

Features frequency table manipulation

(assuming that freqTab denotes a variable with computed features frequency table)

Exploratory analysis

(assuming that freqTabAdj variable holds final features frequency table)

Cluster analysis

results = stylo(
  gui = FALSE,
  frequencies = freqTabAdj,
  mfw.min = ncol(freqTabAdj), mfw.max = ncol(freqTabAdj),
  analysis.type = 'CA',
  distance.measure = 'delta', # distance metric delta/argamon/eder/simple/manhattan/canberra/euclidean/cosine
  linkage = 'ward' # method of linking texts into tree ward/nj/single/complete/average/mcquitty/median/centroid
)

Multidimensional scaling

results = stylo(
  gui = FALSE,
  frequencies = freqTabAdj,
  mfw.min = ncol(freqTabAdj), mfw.max = ncol(freqTabAdj),
  analysis.type = 'MDS',
  distance.measure = 'delta', # distance metric delta/argamon/eder/simple/manhattan/canberra/euclidean/cosine
  text.id.on.graphs = 'both',
  label.offset = 4 # adjust if labels intersect points on the plot
)

Principal Component Analysis

results = stylo(
  gui = FALSE,
  frequencies = freqTabAdj,
  mfw.min = ncol(freqTabAdj),
  mfw.max = ncol(freqTabAdj),
  analysis.type = 'PCV', # PCV/PCR
  pca.visual.flavour = 'classic', # classic/loadings/technical/symbols
  text.id.on.graphs = 'both',
  label.offset = 4 # adjust if labels intersect points on the plot
)

Classification

(assuming that trainFreqTab variable holds features frequency table for the training set and testFreqTab holds one for the test set)

Obtaining information on misclassification after running analysis:

summary = paste(readLines('final_results.txt'), collapse = '\n')
cat(summary)

Delta

results = classify(
  gui = FALSE,
  training.frequencies = trainFreqTab,
  test.frequencies = testFreqTab,
  mfw.min = ncol(trainFreqTab),
  mfw.max = ncol(trainFreqTab),
  classification.method = 'delta',
  distance.measure = 'delta', # distance metric delta/argamon/eder/simple/manhattan/canberra/euclidean/cosine
)

k-Nearest Neighbors

results = classify(
  gui = FALSE,
  training.frequencies = trainFreqTab,
  test.frequencies = testFreqTab,
  mfw.min = ncol(trainFreqTab),  mfw.max = ncol(trainFreqTab),
  classification.method = 'knn',
  k.value = 1 # number of neighbors taken into account
)

SVM

results = classify(
  gui = FALSE,
  training.frequencies = trainFreqTab,
  test.frequencies = testFreqTab,
  mfw.min = ncol(trainFreqTab),  mfw.max = ncol(trainFreqTab),
  classification.method = 'svm',
  svm.kernel = 'linear', # linear/polynomial/radial
  svm.degree = 3,
  svm.coef0 = 0,
  svm.cost = 1
)

Naive Bayes

results = classify(
  gui = FALSE,
  training.frequencies = trainFreqTab,
  test.frequencies = testFreqTab,
  mfw.min = ncol(trainFreqTab),  mfw.max = ncol(trainFreqTab),
  classification.method = 'naivebayes'
)

Nearest Shrunken Centroid

results = classify(
  gui = FALSE,
  training.frequencies = trainFreqTab,
  test.frequencies = testFreqTab,
  mfw.min = ncol(trainFreqTab),  mfw.max = ncol(trainFreqTab),
  classification.method = 'nsc'
)


zozlak/styloWorkshop documentation built on May 5, 2019, 1:37 a.m.