R/classify.R

Defines functions classify

Documented in classify

# Function that performs a number of machine-learning methods
# of classification used in computational stylistics: Delta (Burrows, 2002),
# k-Nearest Neighbors classification, Support Vectors Machines, Naive Bayes,
# and Nearest Shrunken Centroids (Jockers and Witten, 2010). Most of the options
# are derived from the 'stylo' function.

classify = function(gui = TRUE,
                    training.frequencies = NULL,
                    test.frequencies = NULL,
                    training.corpus = NULL,
                    test.corpus = NULL,
                    features = NULL,
                    path = NULL,
                    training.corpus.dir = "primary_set",
                    test.corpus.dir = "secondary_set", ...) {



# if any command-line arguments have been passed by a user, they will
# be stored on the following list and used to overwrite the defaults
passed.arguments = list(...)


# variable's initialization
cross.validation.summary = c()


# changing working directory (if applicable)
#
# first of all, retrieve the current path name
original.path = getwd()
# then check if anywone wants to change the working dir
if(is.character(path) == TRUE & length(path) > 0) {
  # checking if the desired file exists and if it is a directory
  if(file.exists(path) == TRUE & file.info(path)[2] == TRUE) {
    # if yes, then set the new working directory
    setwd(path)
  } else {
    # otherwise, stop the script
    stop("there is no directory ", getwd(), "/", path)
  }
} else {
  # if the argument was empty, then relax
  message("using current directory...")
}


if(is.character(training.corpus.dir)==FALSE | nchar(training.corpus.dir)==0) {
  training.corpus.dir = "primary_set"
}
if(is.character(test.corpus.dir) == FALSE | nchar(test.corpus.dir) == 0) {
  test.corpus.dir = "secondary_set"
}

# loading the default settings as defined in the following function
# (it absorbes the arguments passed from command-line)
variables = stylo.default.settings(...)




# optionally, displaying a GUI box
# (it absorbes the arguments passed from command-line)
if (gui == TRUE) {
      # first, checking if the GUI can be displayed
      # (the conditional expression is stolen form the generic function "menu")
      if (.Platform$OS.type == "windows" || .Platform$GUI ==
            "AQUA" || (capabilities("tcltk") && capabilities("X11") &&
            suppressWarnings(tcltk::.TkUp))) {
        variables = gui.classify(...)
      } else {
        message("")
        message("GUI could not be launched -- default settings will be used;")
        message("otherwise please pass your variables as command-line agruments\n")
      }
}




# #############################################################################
# Explicit assignment of all the variables, in order to avoid attach()
# #############################################################################

add.to.margins = variables$add.to.margins
analysis.type = variables$analysis.type
analyzed.features = variables$analyzed.features
classification.method = variables$classification.method
colors.on.graphs = variables$colors.on.graphs
consensus.strength = variables$consensus.strength
corpus.format = variables$corpus.format
corpus.lang = variables$corpus.lang
culling.incr = variables$culling.incr
culling.max = variables$culling.max
culling.min = variables$culling.min
culling.of.all.samples = variables$culling.of.all.samples
delete.pronouns = variables$delete.pronouns
dendrogram.layout.horizontal = variables$dendrogram.layout.horizontal
display.on.screen = variables$display.on.screen
distance.measure = variables$distance.measure
dump.samples = variables$dump.samples
final.ranking.of.candidates = variables$final.ranking.of.candidates
how.many.correct.attributions = variables$how.many.correct.attributions
interactive.files = variables$interactive.files
k.value = variables$k.value
l.value = variables$l.value
label.offset = variables$label.offset
linkage = variables$linkage
mfw.incr = variables$mfw.incr
mfw.list.cutoff = variables$mfw.list.cutoff
mfw.max = variables$mfw.max
mfw.min = variables$mfw.min
ngram.size = variables$ngram.size
number.of.candidates = variables$number.of.candidates
outputfile = variables$outputfile
passed.arguments = variables$passed.arguments
pca.visual.flavour = variables$pca.visual.flavour
plot.custom.height = variables$plot.custom.height
plot.custom.width = variables$plot.custom.width
plot.font.size = variables$plot.font.size
plot.line.thickness = variables$plot.line.thickness
plot.options.reset = variables$plot.options.reset
reference.wordlist.of.all.samples = variables$reference.wordlist.of.all.samples
sample.size = variables$sample.size
sampling = variables$sampling
sampling.with.replacement = variables$sampling.with.replacement
save.analyzed.features = variables$save.analyzed.features
save.analyzed.freqs = variables$save.analyzed.freqs
save.distance.tables = variables$save.distance.tables
start.at = variables$start.at
svm.coef0 = variables$svm.coef0
svm.cost = variables$svm.cost
svm.degree = variables$svm.degree
svm.kernel = variables$svm.kernel
text.id.on.graphs = variables$text.id.on.graphs
titles.on.graphs = variables$titles.on.graphs
txm.compatibility.mode = variables$txm.compatibility.mode
use.custom.list.of.files = variables$use.custom.list.of.files
use.existing.freq.tables = variables$use.existing.freq.tables
use.existing.wordlist = variables$use.existing.wordlist
write.jpg.file = variables$write.jpg.file
write.pdf.file = variables$write.pdf.file
write.png.file = variables$write.png.file
write.svg.file = variables$write.svg.file
z.scores.of.all.samples = variables$z.scores.of.all.samples
# #############################################################################



# newly-added options
relative.frequencies = variables$relative.frequencies
splitting.rule = variables$splitting.rule
preserve.case = variables$preserve.case
encoding = variables$encoding
cv.folds = variables$cv.folds
stop.words = variables$stop.words
sample.overlap = variables$sample.overlap
number.of.samples = variables$number.of.samples
custom.graph.title = variables$custom.graph.title

show.features = variables$show.features


# ['cv' is temporarily switched off, it always performs 'cv ="stratified"']
# cv = variables$cv






# #############################################################################
# Final settings (you are advised rather not to change them)
# #############################################################################


# Given a language option ("English", "Polish", "Latin" etc., as described
# above), this procedure selects one of the lists of pronouns
# If no language was chosen (or if a desired language is not supported, or if
# there was a spelling mistake), then the variable will be set to "English".

pronouns = stylo.pronouns(corpus.lang = corpus.lang)


# Since it it not so easy to perform, say, 17.9 iterations, or analyze
# 543.3 words, the code below justifies all numerical variables, to prevent
# you from your stupid jokes with funny settings. (OK, it is still
# possible to crash the script but we will not give you a hint)
  mfw.min = round(mfw.min)
  mfw.max = round(mfw.max)
  mfw.incr = round(mfw.incr)
  start.at = round(start.at)
  culling.min = round(culling.min)
  culling.max = round(culling.max)
  culling.incr = round(culling.incr)
  mfw.list.cutoff = round(mfw.list.cutoff)

# This also prevents from unexpected settings
if(number.of.candidates < 1) {
  number.of.candidates = 1
  number.of.candidates = round(number.of.candidates)
  }



###############################################################################
# Backward compatibility: if "use.existing.freq.tables" is switched on, then
# two files with frequency tables will be used, provided that they do exist
  if(use.existing.freq.tables == TRUE
                      & file.exists("freq_table_primary_set.txt") == TRUE
                      & file.exists("freq_table_secondary_set.txt") == TRUE ) {
    training.frequencies = "freq_table_primary_set.txt"
    test.frequencies = "freq_table_secondary_set.txt"
  } else {
    use.existing.freq.tables = FALSE
  }
# Backward compatibility: if "use.existing.wordlist" is switched on, then
# the file "wordlist.txt" will be used, provided that it does exist
  if(use.existing.wordlist == TRUE & file.exists("wordlist.txt") == TRUE ) {
    features = "wordlist.txt"
  } else {
    use.existing.wordlist = FALSE
  }
###############################################################################



# #############################################################################
# #############################################################################














# #################################################
# the module for loading a corpus from text files;
# it can be omitted if the frequency table for
# both primary and secondary sets already exist
# (then "use.existing.freq.tables" should be set
# to TRUE in the preamble of the script/GUI)
# #################################################
#

###############################################################################
# Checking if the argument "features" has been used (e.g. a custom wordlist)
#
# variable initialization:
features.exist = FALSE
#
  # Firstly, checking if the variable has at least two elements
  if(length(features) > 1) {
      # if yes, then checking whether it is a vector
      if(is.vector(features) == TRUE) {
        # if yes, then convert the above object into characters, just in case
        features = as.character(features)
        # link this vector into the variable used for calculations
        mfw.list.of.all = features
      } else {
        message("")
        message("You seem to have chosen an existing set of features")
        message("Unfortunately, something is wrong: check if your variable")
        message("has a form of vector")
        stop("Wrong format: a vector of features (e.g. words) was expected")
      }
    # selecting the above vector as a valid set of features
    features.exist = TRUE
  }
  # Secondly, checking if the variable has exactly one element;
  # presumably, this is a file name where a list of words is stored
  if(length(features) == 1) {
    # to prevent using non-letter characters (e.g. integers)
    features = as.character(features)
      # does the file exist?
      if(file.exists(features) == TRUE) {
        # file with a vector of features will be loaded
        message("\n", "reading a custom set of features from a file...")
        # reading a file: newlines are supposed to be delimiters
        features = scan(features, what = "char", sep = "\n", encoding = encoding)
        # getting rid of the lines beginning with the "#" char
        features = c(grep("^[^#]", features, value = TRUE))
        # link this vector into the variable used for calculations
        mfw.list.of.all = features
      } else {
        # if there's no such a file, then don't try to use it
        message("\n", "file \"", features, "\" could not be found")
        stop("Wrong file name")
      }
    # selecting the above vector as a valid set of features
    features.exist = TRUE
  }
###############################################################################







###############################################################################
# Checking if the argument "frequencies" has been used
#
# Iterating over two sets: the trainig set and the test set
for(iteration in 1:2) {
    # first iteration: training set
    if(iteration == 1) {
      frequencies = training.frequencies
    }
    # second iteration: test set
    if(iteration == 2) {
      frequencies = test.frequencies
    }


  # variable initialization:
  corpus.exists = FALSE

  # Firstly, checking if the variable has at least two elements
  if(length(frequencies) > 1) {
      # if yes, then checking whether it is a table or matrix
      if(is.matrix(frequencies) == TRUE | is.data.frame(frequencies) == TRUE) {
        # if yes, then convert the above object into a matrix (just in case)
        frequencies = as.matrix(frequencies)
      } else {
        message("")
        message("You seem to have chosen an existing table with frequencies")
        message("Unfortunately, something is wrong: check if your variable")
        message("has a form of matrix/data frame\n")
        stop("Wrong format of the table of frequencies")
      }
      # this code makes sure that the table has variables' names
      if(length(colnames(frequencies)) == 0) {
        colnames(frequencies) = paste("var", 1:length(frequencies[1,]), sep = "_")
      }
      # this code makes sure that the table has samples' names
      if(length(rownames(frequencies)) == 0) {
        rownames(frequencies) = paste("sample", 1:length(frequencies[,1]), sep = "_")
      }
    # selecting the above matrix as a valid corpus
    corpus.exists = TRUE
  }
  # Secondly, checking if the variable has exactly one element;
  # presumably, this is a file name where a table is stored
  if(length(frequencies) == 1) {
    # to prevent using non-letter characters (e.g. integers)
    frequencies = as.character(frequencies)
      # does the file exist?
      if(file.exists(frequencies) == TRUE) {
        # file with frequencies will be loaded
        message("\nreading a file containing frequencies...")
        frequencies = t(read.table(frequencies, encoding = encoding))
      } else {
        # if there's no such a file, then don't try to use it
        message("\n", "file \"", frequencies, "\" could not be found")
        stop("Wrong file name")
      }
    # selecting the above matrix as a valid corpus
    corpus.exists = TRUE
  }



  # If a custom set of features was indicated, try to pick the matching variables only
  if(features.exist == TRUE & corpus.exists == TRUE) {
      # checking if the chosen features do match the columns of the table
      if(length(grep("TRUE", colnames(frequencies) %in% features)) < 2) {
        message("The features you want to analyze do not match the variables' names:")
        message("")
        message("Available features: ", head(colnames(frequencies)), "...")
        message("Chosen features: ", head(features), "...")
        message("")
        message("Check the rotation of your table and the names of its rows and columns.")
        stop("Input data mismatch")
      } else {
        # if everything is right, select the subset of columns from the table:
        frequencies = frequencies[ , colnames(frequencies) %in% features]
      }
  }


  # If no custom features were chosen, take them from the variables' names
  if(features.exist == FALSE & corpus.exists == TRUE) {
     features = colnames(frequencies)
     # this is stupid, but this obsolete variable is needed somewhere (?)
     mfw.list.of.all = features
  }



  # Additionally, check if the table with frequencies is long enough
  if(corpus.exists == TRUE) {
    if(length(frequencies[,1]) < 2 | length(frequencies[1,]) < 2 ) {
      message("")
      message("There is not enough samples and/or features to be analyzed.")
      message("Try to use tables of at least two rows by two columns.\n")
      stop("Wrong size of the table of frequencies")
    }
  }

  # 1st iteration: setting the matrix containing the training set (if applicable)
  if(corpus.exists == TRUE & iteration == 1) {
    freq.I.set.0.culling = frequencies
    message("Training set successfully loaded.")
  }
  # 2nd iteration: setting the matrix containing the test set (if applicable)
  if(corpus.exists == TRUE & iteration == 2) {
    freq.II.set.0.culling = frequencies
    message("Test set successfully loaded.")
  }

# attempts at loading the training set and the test set: the loop returns here
}

# Two iterations completed, another sanity check should be applied
  # First, let's check if the I set was loaded
  if(!exists("freq.I.set.0.culling") & exists("freq.II.set.0.culling")) {
    message("Training set is missing, though.")
    message("Trying to build both tables from scratch.")
    corpus.exists = FALSE
  }
  # Secondly, let's check the II set
  if(exists("freq.I.set.0.culling") & !exists("freq.II.set.0.culling")) {
    message("Test set is missing, though.")
    message("Trying to build both tables from scratch.")
    corpus.exists = FALSE
  }
###############################################################################




# If the tables with frequencies could not loaded so far (for any reason),
# try to load an external corpus (R object) passed as an argument

###############################################################################
# Checking if the argument "training.corpus" and/or "test.corpus" has been used
#
# Iterating over two sets: trainig set and test set
for(iteration in 1:2) {
    # first iteration: training set
    if(iteration == 1) {
      parsed.corpus = training.corpus
    }
    # second iteration: test set
    if(iteration == 2) {
      parsed.corpus = test.corpus
    }

  # checking if the variable "parsed.corpus" is empty
  if(corpus.exists == FALSE & length(parsed.corpus) > 0) {
      # if the variable was used, check its format
      if(is.list(parsed.corpus) == TRUE & length(parsed.corpus) > 1) {
          # checking if the samples have their names; otherwise, assign generic ones:
          if( length(names(parsed.corpus)) != length(parsed.corpus) ) {
            names(parsed.corpus) = paste("sample", 1:length(parsed.corpus), sep="_")
          }
        # if everything is fine, use this variable as a valid corpus
#        loaded.corpus = parsed.corpus
      } else {
        message("")
        message("The object you've specified as your corpus cannot be used.")
        message("It should be a list containing particular text samples")
        message("(vectors containing sequencies of words/n-grams or other features).")
        message("The samples (elements of the list) should have their names.")
        message("Alternatively, try to build your corpus from text files (default).\n")
        stop("Wrong corpus format")
      }
  }

  # 1st iteration: setting the matrix containing the training set (if applicable)
  if(iteration == 1) {
    corpus.of.primary.set = parsed.corpus
  }
  # 2nd iteration: setting the matrix containing the test set (if applicable)
  if(iteration == 2) {
    corpus.of.secondary.set = parsed.corpus
  }
# attempts at loading the training set and the test set: the loop returns here
}

# Two iterations completed, another sanity check should be applied
if(corpus.exists == FALSE) {
    if(length(corpus.of.primary.set) >1 & length(corpus.of.secondary.set) >1 ) {
      message("Two subcorpora loaded successfully.")
      corpus.exists = TRUE
    } else {
      message("The subcorpora will be loaded from text files...")
      corpus.exists = FALSE
    }
}
###############################################################################




# If there's still no corpus available, then load and parse text files.
# They are supposed to be stored in a specified corpus subfolder and to follow
# a strictly defined naming convention.

###############################################################################
# Building subcorpora from text files

if(corpus.exists == FALSE) {

  # Checking whether required files and subdirectories exist
  # First check: allow user to choose a suitable folder via GUI
  if(file.exists(training.corpus.dir) == FALSE | file.exists(test.corpus.dir) == FALSE) {
    selected.path = tk_choose.dir(caption = "Select your working directory. It should two subdirectories called *primary_set* and *secondary_set*")
    setwd(selected.path)
  }
  
  # If the user failed to provide a suitable folder at this point, abort.
  if(file.exists(training.corpus.dir) == FALSE | file.exists(test.corpus.dir) == FALSE) {
    message("\n\n", "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n",
            "Working directory should contain two subdirectories:
        \"", training.corpus.dir, "\" and \"", test.corpus.dir, "\"\n",
            "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n", sep = "")
    # back to the original working directory
    setwd(original.path)
    # error message
    stop("corpus prepared incorrectly")
  }
  
  # Retrieving the names of samples
  #
  filenames.primary.set = list.files(training.corpus.dir)
  filenames.secondary.set = list.files(test.corpus.dir)
  
  # Checking if the subdirectories contain any stuff
  if(length(filenames.primary.set) <2 | length(filenames.secondary.set) <2) {
    message("\n\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n",
            "Both subdirectories \"", training.corpus.dir, "\" and \"",
            test.corpus.dir, "\"\nshould contain at least two text samples!\n",
            "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n")
    # back to the original working directory
    setwd(original.path)
    # error message
    stop("corpus prepared incorrectly")
  }

  # loading text files, splitting, parsing, n-gramming, samping, and so forth
  corpus.of.primary.set = load.corpus.and.parse(files = filenames.primary.set,
                         corpus.dir = training.corpus.dir,
                         encoding = encoding,
                         markup.type = corpus.format,
                         corpus.lang = corpus.lang,
                         splitting.rule = splitting.rule,
                         preserve.case = preserve.case,
                         sample.size = sample.size,
                         sampling = sampling,
                         sampling.with.replacement = sampling.with.replacement,
                         sample.overlap = sample.overlap,
                         number.of.samples = number.of.samples,
                         features = analyzed.features,
                         ngram.size = ngram.size)

  # loading text files: test set
  corpus.of.secondary.set = load.corpus.and.parse(files = filenames.secondary.set,
                         corpus.dir = test.corpus.dir,
                         encoding = encoding,
                         markup.type = corpus.format,
                         corpus.lang = corpus.lang,
                         splitting.rule = splitting.rule,
                         preserve.case = preserve.case,
                         sample.size = sample.size,
                         sampling = sampling,
                         sampling.with.replacement = sampling.with.replacement,
                         sample.overlap = sample.overlap,
                         number.of.samples = number.of.samples,
                         features = analyzed.features,
                         ngram.size = ngram.size)
}
###############################################################################




# At this point, some corpora SHOULD be available. If there's still no frequency
# tables, they will be build at this stage

###############################################################################
# building tables of frequencies

if(!exists("freq.I.set.0.culling") | !exists("freq.II.set.0.culling")) {

  # blank line on the screen
  message("")

  # both corpora (training set and test set) shoud contain some texts;
  # if the number of text samples is lower than 2, the script will stop
  if(length(corpus.of.primary.set) < 2 || length(corpus.of.secondary.set) < 1) {
    message("\n\n", "either the training set or the test set is empty!", "\n")
    stop("corpus error")
  }

  # If an external vector of features (usually: the most frequent words) has not
  # been specified (cf. the argument "features"), then we need a list of the most
  # frequent words (or n-grams, or anything else) used in the current corpus,
  # in descending order, without frequencies (just a list of words/features).
  if (features.exist == TRUE) {
    message("")
    message("using an existing wordlist (vector of features)...")
    mfw.list.of.all = features
  } else {
    # Extracting all the words (features) used in the texts of primary set
    # (or both if "Z-scores all" is set to TRUE)
    wordlist.of.primary.set = c()
    message("")
    # iterating over the samples stored in corpus.of.primary.set
    for (file in 1 : length(corpus.of.primary.set)) {
      # loading the next sample from the list filenames.primary.set,
      current.text = corpus.of.primary.set[[file]]
      # putting the samples together:
      wordlist.of.primary.set = c(wordlist.of.primary.set, current.text)
      # short message on screen
      message(".", appendLF = FALSE)
      if(file/25 == floor(file/25)) { message("")} # a newline every 25th sample
    }
    # including words of the secondary set in the reference wordlist (if specified)
      if (reference.wordlist.of.all.samples == TRUE) {
        wordlist.of.secondary.set = c()
        message("")
        for (file in 1 : length(corpus.of.secondary.set)) {
          # loading the next sample from the list filenames.secondary.set,
          current.text = corpus.of.secondary.set[[file]]
          # putting samples together:
          wordlist.of.secondary.set = c(wordlist.of.secondary.set, current.text)
          # short message on screen
          message(".", appendLF = FALSE)
          if(file/25 == floor(file/25)) { message("")} # a newline every 25th sample
        }
      } else {
        # otherwise, create an empty vector
        wordlist.of.secondary.set = c()
      }

    # Preparing a sorted frequency list of the whole primary set (or both sets).
    # short message
    message(" ")
    message(length(c(wordlist.of.primary.set, wordlist.of.secondary.set)), " tokens ",
         "will be used to create a list of features")
    # the core procedure: frequency list
    mfw.list.of.all = sort(table(c(wordlist.of.primary.set, wordlist.of.secondary.set)),
                            decreasing = TRUE)
    # if the whole list is long, then cut off the tail (e.g., > 5000 mfw)
      if (length(mfw.list.of.all) > mfw.list.cutoff) {
        mfw.list.of.all = mfw.list.of.all[1:mfw.list.cutoff]
      }
    # the only thing we need are words ordered by frequency (no frequencies)
    mfw.list.of.all = names(mfw.list.of.all)

    # Saving the list of features.
    # some comments into the file containing wordlist
    cat("# This file contains the words that were used for building the table",
      "# of frequencies. It can be also used for the next tasks, and for this",
      "# purpose it can be manually revised, edited, deleted, culled, etc.",
      "# You can either delete unwanted words, or mark them with \"#\"",
      "# -----------------------------------------------------------------------",
      "", file = "wordlist.txt", sep = "\n")
    # the current wordlist into a file
      # checking if encoding conversion is needed
      if(encoding == "native.enc") {
        data.to.be.saved = mfw.list.of.all
      } else {
        data.to.be.saved = iconv(mfw.list.of.all, to=encoding)
      }
  # writing the stuff
  cat(data.to.be.saved, file = "wordlist.txt", sep = "\n", append = TRUE)

  }   # <----- conditional expr. if(features.exist == TRUE) terminates here







  # empty the dump-dir if it already existed and create it if it did not previously exist
  if(dump.samples == TRUE){
	  if (file.exists("sample_dump_primary_set")){
		# a dump-dir seems to have been created during a previous run
		# tmp delete the dump-dir to remove all of its previous contents
		unlink("sample_dump_primary_set", recursive = TRUE)
	  }
	# (re)create the dump-dir
	dir.create("sample_dump_primary_set")
    # writing the stuff into files
    setwd("sample_dump_primary_set")
      for(i in names(corpus.of.primary.set)) {
        cat(corpus.of.primary.set[[i]], file = paste(names(corpus.of.primary.set[i]), ".txt", sep = ""))
      }
    setwd("..")
  }
  # empty the dump-dir if it already existed and create it if it did not previously exist
  if(dump.samples == TRUE){
	  if (file.exists("sample_dump_secondary_set")){
		# a dump-dir seems to have been created during a previous run
		# tmp delete the dump-dir to remove all of its previous contents
		unlink("sample_dump_secondary_set", recursive = TRUE)
	  }
	# (re)create the dump-dir
	dir.create("sample_dump_secondary_set")
    # writing the stuff into files
    setwd("sample_dump_secondary_set")
      for(i in names(corpus.of.secondary.set)) {
        cat(corpus.of.secondary.set[[i]], file = paste(names(corpus.of.secondary.set[i]), ".txt", sep = ""))
      }
    setwd("..")
  }







  # blank line on the screen
  message("")


  # preparing a huge table of all the frequencies for the training set
  freq.I.set.0.culling = make.table.of.frequencies(corpus = corpus.of.primary.set,
                            features = mfw.list.of.all,
                            absent.sensitive = FALSE,
                            relative = relative.frequencies)

  # preparing a huge table of all the frequencies for the test set
  freq.II.set.0.culling = make.table.of.frequencies(corpus = corpus.of.secondary.set,
                            features = mfw.list.of.all,
                            absent.sensitive = FALSE,
                            relative = relative.frequencies)

  # writing the frequency tables to text files (they can be re-used!)
  # first, the training set
      # checking if any re-encoding is needed
      if(encoding == "native.enc") {
        data.to.be.saved = t(freq.I.set.0.culling)
      } else {
        data.to.be.saved = t(freq.I.set.0.culling)
        rownames(data.to.be.saved) = iconv(rownames(data.to.be.saved), to=encoding)
        colnames(data.to.be.saved) = iconv(colnames(data.to.be.saved), to=encoding)
      }
  # writing the stuff
  write.table(data.to.be.saved, file = "freq_table_primary_set.txt")

  # now, the test set
      # checking if any re-encoding is needed
      if(encoding == "native.enc") {
        data.to.be.saved = t(freq.II.set.0.culling)
      } else {
        data.to.be.saved = t(freq.II.set.0.culling)
        rownames(data.to.be.saved) = iconv(rownames(data.to.be.saved), to=encoding)
        colnames(data.to.be.saved) = iconv(colnames(data.to.be.saved), to=encoding)
      }
  # writing the stuff
  write.table(data.to.be.saved, file = "freq_table_secondary_set.txt")


}
###############################################################################

# #################################################
# the module for loading the corpus terminates here
# #################################################



# #################################################
# module for exporting config settings
# #################################################

# Finally, we want to save some variable values for later use
cat("", file = "classify_config.txt", append = FALSE)
var.name <- function(x) {
      if(is.character(x) == TRUE) {
      cat(paste(deparse(substitute(x)), " = \"", x, "\"", sep = ""), file = "classify_config.txt", sep = "\n", append = TRUE)
        } else {
          cat(paste(deparse(substitute(x)), x, sep = " = "), file = "classify_config.txt", sep = "\n", append = TRUE) }
        }

 var.name(corpus.format)
 var.name(corpus.lang)
 var.name(analyzed.features)
 var.name(ngram.size)
 var.name(mfw.min)
 var.name(mfw.max)
 var.name(mfw.incr)
 var.name(start.at)
 var.name(culling.min)
 var.name(culling.max)
 var.name(culling.incr)
 var.name(mfw.list.cutoff)
 var.name(delete.pronouns)
 var.name(preserve.case)
 var.name(encoding)
 var.name(use.existing.freq.tables)
 var.name(use.existing.wordlist)
 var.name(classification.method)
 var.name(culling.of.all.samples)
 var.name(z.scores.of.all.samples)
 var.name(reference.wordlist.of.all.samples)
 var.name(distance.measure)
 var.name(svm.kernel)
 var.name(svm.degree)
 var.name(svm.coef0)
 var.name(svm.cost)
 var.name(k.value)
 var.name(l.value)
 var.name(sampling)
 var.name(sample.size)
 var.name(number.of.samples)
 var.name(final.ranking.of.candidates)
 var.name(how.many.correct.attributions)
 var.name(number.of.candidates)
 var.name(save.distance.tables)
 var.name(save.analyzed.features)
 var.name(save.analyzed.freqs)




# #################################################







# #################################################
# MAIN PROGRAM; the main loop is below
# #################################################


# cleaning the outputfile
cat("", file = outputfile, append = FALSE)

# saving the original mfw.max value in mfw.max.original
# this is useful for subtitles of bootstrap graphs
mfw.max.original = mfw.max

# the general counter for different purposes: initiation
number.of.current.iteration = 0

# useful for diagnostic reasons; this will be reported in the logfile
total.no.of.correct.attrib = c()
total.no.of.possible.attrib = c()

# retrieving the total number of texts to be "guessed"
# (anonymous texts and unique authorial samples will not be counted)
classes.train = c(gsub("_.*", "", rownames(freq.I.set.0.culling)))
classes.test = c(gsub("_.*", "", rownames(freq.II.set.0.culling)))
perfect.guessing = length(classes.test[classes.test %in% classes.train])







# #################################################
# module for culling (THE MAIN LOOP IN THE PROGRAM)
# #################################################
# #################################################
# #################################################
# module for culling (THE MAIN LOOP IN THE PROGRAM)
# #################################################


# testing if desired culling settings are acceptable;
# if too large, it is set to maximum possible
  if(culling.max > 100) {
  culling.max = 100
  }
  if(culling.min > 100) {
  culling.min = 100
  }
# if too small, it is set to 0 (i.e. minimal value)
  if(culling.min < 0) {
  culling.min = 0
  }
# if max value is LOWER than min value, make them equal
  if(culling.max < culling.min) {
  culling.max = culling.min
  }
# avoiding infinite loops
  if(culling.incr <= 1) {
  culling.incr = 10
  }

# #################################################



for(j in (culling.min/culling.incr):(culling.max/culling.incr)) {


        current.culling = j * culling.incr


        # applying culling
        # an additional table composed of relative word frequencies
        # of joint primary and secondary sets
        if(culling.of.all.samples == FALSE) {
                # applying the function culling to the I set
                primary.set = perform.culling(freq.I.set.0.culling,
                                             current.culling)
                # selecting the same variables from the II set
                secondary.set = freq.II.set.0.culling[,colnames(primary.set)]
        } else {
                # combining the two sets
                freq.table.both.sets = rbind(freq.I.set.0.culling,
                                             freq.II.set.0.culling)
                # applying the culling function to the combined table
                freq.table.both.sets = perform.culling(freq.table.both.sets,
                                             current.culling)
                # split the combined table into two sets again
                primary.set = freq.table.both.sets[rownames(freq.I.set.0.culling),]
                secondary.set = freq.table.both.sets[rownames(freq.II.set.0.culling),]
        }



        # additionally, deleting pronouns (if applicable)
        if(delete.pronouns == TRUE) {
                primary.set = delete.stop.words(primary.set, pronouns)
                secondary.set = delete.stop.words(secondary.set, pronouns)
        }


        # optionally, deleting stop words
        if(is.vector(stop.words) == TRUE) {
                primary.set = delete.stop.words(primary.set, stop.words)
                secondary.set = delete.stop.words(secondary.set, stop.words)
        }




# #################################################
# culling is done, but we are still inside the main loop





# starting the frequency list at frequency rank set in option start.at above

# TO SAY THE TRUTH, IT CAN BE DONE MUCH EARLIER: at the moment when
# the frequency list for either I set or both sets is produced,
# it can be cut and used for building freq. tables

primary.set = primary.set[,start.at:length(primary.set[1,])]
secondary.set = secondary.set[,start.at:length(secondary.set[1,])]




# Testing if the desired MFW number is acceptable,
# if MFW too large, it is set to maximum possible.
  if(mfw.max > length(primary.set[1,])) {
  mfw.max = length(primary.set[1,])
  }
# if too small, it is set to 2 (i.e., minimal value)
  if(mfw.min < 2) {
  mfw.min = 2
  }
# if the max value is smaller than the min value, it will be adjusted
  if(mfw.max < mfw.min) {
  mfw.max = mfw.min
  }
# avoiding infinite loops
  if( (mfw.max != mfw.min) && (mfw.incr == 0) ) {
  mfw.incr = 10
  }





message("")
message("culling @ ", current.culling, "\t", "available words ",
                  length(primary.set[1,]))







# an additional table composed of relative word frequencies
# of joint primary and secondary sets
freq.table.both.sets = rbind(primary.set, secondary.set)




if(tolower(classification.method) == "delta") {
        # a short message on the screen:
        if(distance.measure == "delta") {
                message("Calculating classic Delta distances...")
        }
        if(distance.measure == "argamon") {
                message("Calculating Argamon's Delta distances...")
        }
        if(distance.measure == "eder") {
                message("Calculating Eder's Delta distances...")
        }
        if(distance.measure == "eder") {
                message("Calculating Eder's Simple distances...")
        }
        if(distance.measure == "manhattan") {
                message("Calculating Manhattan distances...")
        }
        if(distance.measure == "canberra") {
                message("Calculating Canberra distances...")
        }
        if(distance.measure == "euclidean") {
                message("Calculating Euclidean distances...")
        }
        if(distance.measure == "cosine") {
                message("Calculating Cosine distances...")
        }
}










# #################################################
# the internal loop starts here (for i = mfw.min : mfw.max)
# #################################################



for(i in seq(mfw.min, mfw.max, round(mfw.incr)) ) {

    
mfw = i



# for safety reasons, if MFWs > words in samples
if(mfw > length(colnames(freq.table.both.sets)) ) {
  mfw = length(colnames(freq.table.both.sets))
  }



# the current task (number of MFW currently analyzed) echoed on the screen
message(mfw, " ", appendLF = FALSE)































if(tolower(classification.method) == "delta") {
    classification.results = perform.delta(training.set = primary.set[,1:mfw],
                                test.set = secondary.set[,1:mfw],
                                distance = distance.measure,
                                z.scores.both.sets = z.scores.of.all.samples)
    distance.table = classification.results$distance_table
    classification.results = classification.results$y

}


if(tolower(classification.method) == "knn") {
    classification.results = perform.knn(training.set = primary.set[,1:mfw],
                                       test.set = secondary.set[,1:mfw],
                                       k.value = k.value)$y
}


if(tolower(classification.method) == "svm") {
    classification.results = perform.svm(training.set = primary.set[,1:mfw],
                                       test.set = secondary.set[,1:mfw])$y
}


if(tolower(classification.method) == "naivebayes") {
    classification.results = perform.naivebayes(training.set= primary.set[,1:mfw],
                                              test.set = secondary.set[,1:mfw])$y
}


if(tolower(classification.method) == "nsc") {
    classification.results = perform.nsc(training.set = primary.set[,1:mfw],
                                       test.set = secondary.set[,1:mfw],
                                       show.features = show.features)
    classification.results.features = classification.results$features
    classification.results = classification.results$y
}





expected_classes = gsub("_.*","",rownames(secondary.set))
predicted_classes = as.vector(classification.results)

#performance = performance.measures(expected_classes, predicted_classes)







# returns the ranking of the most likely candidates as a list
if(final.ranking.of.candidates == TRUE) {
    cat("\n\n\n", file = outputfile, append = TRUE)
      misclassified.samples =
                   paste(rownames(secondary.set), "\t-->\t",
                   classification.results)[classes.test != classification.results]
      cat(misclassified.samples, file = outputfile, append = TRUE, sep = "\n")
      # temporarily (the results should be made available, eventually)
      rm(misclassified.samples)
}




# returns the number of correct attributions
if(how.many.correct.attributions == TRUE) {
    no.of.correct.attrib = sum(as.numeric(classes.test ==
                                 classification.results))
    total.no.of.correct.attrib =
         c(total.no.of.correct.attrib, no.of.correct.attrib)
    total.no.of.possible.attrib =
         c(total.no.of.possible.attrib, perfect.guessing)
    cat("\n", file = outputfile, append = TRUE)
    cat(mfw, " MFW , culled @ ", current.culling, "%,  ",
         no.of.correct.attrib, " of ", perfect.guessing, "\t(",
         round(no.of.correct.attrib / perfect.guessing * 100, 1), "%)",
         "\n", file = outputfile, append = TRUE, sep = "")
}











if(cv.folds > 0) {

  message("")
  message("cross-validation...")


  #bootstrap.output = "bootstrap_output.txt"
  #cleaning the bootstrapfile
  #cat("", file = bootstrap.output, append = FALSE)

  # creating an empty matrix for the final success scores
  cross.validation.results = c()
  cross.validation.results.all = c()
  
  
  # accumulating the predictions and the expected classes
  predicted_classes = c()
  expected_classes = c()




  # beginning of k-fold cross-validation (k being the number of iterations)
  for(iterations in 1 : cv.folds) {

    # an additional table combined of frequencies of set I and II
    # just for feeding the bootstrap module
    freq.table.both.sets.binded = rbind(primary.set[,1:mfw], secondary.set[,1:mfw])


    names.of.training.set.orig = rownames(primary.set)
    classes.training.set = gsub("_.*", "", rownames(primary.set))
    classes.test.set = gsub("_.*", "", rownames(secondary.set))
    names.both.sets = rownames(freq.table.both.sets.binded)
    classes.both.sets = c(classes.training.set, classes.test.set)

    training.samples = c()
    test.samples = c()


      # this looks for classes that were not represented so far in I set
      for(i in names(table(classes.training.set)) ) {
        #
        # count the number of samples of class i included originally in I set
        no.of.training.samples = sum(as.numeric(classes.training.set == i))
        # determine the class' name, surround the name with word boundary char
        class.name = paste("\\b", i, "\\b", sep = "")
        # in both sets, identify the positions of current class' samples
        pinpoint.samples = grep(class.name, classes.both.sets)
        # sanity check, just in case
        if(length(pinpoint.samples) > no.of.training.samples) {
                # select randomly N items from the pinpoited positions
                training = sample(pinpoint.samples, no.of.training.samples)
                # identify the remaining ones: future test set samples
                test = setdiff(pinpoint.samples, training)
                # pick the names at the positions identified above
                training.samples = c(training.samples, names.both.sets[training])
                # the remaining ones go to the test set
                test.samples = c(test.samples, names.both.sets[test])
        } else {
                test = pinpoint.samples
                test.samples = c(test.samples, names.both.sets[test])
        }
      }


#### !!! Anon samples are excluded!!!


  # establishing the training set:
  training.set = freq.table.both.sets.binded[training.samples,]

  # establishing the test set
  test.set = freq.table.both.sets.binded[test.samples,]


 # zscores.training.set = zscores.table.both.sets[training.samples,]
 # zscores.test.set = zscores.table.both.sets[test.samples,]


  if(tolower(classification.method) == "delta") {
      classification.results = perform.delta(training.set,
                                   test.set,
                                   distance = distance.measure,
                                   z.scores.both.sets = z.scores.of.all.samples)
      distance.table = classification.results$distance_table
      classification.results = classification.results$y
  }
  if(tolower(classification.method) == "knn") {
      classification.results = perform.knn(training.set, test.set, k.value)$y
  }
  if(tolower(classification.method) == "svm") {
      classification.results = perform.svm(training.set, test.set)$y
  }
  if(tolower(classification.method) == "nsc") {
      classification.results = perform.nsc(training.set, test.set,
                                         show.features = show.features)$y
  }
  if(tolower(classification.method) == "naivebayes") {
      classification.results = perform.naivebayes(training.set, test.set)$y
  }



  # retrieving classes of the new training set
  classes.training = gsub("_.*", "", rownames(training.set))

  # retrieving classes of the new test set
  classes.test = gsub("_.*", "", rownames(test.set))

  
  
  # accumulating the predictions and the expected classes
  predicted_classes = c(predicted_classes, as.vector(classification.results))
  expected_classes = c(expected_classes, classes.test)


    # returns the number of correct attributions
    if(how.many.correct.attributions == TRUE) {
          no.of.correct.attrib = sum(as.numeric(classes.test ==
                                     classification.results))
      # getting the max. number of samples that couold be guessed
      perfect.guessing.cv = sum(as.numeric(classes.test %in% classes.training))
      cat("\n", file = outputfile, append = TRUE)
      cat(mfw, " MFW , culled @ ", current.culling, "%,  ",
               no.of.correct.attrib, " of ", perfect.guessing.cv, "\t(",
               round(no.of.correct.attrib / perfect.guessing.cv * 100, 1), "%)",
               "\n", file = outputfile, append = TRUE, sep = "")
      # percentage of correct attributions
      success.rate.cv = no.of.correct.attrib / perfect.guessing.cv * 100
      # combining results for k folds
      cross.validation.results = c(cross.validation.results, success.rate.cv)
    }

  }


  cross.validation.results.all = cbind(cross.validation.results.all, cross.validation.results)
  colnames(cross.validation.results.all) = paste(mfw, "@", current.culling, sep="")
  
#  performance = performance.measures(expected_classes, predicted_classes)

  
}   # <-- if(cv.folds > 0)






if(exists("cross.validation.results.all")) {
  cross.validation.summary = cbind(cross.validation.summary, cross.validation.results.all)
  rownames(cross.validation.summary) = 1:cv.folds
}





# saving a requested stuff into external files

# writing distance table(s) to a file (if an appropriate option has been chosen)
if(save.distance.tables == TRUE && exists("distance.table") == TRUE) {
  distance.table.filename = paste("distance_table_", mfw, "mfw_", current.culling, "c.txt", sep = "")
    # checking if encoding conversion is needed
    if(encoding == "native.enc") {
      data.to.be.saved = distance.table
    } else {
      data.to.be.saved = distance.table
      rownames(data.to.be.saved) = iconv(rownames(data.to.be.saved), to = encoding)
      colnames(data.to.be.saved) = iconv(colnames(data.to.be.saved), to = encoding)
    }
  # writing the stuff
  write.table(file = distance.table.filename, data.to.be.saved)
}

# writing the words (or features) actually used in the analysis
features.actually.used = colnames(freq.table.both.sets[,1:mfw])
#
if(save.analyzed.features == TRUE) {
    # checking if encoding conversion is needed
    if(encoding == "native.enc") {
      data.to.be.saved = features.actually.used
    } else {
      data.to.be.saved = iconv(features.actually.used, to = encoding)
    }
  # writing the stuff
  cat(data.to.be.saved,
     file = paste("features_analyzed_", mfw, "mfw_", current.culling, "c.txt", sep = ""),
     sep = "\n")
}

# writing the frequency table that was actually used in the analysis
if(save.analyzed.freqs == TRUE) {
    # checking if encoding conversion is needed
    if(encoding == "native.enc") {
      data.to.be.saved = t(freq.table.both.sets[,1:mfw])
    } else {
      data.to.be.saved = t(freq.table.both.sets[,1:mfw])
      rownames(data.to.be.saved) = iconv(rownames(data.to.be.saved), to = encoding)
      colnames(data.to.be.saved) = iconv(colnames(data.to.be.saved), to = encoding)
    }
  # writting the stuff -- the file name will be changed accordingly
  write.table(data.to.be.saved,
     file = paste("frequencies_analyzed_", mfw, "mfw_", current.culling, "c.txt", sep = ""))
}









#######
####### the features, confusion matrices, etc., should be captured here!!!!!!
#######






}    # <-- the internal loop for(i) returns here
# #################################################

# blank line on the screen
message("")


}    # <-- the main loop for(j) returns here
# #################################################



all.guesses = total.no.of.correct.attrib / total.no.of.possible.attrib * 100
total.no.of.correct.attrib = sum(total.no.of.correct.attrib)
total.no.of.possible.attrib = sum(total.no.of.possible.attrib)



# information about the current task into the logfile
cat("\nGeneral attributive success:  ", total.no.of.correct.attrib, " of ",
           total.no.of.possible.attrib, " (",
           round(total.no.of.correct.attrib/total.no.of.possible.attrib*100, 1),
           "%)\n", file = outputfile, append = TRUE, sep = "")
cat("\nMFWs from ", mfw.min, " to ", mfw.max.original,
                  " @ increment ", mfw.incr, "\nCulling from ", culling.min,
                  " to ", culling.max, " @ increment ", culling.incr,
                  "\nPronouns deleted: ", delete.pronouns, "\n",
				  file = outputfile, append = TRUE, sep = "")
# additional empty line in outputfile (EOF)
cat("\n", file = outputfile, append = TRUE)


# the same information (about the current task) on screen
message("\nGeneral attributive success:  ", total.no.of.correct.attrib, " of ",
           total.no.of.possible.attrib, " (",
           round(total.no.of.correct.attrib/total.no.of.possible.attrib*100,1),
           "%, sd =", round(sd(all.guesses),1),"%)")
message("\nMFWs from ", mfw.min, " to ", mfw.max.original,
                  " @ increment ", mfw.incr, "\nCulling from ", culling.min,
                  " to ", culling.max, " @ increment ", culling.incr,
                  "\nPronouns deleted: ", delete.pronouns, sep = "")
message("")


###########################################################




# Names of many variables are incredibly unfashionable: they were acceptable
# in ver. 0.0.1 of the script, which provided just a basic Delta test
# with no additional options. Since it is quite a lot of work to modernize
# the variables' names in the code (and parhaps it is too late now...),
# these simple wrappers will rename at least the variables to be exported:
success.rate = all.guesses
  if(length(success.rate) >1) {
    overall.success.rate = mean(all.guesses)
  }
frequencies.training.set = freq.I.set.0.culling
frequencies.test.set = freq.II.set.0.culling
frequencies.both.sets = freq.table.both.sets
features.actually.used = colnames(freq.table.both.sets[,1:mfw])
features = mfw.list.of.all


if(exists("classification.results.features")) {
    distinctive.features = classification.results.features
}


# what about removing some of the variables? (suppose there are thousands
# of texts and dozens of features, and only 2GB of RAM...)







# #################################################
# praparing final resutls: building a class




if(exists("misclassified.samples")) {
  attr(misclassified.samples, "description") = "texts (samples) that were not correctly classified"
}
if(exists("cross.validation.summary") & length(cross.validation.summary) >0 ) {
  attr(cross.validation.summary, "description") = "correctly guessed samples (cross-validation folds)"
  if(dim(as.matrix(cross.validation.summary))[2] >1) {
    class(cross.validation.summary) = c("stylo.data", "matrix")
  }
}
if(exists("success.rate")) {
  attr(success.rate, "description") = "percentage of correctly guessed samples"
  class(success.rate) = "stylo.data"
}
if(exists("overall.success.rate")) {
  attr(overall.success.rate, "description") = "average percentage of correctly guessed samples"
  class(overall.success.rate) = "stylo.data"
}
if(exists("distance.table")) {
  attr(distance.table, "description") = "final distances between each pair of samples"
  class(distance.table) = "stylo.data"
}
if(exists("distinctive.features")) {
  attr(distinctive.features, "description") = "most distinctive features"
}
if(exists("frequencies.both.sets")) {
  attr(frequencies.both.sets, "description") = "frequencies of words/features accross the corpus"
  class(frequencies.both.sets) = "stylo.data"
}
if(exists("features") & length(features) >0 ) {
  attr(features, "description") = "features (e.g. words, n-grams, ...) applied to data"
  class(features) = "stylo.data"
}
if(exists("features.actually.used")) {
  attr(features.actually.used, "description") = "features (e.g. frequent words) actually analyzed"
  class(features.actually.used) = "stylo.data"
}
if(exists("zscores.both.sets")) {
  attr(zscores.both.sets, "description") = "z-scored frequencies accross the whole corpus"
  class(zscores.both.sets) = "stylo.data"
}
if(exists("frequencies.training.set")) {
  attr(frequencies.training.set, "description") = "frequencies of words/features in the training set"
  class(frequencies.training.set) = "stylo.data"
}
if(exists("frequencies.test.set")) {
  attr(frequencies.test.set, "description") = "frequencies of words/features in the test set"
  class(frequencies.test.set) = "stylo.data"
}
if(exists("performance")) {
  attr(performance, "description") = "precision, recall, accuracy, and the f1 measure"
}
if(exists("predicted_classes")) {
  predicted = predicted_classes
  attr(predicted, "description") = "a vector of classes predicted by the classifier"
}
if(exists("expected_classes")) {
  expected = expected_classes
  attr(expected, "description") = "ground truth, or a vector of expected classes"
}






# creating an object (list) that will contain the final results,
# tables of frequencies, etc.etc.
results.classify = list()
# elements that we want to add on this list
variables.to.save = c("misclassified.samples",
                      "success.rate",
                      "overall.success.rate",
                      "performance",
                      "predicted",
                      "expected",
                      "distance.table",
                      "distinctive.features",
                      "features",
                      "features.actually.used",
                      "zscores.both.sets",
                      "frequencies.both.sets",
                      "frequencies.training.set",
                      "cross.validation.summary",
                      "frequencies.test.set")
# checking if they really exist; getting rid of non-existing ones:
filtered.variables = ls()[ls() %in% variables.to.save]
# adding them on the list
for(i in filtered.variables) {
  results.classify[[i]] = get(i)
}



# adding some information about the current function call
# to the final list of results
results.classify$call = match.call()
results.classify$name = call("classify")


# This assings the list of final resutls to the class "stylo.resutls";
# the same class will be used to handle the output of stylo(),
# rolling.delta() and oppose(). See the files "print.stylo.results.R"
# and "summary.stylo.results.R" (no help files are provided, since
# these two functions are not visible for the users).
class(results.classify) = "stylo.results"






# back to the original working directory
setwd(original.path)

# return the value of the function
return(results.classify)
}

Try the stylo package in your browser

Any scripts or data that you put into this service are public.

stylo documentation built on Dec. 6, 2020, 5:06 p.m.