R/make.frequency.list.optimized.R
In stylo: Stylometric Multivariate Analyses

# #################################################
# Function for generating a frequency list of words or other (linguistic)
# features. It basically counts the elements of a vector and returns a vector
# of these elements in descending order of frequency.
# Refer to help(make.frequency.list) for farther details.
# #################################################

make.frequency.list.optimized = function(data, 
                               value = FALSE,
                               head = NULL,
                               relative = TRUE) {
     
                                       
                                       
     #####################################
     # first, sanitize the input dataset
     
     # test if the dataset belongs to 'stylo.corpus' class
     if(inherits(data, "stylo.corpus") | is.list(data) == TRUE) {
             # unlist, or make one long text out of the corpus samples
#             data = unlist(data)
     # otherwise, test if the dataset is a vector
     } else if(is.vector(data) == FALSE) {        
             # whet it is not, produce an error message and stop
             stop("unable to make a list of frequencies")
     }
     
     # test if the dataset has at least two elements
     if(length(data) < 3) {
             stop("you try to measure frequencies of an empty vector!")
     }
     #####################################

     
     aggregate.values = c()
     aggregate.length = 0
     
for(k in 1: length(data)) {
#for(k in 1: 3) {
	message(sprintf("%7s", k), appendLF = FALSE) 
#	cat(k)#round(k/length(data), 3))

	current.values = sort(table(data[[k]]), decreasing = TRUE)
        if(length(current.values) > 10000) {
            current.values = current.values[1:10000]
        }
      current.length = length(data[[k]])
      current.names  = names(current.values)
        # for the first time, there is nothing to combine
        if(k == 1) {
            aggregate.values = current.values
        }
      aggregate.names = unique(c(current.names, names(aggregate.values)))
      aggregate.length = aggregate.length + current.length

 tmp.values1 = current.values[aggregate.names] / current.length
 tmp.values2 = aggregate.values[aggregate.names] / aggregate.length
 names(tmp.values1) = aggregate.names
 names(tmp.values2) = aggregate.names

 tmp.values = rbind(tmp.values1, tmp.values2)
 tmp.values[which(is.na(tmp.values))] = 0
 
 aggregate.values = sort(colMeans(tmp.values), decreasing = TRUE)
         if(length(aggregate.values) > 10000) {
            aggregate.values = aggregate.values[1:10000]
        }

        message("\b\b\b\b\b\b\b", appendLF = FALSE)
}

message("       ")


frequent.features = aggregate.values[1:5000]

#     #####################################     
#     # the dataset sanitized, let counting the features begin!     
#     frequent.features = sort(table(data), decreasing = TRUE)
#     #####################################
#
#     
#     # if relative frequencies were requested, they are normalized
#     if(relative == TRUE) {
#             frequent.features = frequent.features / length(data) * 100
#     }
#     
#     # additionally, one might limit the number of the most frequent features;
#     # this will return first n elements only (this is the argument 'head')
#     if(is.numeric(head) == TRUE) {
#             # sanitizing the indicated number
#             head = abs(round(head))
#             if(head == 0) head = 1
#             # cutting off the list
#             frequent.features = frequent.features[1:head]
#     }
#     
#     # in most cases, one needs just a list of features, without frequencies
     if(value == FALSE) {
             frequent.features = names(frequent.features)
     }
     
     
return(frequent.features)     
}