R/RcppExports.R

Defines functions append_data word_vectors_methods reduced_word_vectors count_rows read_ROWS_wv RATIO_DISTINCT COUNTS_INTERSECT NUM_LETTERS_DISTINCT DISTINCT_WORD_INTERSECT batch_calculation modulus reduce_dims_with_correlation inner_reduce_dims keep_idxs COR_MATR inner_cm jaccard_dice inner_jd DICE JACCARD UNION INTERSECT UNIQUE COS cosine_dist DIST vec_parser sublist Not_Duplicated read_ROWS read_CHARS load_sparse_ save_sparse_ sp_means tf_idf_exclude sp_sums dense_2sparse_mat sparsity_float Most_Freq_Terms Associations_Cpp Adj_Sparsity idf_global_term_weights res_term_matrix batch_2file vocabulary_counts vocabulary_counts_big_tokenize big_tokenize convert_bytes file_parser big_parser big_splitter_bytes res_token_list res_token_vector res_token Look_up_tbl Dissimilarity_mat Cosine_dist Levenshtein_dist Dice_similarity Collocations_ngrams Count_characters Frequency_distribution Path_2vector

# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

Path_2vector <- function(path_2folder = "", path_2file = "") {
    .Call(`_textTinyR_Path_2vector`, path_2folder, path_2file)
}

Frequency_distribution <- function(x, path_2folder = "", path_2file = "", file_delimiter = '\n') {
    .Call(`_textTinyR_Frequency_distribution`, x, path_2folder, path_2file, file_delimiter)
}

Count_characters <- function(x, path_2folder = "", path_2file = "", file_delimiter = '\n') {
    .Call(`_textTinyR_Count_characters`, x, path_2folder, path_2file, file_delimiter)
}

Collocations_ngrams <- function(x, path_2folder = "", path_2file = "", file_delimiter = '\n', n_gram_delimiter = "_") {
    .Call(`_textTinyR_Collocations_ngrams`, x, path_2folder, path_2file, file_delimiter, n_gram_delimiter)
}

Dice_similarity <- function(x, y, n_grams) {
    .Call(`_textTinyR_Dice_similarity`, x, y, n_grams)
}

Levenshtein_dist <- function(s, t) {
    .Call(`_textTinyR_Levenshtein_dist`, s, t)
}

Cosine_dist <- function(x, y, split_separator = " ") {
    .Call(`_textTinyR_Cosine_dist`, x, y, split_separator)
}

Dissimilarity_mat <- function(words, dice_n_gram = 2L, method = "dice", split_separator = " ", dice_thresh = 0.3, upper = TRUE, diagonal = TRUE, threads = 1L) {
    .Call(`_textTinyR_Dissimilarity_mat`, words, dice_n_gram, method, split_separator, dice_thresh, upper, diagonal, threads)
}

Look_up_tbl <- function(VEC, n_grams) {
    .Call(`_textTinyR_Look_up_tbl`, VEC, n_grams)
}

res_token <- function(x, language, language_spec, LOCALE_UTF, FLAG_path, read_file_delimiter, max_num_char, remove_char = "", cpp_to_lower = FALSE, cpp_to_upper = FALSE, cpp_remove_punctuation = FALSE, remove_punctuation_vector = FALSE, cpp_remove_numbers = FALSE, cpp_trim_token = FALSE, cpp_tokenization_function = FALSE, cpp_string_separator = "-*", cpp_remove_stopwords = FALSE, min_num_char = 1L, stemmer = "NULL", min_n_gram = 1L, max_n_gram = 1L, skip_n_gram = 1L, skip_distance = 0L, n_gram_delimiter = " ", concat_delimiter = "NULL", path_2file = "", stemmer_ngram = 4L, stemmer_gamma = 0.0, stemmer_truncate = 3L, stemmer_batches = 1L, threads = 1L, verbose = FALSE, save_2single_file = FALSE, path_extend = "output_token.txt", vocabulary_path = "") {
    .Call(`_textTinyR_res_token`, x, language, language_spec, LOCALE_UTF, FLAG_path, read_file_delimiter, max_num_char, remove_char, cpp_to_lower, cpp_to_upper, cpp_remove_punctuation, remove_punctuation_vector, cpp_remove_numbers, cpp_trim_token, cpp_tokenization_function, cpp_string_separator, cpp_remove_stopwords, min_num_char, stemmer, min_n_gram, max_n_gram, skip_n_gram, skip_distance, n_gram_delimiter, concat_delimiter, path_2file, stemmer_ngram, stemmer_gamma, stemmer_truncate, stemmer_batches, threads, verbose, save_2single_file, path_extend, vocabulary_path)
}

res_token_vector <- function(VEC, language, language_spec, LOCALE_UTF, max_num_char, remove_char, cpp_to_lower, cpp_to_upper, cpp_remove_punctuation, remove_punctuation_vector, cpp_remove_numbers, cpp_trim_token, cpp_tokenization_function, cpp_string_separator, cpp_remove_stopwords, min_num_char, stemmer, min_n_gram, max_n_gram, skip_n_gram, skip_distance, n_gram_delimiter, concat_delimiter, path_2file, stemmer_ngram, stemmer_gamma, stemmer_truncate, stemmer_batches, threads, verbose, vocabulary_path) {
    .Call(`_textTinyR_res_token_vector`, VEC, language, language_spec, LOCALE_UTF, max_num_char, remove_char, cpp_to_lower, cpp_to_upper, cpp_remove_punctuation, remove_punctuation_vector, cpp_remove_numbers, cpp_trim_token, cpp_tokenization_function, cpp_string_separator, cpp_remove_stopwords, min_num_char, stemmer, min_n_gram, max_n_gram, skip_n_gram, skip_distance, n_gram_delimiter, concat_delimiter, path_2file, stemmer_ngram, stemmer_gamma, stemmer_truncate, stemmer_batches, threads, verbose, vocabulary_path)
}

res_token_list <- function(VEC, language, language_spec, LOCALE_UTF, max_num_char, remove_char, cpp_to_lower, cpp_to_upper, cpp_remove_punctuation, remove_punctuation_vector, cpp_remove_numbers, cpp_trim_token, cpp_tokenization_function, cpp_string_separator, cpp_remove_stopwords, min_num_char, stemmer, min_n_gram, max_n_gram, skip_n_gram, skip_distance, n_gram_delimiter, concat_delimiter, path_2file, stemmer_ngram, stemmer_gamma, stemmer_truncate, stemmer_batches, threads, verbose, vocabulary_path) {
    .Call(`_textTinyR_res_token_list`, VEC, language, language_spec, LOCALE_UTF, max_num_char, remove_char, cpp_to_lower, cpp_to_upper, cpp_remove_punctuation, remove_punctuation_vector, cpp_remove_numbers, cpp_trim_token, cpp_tokenization_function, cpp_string_separator, cpp_remove_stopwords, min_num_char, stemmer, min_n_gram, max_n_gram, skip_n_gram, skip_distance, n_gram_delimiter, concat_delimiter, path_2file, stemmer_ngram, stemmer_gamma, stemmer_truncate, stemmer_batches, threads, verbose, vocabulary_path)
}

big_splitter_bytes <- function(input_path, batches, end_query, OUTPUT_PATH, trimmed_line = FALSE, verbose = FALSE) {
    invisible(.Call(`_textTinyR_big_splitter_bytes`, input_path, batches, end_query, OUTPUT_PATH, trimmed_line, verbose))
}

big_parser <- function(input_path_folder, start_query, end_query, output_path_folder, min_lines = 1L, trimmed_line = FALSE, verbose = FALSE) {
    invisible(.Call(`_textTinyR_big_parser`, input_path_folder, start_query, end_query, output_path_folder, min_lines, trimmed_line, verbose))
}

file_parser <- function(input_path_file, start_query, end_query, output_path_file = "", min_lines = 1L, trimmed_line = FALSE, verbose = FALSE) {
    invisible(.Call(`_textTinyR_file_parser`, input_path_file, start_query, end_query, output_path_file, min_lines, trimmed_line, verbose))
}

convert_bytes <- function(input_path_file, unit = "GB") {
    .Call(`_textTinyR_convert_bytes`, input_path_file, unit)
}

big_tokenize <- function(input_path_folder, output_path_folder, batches, language, language_spec, LOCALE_UTF, read_file_delimiter, max_num_char, increment_batch_no = 1L, remove_char = "", cpp_to_lower = FALSE, cpp_to_upper = FALSE, cpp_remove_punctuation = FALSE, remove_punctuation_vector = FALSE, cpp_remove_numbers = FALSE, cpp_trim_token = FALSE, cpp_tokenization_function = FALSE, cpp_string_separator = "-*", cpp_remove_stopwords = FALSE, min_num_char = 1L, stemmer = "NULL", min_n_gram = 1L, max_n_gram = 1L, skip_n_gram = 1L, skip_distance = 0L, n_gram_delimiter = " ", concat_delimiter = "NULL", stemmer_ngram = 4L, stemmer_gamma = 0.0, stemmer_truncate = 3L, stemmer_batches = 1L, threads = 1L, save_2single_file = FALSE, vocabulary_folder = "", verbose = FALSE) {
    invisible(.Call(`_textTinyR_big_tokenize`, input_path_folder, output_path_folder, batches, language, language_spec, LOCALE_UTF, read_file_delimiter, max_num_char, increment_batch_no, remove_char, cpp_to_lower, cpp_to_upper, cpp_remove_punctuation, remove_punctuation_vector, cpp_remove_numbers, cpp_trim_token, cpp_tokenization_function, cpp_string_separator, cpp_remove_stopwords, min_num_char, stemmer, min_n_gram, max_n_gram, skip_n_gram, skip_distance, n_gram_delimiter, concat_delimiter, stemmer_ngram, stemmer_gamma, stemmer_truncate, stemmer_batches, threads, save_2single_file, vocabulary_folder, verbose))
}

vocabulary_counts_big_tokenize <- function(input_path_folder, output_path_file, max_num_chars = 1000L, verbose = FALSE) {
    invisible(.Call(`_textTinyR_vocabulary_counts_big_tokenize`, input_path_folder, output_path_file, max_num_chars, verbose))
}

vocabulary_counts <- function(input_path_file, start_query, end_query, language, output_path_file = "", min_lines = 1L, trimmed_line = FALSE, query_transform = FALSE, language_spec = "english", LOCALE_UTF = "", max_num_char = 1000000000L, remove_char = "", cpp_to_lower = FALSE, cpp_to_upper = FALSE, cpp_remove_punctuation = FALSE, remove_punctuation_vector = FALSE, cpp_remove_numbers = FALSE, cpp_trim_token = FALSE, cpp_tokenization_function = FALSE, cpp_string_separator = " \r\n\t.,;:()?!//", cpp_remove_stopwords = FALSE, min_num_char = 1L, stemmer = "NULL", min_n_gram = 1L, max_n_gram = 1L, skip_n_gram = 1L, skip_distance = 0L, n_gram_delimiter = " ", threads = 1L, verbose = FALSE) {
    invisible(.Call(`_textTinyR_vocabulary_counts`, input_path_file, start_query, end_query, language, output_path_file, min_lines, trimmed_line, query_transform, language_spec, LOCALE_UTF, max_num_char, remove_char, cpp_to_lower, cpp_to_upper, cpp_remove_punctuation, remove_punctuation_vector, cpp_remove_numbers, cpp_trim_token, cpp_tokenization_function, cpp_string_separator, cpp_remove_stopwords, min_num_char, stemmer, min_n_gram, max_n_gram, skip_n_gram, skip_distance, n_gram_delimiter, threads, verbose))
}

batch_2file <- function(INPUT_FILE, OUTPUT_PATH, batches, read_file_delimiter, language, language_spec, LOCALE_UTF, max_num_char, remove_char = "", cpp_to_lower = FALSE, cpp_to_upper = FALSE, cpp_remove_punctuation = FALSE, remove_punctuation_vector = FALSE, cpp_remove_numbers = FALSE, cpp_trim_token = FALSE, cpp_tokenization_function = FALSE, cpp_string_separator = "-*", cpp_remove_stopwords = FALSE, min_num_char = 1L, stemmer = "NULL", min_n_gram = 1L, max_n_gram = 1L, skip_n_gram = 1L, skip_distance = 0L, n_gram_delimiter = " ", stemmer_ngram = 4L, stemmer_gamma = 0.0, stemmer_truncate = 3L, stemmer_batches = 1L, threads = 1L, concat_delimiter = "\n", vocabulary_path = "", verbose = FALSE) {
    invisible(.Call(`_textTinyR_batch_2file`, INPUT_FILE, OUTPUT_PATH, batches, read_file_delimiter, language, language_spec, LOCALE_UTF, max_num_char, remove_char, cpp_to_lower, cpp_to_upper, cpp_remove_punctuation, remove_punctuation_vector, cpp_remove_numbers, cpp_trim_token, cpp_tokenization_function, cpp_string_separator, cpp_remove_stopwords, min_num_char, stemmer, min_n_gram, max_n_gram, skip_n_gram, skip_distance, n_gram_delimiter, stemmer_ngram, stemmer_gamma, stemmer_truncate, stemmer_batches, threads, concat_delimiter, vocabulary_path, verbose))
}

res_term_matrix <- function(vector_corpus, language, language_spec, LOCALE_UTF, max_num_char, document_term_matrix = TRUE, path_2documents_file = "NULL", sort_columns = FALSE, remove_char = "", cpp_to_lower = FALSE, cpp_to_upper = FALSE, cpp_remove_punctuation = FALSE, remove_punctuation_vector = FALSE, cpp_remove_numbers = FALSE, cpp_trim_token = FALSE, cpp_tokenization_function = FALSE, cpp_string_separator = "-*", cpp_remove_stopwords = FALSE, min_num_char = 1L, stemmer = "NULL", min_n_gram = 1L, max_n_gram = 1L, skip_n_gram = 1L, skip_distance = 0L, n_gram_delimiter = " ", stemmer_ngram = 4L, stemmer_gamma = 0.0, stemmer_truncate = 3L, stemmer_batches = 1L, threads = 1L, verbose = FALSE, print_every_rows = 1000L, normalize_tf = "NULL", tf_idf = FALSE) {
    .Call(`_textTinyR_res_term_matrix`, vector_corpus, language, language_spec, LOCALE_UTF, max_num_char, document_term_matrix, path_2documents_file, sort_columns, remove_char, cpp_to_lower, cpp_to_upper, cpp_remove_punctuation, remove_punctuation_vector, cpp_remove_numbers, cpp_trim_token, cpp_tokenization_function, cpp_string_separator, cpp_remove_stopwords, min_num_char, stemmer, min_n_gram, max_n_gram, skip_n_gram, skip_distance, n_gram_delimiter, stemmer_ngram, stemmer_gamma, stemmer_truncate, stemmer_batches, threads, verbose, print_every_rows, normalize_tf, tf_idf)
}

idf_global_term_weights <- function(Tmat, Terms) {
    .Call(`_textTinyR_idf_global_term_weights`, Tmat, Terms)
}

Adj_Sparsity <- function(column_indices, row_indices, docs_counts, Terms, sparsity_thresh = 1.0) {
    .Call(`_textTinyR_Adj_Sparsity`, column_indices, row_indices, docs_counts, Terms, sparsity_thresh)
}

Associations_Cpp <- function(column_indices_, row_indices_, docs_counts_, target_size, Terms, mult_target_var, keepTerms = 0L, target_var = -1L, normalize_TF = "NULL", tf_IDF = FALSE, verbose = FALSE) {
    .Call(`_textTinyR_Associations_Cpp`, column_indices_, row_indices_, docs_counts_, target_size, Terms, mult_target_var, keepTerms, target_var, normalize_TF, tf_IDF, verbose)
}

Most_Freq_Terms <- function(sparse_data, Terms, keepTerms = 0L, flag_dtm = FALSE, threads = 1L, verbose = FALSE) {
    .Call(`_textTinyR_Most_Freq_Terms`, sparse_data, Terms, keepTerms, flag_dtm, threads, verbose)
}

sparsity_float <- function(data) {
    invisible(.Call(`_textTinyR_sparsity_float`, data))
}

dense_2sparse_mat <- function(x) {
    .Call(`_textTinyR_dense_2sparse_mat`, x)
}

sp_sums <- function(sp_data, rowSums = FALSE) {
    .Call(`_textTinyR_sp_sums`, sp_data, rowSums)
}

tf_idf_exclude <- function(tmp_mat, document_term_matrix = TRUE) {
    .Call(`_textTinyR_tf_idf_exclude`, tmp_mat, document_term_matrix)
}

sp_means <- function(sp_data, rowMeans = FALSE) {
    .Call(`_textTinyR_sp_means`, sp_data, rowMeans)
}

save_sparse_ <- function(x, file_name = "save_sparse.mat") {
    invisible(.Call(`_textTinyR_save_sparse_`, x, file_name))
}

load_sparse_ <- function(file_name = "load_sparse.mat") {
    .Call(`_textTinyR_load_sparse_`, file_name)
}

read_CHARS <- function(input_file, characters = 200L, write_2file = "") {
    .Call(`_textTinyR_read_CHARS`, input_file, characters, write_2file)
}

read_ROWS <- function(input_file, write_2file = "", read_delimiter = ' ', rows = 200L) {
    .Call(`_textTinyR_read_ROWS`, input_file, write_2file, read_delimiter, rows)
}

Not_Duplicated <- function(x) {
    .Call(`_textTinyR_Not_Duplicated`, x)
}

sublist <- function(input, ids) {
    .Call(`_textTinyR_sublist`, input, ids)
}

vec_parser <- function(input_path_file, start_query, end_query, trimmed_line = FALSE, verbose = FALSE) {
    .Call(`_textTinyR_vec_parser`, input_path_file, start_query, end_query, trimmed_line, verbose)
}

DIST <- function(MATRIX_1st, MATRIX_2nd, method, threads, eps = 1.0e-6) {
    .Call(`_textTinyR_DIST`, MATRIX_1st, MATRIX_2nd, method, threads, eps)
}

cosine_dist <- function(x, y, separator) {
    .Call(`_textTinyR_cosine_dist`, x, y, separator)
}

COS <- function(TEXT_SEQ1, TEXT_SEQ2, threads, separator) {
    .Call(`_textTinyR_COS`, TEXT_SEQ1, TEXT_SEQ2, threads, separator)
}

UNIQUE <- function(x) {
    .Call(`_textTinyR_UNIQUE`, x)
}

INTERSECT <- function(v1, v2) {
    .Call(`_textTinyR_INTERSECT`, v1, v2)
}

UNION <- function(v1, v2) {
    .Call(`_textTinyR_UNION`, v1, v2)
}

JACCARD <- function(vec1, vec2) {
    .Call(`_textTinyR_JACCARD`, vec1, vec2)
}

DICE <- function(vec1, vec2) {
    .Call(`_textTinyR_DICE`, vec1, vec2)
}

inner_jd <- function(VEC1, VEC2, method, j) {
    .Call(`_textTinyR_inner_jd`, VEC1, VEC2, method, j)
}

jaccard_dice <- function(VEC1, VEC2, method, threads = 1L) {
    .Call(`_textTinyR_jaccard_dice`, VEC1, VEC2, method, threads)
}

inner_cm <- function(x, y, i) {
    .Call(`_textTinyR_inner_cm`, x, y, i)
}

COR_MATR <- function(x, y, threads) {
    .Call(`_textTinyR_COR_MATR`, x, y, threads)
}

keep_idxs <- function(x, exclude_idx) {
    .Call(`_textTinyR_keep_idxs`, x, exclude_idx)
}

inner_reduce_dims <- function(x, ALL_OTHER_IDXs, i, current_col) {
    .Call(`_textTinyR_inner_reduce_dims`, x, ALL_OTHER_IDXs, i, current_col)
}

reduce_dims_with_correlation <- function(x, y, response_lower_thresh = 0.2, predictors_upper_thresh = 0.65, threads = 1L) {
    .Call(`_textTinyR_reduce_dims_with_correlation`, x, y, response_lower_thresh, predictors_upper_thresh, threads)
}

modulus <- function(a, b) {
    .Call(`_textTinyR_modulus`, a, b)
}

batch_calculation <- function(nr_rows, batches) {
    .Call(`_textTinyR_batch_calculation`, nr_rows, batches)
}

DISTINCT_WORD_INTERSECT <- function(VEC1, VEC2) {
    .Call(`_textTinyR_DISTINCT_WORD_INTERSECT`, VEC1, VEC2)
}

NUM_LETTERS_DISTINCT <- function(VEC) {
    .Call(`_textTinyR_NUM_LETTERS_DISTINCT`, VEC)
}

COUNTS_INTERSECT <- function(SUBL1, SUBL2, distinct = TRUE, num_letters = FALSE) {
    .Call(`_textTinyR_COUNTS_INTERSECT`, SUBL1, SUBL2, distinct, num_letters)
}

RATIO_DISTINCT <- function(SUBL1, SUBL2, distinct = TRUE, num_letters = FALSE) {
    .Call(`_textTinyR_RATIO_DISTINCT`, SUBL1, SUBL2, distinct, num_letters)
}

read_ROWS_wv <- function(input_file, read_delimiter = ' ') {
    .Call(`_textTinyR_read_ROWS_wv`, input_file, read_delimiter)
}

count_rows <- function(FILE, verbose = FALSE) {
    .Call(`_textTinyR_count_rows`, FILE, verbose)
}

reduced_word_vectors <- function(FILE, unique_tokens, vector_dimensions, print_every_rows = 10000L, verbose = FALSE, copy_data = FALSE) {
    .Call(`_textTinyR_reduced_word_vectors`, FILE, unique_tokens, vector_dimensions, print_every_rows, verbose, copy_data)
}

word_vectors_methods <- function(rcpp_list, INPUT_list, FILE, method, unique_tokens, vector_dimensions, gtw_terms, gtw_weights, print_every_rows = 10000L, verbose = FALSE, threads = 1L, copy_data = FALSE) {
    .Call(`_textTinyR_word_vectors_methods`, rcpp_list, INPUT_list, FILE, method, unique_tokens, vector_dimensions, gtw_terms, gtw_weights, print_every_rows, verbose, threads, copy_data)
}

append_data <- function(x, y) {
    .Call(`_textTinyR_append_data`, x, y)
}
mlampros/textTinyR documentation built on Jan. 17, 2024, 1:18 a.m.