Nothing
#######################################################################
# arules - Mining Association Rules and Frequent Itemsets
# Copyright (C) 2011-2015 Michael Hahsler, Christian Buchta,
# Bettina Gruen and Kurt Hornik
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#' Read Transaction Data
#'
#' Reads transaction data from a file and creates a
#' [transactions] object.
#'
#' For _basket_ format, each line in the transaction data file
#' represents a transaction where the items (item labels) are separated by the
#' characters specified by `sep`. For _single_ format, each line
#' corresponds to a single item, containing at least ids for the transaction
#' and the item.
#'
#' @name read
#' @family import/export
#'
#' @param file the file name or a connection.
#' @param format a character string indicating the format of the data set. One
#' of `"basket"` or `"single"`, can be abbreviated.
#' @param header a logical value indicating whether the file contains the names
#' of the variables as its first line.
#' @param sep a character string specifying how fields are separated in the
#' data file. The default (`""`) splits at whitespaces.
#' @param cols For the _single_ format, `cols` is a numeric or
#' character vector of length two giving the numbers or names of the columns
#' (fields) with the transaction and item ids, respectively. If character, the
#' first line of `file` is assumed to be a header with column names. For
#' the _basket_ format, `cols` can be a numeric scalar giving the
#' number of the column (field) with the transaction ids. If
#' `cols = NULL`, the data do not contain transaction ids.
#' @param rm.duplicates a logical value specifying if duplicate items should be
#' removed from the transactions.
#' @param quote a list of characters used as quotes when reading.
#' @param skip number of lines to skip in the file before start reading data.
#' @param encoding character string indicating the encoding which is passed to
#' [readLines()] or [scan()] (see [Encoding] for character encoding).
#' @return Returns an object of class [transactions].
#' @author Michael Hahsler and Kurt Hornik
#' @keywords file
#' @examples
#' ## create a demo file using basket format for the example
#' data <- paste(
#' "# this is some test data",
#' "item1, item2",
#' "item1",
#' "item2, item3",
#' sep="\n")
#' cat(data)
#' write(data, file = "demo_basket.txt")
#'
#' ## read demo data (skip the comment in the first line)
#' tr <- read.transactions("demo_basket.txt", format = "basket", sep = ",", skip = 1)
#' inspect(tr)
#' ## make always sure that the items were properly separated
#' itemLabels(tr)
#'
#' ## create a demo file using single format for the example
#' ## column 1 contains the transaction ID and column 2 contains one item
#' data <- paste(
#' "trans1 item1",
#' "trans2 item1",
#' "trans2 item2",
#' sep ="\n")
#' cat(data)
#' write(data, file = "demo_single.txt")
#'
#' ## read demo data
#' tr <- read.transactions("demo_single.txt", format = "single", cols = c(1,2))
#' inspect(tr)
#'
#' ## create a demo file using single format with column headers
#' data <- paste(
#' "item_id;trans_id",
#' "item1;trans1",
#' "item1;trans2",
#' "item2;trans2",
#' sep ="\n")
#' cat(data)
#' write(data, file = "demo_single.txt")
#'
#' ## read demo data
#' tr <- read.transactions("demo_single.txt", format = "single",
#' header = TRUE, sep = ";", cols = c("trans_id", "item_id"))
#' inspect(tr)
#'
#' ## tidy up
#' unlink("demo_basket.txt")
#' unlink("demo_single.txt")
#' @export read.transactions
read.transactions <-
function(file,
format = c("basket", "single"),
header = FALSE,
sep = "",
cols = NULL,
rm.duplicates = FALSE,
quote = "\"'",
skip = 0,
encoding = "unknown") {
format <- match.arg(format)
if (is.character(file)) {
file <- file(file, "r")
on.exit(close(file))
}
if (format == "basket") {
data <- lapply(
readLines(file, encoding = encoding),
FUN = function(l)
scan(
text = l,
what = 'character',
sep = sep,
quote = quote,
quiet = TRUE,
encoding = encoding
)
)
if (header)
skip <- skip + 1
## skip
if (skip > 0)
data <- data[-seq_len(skip)]
if (!is.null(cols)) {
if (!(is(cols, "numeric") && (length(cols) == 1)))
stop("'cols' must be a numeric scalar for 'basket'.")
cols <- as(cols, "integer")
names(data) <- sapply(data, "[", cols)
data <- lapply(data, "[",-cols)
}
## remove leading and trailing white spaces
data <- lapply(data, function(x)
trimws(x))
## remove items with length(label) == 0
data <- lapply(data, function(x)
x[nchar(x) > 0])
if (rm.duplicates)
data <- .rm.duplicates(data)
return(as(data, "transactions"))
}
## If format is "single", have lines with TIDs and IIDs in the
## columns specified by 'cols'.
## If cols is a character vector of length 2 we assume the file
## has a header with colnames (added by F. Leisch)
if (header) {
colnames <- scan(
file = file,
what = "",
sep = sep,
quote = quote,
quiet = TRUE,
skip = skip,
nlines = 1,
encoding = encoding
)
if (is(cols, "character")) {
cols <- match(cols, colnames)
if (any(is.na(cols)))
stop("'cols' does not match entries in header of file.")
}
# connection continues at the current position
skip <- 0
}
## Else we get the numbers of the columns directly
if (length(cols) != 2)
stop("'cols' must be a vector of length 2 for 'single'.")
if (!is(cols, "numeric"))
stop("'cols' must be a numeric (character is only allowed for header = TRUE).")
cols <- as(cols, "integer")
## Thanks to BDR for indicating how to only read in the relevant
## columns.
what <- vector("list", length = max(cols))
what[cols] <- ""
entries <- scan(
file = file,
sep = sep,
quote = quote,
what = what,
flush = TRUE,
quiet = TRUE,
skip = skip,
encoding = encoding
)
tids <- factor(entries[[cols[1]]])
items <- factor(entries[[cols[2]]])
## Note: rm.duplicates is automatically done
ngT <- new(
"ngTMatrix",
i = as.integer(items) - 1L,
j = as.integer(tids) - 1L,
Dim = c(length(levels(items)), length(levels(tids))),
Dimnames = list(levels(items), NULL)
)
trans <- as(as(ngT, "CsparseMatrix"), "transactions")
transactionInfo(trans) <-
data.frame(transactionID = levels(tids))
return(trans)
}
.rm.duplicates <- function(x) {
n <- sapply(x, length, USE.NAMES = FALSE)
x <- lapply(x, unique)
n <- n - sapply(x, length, USE.NAMES = FALSE)
if (any(n)) {
n <- table(items = n)[-1]
cat("distribution of transactions with duplicates:\n")
print(n)
}
x
}
#' @rdname write
setGeneric("write",
function(x, file = "", ...) base::write(x, file, ...))
#' Write Transactions or Associations to a File
#'
#' Provides the generic function `write()` and the methods to write
#' [transactions] or [associations] to a file.
#'
#' For associations ([rules] and [itemsets]) `write()` first uses coercion to
#' data.frame to obtain a printable form of `x` and then uses
#' [utils::write.table()] to write the data to disk. This is just a method to
#' export the rules in human-readable form. These exported associations cannot be
#' read back in as rules. To save and load associations in compact form, use [save()] and
#' [load()] from the \pkg{base} package. Alternatively, association can be
#' written to disk in PMML (Predictive Model Markup Language) via
#' [write.PMML()]. This requires package \pkg{pmml}.
#'
#' Transactions can be saved in _basket_ (one line per transaction) or in _single_
#' (one line per item) format.
#'
#' @name write
#' @family import/export
#'
#' @aliases write write.csv
#' @param x the [transactions] or [associations] ([rules], [itemsets], etc.) object.
#' @param file either a character string naming a file or a connection open for
#' writing. '""' indicates output to the console.
#' @param format format to write transactions.
#' @param sep the field separator string. Values within each row of x are
#' separated by this string. Use `quote = TRUE` and `sep = ","` for
#' saving data as in csv format.
#' @param quote a logical value. Quote fields?
#' @param \dots further arguments passed on to [write.table()].
#' Use `fileEncoding` to set the encoding used for
#' writing the file.
#' @author Michael Hahsler
#' @keywords file
#' @examples
#' data("Epub")
#'
#' ## write the formated transactions to screen (basket format)
#' write(head(Epub))
#'
#' ## write the formated transactions to screen (single format)
#' write(head(Epub), format="single")
#'
#' ## write the formated result to file in CSV format
#' write(Epub, file = "data.csv", format = "single", sep = ",")
#'
#' ## write rules in CSV format
#' rules <- apriori(Epub, parameter=list(support = 0.0005, conf = 0.8))
#' write(rules, file = "data.csv", sep = ",")
#'
#' unlink("data.csv") # tidy up
NULL
#' @rdname write
setMethod("write", signature(x = "transactions"),
function(x,
file = "",
format = c("basket", "single"),
sep = " ",
quote = TRUE,
...) {
format <- match.arg(format)
if (format == "basket") {
l <- LIST(x)
## quotes?
if (quote)
l <- lapply(
l,
FUN = function(s)
sprintf('"%s"', s)
)
dat <- unlist(list(lapply(l, paste, collapse = sep)))
write(dat, file = file, ...)
} else {
## format single
l <- LIST(x)
dat <-
data.frame(
transactionID = rep(labels(l), lapply(l, length)),
item = unlist(l),
row.names = NULL
)
utils::write.table(
dat,
file = file,
sep = sep,
quote = quote,
row.names = FALSE,
col.names = FALSE,
...
)
}
invisible(dat)
})
#' @rdname write
setMethod("write", signature(x = "associations"),
function(x,
file = "",
sep = " ",
quote = TRUE,
...)
utils::write.table(
as(x, "data.frame"),
file = file,
sep = sep,
quote = quote,
...
))
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.