readtext: Import and Handling for Plain and Formatted Text Files

# TODO: re-do docs
# TODO: Check and remove extranous codes # TODO: recurse file listing for e.g. remote ZIP file
# TODO: readtext with csv doesn"t seem to require text_field

dvars <- function(x) as.data.frame(x)[, -c(1:2)]

test_that("test readtext with single filename", {
    fox <- c(fox.txt = "The quick brown fox jumps over the lazy dog.")
    expect_equal(
        as.character(readtext("../data/fox/fox.txt")),
        fox
    )
})

test_that("test for deprecated warning for text()", {
    fox <- c(fox.txt = "The quick brown fox jumps over the lazy dog.")
    expect_warning(
        texts(readtext("../data/fox/fox.txt")),
        "'texts.readtext' is deprecated.
Use 'Use as.character() instead' instead.",
        fixed = TRUE
    )
})

test_that("test readtext with vector of filenames", {
    expect_equal(
        length(as.character(readtext(
            c(
                "../data/fruits/apple.txt",
                "../data/fruits/orange.txt"
            )
        ))),
        2
    )
})

test_that("test readtext with glob-style mask", {
    expect_equal(
        length(as.character(readtext(
            "../data/glob/*.txt"
        ))),
        5
    )

    expect_equal(
        length(as.character(readtext(
            "../data/glob/?.txt"
        ))),
        4
    )

    # Glob in non-last part of filename
    expect_equal(
        length(as.character(readtext(
            "../data/glob/*/test.txt"
        ))),
        2
    )

    # It would be nice to be able to test that escaped glob doesn"t special
    # characters in filename, but R check won"t allow a file of this name to
    # exist in the package... This should still pass if run the test manually
    # (having created the file, supposing your platform supports it)
    #  expect_equal(
    #      length(as.character(readtext(
    #              "../data/glob/special/\\*.txt"
    #      ))),
    #      1
    #  )
})

test_that("test structured readtext with glob-style mask", {
    expect_equal(
        nrow(readtext("../data/csv/test*.csv", text_field = "text")),
        6
    )
    expect_equal(
        nrow(readtext("../data/csv/test*.csv", text_field = "text")),
        6
    )
})


test_that("test remote text file", {
    expect_equal(
        as.character(readtext("https://raw.githubusercontent.com/kbenoit/readtext/master/tests/data/fox/fox.txt")),
        c(fox.txt = "The quick brown fox jumps over the lazy dog.")
    )
    # ignore_missing_files with an existing file should make no difference
    expect_equal(
        as.character(readtext("https://raw.githubusercontent.com/kbenoit/readtext/master/tests/data/fox/fox.txt",
                       ignore_missing_files = TRUE)),
        c(fox.txt = "The quick brown fox jumps over the lazy dog.")
    )
    
    # ingores parameters in URL
    expect_equal(
        as.character(readtext("https://raw.githubusercontent.com/kbenoit/readtext/master/tests/data/fox/fox.txt?x=1&y=2",
                       ignore_missing_files = TRUE)),
        c(fox.txt = "The quick brown fox jumps over the lazy dog.")
    )

    
})


test_that("test remote csv file", {
    expect_equal(
        as.character(readtext("https://raw.githubusercontent.com/kbenoit/readtext/master/tests/data/csv/test.csv", text_field = "text")),
        c(test.csv.1 = "Lorem ipsum.", test.csv.2 = "Dolor sit")
    )
})


context("test that require recursive invocation of listFileNames (i.e. because a special filename resolves to another special filename)")

test_that("test zip file", {
    skip_on_cran()
    skip_on_os("windows")
    DATA_DIR <- system.file("extdata/", package = "readtext")
        expect_equal(
        length(as.character(readtext(paste0(DATA_DIR, "/data_files_encodedtexts.zip")))),
        36
    )
})

test_that("test warning for unrecognized filetype", {
    expect_warning(
        readtext("../data/empty/empty.nonesuch"),
        paste0("Unsupported extension ", sQuote("nonesuch"), " of file *")
    )

    # But test that it still loads
    expect_warning(
        readtext("../data/unknown/unknown"),
        paste0("Unsupported extension ", sQuote(""), " of file *")
    )
    expect_equal(
        readtext("../data/unknown/unknown", verbosity = 0)$text,
        "The quick brown fox jumps over the lazy dog."
    )
})


# TODO: Refactor this to loop over filetypes
test_that("test csv files", {
    # Test corpus object
    testcorpus <- readtext("../data/csv/test.csv", text_field = "text")
    expect_identical(
        as.data.frame(testcorpus)[, -c(1:2)],
        data.frame(list(colour = c("green", "red"), number = c(42L, 99L)))
    )
    expect_equal(
        as.character(testcorpus),
        c(test.csv.1 = "Lorem ipsum.", test.csv.2 = "Dolor sit")
    )

    expect_error(
        readtext("../data/csv/*", text_field = "nonesuch"),
        "There is no field called"
    )

    expect_error(
        readtext("../data/csv/*", text_field = 9000),
        "There is no 9000th field"
    )
})

test_that("test tab files", {
    testcorpus <- readtext("../data/tab/test.tab", text_field = "text")
    expect_equal(
        as.data.frame(testcorpus)[, -c(1:2)],
        data.frame(list(colour = c("green", "red"), number = c(42, 99)), 
                          stringsAsFactors = FALSE)
    )
    expect_equal(
        testcorpus$text,
        unname(c(test.tab.1 = "Lorem ipsum.", test.tab.2 = "Dolor sit"))
    )
    
    expect_error(
        readtext("../data/tab/test.tab", text_field = "nonexistent"),
                 "There is no field called nonexistent"
    )
    
})

test_that("test tsv files", {
    testcorpus <- readtext("../data/tsv/test.tsv", text_field = "text")
    expect_that(
        as.data.frame(testcorpus)[, -c(1:2)],
        equals(data.frame(list(colour = c("green", "red"), number = c(42, 99)), 
                          stringsAsFactors = FALSE))
    )
    expect_that(
        as.character(testcorpus),
        equals(c(test.tsv.1 = "Lorem ipsum.", test.tsv.2 = "Dolor sit"))
    )
    
    expect_error(
        readtext("../data/tsv/test.tsv", text_field = "nonexistant"),
                 "There is no field called nonexistant"
    )
    
})


test_that("test xml files", {
    # Test corpus object
    testcorpus <- readtext("../data/xml/test.xml", text_field = "text")
    expect_equal(
        data.frame(testcorpus[, -c(1, 2)]),
        data.frame(list(colour = c("green", "red"), number = c(42, 99)), 
                          stringsAsFactors = FALSE)
    )
    expect_equal(
        unname(as.character(testcorpus)),
        c("Lorem ipsum.", "Dolor sit")
    )
    expect_identical(
        testcorpus$doc_id,
        c("test.xml.1", "test.xml.2")
    )

    expect_warning(
        readtext("../data/xml/test.xml", text_field = 1),
        "You should specify text_field by name.*"
    )
    expect_identical(
        unname(as.character(suppressWarnings(readtext("../data/xml/test.xml", text_field = 1)))),
        c("Lorem ipsum.", "Dolor sit")
    )
    expect_identical(
        testcorpus$doc_id,
        c("test.xml.1", "test.xml.2")
    )
    
    expect_error(
        readtext("../data/xml/test.xml", text_field = "nonesuch"),
        "There is no field called"
    )
    
    expect_error(
        readtext("../data/xml/test.xml", text_field = 9000),
        "There is no 9000th field"
    )
})


test_that("test xml files with XPath", {

    expected <- c("The quick brown fox")
    names(expected) <- "tei.xml"

    actual <- readtext("../data/xml/tei.xml",
                      text_field = "/d1:TEI/d1:text/d1:body//d1:p")#,
                      # namespaces = c(tei = "https://www.tei-c.org/ns/1.0"))
    expect_equal(as.character(actual), expected)


    actual <- readtext("../data/xml/tei.xml", collapse = "P",
                       text_field = "/d1:TEI/d1:text/d1:body//d1:p")
                       # namespaces = c(tei = "https://www.tei-c.org/ns/1.0"))
    expect_equal(unname(as.character(actual)), "The Pquick Pbrown Pfox")

    actual <- readtext("../data/xml/tei.xml", collapse = "P",
                      text_field = "/d1:TEI//*/text()")#,
                      # namespaces = c(tei = "https://www.tei-c.org/ns/1.0"))
    expect_equal(unname(as.character(actual)), "Lorem Ipsum 1PSome PlacePAnywhere, USPNopePThe Pquick Pbrown PfoxPNope")

})


test_that("test readtext() with docvarsfrom = filenames", {
    
    expect_that(
        dvars(readtext("../data/docvars/one/*", docvarsfrom = "filenames")),
        equals(data.frame(list(docvar1 = c(1L, 2L), docvar2 = c("apple", "orange")), 
                          stringsAsFactors = FALSE))
    )
    
    expect_that(
        dvars(readtext("../data/docvars/dash/*", docvarsfrom = "filenames", dvsep = "-")),
        equals(data.frame(list(docvar1 = c(1,2), docvar2 = c("apple", "orange")), 
                          stringsAsFactors = FALSE))
    )
    
    
    expect_that(
        dvars(readtext("../data/docvars/two/*txt", docvarsfrom = "filenames")),
        equals(data.frame(list(docvar1 = c(1,2), docvar2 = c("apple", "orange")), docvar3 = c("red", "orange"), 
                          stringsAsFactors = FALSE))
    )
    
    expect_error(
        readtext("../data/docvars/two/*json", text_field = "nonesuch", docvarsfrom = "filenames"),
                "There is no field called"
    )
    
    expect_error(
        readtext("../data/docvars/unequal/*", docvarsfrom = "filenames"),
        "Filename elements are not equal in length."
    )
    
    expect_that(
        dvars(readtext("../data/docvars/two/*txt", docvarsfrom = "filenames",
                         docvarnames = c("id", "fruit", "colour"))),
        equals(data.frame(list(id = c(1,2), fruit = c("apple", "orange")), 
                          colour = c("red", "orange"), stringsAsFactors = FALSE))
    )

    expect_warning(
        dvars(readtext("../data/docvars/two/*txt", docvarsfrom = "filenames",
                         docvarnames = c("id", "fruit")
        )),
        "Fewer docnames supplied than existing docvars - last 1 docvar given generic names."
    )
    expect_that(
        dvars(suppressWarnings(readtext("../data/docvars/two/*txt", docvarsfrom = "filenames",
                         docvarnames = c("id", "fruit")))),
        equals(data.frame(list(id = c(1,2), fruit = c("apple", "orange")), 
                          docvar3 = c("red", "orange"), stringsAsFactors = FALSE))
    )
    
    expect_warning(
        dvars(readtext("../data/docvars/two/*txt", docvarsfrom = "filenames",
                         docvarnames = c("id")
        )),
        "Fewer docnames supplied than existing docvars - last 2 docvars given generic names."
    )
    
    #TODO: What happens if you supply more docnames?
    
    expect_error(
        dvars(readtext("../data/docvars/two/*txt", docvarsfrom = "nonesuch"))
    )
    
    #  Docvars from both metadata and filename
    expect_equal(
        dvars(readtext("../data/docvars/csv/*", docvarsfrom = c("filenames"), docvarnames = c("id", "fruit"), text_field = "text")),
        data.frame(list(shape = c("round", NA), texture = c(NA, "rough"), id = c(1, 2), fruit = c("apple", "orange")), 
                   stringsAsFactors = FALSE)
    )
    
    # #  Docvars from both metadata and filename
    # expect_equal(
    #     dvars(readtext("../data/docvars/json/*", docvarsfrom = c("filenames", "metadata"), docvarnames = c("id", "fruit"), text_field = "text")),
    #     data.frame(list(id = c(1, 2), fruit = c("apple", "orange"), shape = c("round", NA), texture = c(NA, "rough")), stringsAsFactors = FALSE)
    # )
    
})

test_that("test docvars.readtext warning with field != NULL", {
    expect_identical(
        dvars(readtext("../data/fox/fox.txt")),
        data.frame(x = 1)[, 0, drop=FALSE]
    )
})

test_that("test that readtext encoding argument must be either length 1 or same length as the number of files", {
    expect_error(
        readtext(
            c("../data/fox/fox.txt", "../data/fox/fox.txt", "../data/fox/fox.txt", "../data/fox/fox.txt"),
            encoding = c("utf-8", "utf-8")
        ),
        "Encoding parameter must be length 1, or as long as the number of files"
    )
})

context("Loading a corpus from a zip archive")
test_that("A single-level zip file containing txt files can be loaded",{
    qc <- readtext("../data/zip/inauguralTopLevel.zip")
    expect_equal(nrow(qc), 57)
})




context("Loading an empty gzipped tar archive")
test_that("An empty tar.gz file raises an error",{
    expect_error(
        readtext("../data/empty/test.tar.gz"),
        "File '../data/empty/test.tar.gz' does not exist"
    )
})


test_that("test reading structured text files with different columns", {
    testcorpus <- readtext(
        "../data/fruits/*.csv",
        text_field = "text"
    )
    
    expect_equal(
        dvars(testcorpus),
        data.frame(list(
            color = c("green", "orange", NA, NA), 
            shape = c(NA, NA, "round", "long")
        ),
        stringsAsFactors = FALSE)
    )
    expected_texts <- c("apple", "orange", "apple", "banana")
    names(expected_texts) <- c("1.csv.1", "1.csv.2", "2.csv.1", "2.csv.2")
    expect_equal(
        as.character(testcorpus),
        expected_texts
    )
})




context("Tests of new readtext internals. If these fail, it doesn't necessarily affect the exposed API")

context("Tests for list_files")

test_that("Test function to list files", {
    
    expect_error(
        readtext:::list_files("nonesuch://example.org/test.txt"),
        "Unsupported URL scheme"
    )
    
    testExistingFile <- readtext:::get_temp()
    file.create(testExistingFile)
    expect_equal(readtext:::list_files(testExistingFile), 
                 testExistingFile)
    expect_equal(readtext:::list_files(paste0('file://', testExistingFile)), 
                 testExistingFile)
    
    
    # Test vector of filenames
    testExistingFile2 <- readtext:::get_temp()
    file.create(testExistingFile2)
    expect_equal(
        readtext:::list_files(c(testExistingFile, testExistingFile2)),
        sort(c(testExistingFile, testExistingFile2))
    )
    
    # TODO: Test vector of filename and URL
    expect_equal(
        readtext:::list_files(c(testExistingFile, testExistingFile2)),
        sort(c(testExistingFile, testExistingFile2))
    )
    
    file.remove(testExistingFile)
    expect_error(
        readtext:::list_files(testExistingFile),
        "File '' does not exist"
    )
    expect_equal(
        readtext:::list_files(testExistingFile, ignore_missing = TRUE),
        character(0)
    )
    
    
    #Test globbing
    tempdir <- readtext:::get_temp(directory = TRUE)
    
    file.create(file.path(tempdir, "1.tsv"))
    file.create(file.path(tempdir, "2.tsv"))
    file.create(file.path(tempdir, "10.tsv"))
    
    expect_equal(
        length(readtext:::list_files(paste0(tempdir, "/", "*.tsv" ))),
        3
    )
    
    expect_equal(
        length(readtext:::list_files(paste0(tempdir, "/", "?.tsv" ))),
        2
    )
    
    expect_error(
        length(readtext:::list_files(paste0(tempdir, "/", "?.txt" ))),
        "File '' does not exist"
    )
    
    
    # Test globbing subdir
    
    tempsubdir1 <- readtext:::get_temp(temp_dir = tempdir, directory = TRUE)
    tempsubdir2 <- readtext:::get_temp(temp_dir = tempdir, directory = TRUE)
    
    file.create(file.path(tempsubdir1, "1.tsv"))
    file.create(file.path(tempsubdir1, "2.tsv"))
    file.create(file.path(tempsubdir2, "1.tsv"))
    
    expect_equal(
        length(readtext:::list_files(paste0(tempdir, "/", "*/", "?.tsv" ))),
        3
    )
    
    
    expect_error(
        readtext:::list_files("http://example.org/test.nonesuch"),
        "Remote URL does not end in known extension."
    )
    
})
    
test_that("Test function to list files with remote sources", {
    skip_on_cran()
    expect_error(
      readtext:::list_files("https://www.google.com/404.txt"),
      ".*404.*"
    )
    
    expect_equal(
      dim(readtext("https://www.google.com/404.txt", ignore_missing_files = TRUE)),
      c(1,2)
    )
})


test_that("text vectors have names of the files they come from by default (bug 221)", {

        expect_equal(
            names(as.character(readtext("../data/fox/fox.txt"))),
            "fox.txt"
        )

        actual_names <- names(as.character(readtext(
            "../data/csv/test*.csv", text_field = "text"
        )))
        expect_true(
            setequal(
                c("test.csv.1", "test.csv.2", "test2.csv.1", "test2.csv.2", "test3.csv.1", "test3.csv.2"),
                actual_names
            )
        )

        actual_names <- names(as.character(readtext(
            "../data/glob/*.txt"
        )))
        expect_true(
            setequal(
                c("1.txt", "2.txt", "3.txt", "4.txt", "10.txt"),
                actual_names
            )
        )

        actual_names <- names(as.character(readtext(
            "../data/tar/test.tar"
        )))
        expect_true(
            setequal(
                c("test.txt", "test2.txt", "test3.txt", "test4.txt"),
                actual_names
            )
        )

}) 

test_that("test globbed tar file",{
    skip_on_cran()
    skip_on_os("linux")
    expect_equal(
        unname(as.character(readtext("../data/tar/*"))),
        c("Lorem ipsum", "brown fox", "Dolor sit", "The quick")
    )
})

test_that("test html file",{
    expected <- c("The quick brown fox \njumps over the lazy dog")
    names(expected) <- "html5.html"

    expect_equal(
        as.character(readtext("../data/html/html5.html")),
        expected
    )

})

test_that("test malformed html file",{
    skip_on_os("windows")
    expected <- c("The quick brown fox \n    \njumps over the lazy dog")
    names(expected) <- "malformed_html5.html"
    expect_equal(
        as.character(readtext("../data/html/malformed_html5.html")),
        expected
    )
})

test_that("test for pdf file", {
    skip_on_os("windows")
    expect_output(
        cat(as.character(readtext("../data/pdf/test.pdf"))),
        "The quick brown fox jumps over the lazy dog\n" 
    )
})

test_that("test for odt file", {
	expected <- c("The quick brown fox jumps over the lazy dog")
	names(expected) <- "test.odt"
	expect_equal(
		as.character(readtext("../data/odt/test.odt")),
		expected
	)
	
})

test_that("test for docx file", {
    expected <- c("The quick brown fox jumps over the lazy dog")
    names(expected) <- "test.docx"
    
    expect_equal(
        as.character(readtext("../data/docx/test.docx")),
        expected
    )
})

test_that("test for doc file", {
    skip_on_os("windows")  
    expected <- paste(rep(c("The quick brown fox jumps over the lazy dog."), 10), collapse = " ")
    names(expected) <- "test.doc"

    txts <- as.character(readtext("../data/doc/test.doc"))
    namestmp <- names(txts)
    txts <- stringi::stri_replace_all_regex(txts, "\\n", " ")
    names(txts) <- namestmp

    expect_equal(
        txts,
        expected
    )
})

test_that("test json files", {
    skip_on_cran()
    skip_on_os("linux")
    expect_equal(
        unname(as.character(readtext("../data/json/test*json", text_field = "text"))),
        c("Lorem ipsum", "Dolor sit", "The quick", "brown fox", "Now is the winter")
    )
    
    #  test.json and test2.json are newline-delimited json
    #  test3.json is a single json object
    expected_docvars <- data.frame(list(
        colour = c("green", "red", "orange", "blue", NA), 
        number = c(42, 99, 0, NA, 3)),
        stringsAsFactors = FALSE)
    expected_docvars <- expected_docvars[order(expected_docvars$number),]
    row.names(expected_docvars) <- NULL
    actual_docvars <- dvars(readtext("../data/json/test*json", text_field = "text"))
    actual_docvars <- actual_docvars[order(actual_docvars$number),]
    row.names(actual_docvars) <- NULL
    row.names(actual_docvars)
    row.names(expected_docvars)
    expect_equal(
        actual_docvars,
        expected_docvars
    )
    
    expect_error(
        readtext("../data/json/test*json", text_field = 1),
        "Cannot use numeric text_field with json file"
    )
    
    expect_error(
        readtext("../data/json/test3.json", text_field = "nonesuch"),
        "There is no field called nonesuch in file"
    )
    
    
    # Twitter json files
    tweetSource <- readtext("../data/tweets/stream.json", source = "twitter")
    
    expect_equal(
        as.character(tweetSource),
        c(stream.json.1 = "I jumped over the lazy @dog", stream.json.2 = "Yawn")
    )
    
    expect_equal(
        dvars(tweetSource)$statuses_count,
        c(16204, 200)
    )
    
    expect_equal(
        dvars(tweetSource)$screen_name,
        c("foxxy", "dog")
    )
    
    
})

if (.Platform$OS.type == "unix") {
    test_that("test readtext with folder", {
        expect_equal(
            length(readtext("../data/fruits")$text),
            7
        )
    })
}    



context("Loading a corpus from a tar archive")
test_that("A single-level tar file containing txt files can be loaded",{
    skip_on_cran()
    skip_on_os("linux")
    expect_equal(
        unname(as.character(readtext("../data/tar/test.tar"))),
        c("Lorem ipsum", "brown fox", "Dolor sit", "The quick")
    )
})

context("Loading a corpus from a gzipped tar archive")
test_that("A single-level tar.gz file containing txt files can be loaded",{
    skip_on_cran()
    skip_on_os("linux")
    expect_equal(
        unname(as.character(readtext("../data/targz/test.tar.gz"))),
        c("Lorem ipsum", "brown fox", "Dolor sit", "The quick")
    )
})

context("Loading a corpus from a bzipped tar archive")
test_that("A single-level tar.bz file containing txt files can be loaded",{
    skip_on_cran()
    skip_on_os("linux")
    skip_on_os("windows")
    expect_equal(
        unname(as.character(readtext("../data/tarbz/test.tar.bz"))),
        c("Lorem ipsum", "brown fox", "Dolor sit", "The quick")
    )
})

context("Tests for verbosity argument")
test_that("test warning for unrecognized filetype", {
       expect_warning(
           readtext("../data/empty/empty.nonesuch"),
           paste0("Unsupported extension ", sQuote("nonesuch"), " of file")
       )
       expect_warning(
           readtext("../data/empty/empty.nonesuch", verbosity = 3),
           paste0("Unsupported extension ", sQuote("nonesuch"), " of file")
       )
       expect_warning(
           readtext("../data/empty/empty.nonesuch", verbosity = 2),
           paste0("Unsupported extension ", sQuote("nonesuch"), " of file")
       )
       expect_warning(
           readtext("../data/empty/empty.nonesuch", verbosity = 1),
           paste0("Unsupported extension ", sQuote("nonesuch"), " of file")
       )
       expect_silent(
           readtext("../data/empty/empty.nonesuch", verbosity = 0)
       )
})

test_that("messages from list_file",{
    expect_silent(
        readtext("../data/zip/inauguralTopLevel.zip", verbosity = 0)
    )
    expect_silent(
        readtext("../data/zip/inauguralTopLevel.zip", verbosity = 1)
    )
    expect_message(
        readtext("../data/zip/inauguralTopLevel.zip", verbosity = 2),
        "Reading texts from \\.\\./data/zip/inauguralTopLevel\\.zip"
    )
    expect_message(
        readtext("../data/zip/inauguralTopLevel.zip", verbosity = 3),
        "reading \\(txt\\) file: .*1789-Washington\\.txt"
    )
})

test_that("readtext called with textfield works with deprecation warning", {
    expect_equal(
        nrow(readtext("../data/csv/test*.csv", text_field = "text")),
        6
    )
    expect_equal(
        nrow(dvars(readtext("../data/csv/test*.csv", text_field = "text"))),
        6
    )
    expect_equal(
        length(as.character(readtext("../data/csv/test*.csv", text_field = "text"))),
        6
    )
    expect_warning(
        readtext("../data/csv/test*.csv", textfield = "text"),
        "textfield is deprecated; use text_field instead"
    )
})


test_that("tests for Excel files", {

    expect_equal(unname(as.character(
        readtext("../data/xls/test.xlsx", text_field = "text"))),
        c("The quick", "brown fox", "jumps over", "the lazy dog.")
    )
    expect_that(
        dvars(readtext("../data/xls/test.xlsx", text_field = "text")),
        equals(data.frame(list(
                        colour = c("orange", "blue", "pink", "pink"),
                        number = c(0, NA, NA, NA),
                        taste = c(NA, NA, "sweet", "umami")
                        ), stringsAsFactors = FALSE))
    )


    expect_equal(
        as.character(readtext("../data/xls/test.xls", text_field = "text")),
        c("test.xls.1" = "The quick", "test.xls.2" = "brown fox", 
          "test.xls.3" = "jumps over", "test.xls.4" = "the lazy dog.")
    )
    expect_that(
        dvars(readtext("../data/xls/test.xls", text_field = "text")),
        equals(data.frame(list(
                        colour = c("orange", "blue", "pink", "pink"),
                        number = c(0, NA, NA, NA),
                        taste = c(NA, NA, "sweet", "umami")
                        ), stringsAsFactors = FALSE))
    )


})

test_that("tests for ODS files", {
    expect_equal(unname(as.character(
        readtext("../data/ods/test.ods", text_field = "text"))),
        c("The quick", "brown fox", "jumps over", "the lazy dog.")
    )
    expect_identical(
        readtext("../data/ods/test.ods", text_field = "text"),
        structure(list(doc_id = c("test.ods.1", "test.ods.2", "test.ods.3", 
                                  "test.ods.4"), text = c("The quick", "brown fox", "jumps over", 
                                                          "the lazy dog."), colour = c("orange", "blue", "pink", "pink"
                                                          ), number = c(0L, NA, NA, NA), taste = c(NA, NA, "sweet", "umami"
                                                          )), row.names = c(NA, -4L), class = c("readtext", "data.frame"
                                                          ))
    )

})


test_that("rases error when source is not valid", {
    
    expect_error(
        readtext('../data/nexis/sun_2000-11-01_0001.html', source = 1),
        'source must be a character'
    )
    expect_error(
        readtext('../data/nexis/sun_2000-11-01_0001.html', source = 'something'),
        "'nexis' is the only source type available for HTML."
    )
    expect_silent(
        readtext('../data/nexis/sun_2000-11-01_0001.html', source = 'nexis')
    )
    
    expect_error(
        readtext('../data/tweets/stream.json', source = 1),
        'source must be a character'
    )
    
    expect_error(
        readtext('../data/tweets/stream.json', source = 'something'),
        "'twitter' is the only source type available for json"
    )
    
    expect_silent(
        readtext('../data/tweets/stream.json', source = 'twitter')
    )
    
})

test_that("readtext works with one-column csv files (#138)", {
    expect_equivalent(
        readtext("../data/csv/data_onecol.csv"),
        data.frame(doc_id = paste("data_onecol.csv", 1:2, sep = "."),
                   text = c("foo foo foo foo", "bar bar bar bar"),
                   stringsAsFactors = FALSE)
    )
    expect_equivalent(
        readtext("../data/csv/data_twocol.csv"),
        data.frame(doc_id = paste("data_twocol.csv", 1:2, sep = "."),
                   text = c("foo foo foo foo", "bar bar bar bar"),
                   y = 1:2,
                   stringsAsFactors = FALSE)
    )
    expect_equivalent(
        readtext("../data/csv/data_twocol.csv", text_field = "x", docid_field = "y"),
        data.frame(doc_id = c("1", "2"),
                   text = c("foo foo foo foo", "bar bar bar bar"),
                   stringsAsFactors = FALSE)
    )
})

test_that("tests for ODS files", {
    expect_identical(
        unname(as.character(readtext("../data/rtf/*.rtf"))),
        c("The quick brown fox jumps over the lazy dog",
          "This is an example of “rich text” format.")
    )
})

test_that("tests for files with doc_id", {
    expect_identical(
        as.character(readtext("../data/csv/withdocid.csv", docid_field = "doc_id", text_field = "text")),
        c(doc1 = "The quick", doc2 = "brown fox", doc3 = "jumped over")
    )
    expect_identical(
        as.character(readtext("../data/csv/withdocid.csv", text_field = "text"))[1],
        c(withdocid.csv.1 = "The quick")
    )
    expect_identical(
        as.character(readtext("../data/ods/withdocid.ods", docid_field = "doc_id", text_field = "text")),
        c(doc1 = "The quick", doc2 = "brown fox", doc3 = "jumped over")
    )
    expect_identical(
        as.character(readtext("../data/xls/withdocid.xls", docid_field = "doc_id", text_field = "text")),
        c(doc1 = "The quick", doc2 = "brown fox", doc3 = "jumped over")
    )
    expect_identical(
        as.character(readtext("../data/json/withdocid.json", docid_field = "doc_id", text_field = "text")),
        c(doc1 = "Lorem ipsum", doc2 = "Dolor sit")
    )
    expect_message(
        readtext("../data/csv/withdocid.csv", text_field = "text"),
        'A field called "doc_id" exists in the file. If you intend to use it as a document identifier, use "docid_field" option.')
    expect_error(
        readtext("../data/xls/withdocid.xls", docid_field = "nonesuch"),
        "There is no field called nonesuch"
    )
    expect_error(
        readtext("../data/xls/withdocid.xls", docid_field = 9000),
        "There is no 9000th field"
    )  
})
kbenoit/readtext documentation built on Feb. 27, 2024, 7:45 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
kbenoit/readtext
Import and Handling for Plain and Formatted Text Files

tests/testthat/test-readtext.R
In kbenoit/readtext: Import and Handling for Plain and Formatted Text Files

R Package Documentation

Browse R Packages

We want your feedback!

kbenoit/readtext Import and Handling for Plain and Formatted Text Files

tests/testthat/test-readtext.R In kbenoit/readtext: Import and Handling for Plain and Formatted Text Files

R Package Documentation

Browse R Packages

We want your feedback!

kbenoit/readtext
Import and Handling for Plain and Formatted Text Files

tests/testthat/test-readtext.R
In kbenoit/readtext: Import and Handling for Plain and Formatted Text Files