test-readtext.R
In readtext: Import and Handling for Plain and Formatted Text Files

# TODO: re-do docs
# TODO: Check and remove extranous codes # TODO: recurse file listing for e.g. remote ZIP file
# TODO: readtext with csv doesn"t seem to require text_field

library("quanteda")

test_that("test readtext with single filename", {
    fox <- c(fox.txt = "The quick brown fox jumps over the lazy dog.")
    expect_equal(
        as.character(readtext("../data/fox/fox.txt")),
        fox
    )
})

test_that("test for deprecated warning for text()", {
    fox <- c(fox.txt = "The quick brown fox jumps over the lazy dog.")
    expect_warning(
        texts(readtext("../data/fox/fox.txt")),
        "'texts.readtext' is deprecated.
Use 'Use as.character() instead' instead.",
        fixed = TRUE
    )
})

test_that("test readtext with vector of filenames", {
    expect_equal(
        length(as.character(readtext(
            c(
                "../data/fruits/apple.txt",
                "../data/fruits/orange.txt"
            )
        ))),
        2
    )
})

test_that("test readtext with glob-style mask", {
    expect_equal(
        length(as.character(readtext(
            "../data/glob/*.txt"
        ))),
        5
    )

    expect_equal(
        length(as.character(readtext(
            "../data/glob/?.txt"
        ))),
        4
    )

    # Glob in non-last part of filename
    expect_equal(
        length(as.character(readtext(
            "../data/glob/*/test.txt"
        ))),
        2
    )

    # It would be nice to be able to test that escaped glob doesn"t special
    # characters in filename, but R check won"t allow a file of this name to
    # exist in the package... This should still pass if run the test manually
    # (having created the file, supposing your platform supports it)
    #  expect_equal(
    #      length(as.character(readtext(
    #              "../data/glob/special/\\*.txt"
    #      ))),
    #      1
    #  )
})

test_that("test structured readtext with glob-style mask", {
    expect_equal(
        nrow(readtext("../data/csv/test*.csv", text_field = "text")),
        6
    )
    expect_equal(
        nrow(readtext("../data/csv/test*.csv", text_field = "text")),
        6
    )
})


test_that("test remote text file", {
    expect_equal(
        as.character(readtext("https://raw.githubusercontent.com/kbenoit/readtext/master/tests/data/fox/fox.txt")),
        c(fox.txt = "The quick brown fox jumps over the lazy dog.")
    )
    # ignore_missing_files with an existing file should make no difference
    expect_equal(
        as.character(readtext("https://raw.githubusercontent.com/kbenoit/readtext/master/tests/data/fox/fox.txt",
                       ignore_missing_files = TRUE)),
        c(fox.txt = "The quick brown fox jumps over the lazy dog.")
    )
    
    # ingores parameters in URL
    expect_equal(
        as.character(readtext("https://raw.githubusercontent.com/kbenoit/readtext/master/tests/data/fox/fox.txt?x=1&y=2",
                       ignore_missing_files = TRUE)),
        c(fox.txt = "The quick brown fox jumps over the lazy dog.")
    )

    
})


test_that("test remote csv file", {
    expect_equal(
        as.character(readtext("https://raw.githubusercontent.com/kbenoit/readtext/master/tests/data/csv/test.csv", text_field = "text")),
        c(test.csv.1 = "Lorem ipsum.", test.csv.2 = "Dolor sit")
    )
})


context("test that require recursive invocation of listFileNames (i.e. because a special filename resolves to another special filename)")

test_that("test zip file", {
    skip_on_cran()
    skip_on_os("windows")
    DATA_DIR <- system.file("extdata/", package = "readtext")
        expect_equal(
        length(as.character(readtext(paste0(DATA_DIR, "/data_files_encodedtexts.zip")))),
        36
    )
})

test_that("test warning for unrecognized filetype", {
    expect_warning(
        readtext("../data/empty/empty.nonesuch"),
        paste0("Unsupported extension ", sQuote("nonesuch"), " of file *")
    )

    # But test that it still loads
    expect_warning(
        readtext("../data/unknown/unknown"),
        paste0("Unsupported extension ", sQuote(""), " of file *")
    )
    expect_equal(
        readtext("../data/unknown/unknown", verbosity = 0)$text,
        "The quick brown fox jumps over the lazy dog."
    )
})


# TODO: Refactor this to loop over filetypes
test_that("test csv files", {
    # Test corpus object
    testcorpus <- readtext("../data/csv/test.csv", text_field = "text")
    expect_that(
        docvars(testcorpus),
        equals(data.frame(list(colour = c("green", "red"), number = c(42, 99)),
                          stringsAsFactors = FALSE))
    )
    expect_equal(
        as.character(testcorpus),
        c(test.csv.1 = "Lorem ipsum.", test.csv.2 = "Dolor sit")
    )

    expect_error(
        readtext("../data/csv/*", text_field = "nonesuch"),
        "There is no field called"
    )

    expect_error(
        readtext("../data/csv/*", text_field = 9000),
        "There is no 9000th field"
    )
})

test_that("test tab files", {
    testreadtext <- readtext("../data/tab/test.tab", text_field = "text")
    expect_equal(
        docvars(corpus(testreadtext)),
        data.frame(list(colour = c("green", "red"), number = c(42, 99)), 
                          stringsAsFactors = FALSE)
    )
    expect_equal(
        testreadtext$text,
        unname(c(test.tab.1 = "Lorem ipsum.", test.tab.2 = "Dolor sit"))
    )
    
    expect_error(
        readtext("../data/tab/test.tab", text_field = "nonexistent"),
                 "There is no field called nonexistent"
    )
    
})

test_that("test tsv files", {
    testreadtext <- readtext("../data/tsv/test.tsv", text_field = "text")
    expect_that(
        docvars(testreadtext),
        equals(data.frame(list(colour = c("green", "red"), number = c(42, 99)), 
                          stringsAsFactors = FALSE))
    )
    expect_that(
        as.character(testreadtext),
        equals(c(test.tsv.1 = "Lorem ipsum.", test.tsv.2 = "Dolor sit"))
    )
    
    expect_error(
        readtext("../data/tsv/test.tsv", text_field = "nonexistant"),
                 "There is no field called nonexistant"
    )
    
})


test_that("test xml files", {
    # Test corpus object
    testcorpus <- readtext("../data/xml/test.xml", text_field = "text")
    expect_equal(
        data.frame(testcorpus[, -c(1, 2)]),
        data.frame(list(colour = c("green", "red"), number = c(42, 99)), 
                          stringsAsFactors = FALSE)
    )
    expect_equal(
        unname(as.character(testcorpus)),
        c("Lorem ipsum.", "Dolor sit")
    )
    expect_equal(
        docnames(testcorpus),
        c("test.xml.1", "test.xml.2")
    )

    expect_warning(
        readtext("../data/xml/test.xml", text_field = 1),
        "You should specify text_field by name.*"
    )
    expect_that(
        unname(as.character(suppressWarnings(readtext("../data/xml/test.xml", text_field = 1)))),
        equals(c("Lorem ipsum.", "Dolor sit"))
    )
    expect_that(
        docnames(testcorpus),
        equals(c("test.xml.1", "test.xml.2"))
    )
    
    expect_error(
        readtext("../data/xml/test.xml", text_field = "nonesuch"),
        "There is no field called"
    )
    
    expect_error(
        readtext("../data/xml/test.xml", text_field = 9000),
        "There is no 9000th field"
    )
})


test_that("test xml files with XPath", {

    expected <- c("The quick brown fox")
    names(expected) <- "tei.xml"

    actual <- readtext("../data/xml/tei.xml",
                      text_field = "/d1:TEI/d1:text/d1:body//d1:p")#,
                      # namespaces = c(tei = "https://www.tei-c.org/ns/1.0"))
    expect_equal(as.character(actual), expected)


    actual <- readtext("../data/xml/tei.xml", collapse = "P",
                       text_field = "/d1:TEI/d1:text/d1:body//d1:p")
                       # namespaces = c(tei = "https://www.tei-c.org/ns/1.0"))
    expect_equal(unname(as.character(actual)), "The Pquick Pbrown Pfox")

    actual <- readtext("../data/xml/tei.xml", collapse = "P",
                      text_field = "/d1:TEI//*/text()")#,
                      # namespaces = c(tei = "https://www.tei-c.org/ns/1.0"))
    expect_equal(unname(as.character(actual)), "Lorem Ipsum 1PSome PlacePAnywhere, USPNopePThe Pquick Pbrown PfoxPNope")

})


test_that("test readtext() with docvarsfrom = filenames", {
    
    expect_that(
        docvars(readtext("../data/docvars/one/*", docvarsfrom = "filenames")),
        equals(data.frame(list(docvar1 = c(1L, 2L), docvar2 = c("apple", "orange")), 
                          stringsAsFactors = FALSE))
    )
    
    expect_that(
        docvars(readtext("../data/docvars/dash/*", docvarsfrom = "filenames", dvsep = "-")),
        equals(data.frame(list(docvar1 = c(1,2), docvar2 = c("apple", "orange")), 
                          stringsAsFactors = FALSE))
    )
    
    
    expect_that(
        docvars(readtext("../data/docvars/two/*txt", docvarsfrom = "filenames")),
        equals(data.frame(list(docvar1 = c(1,2), docvar2 = c("apple", "orange")), docvar3 = c("red", "orange"), 
                          stringsAsFactors = FALSE))
    )
    
    expect_error(
        readtext("../data/docvars/two/*json", text_field = "nonesuch", docvarsfrom = "filenames"),
                "There is no field called"
    )
    
    expect_error(
        readtext("../data/docvars/unequal/*", docvarsfrom = "filenames"),
        "Filename elements are not equal in length."
    )
    
    expect_that(
        docvars(readtext("../data/docvars/two/*txt", docvarsfrom = "filenames",
                         docvarnames = c("id", "fruit", "colour"))),
        equals(data.frame(list(id = c(1,2), fruit = c("apple", "orange")), 
                          colour = c("red", "orange"), stringsAsFactors = FALSE))
    )

    expect_warning(
        docvars(readtext("../data/docvars/two/*txt", docvarsfrom = "filenames",
                         docvarnames = c("id", "fruit")
        )),
        "Fewer docnames supplied than existing docvars - last 1 docvar given generic names."
    )
    expect_that(
        docvars(suppressWarnings(readtext("../data/docvars/two/*txt", docvarsfrom = "filenames",
                         docvarnames = c("id", "fruit")))),
        equals(data.frame(list(id = c(1,2), fruit = c("apple", "orange")), 
                          docvar3 = c("red", "orange"), stringsAsFactors = FALSE))
    )
    
    expect_warning(
        docvars(readtext("../data/docvars/two/*txt", docvarsfrom = "filenames",
                         docvarnames = c("id")
        )),
        "Fewer docnames supplied than existing docvars - last 2 docvars given generic names."
    )
    
    #TODO: What happens if you supply more docnames?
    
    expect_error(
        docvars(readtext("../data/docvars/two/*txt", docvarsfrom = "nonesuch"))
    )
    
    #  Docvars from both metadata and filename
    expect_equal(
        docvars(readtext("../data/docvars/csv/*", docvarsfrom = c("filenames"), docvarnames = c("id", "fruit"), text_field = "text")),
        data.frame(list(shape = c("round", NA), texture = c(NA, "rough"), id = c(1, 2), fruit = c("apple", "orange")), 
                   stringsAsFactors = FALSE)
    )
    
    # #  Docvars from both metadata and filename
    # expect_equal(
    #     docvars(readtext("../data/docvars/json/*", docvarsfrom = c("filenames", "metadata"), docvarnames = c("id", "fruit"), text_field = "text")),
    #     data.frame(list(id = c(1, 2), fruit = c("apple", "orange"), shape = c("round", NA), texture = c(NA, "rough")), stringsAsFactors = FALSE)
    # )
    
})

test_that("test docvars.readtext warning with field != NULL", {
    expect_identical(
        docvars(readtext("../data/fox/fox.txt")),
        data.frame(x = 1)[, 0, drop=FALSE]
    )
})

test_that("test that readtext encoding argument must be either length 1 or same length as the number of files", {
    expect_error(
        readtext(
            c("../data/fox/fox.txt", "../data/fox/fox.txt", "../data/fox/fox.txt", "../data/fox/fox.txt"),
            encoding = c("utf-8", "utf-8")
        ),
        "Encoding parameter must be length 1, or as long as the number of files"
    )
})

context("Loading a corpus from a zip archive")
test_that("A single-level zip file containing txt files can be loaded",{
    qc <- readtext("../data/zip/inauguralTopLevel.zip")
    expect_equal(nrow(qc), 57)
})




context("Loading an empty gzipped tar archive")
test_that("An empty tar.gz file raises an error",{
    expect_error(
        readtext("../data/empty/test.tar.gz"),
        "File '../data/empty/test.tar.gz' does not exist"
    )
})


test_that("test reading structured text files with different columns", {
    testcorpus <- readtext(
        "../data/fruits/*.csv",
        text_field = "text"
    )
    
    expect_equal(
        docvars(testcorpus),
        data.frame(list(
            color = c("green", "orange", NA, NA), 
            shape = c(NA, NA, "round", "long")
        ),
        stringsAsFactors = FALSE)
    )
    expected_texts <- c("apple", "orange", "apple", "banana")
    names(expected_texts) <- c("1.csv.1", "1.csv.2", "2.csv.1", "2.csv.2")
    expect_equal(
        as.character(testcorpus),
        expected_texts
    )
})




context("Tests of new readtext internals. If these fail, it doesn't necessarily affect the exposed API")

context("Tests for list_files")

test_that("Test function to list files", {
    
    expect_error(
        readtext:::list_files("nonesuch://example.org/test.txt"),
        "Unsupported URL scheme"
    )
    
    testExistingFile <- readtext:::get_temp()
    file.create(testExistingFile)
    expect_equal(readtext:::list_files(testExistingFile), 
                 testExistingFile)
    expect_equal(readtext:::list_files(paste0('file://', testExistingFile)), 
                 testExistingFile)
    
    
    # Test vector of filenames
    testExistingFile2 <- readtext:::get_temp()
    file.create(testExistingFile2)
    expect_equal(
        readtext:::list_files(c(testExistingFile, testExistingFile2)),
        sort(c(testExistingFile, testExistingFile2))
    )
    
    # TODO: Test vector of filename and URL
    expect_equal(
        readtext:::list_files(c(testExistingFile, testExistingFile2)),
        sort(c(testExistingFile, testExistingFile2))
    )
    
    file.remove(testExistingFile)
    expect_error(
        readtext:::list_files(testExistingFile),
        "File '' does not exist"
    )
    expect_equal(
        readtext:::list_files(testExistingFile, ignore_missing = TRUE),
        character(0)
    )
    
    
    #Test globbing
    tempdir <- readtext:::get_temp(directory = TRUE)
    
    file.create(file.path(tempdir, "1.tsv"))
    file.create(file.path(tempdir, "2.tsv"))
    file.create(file.path(tempdir, "10.tsv"))
    
    expect_equal(
        length(readtext:::list_files(paste0(tempdir, "/", "*.tsv" ))),
        3
    )
    
    expect_equal(
        length(readtext:::list_files(paste0(tempdir, "/", "?.tsv" ))),
        2
    )
    
    expect_error(
        length(readtext:::list_files(paste0(tempdir, "/", "?.txt" ))),
        "File '' does not exist"
    )
    
    
    # Test globbing subdir
    
    tempsubdir1 <- readtext:::get_temp(temp_dir = tempdir, directory = TRUE)
    tempsubdir2 <- readtext:::get_temp(temp_dir = tempdir, directory = TRUE)
    
    file.create(file.path(tempsubdir1, "1.tsv"))
    file.create(file.path(tempsubdir1, "2.tsv"))
    file.create(file.path(tempsubdir2, "1.tsv"))
    
    expect_equal(
        length(readtext:::list_files(paste0(tempdir, "/", "*/", "?.tsv" ))),
        3
    )
    
    
    expect_error(
        readtext:::list_files("http://example.org/test.nonesuch"),
        "Remote URL does not end in known extension."
    )
    
})
    
test_that("Test function to list files with remote sources", {
    skip_on_cran()
    expect_error(
      readtext:::list_files("https://www.google.com/404.txt"),
      ".*404.*"
    )
    
    expect_equal(
      dim(readtext("https://www.google.com/404.txt", ignore_missing_files = TRUE)),
      c(1,2)
    )
})


test_that("text vectors have names of the files they come from by default (bug 221)", {

        expect_equal(
            names(as.character(readtext("../data/fox/fox.txt"))),
            "fox.txt"
        )

        actual_names <- names(as.character(readtext(
            "../data/csv/test*.csv", text_field = "text"
        )))
        expect_true(
            setequal(
                c("test.csv.1", "test.csv.2", "test2.csv.1", "test2.csv.2", "test3.csv.1", "test3.csv.2"),
                actual_names
            )
        )

        actual_names <- names(as.character(readtext(
            "../data/glob/*.txt"
        )))
        expect_true(
            setequal(
                c("1.txt", "2.txt", "3.txt", "4.txt", "10.txt"),
                actual_names
            )
        )

        actual_names <- names(as.character(readtext(
            "../data/tar/test.tar"
        )))
        expect_true(
            setequal(
                c("test.txt", "test2.txt", "test3.txt", "test4.txt"),
                actual_names
            )
        )

}) 

test_that("test globbed tar file",{
    skip_on_cran()
    skip_on_os("linux")
    expect_equal(
        unname(as.character(readtext("../data/tar/*"))),
        c("Lorem ipsum", "brown fox", "Dolor sit", "The quick")
    )
})

test_that("test html file",{
    expected <- c("The quick brown fox \njumps over the lazy dog")
    names(expected) <- "html5.html"

    expect_equal(
        as.character(readtext("../data/html/html5.html")),
        expected
    )

})

test_that("test malformed html file",{
    skip_on_os("windows")
    expected <- c("The quick brown fox \n    \njumps over the lazy dog")
    names(expected) <- "malformed_html5.html"
    expect_equal(
        as.character(readtext("../data/html/malformed_html5.html")),
        expected
    )
})

test_that("test for pdf file", {
    skip_on_os("windows")
    expect_output(
        cat(as.character(readtext("../data/pdf/test.pdf"))),
        "The quick brown fox jumps over the lazy dog\n" 
    )
})

test_that("test for odt file", {
	expected <- c("The quick brown fox jumps over the lazy dog")
	names(expected) <- "test.odt"
	expect_equal(
		as.character(readtext("../data/odt/test.odt")),
		expected
	)
	
})

test_that("test for docx file", {
    expected <- c("The quick brown fox jumps over the lazy dog")
    names(expected) <- "test.docx"
    
    expect_equal(
        as.character(readtext("../data/docx/test.docx")),
        expected
    )
})

test_that("test for doc file", {
    skip_on_os("windows")  
    expected <- paste(rep(c("The quick brown fox jumps over the lazy dog."), 10), collapse = " ")
    names(expected) <- "test.doc"

    txts <- as.character(readtext("../data/doc/test.doc"))
    namestmp <- names(txts)
    txts <- stringi::stri_replace_all_regex(txts, "\\n", " ")
    names(txts) <- namestmp

    expect_equal(
        txts,
        expected
    )
})

test_that("test json files", {
    skip_on_cran()
    skip_on_os("linux")
    expect_equal(
        unname(as.character(readtext("../data/json/test*json", text_field = "text"))),
        c("Lorem ipsum", "Dolor sit", "The quick", "brown fox", "Now is the winter")
    )
    
    #  test.json and test2.json are newline-delimited json
    #  test3.json is a single json object
    expected_docvars <- data.frame(list(
        colour = c("green", "red", "orange", "blue", NA), 
        number = c(42, 99, 0, NA, 3)),
        stringsAsFactors = FALSE)
    expected_docvars <- expected_docvars[order(expected_docvars$number),]
    row.names(expected_docvars) <- NULL
    actual_docvars <- docvars(readtext("../data/json/test*json", text_field = "text"))
    actual_docvars <- actual_docvars[order(actual_docvars$number),]
    row.names(actual_docvars) <- NULL
    row.names(actual_docvars)
    row.names(expected_docvars)
    expect_equal(
        actual_docvars,
        expected_docvars
    )
    
    expect_error(
        readtext("../data/json/test*json", text_field = 1),
        "Cannot use numeric text_field with json file"
    )
    
    expect_error(
        readtext("../data/json/test3.json", text_field = "nonesuch"),
        "There is no field called nonesuch in file"
    )
    
    
    # Twitter json files
    tweetSource <- readtext("../data/tweets/stream.json", source = "twitter")
    
    expect_equal(
        as.character(tweetSource),
        c(stream.json.1 = "I jumped over the lazy @dog", stream.json.2 = "Yawn")
    )
    
    expect_equal(
        docvars(tweetSource)$statuses_count,
        c(16204, 200)
    )
    
    expect_equal(
        docvars(tweetSource)$screen_name,
        c("foxxy", "dog")
    )
    
    
})

if (.Platform$OS.type == "unix") {
    test_that("test readtext with folder", {
        expect_equal(
            length(readtext("../data/fruits")$text),
            7
        )
    })
}    



context("Loading a corpus from a tar archive")
test_that("A single-level tar file containing txt files can be loaded",{
    skip_on_cran()
    skip_on_os("linux")
    expect_equal(
        unname(as.character(readtext("../data/tar/test.tar"))),
        c("Lorem ipsum", "brown fox", "Dolor sit", "The quick")
    )
})

context("Loading a corpus from a gzipped tar archive")
test_that("A single-level tar.gz file containing txt files can be loaded",{
    skip_on_cran()
    skip_on_os("linux")
    expect_equal(
        unname(as.character(readtext("../data/targz/test.tar.gz"))),
        c("Lorem ipsum", "brown fox", "Dolor sit", "The quick")
    )
})

context("Loading a corpus from a bzipped tar archive")
test_that("A single-level tar.bz file containing txt files can be loaded",{
    skip_on_cran()
    skip_on_os("linux")
    skip_on_os("windows")
    expect_equal(
        unname(as.character(readtext("../data/tarbz/test.tar.bz"))),
        c("Lorem ipsum", "brown fox", "Dolor sit", "The quick")
    )
})

context("Tests for verbosity argument")
test_that("test warning for unrecognized filetype", {
       expect_warning(
           readtext("../data/empty/empty.nonesuch"),
           paste0("Unsupported extension ", sQuote("nonesuch"), " of file")
       )
       expect_warning(
           readtext("../data/empty/empty.nonesuch", verbosity = 3),
           paste0("Unsupported extension ", sQuote("nonesuch"), " of file")
       )
       expect_warning(
           readtext("../data/empty/empty.nonesuch", verbosity = 2),
           paste0("Unsupported extension ", sQuote("nonesuch"), " of file")
       )
       expect_warning(
           readtext("../data/empty/empty.nonesuch", verbosity = 1),
           paste0("Unsupported extension ", sQuote("nonesuch"), " of file")
       )
       expect_silent(
           readtext("../data/empty/empty.nonesuch", verbosity = 0)
       )
})

test_that("messages from list_file",{
    expect_silent(
        readtext("../data/zip/inauguralTopLevel.zip", verbosity = 0)
    )
    expect_silent(
        readtext("../data/zip/inauguralTopLevel.zip", verbosity = 1)
    )
    expect_message(
        readtext("../data/zip/inauguralTopLevel.zip", verbosity = 2),
        "Reading texts from \\.\\./data/zip/inauguralTopLevel\\.zip"
    )
    expect_message(
        readtext("../data/zip/inauguralTopLevel.zip", verbosity = 3),
        "reading \\(txt\\) file: .*1789-Washington\\.txt"
    )
})

test_that("readtext called with textfield works with deprecation warning", {
    expect_equal(
        nrow(readtext("../data/csv/test*.csv", text_field = "text")),
        6
    )
    expect_equal(
        nrow(docvars(readtext("../data/csv/test*.csv", text_field = "text"))),
        6
    )
    expect_equal(
        length(as.character(readtext("../data/csv/test*.csv", text_field = "text"))),
        6
    )
    expect_warning(
        readtext("../data/csv/test*.csv", textfield = "text"),
        "textfield is deprecated; use text_field instead"
    )
})


test_that("tests for Excel files", {

    expect_equal(unname(as.character(
        readtext("../data/xls/test.xlsx", text_field = "text"))),
        c("The quick", "brown fox", "jumps over", "the lazy dog.")
    )
    expect_that(
        docvars(readtext("../data/xls/test.xlsx", text_field = "text")),
        equals(data.frame(list(
                        colour = c("orange", "blue", "pink", "pink"),
                        number = c(0, NA, NA, NA),
                        taste = c(NA, NA, "sweet", "umami")
                        ), stringsAsFactors = FALSE))
    )


    expect_equal(
        as.character(readtext("../data/xls/test.xls", text_field = "text")),
        c("test.xls.1" = "The quick", "test.xls.2" = "brown fox", 
          "test.xls.3" = "jumps over", "test.xls.4" = "the lazy dog.")
    )
    expect_that(
        docvars(readtext("../data/xls/test.xls", text_field = "text")),
        equals(data.frame(list(
                        colour = c("orange", "blue", "pink", "pink"),
                        number = c(0, NA, NA, NA),
                        taste = c(NA, NA, "sweet", "umami")
                        ), stringsAsFactors = FALSE))
    )


})

test_that("tests for ODS files", {
    expect_equal(unname(as.character(
        readtext("../data/ods/test.ods", text_field = "text"))),
        c("The quick", "brown fox", "jumps over", "the lazy dog.")
    )
    expect_identical(
        readtext("../data/ods/test.ods", text_field = "text"),
        structure(list(doc_id = c("test.ods.1", "test.ods.2", "test.ods.3", 
                                  "test.ods.4"), text = c("The quick", "brown fox", "jumps over", 
                                                          "the lazy dog."), colour = c("orange", "blue", "pink", "pink"
                                                          ), number = c(0L, NA, NA, NA), taste = c(NA, NA, "sweet", "umami"
                                                          )), row.names = c(NA, -4L), class = c("readtext", "data.frame"
                                                          ))
    )

})


test_that("rases error when source is not valid", {
    
    expect_error(
        readtext('../data/nexis/sun_2000-11-01_0001.html', source = 1),
        'source must be a character'
    )
    expect_error(
        readtext('../data/nexis/sun_2000-11-01_0001.html', source = 'something'),
        "'nexis' is the only source type available for HTML."
    )
    expect_silent(
        readtext('../data/nexis/sun_2000-11-01_0001.html', source = 'nexis')
    )
    
    expect_error(
        readtext('../data/tweets/stream.json', source = 1),
        'source must be a character'
    )
    
    expect_error(
        readtext('../data/tweets/stream.json', source = 'something'),
        "'twitter' is the only source type available for json"
    )
    
    expect_silent(
        readtext('../data/tweets/stream.json', source = 'twitter')
    )
    
})

test_that("readtext works with one-column csv files (#138)", {
    expect_equivalent(
        readtext("../data/csv/data_onecol.csv"),
        data.frame(doc_id = paste("data_onecol.csv", 1:2, sep = "."),
                   text = c("foo foo foo foo", "bar bar bar bar"),
                   stringsAsFactors = FALSE)
    )
    expect_equivalent(
        readtext("../data/csv/data_twocol.csv"),
        data.frame(doc_id = paste("data_twocol.csv", 1:2, sep = "."),
                   text = c("foo foo foo foo", "bar bar bar bar"),
                   y = 1:2,
                   stringsAsFactors = FALSE)
    )
    expect_equivalent(
        readtext("../data/csv/data_twocol.csv", text_field = "x", docid_field = "y"),
        data.frame(doc_id = c("1", "2"),
                   text = c("foo foo foo foo", "bar bar bar bar"),
                   stringsAsFactors = FALSE)
    )
})

test_that("tests for ODS files", {
    expect_identical(
        unname(as.character(readtext("../data/rtf/*.rtf"))),
        c("The quick brown fox jumps over the lazy dog",
          "This is an example of “rich text” format.")
    )
})

test_that("tests for files with doc_id", {
    expect_identical(
        as.character(readtext("../data/csv/withdocid.csv", docid_field = "doc_id", text_field = "text")),
        c(doc1 = "The quick", doc2 = "brown fox", doc3 = "jumped over")
    )
    expect_identical(
        as.character(readtext("../data/csv/withdocid.csv", text_field = "text"))[1],
        c(withdocid.csv.1 = "The quick")
    )
    expect_identical(
        as.character(readtext("../data/ods/withdocid.ods", docid_field = "doc_id", text_field = "text")),
        c(doc1 = "The quick", doc2 = "brown fox", doc3 = "jumped over")
    )
    expect_identical(
        as.character(readtext("../data/xls/withdocid.xls", docid_field = "doc_id", text_field = "text")),
        c(doc1 = "The quick", doc2 = "brown fox", doc3 = "jumped over")
    )
    expect_identical(
        as.character(readtext("../data/json/withdocid.json", docid_field = "doc_id", text_field = "text")),
        c(doc1 = "Lorem ipsum", doc2 = "Dolor sit")
    )
    expect_message(
        readtext("../data/csv/withdocid.csv", text_field = "text"),
        'A field called "doc_id" exists in the file. If you intend to use it as a document identifier, use "docid_field" option.')
    expect_error(
        readtext("../data/xls/withdocid.xls", docid_field = "nonesuch"),
        "There is no field called nonesuch"
    )
    expect_error(
        readtext("../data/xls/withdocid.xls", docid_field = 9000),
        "There is no 9000th field"
    )  
})
Any scripts or data that you put into this service are public.
readtext documentation built on June 7, 2023, 5:11 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
readtext
Import and Handling for Plain and Formatted Text Files

tests/testthat/test-readtext.R
In readtext: Import and Handling for Plain and Formatted Text Files

Try the readtext package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

readtext Import and Handling for Plain and Formatted Text Files

tests/testthat/test-readtext.R In readtext: Import and Handling for Plain and Formatted Text Files

Try the readtext package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

readtext
Import and Handling for Plain and Formatted Text Files

tests/testthat/test-readtext.R
In readtext: Import and Handling for Plain and Formatted Text Files