R/get-functions.R
In readtext: Import and Handling for Plain and Formatted Text Files

## txt format
get_txt <- function(path, source, ...) {
    con <- file(path, ...)
    txt <- paste(readLines(con, warn = FALSE), collapse = "\n")
    close(con)
    data.frame(text = txt, stringsAsFactors = FALSE)
}

## csv format
get_csv <- function(path, text_field, docid_field, encoding, source, ...) {

    # Replace native.enc with UTF-8 if that's what it is
    # http://r.789695.n4.nabble.com/Find-out-what-quot-native-enc-quot-corresponds-to-td4639208.html
    if (encoding == "native.enc")
        encoding <- strsplit(Sys.getlocale("LC_CTYPE"), "\\.")[[1]][2]
    if (!encoding %in% c("Latin-1", "UTF-8")) {
        # If the encoding is not one fread supports, open the file using R's native function
        #  Use the encoding arg to open the file, pass all other args to fread
        con <- file(path)
        txt <- paste(readLines(con, encoding = encoding, warn = FALSE), collapse = "\n")
        close(con)
        result <- data.table::fread(input = txt, data.table = FALSE, stringsAsFactors = FALSE, ...)
    } else {
        result <- data.table::fread(input = path, data.table = FALSE, stringsAsFactors = FALSE, encoding = encoding, ...)
    }
    result <- sort_fields(result, path, text_field)
    result <- add_docid(result, path, docid_field)
    return(result)
}

#  Dispatch to get_json_object or get_json_tweets depending on whether 
#  it looks like a twitter json file
get_json <- function(path, text_field, docid_field, encoding, source, verbosity = 1, ...) {

    if (!source %in% c("auto", "twitter"))
        stop("'twitter' is the only source type available for json")
    
    if (source == "twitter") {
        return(get_json_tweets(path, verbosity, ...))
    } else {
        if (is.numeric(text_field))
            stop("Cannot use numeric text_field with json file")
        result <- get_json_object(path, verbosity, ...)
        if (is.null(result)){
            result <- get_json_lines(path, verbosity, ...)
        } 
        if (!is.null(result)){
            result <- sort_fields(result, path, text_field)
            result <- add_docid(result, path, docid_field)
            return(result)
        }
        stop("This JSON file format is not supported.", call. = FALSE)
    }
}

## Twitter json
get_json_tweets <- function(path, verbosity = 1, ...) {
    # if (!requireNamespace("streamR", quietly = TRUE))
    #     stop("You must have streamR installed to read Twitter json files.")
    # read raw json data
    #txt <- readLines(path, warn = FALSE, ...)
    tryCatch({
        streamR::parseTweets(path, verbose = FALSE, ...)
    },
        error = function(e) {
        if (verbosity >= 1)
            stop("Doesn't look like Tweets JSON file, trying general JSON.")
        return(NULL)
    })
}

## general json
#' @importFrom data.table setDT
get_json_object <- function(path, verbosity = 1, ...) {
    # if (!requireNamespace("jsonlite", quietly = TRUE))
    #     stop("You must have jsonlite installed to read json files.")
    #as.data.frame(jsonlite::fromJSON(path, flatten = TRUE, ...), stringsAsFactors = FALSE)
    tryCatch({
        data.table::setDT(jsonlite::read_json(path, simplifyVector = TRUE))
    },
    error = function(e) {
        if (verbosity >= 1)
            message("File doesn't contain a single valid JSON object.")
        return(NULL)
    })
}

#' @importFrom data.table rbindlist
get_json_lines <- function(path, verbosity = 1, ...) {
    # if (!requireNamespace("jsonlite", quietly = TRUE))
    #     stop("You must have jsonlite installed to read json files.")

    tryCatch({
        lines <- readLines(path, warn = FALSE)
        #jsonlite::fromJSON(lines[1], flatten = TRUE, ...) # this line seems not doing anything...
        data.table::rbindlist(
            lapply(lines, function(x) jsonlite::fromJSON(stri_trim(x), flatten = TRUE, ...)),
            use.names = TRUE, fill = TRUE
        )
    },
    error = function(e) {
        if (verbosity >= 1)
            stop("This JSON file format is not supported.", call. = FALSE)
    })
}


## flat xml format
get_xml <- function(path, text_field, encoding, source, collapse = "", verbosity = 1, 
                    ...) {
    # TODO: encoding param is ignored
    # if (!requireNamespace("XML", quietly = TRUE))
    #     stop("You must have XML installed to read XML files.")

    if (is_probably_xpath(text_field)) {
        xml <- xml2::read_xml(path)
        txt <- xml2::xml_text(xml2::xml_find_all(xml, text_field), ...)
        txt <- paste0(txt, collapse = collapse)
        return(data.frame(text = txt, stringsAsFactors = FALSE))
    } else {
        xml <- xml2::read_xml(path)
        result <- xml2_to_dataframe(xml)
        if (is.numeric(text_field)) {
            if (text_field > ncol(result)) 
                stop(paste0("There is no ", text_field, "th field in file ", path))
            if (verbosity >= 1) {
                warning(paste("You should specify text_field by name rather than by index, unless",
                              "you're certain that your XML file's fields are always in the same order."))
            }
        }

        # Because XML::xmlToDataFrame doesn't impute column types, we have to do it
        # ourselves, to match get_csv's behaviour
        sort_fields(result, path, text_field, impute_types = TRUE)
    }
}

get_html <- function(path, encoding, source, verbosity = 1, ...) {

    if (!source %in% c("auto", "nexis"))
        stop("'nexis' is the only source type available for HTML.")
    
    if (source == "nexis") {
        return(get_nexis_html(path, verbosity = verbosity, ...))
        tryCatch({
            #return(get_nexis_html(path, ...))
        },
        error = function(e) {
            if (verbosity >= 1) stop("Doesn't look like Nexis HTML file")
        })
    } else {
        # http://stackoverflow.com/a/3195926
        # html <- XML::htmlTreeParse(path, useInternal = TRUE)
        # txt <- XML::xpathApply(html, "//body//text()[not(ancestor::script)][not(ancestor::style)][not(ancestor::noscript)]",
        #                        XML::xmlValue)
        html <- xml2::read_html(path)
        txt <- xml2::xml_text(xml2::xml_find_all(html, "//body//text()[not(ancestor::script)][not(ancestor::style)][not(ancestor::noscript)]"))
        txt <- txt[stri_trim(txt) != ""]
        txt <- paste0(txt, collapse = "\n")

        data.frame(text = txt, stringsAsFactors = FALSE)
    }
}


get_pdf <- function(path, source, ...) {

    txt <- pdftools::pdf_text(as.character(path))
    txt <- paste0(txt, collapse = "\n")
    Encoding(txt) <- "UTF-8"
    data.frame(text = txt, stringsAsFactors = FALSE)
}

get_odt <- function(path, source, ...) {
	path <- extract_archive(path, ignore_missing = FALSE)
	path <- sub("/\\*$", "", path)
	path <- file.path(path, "content.xml")
	
	xml <- xml2::read_xml(path)
	txt <- xml2::xml_text(xml2::xml_find_all(xml, "//text:p"))
	
	txt <- txt[!grepl("^\\s*$", txt)] # Remove text which is just whitespace
	txt <- paste0(txt, collapse = "\n")
	
	data.frame(text = txt, stringsAsFactors = FALSE)
}

get_docx <- function(path, source, ...) {
    path <- extract_archive(path, ignore_missing = FALSE)
    path <- sub("/\\*$", "", path)
    path <- file.path(path, "word", "document.xml")

    xml <- xml2::read_xml(path)
    txt <- xml2::xml_text(xml2::xml_find_all(xml, "//w:p"))
    
    # xml <- XML::xmlTreeParse(path, useInternalNodes = TRUE)
    # txt <- XML::xpathApply(xml, "//w:p", XML::xmlValue)
    txt <- txt[!grepl("^\\s*$", txt)] # Remove text which is just whitespace
    txt <- paste0(txt, collapse = "\n")

    data.frame(text = txt, stringsAsFactors = FALSE)
}

get_doc <- function(path, source, ...) {

    path <- normalizePath(path)
    txt <- antiword::antiword(as.character(path))

    # tryCatch({
    #     txt <- system2("antiword", shQuote(normalizePath(f)), stdout = TRUE)
    # },
    # error = function(e) {
    #     if (grepl("error in running command", e)) {
    #         stop(e, "Please check whether antiword is installed. You can download it from http://www.winfield.demon.nl/")
    #     } else {
    #     stop(e)
    #     }
    # })
    txt <- paste0(txt, collapse = "\n")
    txt <- trimws(txt)
    data.frame(text = txt, stringsAsFactors = FALSE)
}

get_rtf <- function(path, source, ...) {
    path <- normalizePath(path)
    txt <- striprtf::read_rtf(as.character(path))
    txt <- paste0(txt, collapse = "\n")
    txt <- trimws(txt)
    data.frame(text = txt, stringsAsFactors = FALSE)
}

get_excel <- function(path, text_field, docid_field, source, ...) {
    sheet_names <- readxl::excel_sheets(path)
    sheets <- lapply(sheet_names, function(x, ...) readxl::read_excel(path, sheet = x, ...))

    if (length(unique(sapply(sheets, ncol))) != 1) {
        warning("Not all worksheets in file \"", path, "\" have the same number of columns.")
    }

    result <- data.table::rbindlist(sheets, fill = TRUE)
    result <- sort_fields(result, path, text_field, impute_types = TRUE)
    result <- add_docid(result, path, docid_field)
    return(result)
}


get_ods <- function(path, text_field, docid_field, source, ...) {
    sheet_names <- readODS::list_ods_sheets(path)
    suppressMessages(
        sheets <- lapply(sheet_names, function(x, ...) readODS::read_ods(path, sheet = x, ...))
    )

    if (length(unique(sapply(sheets, ncol))) != 1)
        warning("Not all worksheets in file \"", path, "\" have the same number of columns.")

    result <- data.table::rbindlist(sheets, fill = TRUE)
    result <- sort_fields(result, path, text_field, impute_types = TRUE)
    result <- add_docid(result, path, docid_field)
    return(result)
}


xml2_to_dataframe <- function(xml) {
    xml_list <- xml2::as_list(xml)
    depth_check <- function(this, thisdepth = 0) {
        if (!is.list(this)) {
            return(thisdepth)
        } else if (thisdepth > 3) {
            return(thisdepth)
        } else {
            return(max(unlist(lapply(this, depth_check, thisdepth = thisdepth + 1))))    
        }
    }
    if (depth_check(xml_list[[1]]) != 3) {
        stop("The xml format does not fit for the extraction without xPath\n  Use xPath method instead")
    }
    ret <- data.table::rbindlist(xml_list[[1]], fill = TRUE)
    data.table::setDF(ret)
    return(ret)
}

Any scripts or data that you put into this service are public.

readtext documentation built on May 29, 2024, 5:13 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

readtext
Import and Handling for Plain and Formatted Text Files

R/get-functions.R
In readtext: Import and Handling for Plain and Formatted Text Files

Defines functions xml2_to_dataframe get_ods get_excel get_rtf get_doc get_docx get_odt get_pdf get_html get_xml get_json_lines get_json_object get_json_tweets get_json get_csv get_txt

Try the readtext package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

readtext Import and Handling for Plain and Formatted Text Files

R/get-functions.R In readtext: Import and Handling for Plain and Formatted Text Files

Defines functions xml2_to_dataframe get_ods get_excel get_rtf get_doc get_docx get_odt get_pdf get_html get_xml get_json_lines get_json_object get_json_tweets get_json get_csv get_txt

Try the readtext package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

readtext
Import and Handling for Plain and Formatted Text Files

R/get-functions.R
In readtext: Import and Handling for Plain and Formatted Text Files