#' Read WhatsApp history into R
#'
#' Takes a history file from the ``WhatsApp'' messenger app (txt or zip) and
#' returns a formatted data.frame with descriptions of the used emojis.
#'
#' @details The history can be obtained going to the menu in a chat on the
#' ``WhatsApp'' app, choosing "more", then "Export chat".
#'
#' @param x Path to a txt or zip file of a WhatsApp history or the history
#' itself as character object.
#' @param tz A time zone for date conversion. Set NULL or "" for the default
#' time zone or a single string with a timezone identifier, see
#' \link[stringi]{stri_timezone_list}.
#' @param format Most formats are automatically detected. If you encounter
#' problems you can provide a custom format here. Refer to
#' \link[stringi]{stri_datetime_parse} for guidance.
#' @param verbose A logical flag indicating whether information should be
#' printed to the screen.
#' @param encoding Input encoding. Should usually be "UTF-8" if files haven't
#' changed since export from WhatsApp.
#' @param ... Further arguments passed to \link[base]{readLines}.
#'
#' @return A tibble with the information parsed from the history file.
#' @export
#' @import stringi
#' @importFrom tibble tibble
#' @importFrom utils head unzip
#'
#' @examples
#' history <- system.file("extdata", "sample.txt", package = "rwhatsapp")
#' df <- rwa_read(history)
rwa_read <- function(x,
tz = NULL,
format = NULL,
verbose = FALSE,
encoding = "UTF-8",
...) {
if (verbose) {
start_time <- status("Reading chat history from",
appendLF = FALSE, ppfix = "")
} else {
start_time <- NULL
}
chat_raw <- rwa_read_lines(x, verbose, start_time, encoding, ...)
chat_raw <- chat_raw[!chat_raw == ""]
formats <- c(
"^\\d{2,4}.\\d{2}.\\d{2,4} - \\d{2}:\\d{2}[^;]+;|^\\d{2,4}-\\d{2}-\\d{2,4}[^-]+ -",
"[^-]+ - ",
"[^]]+] ",
"^[^A-z]*\\d{1,2}:\\d{1,2}(\\sAM|\\sPM){0,1}"
)
time <- lapply(formats, function(f) {
stri_extract_first_regex(
str = chat_raw,
pattern = f
)
})
nnas <- vapply(time, function(t) sum(is.na(t)), FUN.VALUE = integer(1))
time <- time[[which.min(nnas)]]
proper_time <- stri_detect_regex(
str = time,
pattern = "\\d{1,4}.\\d{1,2}.\\d{2,4}|\\d{1,2}:\\d{1,2} [APM]"
)
time[!proper_time] <- NA
for (l in rev(which(is.na(time)))) {
chat_raw[l - 1] <- stri_paste(chat_raw[l - 1], chat_raw[l],
sep = "\n")
}
chat_raw <- chat_raw[!is.na(time)]
time <- time[!is.na(time)]
if (verbose) status("timestamps extracted")
source <- names(chat_raw)
chat_raw <- stri_replace_first_fixed(str = chat_raw,
pattern = time,
replacement = "")
time <- stri_replace_all_regex(
str = time,
pattern = c("\\[", "\\]", "-$", "- $"),
replacement = c("", "", "", ""),
vectorize_all = FALSE
)
time <- rwa_parse_time(time, format, tz)
if (verbose) status("timestamps converted")
if (sum(is.na(time)) > (length(time) / 10)) {
warning("Time conversion did not work correctly. Provide a custom format",
" or add an issue at www.github.com/JBGruber/rwhatsapp.")
}
author <- stri_extract_first_regex(str = chat_raw,
pattern = "[^:]+: ")
chat_raw[!is.na(author)] <- stri_replace_first_fixed(
str = chat_raw[!is.na(author)],
pattern = author[!is.na(author)],
replacement = ""
)
author <- stri_replace_last_fixed(str = author,
pattern = ": ",
replacement = "")
if (isTRUE(any(stri_detect_regex(head(author, 10), "^ - ")))) {
author <- stri_replace_first_regex(str = author,
pattern = "^ - ",
replacement = "")
}
if (verbose) status("author extracted")
tbl <- tibble::tibble(
time = time,
author = as.factor(stri_trim_both(author)),
text = chat_raw,
source = source
)
tbl <- lookup_emoji(tbl)
if (verbose) {
status("emoji extracted")
status(nrow(tbl),
" messages from ",
length(unique(tbl$author)),
" authors extracted. ",
"Elapsed time:",
ppfix = "", indent = "")
}
return(tbl)
}
#' Read in files from supported formats
#'
#' @param start_time For verbose messages.
#' @inherit rwa_read
#' @import stringi
#' @noRd
rwa_read_lines <- function(x, verbose, start_time = NULL, encoding, ...) {
# get files
zps <- grep(".zip$", x, ignore.case = TRUE)
temp <- NULL
src <- NULL
if (length(zps) > 0) {
src <- x[zps]
x[zps] <- vapply(x[zps], FUN.VALUE = character(1), FUN = function(x) {
content <- unzip(x, list = TRUE)
content <- content[grepl(".txt$", content$Name, ignore.case = TRUE), ]
temp <- paste0(tempdir(), "/whatsapp")
unzip(x, files = content$Name, overwrite = TRUE, exdir = temp)
return(list.files(temp, pattern = content$Name, full.names = TRUE))
})
}
if (f_exist_s(x)) {
if (length(x) == 1) {
chat_raw <- readLines(x, encoding = encoding, ...)
names(chat_raw) <- rep(x, length(chat_raw))
if (verbose) {
message(" one log file...")
status("one log file loaded")
}
} else {
chat_raw <- unlist(lapply(x, function(t) {
cr <- readLines(t, encoding = encoding, ...)
names(cr) <- rep(t, length(cr))
return(cr)
}))
if (verbose) {
message(" ", length(x), " log files...")
status("files loaded ")
}
}
} else if (is.character(x) && length(x) > 1) {
chat_raw <- x
names(chat_raw) <- rep("text input", length(chat_raw))
if (verbose) {
message(" character object...")
status("object loaded ")
}
} else {
stop("Provide either a path to one or multiple txt or zip files of a ",
"WhatsApp history or the history itself as character object ",
"(length > 1).")
}
if (length(zps) > 0) {
names(chat_raw) <- stri_replace_last_fixed(names(chat_raw), x[zps], src)
unlink(temp, recursive = TRUE)
}
return(chat_raw)
}
#' Parse time
#'
#' @param time A character object with times to parse.
#' @inherit rwa_read
#' @import stringi
#' @noRd
rwa_parse_time <- function(time, format, tz) {
if (is.null(format)) {
formats <- c(
"dd.MM.yyyy hh:mm:ss a",
"dd.MM.yyyy hh:mm a",
"dd.MM.yyyy HH:mm:ss",
"dd.MM.yyyy HH:mm",
"MM.dd.yyyy hh:mm:ss a",
"MM.dd.yyyy hh:mm a",
"MM.dd.yyyy HH:mm:ss",
"MM.dd.yyyy HH:mm"
)
time <- stri_replace_all_fixed(
time,
c("a.m.", "p.m.", "a. m.", "p. m."),
c("AM", "PM"),
vectorize_all = FALSE
)
time <- stri_replace_all_regex(
time,
c("[^[0-9.:/\\-APM]]", "\\s+"),
c(" ", " "),
vectorize_all = FALSE
)
if (any(stri_detect_fixed(time, "."))) {
if (sum(stri_detect_regex(time, "\\d+.\\d+.\\d{1,2}")) >
(length(time) * 0.9)) {
formats <- stri_replace_all_fixed(
formats,
"yyyy",
"yy"
)
}
} else if (any(stri_detect_fixed(time, "/"))) {
formats <- stri_replace_all_fixed(
formats,
".",
"/"
)
if (sum(stri_detect_regex(time, "\\b\\d{1,2}/\\d{1,2}/\\d{2}")) >
(length(time) * 0.9)) {
formats <- stri_replace_all_fixed(
formats,
"yyyy",
"yy"
)
} else if (sum(stri_detect_regex(time, "\\b\\d{4}/\\d+/\\d{1,2}")) >
(length(time) * 0.9)) {
formats <- stri_replace_all_fixed(
formats,
"dd/MM/yyyy",
"yyyy/MM/dd",
vectorize_all = FALSE
)
}
} else if (any(stri_detect_fixed(time, "-"))) {
formats <- stri_replace_all_fixed(
formats,
".",
"-"
)
if (sum(stri_detect_regex(time, "\\b\\d{1,2}-\\d{1,2}-\\d+")) >
(length(time) * 0.9)) {
formats <- stri_replace_all_fixed(
formats,
"yyyy",
"yy"
)
} else if (sum(stri_detect_regex(time, "\\b\\d{4}-\\d+-\\d{1,2}")) >
(length(time) * 0.9)) {
formats <- stri_replace_all_fixed(
formats,
"dd-MM-yyyy",
"yyyy-MM-dd",
vectorize_all = FALSE
)
}
} else {
formats <- c(
"hh:mma, MM dd"
)
}
test <- sapply(formats, function(f) {
test <- stri_datetime_parse(str = head(time, n = 1000),
format = f,
lenient = FALSE,
tz = tz)
sum(is.na(test))
})
format <- names(which.min(test))
}
time <- stri_datetime_parse(str = time,
format = format,
tz = tz)
return(time)
}
# creates status message and exports start_time if not in parent environment yet
status <- function(...,
sep = "",
appendLF = TRUE,
ppfix = "...",
indent = "\t") {
if (exists("start_time", envir = parent.frame())) {
start_time <- mget("start_time", envir = parent.frame())[[1]]
diff <- format((Sys.time() - start_time), digits = 2, nsmall = 2)
message(paste(indent, ppfix, ..., " [", diff, "]", sep = sep),
appendLF = appendLF)
} else {
export <- Sys.time()
start_time <- export
message(paste(..., ppfix, sep = sep), appendLF = appendLF)
}
if (exists("export")) {
return(export)
}
}
# safely test if files exist
f_exist_s <- function(x) {
url_exist <- isTRUE(any(
tryCatch(sapply(x, url),
error = function(e) {
})
))
files_exist <- isTRUE(any(
tryCatch(file.exists(x),
error = function(e) {
})
))
url_exist || files_exist
}
#' List of emojis and corresponding descriptions.
#'
#' A dataset containing emojis and corresponding descriptions. This dataset was
#' collected by combining the data from the emo package with newly scraped
#' emojis from unicode.org.
#'
#' @format A tibble with 4085 rows and 3 columns: \itemize{ \item emoji
#' character representation of the emoji \item name of the emoji \item
#' hex_runes hexadecimal representations of emoji }
#' @details \code{hex_runes} can be used to easily look up image files of
#' emojis.
#' @source \url{https://github.com/hadley/emo/} and
#' \url{https://unicode.org/emoji/charts/}
"emojis"
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.