Nothing
#' Read WhatsApp files
#'
#' Reads HTML-files from WhatsApp and separates the text and meta data.
#'
#' @param path Character: string with path where the data files are.
#' If only \code{path} is given, \code{file} will be determined by searching
#' for html files with \code{\link[base]{list.files}} and recursion.
#' @param file Character: string with names of the HTML files.
#' @return \code{\link{textmeta}} object.
#' @author Jonas Rieger (<jonas.rieger@@tu-dortmund.de>)
#' @keywords manip
#'
#' @export readWhatsApp
readWhatsApp = function(path, file){
if(missing(file)){
if(missing(path)) path = getwd()
if(any(grepl(pattern = "\\.html$", x = path, ignore.case = TRUE))){
return(readWhatsApp.file(file = path))
}else{
return(readWhatsApp.file(file = list.files(path = path, pattern = "*.html$",
full.names = TRUE, recursive = TRUE, ignore.case = TRUE)))
}
}
if(missing(path)){
return(readWhatsApp.file(file = file))
}
return(readWhatsApp(file = file.path(path, file)))
}
readWhatsApp.file = function(file){
stopifnot(is.character(file), all(file.access(file, mode = 4) == 0))
text = NULL
meta = NULL
filenames = gsub(x = gsub(x = file, pattern = "\\.html$", replacement = ""),
pattern = ".*[/\\]+", replacement = "")
readTime = Sys.time()
for (i in seq_along(file)) {
#message("read WhatsApp-Chat \"", filenames[i], "\" ...")
article = readLines(con = file[i], encoding = "UTF-8")
lines = grep(pattern = "<div class=\"vW7d1((.){7})?\">", x = article)
articlespan = article[min(lines):length(article)]
obs = tail(
unlist(strsplit(paste(articlespan, collapse = " "),
"<div class=\"vW7d1((.){7})?\">")), n = -1)
# jeweils erster Wert des Splits keine Beobachtung!
mData = data.frame(id = paste(filenames[i], seq_along(obs), sep = "."), title = NA_character_,
stringsAsFactors = FALSE)
# IDs noch genauer setzen. Iwie den Namen des Chats einbringen,
# siehe dafuer article[lines] vor obs
newtext = character(length(obs))
mData$systemActivity = grepl(pattern = "<div class=\"_3_7SH Zq3Mc tail\">", obs)
mData$systemDate = grepl(pattern = "<div class=\"_3_7SH Zq3Mc\">", obs)
mData$systemSecurity = grepl(pattern = "<div class=\"_3_7SH _14b5J Zq3Mc tail\">", obs)
mData$systemInfo = grepl(pattern = "<div class=\"_3_7SH _3DFk6 _2pwyf( message-(in|out))?( tail)?\">", obs)
mData$systemAccount = grepl(pattern = "<div class=\"_3_7SH _3T8jk Zq3Mc tail\">", obs)
systemMessage = mData$systemActivity | mData$systemDate | mData$systemSecurity | mData$systemInfo | mData$systemAccount
newtext[systemMessage] = removeXML(stringr::str_extract(obs[systemMessage],
"<div class=\"_3_7SH( _14b5J)? Zq3Mc( tail)?\">(.*?)</div>"))
mData$userMessage = !systemMessage
mData$textMessage = grepl(pattern = "<div class=\"_3_7SH _3DFk6( message-(in|out))?( tail)?\">", obs)
mData$imageMessage = grepl(pattern = "<div class=\"_3_7SH _3qMSo( message-(in|out))?( tail)?\">", obs)
mData$audioMessage = grepl(pattern = "<div class=\"_3_7SH _1gqYh( message-(in|out))?( tail)?\">", obs) |
grepl(pattern = "<div class=\"_3_7SH _17oKL( message-(in|out))?( tail)?\">", obs)
mData$videoMessage = grepl(pattern = "<div class=\"_3_7SH _3In2e( message-(in|out))?( tail)?\">", obs)
mData$stickerMessage = grepl(pattern = "<div class=\"_3_7SH _1rK-b( message-(in|out))?( tail)?\">", obs)
mData$gifMessage = grepl(pattern = "<div class=\"_3_7SH _2hOiI( message-(in|out))?( tail)?\">", obs)
header = stringr::str_extract(obs[mData$userMessage],
"<div class=\"(.*?)copyable-text\" data-pre-plain-text=\"(.*?)\"><div")
header = stringr::str_extract(header,
"copyable-text\" data-pre-plain-text=\"(.*?)\"><div")
mData$posix[mData$userMessage] = substr(header, 38, 54)
mData$posix = trimws(gsub(pattern = "\\]", x = mData$posix, replacement = ""))
mData$posix = as.POSIXct(mData$posix, format = "%H:%M, %d.%m.%Y", "CET")
mData$date = as.Date(mData$posix)
mData$author[mData$userMessage] = substr(header, 55, nchar(header) - 8)
mData$author = trimws(gsub(pattern = "\\]", x = mData$author, replacement = ""))
textandemojis = stringr::str_extract(obs[mData$userMessage],
"<((span dir=\"ltr\" class=\")|(div class=\"_2x9bY ))selectable-text invisible-space copyable-text\">(.*?)</div>")
# emojji detection and extraction
newtext[mData$userMessage] = removeXML(.extractEmojis(textandemojis))
mData$forward[mData$userMessage] = grepl(
pattern = "<span data-icon=\"forward-indicator\" class=\"ySrGA\">",
x = obs[mData$userMessage])
mData$cited[mData$userMessage] = grepl(pattern = "<span dir=\"auto\" class=\"quoted-mention\">",
x = obs[mData$userMessage])
citedtmp = mData$cited
citedtmp[is.na(citedtmp)] = FALSE
if(any(citedtmp)){
mData$citedAuthor[citedtmp] = removeXML(stringr::str_extract(obs[citedtmp],
"<span dir=\"auto\" class=\"((_3Ye_R _1wjpf)|(_2a1Yw))\">(.*?)</div>"))
mData$citedText[citedtmp] = removeXML(.extractEmojis2(stringr::str_extract(obs[citedtmp],
"<span dir=\"auto\" class=\"quoted-mention\">(.*?)</div>")))
# Zitat koennte Emojis enthalten?!
}
else{
mData$citedAuthor = NA
mData$citedText = NA
}
mData$time[mData$userMessage] = removeXML(stringr::str_extract(obs[mData$userMessage],
"<span class=\"_3EFt_\">(.*?)</span>"))
meta = rbind(meta, mData)
names(newtext) = mData$id
text = c(text, newtext)
}
res = textmeta(meta = meta, text = as.list(text))
readTime = difftime(Sys.time(), readTime)
cat("Time for Reading Data:", round(as.numeric(readTime), 2), attributes(readTime)$units,
"\n----------------------------------------------------------------------\n")
summary(res)
}
.extractEmojis = function(textandemojis){
# emojji detection and extraction with "data-plain-text" Flag
textandemojis = strsplit(textandemojis, "<img crossorigin")
tmp = cumsum(lengths(textandemojis))
indsplit = rbind(c(1, head(tmp, -1)+1), tmp)
textandemojis = unlist(textandemojis)
indemoji = grepl(x = textandemojis, pattern = "data-plain-text=\"(.*?)\">")
tmp = stringr::str_extract(string = textandemojis[indemoji],
pattern = "data-plain-text=\"(.*?)\">")
tmp = stringr::str_extract(string = tmp, pattern = "data-plain-text=\"(.*?)\"")
textandemojis[indemoji] = paste(substr(tmp, 18, nchar(tmp)-1),
removeXML(paste("<", textandemojis[indemoji])))
textandemojis = sapply(seq_len(ncol(indsplit)), function(x)
paste(textandemojis[indsplit[1,x]:indsplit[2,x]], collapse = " "))
return(textandemojis)
}
.extractEmojis2 = function(textandemojis){
# emojji detection and extraction with "alt" Flag
textandemojis = strsplit(textandemojis, "<img crossorigin")
tmp = cumsum(lengths(textandemojis))
indsplit = rbind(c(1, head(tmp, -1)+1), tmp)
textandemojis = unlist(textandemojis)
indemoji = grepl(x = textandemojis, pattern = "alt=\"(.*?)\">")
tmp = stringr::str_extract(string = textandemojis[indemoji],
pattern = "alt=\"(.*?)\">")
tmp = stringr::str_extract(string = tmp, pattern = "alt=\"(.*?)\"")
textandemojis[indemoji] = paste(substr(tmp, 6, nchar(tmp)-1),
removeXML(paste("<", textandemojis[indemoji])))
textandemojis = sapply(seq_len(ncol(indsplit)), function(x)
paste(textandemojis[indsplit[1,x]:indsplit[2,x]], collapse = " "))
return(textandemojis)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.