R/geo-tagging.R
In epitweetr: Early Detection of Public Health Threats from 'Twitter' Data

Documented in geolocate_text get_todays_sample_tweets update_geonames update_languages

# Get the SQL-like expression to extract tweet geolocation variables and apply prioritisation
# This function is used on aggregate tweets for translating variables names into SQL valid columns of geotag tweets
get_tweet_location_var <- function(varname) {
  if(varname == "longitude" || varname == "latitude")
    paste("coalesce("
      , "text_loc.geo_", varname
      , ", linked_text_loc.geo_", varname
      , ")"
      , sep = ""
    )
  else
    paste("coalesce("
      , "text_loc.", varname
      , ", linked_text_loc.", varname
      , ")"
      , sep = ""
    )
}

# It gets used columns for tweet geolocation. This is used for limiting columns to extract from json files
get_tweet_location_columns <- function(table) {
  if(table == "tweet")
    list(
    )
  else
    list(
      "text_loc"
      ,"linked_text_loc"
    )
}
# Get the SQL like expression to extract user geolocation variables and applying prioritisation
# This function is used on aggregate tweets for translating variables names into SQL valid columns of geotag tweets
get_user_location_var <- function(varname) {
  if(varname == "longitude" || varname == "latitude")
    paste("coalesce("
      , "tweet_", varname
      , ", place_", varname
      , ", place_full_name_loc.geo_",varname
      , ", linked_place_", varname
      , ", linked_place_full_name_loc.geo_",varname
      , ", user_location_loc.geo_", varname
      , ", user_description_loc.geo_", varname
      , ")"
      , sep = ""
    )
  else
    paste("coalesce("
      , "place_full_name_loc.",varname
      , ", linked_place_full_name_loc.",varname
      , ", user_location_loc.", varname
      , ", user_description_loc.", varname
      , ")"
      , sep = ""
    )
}

# Get used columns for user geolocation. This is used for limiting columns to extract from json files
get_user_location_columns <- function(table) {
  if(table == "tweet")
    list(
      "tweet_longitude"
      , "tweet_latitude"
      , "place_longitude"
      , "place_latitude"
      , "linked_place_longitude"
      , "linked_place_latitude"
    )
  else
    list(
      "user_location_loc"
      , "user_description_loc"
      , "place_full_name_loc"
      , "linked_place_full_name_loc"
      , "place_full_name_loc"
      , "linked_place_full_name_loc"
      )
}

#' @title Get a sample of latest tweet geolocations (deprecated)
#' @description This function was removed from epitweetr v1.0.1. Please use search_tweets instead
#' @param limit Size of the sample, default: 100
#' @param text_col Name of the tweet field to geolocate it should be one of the following ("text", "linked_text", "user_description", "user_location", "place_full_name", "linked_place_full_name"),
#' default: 'text'
#' @param lang_col Name of the tweet variable containing the language to evaluate. It should be one of the following ("lang", "linked_lang", NA), default: "lang"
#' @return Data frame containing the sampled tweets and the geolocation metrics
#' @details This function was removed from epitweetr v1.0.1. Please use search_tweets instead.
#' @examples 
#' if(FALSE){
#'    library(epitweetr)
#'    # setting up the data folder
#'    message('Please choose the epitweetr data directory')
#'    setup_config(file.choose())
#'
#'    # geolocating today's tweets
#'    show(get_todays_sample_tweets())
#' }
#' @rdname get_todays_sample_tweets
#' @seealso
#'  \code{\link{download_dependencies}}
#'
#'  \code{\link{update_geonames}}
#'
#'  \code{\link{update_languages}}
#'
#' @export 
get_todays_sample_tweets <- function(limit = 1000, text_col = "text", lang_col = "lang") {
  lifecycle::deprecate_stop("1.0.1", "get_todays_sample_tweets()", "search_tweets()")
}


# Get raw countries data frame from Excel file
get_raw_countries <- function() {
  # obtained from https://gist.github.com/AdrianBinDC/621321d6083b176b3a3215c0b75f8146#file-country-bounding-boxes-md but it had been manually updated
  df <- readxl::read_excel(get_countries_path()) 
  names(df)<-make.names(names(df),unique = TRUE)
  df$minLat <- as.numeric(df$minLat)
  df$maxLat <- as.numeric(df$maxLat)
  df$minLong <- as.numeric(df$minLong)
  df$maxLong <- as.numeric(df$maxLong)
  df
}


# Getting a list of regions, sub regions and countries for using as a select
get_country_items <- function(order = "level") {
  `%>%` <- magrittr::`%>%`
  #If countries are already on cache we return them otherwise we calculate them
  if(exists(paste("country_items", order, sep = ""), where = cached)) {
    return (cached[["country_items", order, sep = ""]])
  }
  else {
    # Getting country data from package embedded excel, ignoring Antarctica
    countries <- get_raw_countries() %>% dplyr::filter(.data$region != "")

    # using intermediate region when available
    countries$sub.region <- mapply(function(int, sub) {if(is.na(int)) sub else int}, countries$intermediate.region, countries$sub.region)
    #getting regions and subregions
    regions <- (
      countries 
        %>% dplyr::group_by(.data$region) 
        %>% dplyr::summarize(minLat = min(.data$minLat), minLong = min(.data$minLong), maxLat = max(.data$maxLat), maxLong = max(.data$maxLong))
        %>% dplyr::ungroup()
    )   
    subregions <- (
      countries 
        %>% dplyr::group_by(.data$region, .data$sub.region) 
        %>% dplyr::summarize(minLat = min(.data$minLat), minLong = min(.data$minLong), maxLat = max(.data$maxLat), maxLong = max(.data$maxLong))
        %>% dplyr::ungroup()
    )   

    # creating list containing results
    items <- list()
    # adding hardcoded world items  
    row <- 1
    items[[row]] <- list(name = "World (all)", codes = list(), level = 0, minLat = -90, maxLat = 90, minLong = -180, maxLong = 180 )
    row <- 2
    items[[row]] <- list(name = "World (geolocated)", codes = as.list(countries$alpha.2), level = 0, minLat = -90, maxLat = 90, minLong = -180, maxLong = 180 )
    for(r in 1:nrow(regions)) {
      row <- row + 1
      region <- regions$region[[r]]
      items[[row]] = list(name = region, codes = list(), level = 1, minLat = 90, maxLat = -90, minLong = 180, maxLong = -180)
      # filling region item
      for(c in 1:nrow(countries)) {
        if(countries$region[[c]] == region) {
          # adding country codes for region
          items[[row]]$codes[[length(items[[row]]$codes)+1]] <- countries$alpha.2[[c]]
          # calculating region bounding box 
          if(countries$minLat[[c]] < items[[row]]$minLat) items[[row]]$minLat <- countries$minLat[[c]]
          if(countries$maxLat[[c]] > items[[row]]$maxLat) items[[row]]$maxLat <- countries$maxLat[[c]]
          if(countries$minLong[[c]] < items[[row]]$minLong) items[[row]]$minLong <- countries$minLong[[c]]
          if(countries$maxLong[[c]] > items[[row]]$maxLong) items[[row]]$maxLong <- countries$maxLong[[c]]
        }
      }
      # Adding elements for subregions
      for(s in 1:nrow(subregions)) {
        if(subregions$region[[s]] == region ) {       
          row <- row + 1
          subregion <- subregions$sub.region[[s]]
          items[[row]] = list(name = subregion, codes = list(), level = 2, minLat = 90, maxLat = -90, minLong = 180, maxLong = -180)
          # filling sub region item
          for(c in 1:nrow(countries)) {
            if(countries$region[[c]] == region && countries$sub.region[[c]] == subregion) {
              # adding country codes for region
              items[[row]]$codes[[length(items[[row]]$codes)+1]] <- countries$alpha.2[[c]]
              # calculating region bounding box 
              if(countries$minLat[[c]] < items[[row]]$minLat) items[[row]]$minLat <- countries$minLat[[c]]
              if(countries$maxLat[[c]] > items[[row]]$maxLat) items[[row]]$maxLat <- countries$maxLat[[c]]
              if(countries$minLong[[c]] < items[[row]]$minLong) items[[row]]$minLong <- countries$minLong[[c]]
              if(countries$maxLong[[c]] > items[[row]]$maxLong) items[[row]]$maxLong <- countries$maxLong[[c]]
            }
          }
          # adding country items
          for(c in 1:nrow(countries)) {
            if(countries$region[[c]] == region && countries$sub.region[[c]] == subregion) {
              row <- row + 1
              country <- countries$name[[c]]
              items[[row]] <- list(
                name = country
                , codes = list(countries$alpha.2[[c]])
                , level = 3
                , minLat = countries$minLat[[c]]
                , maxLat = countries$maxLat[[c]]
                , minLong = countries$minLong[[c]]
                , maxLong = countries$maxLong[[c]]
              )
            }
          }
        }
      }
    }
    
    # Getting EU classification
    eu_zones <- countries$EU.EEA[!is.na(countries$EU.EEA)] %>% unique
    eu_codes <- lapply(eu_zones, function(r) {list(name = r, codes = countries$alpha.2[!is.na(countries$EU.EEA) & countries$EU.EEA == r], level = 0.5)}) 
    eu_codes <- (
      if(length(eu_codes)==0) eu_codes
      else c(eu_codes, list(list(name = paste(eu_zones, collapse = "+"), codes = countries$alpha.2[!is.na(countries$EU.EEA)], level = 0.5)))
    )

    # Getting WHO classification
    who_zones <- countries$WHO.Region[!is.na(countries$WHO.Region)] %>% unique
    who_codes <- lapply(who_zones, function(r) {list(name = r, codes = countries$alpha.2[!is.na(countries$WHO.Region) & countries$WHO.Region == r], level = 0.6)}) 
    specials <- c(eu_codes, who_codes)
    for(s in 1:length(specials)) {
      row <- row + 1
      items[[row]] = list(name = specials[[s]]$name, codes = list(), level = specials[[s]]$level, minLat = 90, maxLat = -90, minLong = 180, maxLong = -180) 
      for(c in 1:nrow(countries)) {
        if(countries$alpha.2[[c]] %in% specials[[s]]$codes) {
          # adding country codes for region
          items[[row]]$codes[[length(items[[row]]$codes)+1]] <- countries$alpha.2[[c]]
          # calculating region bounding box 
          if(countries$minLat[[c]] < items[[row]]$minLat) items[[row]]$minLat <- countries$minLat[[c]]
          if(countries$maxLat[[c]] > items[[row]]$maxLat) items[[row]]$maxLat <- countries$maxLat[[c]]
          if(countries$minLong[[c]] < items[[row]]$minLong) items[[row]]$minLong <- countries$minLong[[c]]
          if(countries$maxLong[[c]] > items[[row]]$maxLong) items[[row]]$maxLong <- countries$maxLong[[c]]
        }
      }

    }
    # Items are natively ordered as a hierarchy, but default of this function is ordered by level and name
    if(order == "level")
      items <- items[order(sapply(items,function(i) {paste(i$level, i$name)}))]

    # Setting the cache
    cached[[paste("country_items", order, sep="_")]] <- items    
    return(items)
  }
}

# Get country codes
# usage 
# map <- get_country_code_map()
# map[["FR"]] 
get_country_code_map <- function() {
  regions <- get_country_items()  
  countries <- regions[sapply(regions, function(i) i$level == 3)]
  return(setNames(sapply(countries, function(r) r$name), sapply(countries, function(r) r$code)))
}

# Get country codes by name
# usage 
# map <- get_country_codes_by_name()
# map[["Asia"]] 
get_country_codes_by_name <- function() {
  regions <- get_country_items()  
  return(setNames(sapply(regions, function(r) r$code), sapply(regions, function(r) r$name)))
}

# Get regions data as data frame
get_regions_df <- function() {
  regions <- get_country_items()
  data.frame(
    Name = sapply(regions, function(r) r$name) , 
    Codes = sapply(regions, function(r) paste(r$codes, collapse = ", ")), 
    Level = sapply(regions, function(r) r$level), 
    MinLatitude = as.numeric(sapply(regions, function(r) r$minLat)), 
    MaxLatitude = as.numeric(sapply(regions, function(r) r$maxLat)), 
    MinLongitude = as.numeric(sapply(regions, function(r) r$minLong)), 
    MaxLingityde = as.numeric(sapply(regions, function(r) r$maxLong))
  ) 
}

# Get country indexes
# usage 
# map <- get_country_index_map()
# map[["FR"]] 
get_country_index_map <- function() {
  regions <- get_country_items()
  all_index <- 1:length(regions) 
  indexes <- all_index[sapply(all_index, function(i) regions[[i]]$level == 3)]
  return(setNames(indexes, sapply(indexes, function(i) regions[[i]]$code)))
}

#' @title Updates the local copy of the GeoNames database
#' @description Downloading and indexing a fresh version of the GeoNames database from the provided URL
#' @param tasks Tasks object for reporting progress and error messages, default: get_tasks()
#' @return The list of tasks updated with produced messages
#' @details Run a one shot task to download and index a local copy of the  \href{http://www.geonames.org/}{GeoNames database}. 
#' The GeoNames geographical database covers all countries and contains over eleven million place names that are available; Creative Commons Attribution 4.0 License. 
#'
#' The URL to download the database from is set on the configuration tab of the Shiny app, in case it changes.
#'
#' The indexing is developed in Spark and Lucene
#'
#' A prerequisite to this function is that the \code{\link{search_loop}} must already have stored collected tweets in the search folder and that the task \code{\link{download_dependencies}}
#' has been successfully run.
#'
#' Normally this function is not called directly by the user but from the \code{\link{detect_loop}} function.
#' @examples 
#' if(FALSE){
#'    library(epitweetr)
#'    # setting up the data folder
#'    message('Please choose the epitweetr data directory')
#'    setup_config(file.choose())
#'
#'    # geolocating last tweets
#'    tasks <- update_geonames()
#' }
#' @rdname update_geonames
#' @seealso
#'  \code{\link{download_dependencies}}
#'
#'  \code{\link{detect_loop}}
#'
#'  \code{\link{get_tasks}}
#'  
#' @export 
#' @importFrom utils download.file unzip 
update_geonames <- function(tasks = get_tasks()) {
  stop_if_no_config()
  tasks <- tryCatch({
    tasks <- update_geonames_task(tasks, "running", "downloading", start = TRUE)
    # Create geo folder if it does not exist
    if(!file.exists(file.path(conf$data_dir, "geo"))) {
      dir.create(file.path(conf$data_dir, "geo"))   
    }
    # Downloading GeoNames
    temp <- tempfile()
    temp_dir <- tempdir()
    download.file(tasks$geonames$url, temp, mode="wb")
    
    # Decompressing file
    tasks <- update_geonames_task(tasks, "running", "decompressing")
    unzip(zipfile = temp, files = "allCountries.txt", exdir = temp_dir)
    file.remove(temp)
    file.rename(from = file.path(temp_dir, "allCountries.txt"), to = get_geonames_txt_path())
    
    # Assembling GeoNames datasets 
    # using Spark job with entry point updateGeonames part 'ASSEMBLE'
    tasks <- update_geonames_task(tasks, "running", "assembling")
    spark_job(
       paste(
        "updateGeonames"
        , conf_geonames_as_arg()
        , "assemble", paste("\"TRUE\"",sep="") 
        , "index", paste("\"FALSE\"",sep="") 
        , "parallelism", conf$spark_cores 
      )
    )
    
    # Indexing GeoNames datasets
    tasks <- update_geonames_task(tasks, "running", "indexing")
    # using spark job with entry point updateGeonames part 'INDEX'
    spark_job(
      paste(
        "updateGeonames"
        , conf_geonames_as_arg()
        , "assemble", paste("\"FALSE\"",sep="") 
        , "index", paste("\"TRUE\"",sep="") 
        , "parallelism", conf$spark_cores 
      )
    )
    
    # Setting status to success
    tasks <- update_geonames_task(tasks, "success", "", end = TRUE)
    tasks
  }, warning = function(error_condition) {
    # Setting status to failed
    tasks <- update_geonames_task(tasks, "failed", paste("failed while", tasks$geonames$message," geonames", error_condition))
    tasks
  }, error = function(error_condition) {
    # Setting status to failed
    tasks <- update_geonames_task(tasks, "failed", paste("failed while", tasks$geonames$message," geonames", error_condition))
    tasks
  })
  return(tasks)
}

 
#' @title Updates local copies of languages
#' @description Downloading and indexing a fresh version of language models tagged for update on the Shiny app configuration tab
#' @param tasks Tasks object for reporting progress and error messages, default: get_tasks()
#' @return The list of tasks updated with produced messages
#' @details Run a one shot task to download and index a local fasttext \href{https://fasttext.cc/docs/en/crawl-vectors.html}{pretrained models}. 
#' A fasttext model is a collection of vectors for a language automatically produced scrolling a big corpus of text that can be used to capture the semantic of a word.
#'
#' The URL to download the vectors from are set on the configuration tab of the Shiny app.
#'
#' This task will also update SVM models to predict whether a word is a location that will be used in the geolocation process.
#'
#' The indexing is developed in SPARK and Lucene.
#'
#' A prerequisite to this function is that the \code{\link{search_loop}} must already have stored collected tweets in the search folder and that the tasks \code{\link{download_dependencies}}
#' and \code{\link{update_geonames}} has been run successfully.
#'
#' Normally this function is not called directly by the user but from the \code{\link{detect_loop}} function.
#' @examples 
#' if(FALSE){
#'    library(epitweetr)
#'    # setting up the data folder
#'    message('Please choose the epitweetr data directory')
#'    setup_config(file.choose())
#'
#'    # updating language tasks
#'    tasks <- update_languages()
#' }
#' @rdname update_languages
#' @seealso
#'  \code{\link{download_dependencies}}
#'
#'  \code{\link{update_geonames}}
#'
#'  \code{\link{detect_loop}}
#' 
#'  \code{\link{get_tasks}}
#'  
#' @export 
#' @importFrom utils download.file 
update_languages <- function(tasks = get_tasks()) {
  stop_if_no_config()
  tasks <- tryCatch({
    tasks <- update_languages_task(tasks, "running", "downloading", start = TRUE)

    # Create languages folder if it does not exist
    if(!file.exists(file.path(conf$data_dir, "languages"))) {
      dir.create(file.path(conf$data_dir, "languages"))   
    }

    # Executing actions per language
    i <- 1
    while(i <= length(tasks$languages$statuses)) {
      # downloading when necessary
      if(tasks$languages$statuses[[i]] %in% c("to add", "to update")) {
        tasks <- update_languages_task(tasks, "running", paste("downloading", tasks$languages$name[[i]]), lang_code = tasks$languages$codes[[i]], lang_start = TRUE)
        temp <- tempfile()
        # downloading the file
        download.file(tasks$languages$url[[i]],temp,mode="wb")
        file.rename(from = file.path(temp), to = tasks$languages$vectors[[i]])
        tasks <- update_languages_task(tasks, "running", paste("downloaded", tasks$languages$name[[i]]), lang_code = tasks$languages$codes[[i]], lang_done = TRUE)
      } else if(tasks$languages$statuses[[i]] %in% c("to remove")) {
        # requesting to delete language data
        # deleting language files
        code <- tasks$languages$codes[[i]] 
        if(file.exists(get_lang_vectors_path(code))) file.remove(get_lang_vectors_path(code))
        if(dir.exists(get_lang_model_path(code))) unlink(get_lang_model_path(code))
        if(file.exists(get_lang_stamp_path(code))) file.remove(get_lang_stamp_path(code))
        tasks <- update_languages_task(tasks, "running", paste("deleted", tasks$languages$name[[i]]), lang_code = code, lang_removed = TRUE)
        i <- i - 1
      }
      i <- i + 1
    }
    # Indexing languages after changes
    # using the Spark job with entry point updateLanguages 
    update_languages_task(tasks, "running", "updating training set & indexing")
    update_geotraining_df()
    retrain_languages()
    update_geotraining_df()

    # Setting status to success
    tasks <- update_languages_task(tasks, "success", "", end = TRUE)
    tasks
  }, warning = function(error_condition) {
    # Setting status to failed
    tasks <- update_languages_task(tasks, "failed", paste("failed while", tasks$languages$message," languages", error_condition))
    tasks
  }, error = function(error_condition) {
    # Setting status to failed
    tasks <- update_languages_task(tasks, "failed", paste("failed while", tasks$languages$message," languages", error_condition))
    tasks
  })
  return(tasks)
}

 
#' @title geolocate text in a data frame given a text column and optionally a language column
#' @description extracts geolocaion information on text on a column of the provided data frame and returns a new data frame with geolocation information
#' @param df A data frame containing at least character column with text, a column with the language name can be provided to improve geolocation quality
#' @param text_col character, name of the column on the data frame containing the text to geolocalize, default:text
#' @param lang_col character, name of the column on the data frame containing the language of texts, default: NULL
#' @param min_score, numeric, the minimum score obtained on the Lucene scoring function to accept matches on GeoNames. It has to be empirically set default: NULL
#' @return A new data frame containing the following geolocation columns: geo_code, geo_country_code, geo_country, geo_name, tags
#' @details This function perform a call to the epitweetr database which includes functionality for geolocating for languages activated and successfully processed on the shiny app.
#' 
#' The geolocation process tries to find the best match in GeoNames database \url{https://www.geonames.org/} including all local aliases for words.
#'
#' If no language is associated to the text, all tokens will be sent as a query to the indexed GeoNames database.
#'
#' If a language code is associated to the text and this language is trained on epitweetr, entity recognition techniques will be used to identify the best candidate in text to contain a location
#' and only these tokens will be sent to the GeoNames query.
#'
#' A custom scoring function is implemented to grant more weight to cities increasing with population to try to perform disambiguation.
#'
#' Rules for forcing the geolocation choices of the algorithms and for tuning performance with manual annotations can be performed on the geotag tab of the Shiny app.
#'
#' A prerequisite to this function is that the tasks \code{\link{download_dependencies}} \code{\link{update_geonames}} and \code{\link{update_languages}} has been run successfully.
#'
#' This function is called from the Shiny app on geolocation evaluation tab but can also be used for manually evaluating the epitweetr geolocation algorithm.
#' @examples 
#' if(FALSE) {
#'    library(epitweetr)
#'    # setting up the data folder
#'    message('Please choose the epitweetr data directory')
#'    setup_config(file.choose())
#'
#'    # creating a test dataframe
#'    df <- data.frame(text = c("Me gusta Santiago de Chile es una linda ciudad"), lang = c("es"))
#'    geo <- geolocate_text(df = df, text_col = "text", lang_col="lang") 
#'    
#' }
#' @rdname geolocate_text
#' @seealso
#'  \code{\link{download_dependencies}}
#'
#'  \code{\link{update_geonames}}
#'
#'  \code{\link{detect_loop}}
#' 
#' @export 
#' @importFrom utils download.file 
geolocate_text <- function(df, text_col = "text", lang_col=NULL, min_score = NULL) {
  stop_if_no_config()
  stop_if_no_fs()
  
  `%>%` <- magrittr::`%>%`
  if(is.null(lang_col)) df[[lang_col]] <- NA
  to_geolocate = df %>% 
    dplyr::transmute(id = as.character(dplyr::row_number()), text = .data[[text_col]], lang =.data[[lang_col]] ) %>%
    jsonlite::toJSON()
  geo_uri = paste(get_scala_geolocate_text_url(), "?jsonnl=true", sep = "")
  if(!is.null(min_score)) {
    geo_uri <- paste(geo_uri,"&minScore=", min_score, sep = "")
  }
  ret = stream_post(uri = geo_uri, body = to_geolocate) %>%
    dplyr::arrange(as.integer(.data$id))
  for(col in c("geo_code", "geo_country_code", "geo_country", "geo_name", "tags"))
    if(!exists(col, where = ret))
      ret[[col]] <- NA
  dplyr::select(ret, -which(names(ret) %in% c("text", "id", "lang")))
}

# returns the current geotraining annotations as a data frame
get_geotraining_df <- function() {
  `%>%` <- magrittr::`%>%`
  data_types<-c("text","text", "text", "text", "text", "text", "text", "text", "text", "text", "text", "text", "text")
  current <- readxl::read_excel(get_geotraining_path(), col_types = data_types)
  current$Text <- stringr::str_trim(current$Text)
  current <- current %>% dplyr::distinct(.data$`Text`, .data$Lang, .data$`Location in text`, .keep_all = T)
  current
}

# returns the current geotraining annotation augmented of tweets_to_add tweets to annotate
updated_geotraining_df <- function(tweets_to_add = 100, progress = function(a, b) {}) {
  `%>%` <- magrittr::`%>%`
  progress(0.1, "getting current training file")
  current <- get_geotraining_df()
  locations <- current %>% dplyr::filter(.data$Type == "Location" & .data$`Location yes/no` %in% c("yes", "OK"))
  add_locations <- if(nrow(locations) == 0) "&locationSamples=true" else "&locationSamples=false" 
  non_locations <- current %>% dplyr::filter(.data$Type == "Location" & .data$`Location yes/no` %in% c("KO", "no"))
  non_location_langs <- unique(non_locations$Lang)
  excl_langs <- if(length(non_location_langs) > 0) paste("&excludedLangs=", paste(non_location_langs, collapse = "&excludedLangs="), sep = "") else "" 

  progress(0.3, "obtaining locations from downloaded models and geonames")
  geo_training_uri <- paste(get_scala_geotraining_url(), "?jsonnl=true", add_locations, excl_langs, sep = "")
  geo_training_url <- url(geo_training_uri)
  geo_training <- jsonlite::stream_in(geo_training_url) 
  if(nrow(geo_training) > 0) {
    geo_training <- geo_training %>%
      dplyr::transmute(
        Type = "Location", 
        `Text` = .data$word, 
        `Location in text` = NA, 
        `Location yes/no` = ifelse(.data$isLocation, "yes", "no"), 
        `Associate country code` = NA, 
        `Associate with`=NA, 
        `Source` = "Epitweetr model", 
        `Tweet Id` = NA, 
        `Lang` = .data$lang,
        `Tweet part` = NA, 
        `Epitweetr match` = NA,
        `Epitweetr country match` = NA,
        `Epitweetr country code match` = NA
      )
    geo_training$Text <- stringr::str_trim(geo_training$Text)
    geo_training <- geo_training %>%  dplyr::distinct(.data$`Text`, .data$Lang, .keep_all = T)
  }
  # Adding new tweets
  untagged <- unique(length((current %>% dplyr::filter(.data$`Tweet part` == 'text' & .data$`Location yes/no` == '?'))$`Tweet Id`))
  to_add <- if(tweets_to_add > untagged) tweets_to_add else 0


  progress(0.6, "adding new tweets")
  to_tag <- search_tweets(query = paste("lang:", lapply(conf$languages, function(l) l$code), collapse = " ", sep = ""), max = to_add)
  if(nrow(to_tag) > 0) {
    to_tag$text <- stringr::str_trim(to_tag$text)
    to_tag$user_description <- stringr::str_trim(to_tag$user_description)
    to_tag$user_location <- stringr::str_trim(to_tag$user_location)
    texts <- to_tag %>%
      dplyr::transmute(
        Type = "Text", 
        `Text`=.data$text,
        `Location in text` = NA, 
        `Location yes/no` = "?", 
        `Associate country code` = NA, 
        `Associate with`=NA, 
        `Source` = "Tweet", 
        `Tweet Id` = .data$tweet_id, 
        `Lang` = .data$lang,
        `Tweet part` = "text", 
        `Epitweetr match` = NA,
        `Epitweetr country match` = NA,
        `Epitweetr country code match` = NA
      ) %>%
      dplyr::filter(!.data$`Text` %in% current$`Text`) %>%
      dplyr::distinct(.data$`Text`, .data$Lang, .keep_all = T)
    
    user_desc <- to_tag %>%
      dplyr::filter(!is.na(.data$user_description)) %>%
      dplyr::transmute(
        Type = "Text", 
        `Text`=.data$user_description,
        `Location in text` = NA, 
        `Location yes/no` = "?", 
        `Associate country code` = NA, 
        `Associate with`= NA, 
        `Source` = "Tweet", 
        `Tweet Id` = .data$tweet_id, 
        `Lang` = .data$lang,
        `Tweet part` = "user description", 
        `Epitweetr match` = NA,
        `Epitweetr country match` = NA,
        `Epitweetr country code match` = NA,
      ) %>%
      dplyr::filter(!.data$`Text` %in% current$`Text`) %>%
      dplyr::distinct(.data$`Text`, .data$Lang, .keep_all = T)
  
    user_loc <- to_tag %>%
      dplyr::filter(!is.na(.data$user_location)) %>%
      dplyr::transmute(
        Type = "Location",
        `Text`=.data$user_location,
        `Location in text` = NA, 
        `Location yes/no` = "?", 
        `Associate country code` = NA, 
        `Associate with`=NA, 
        `Source` = "Tweet", 
        `Tweet Id` = .data$tweet_id, 
        `Lang` = .data$lang,
        `Tweet part` = "user location", 
        `Epitweetr match` = NA,
        `Epitweetr country match` = NA,
        `Epitweetr country code match` = NA,
      ) %>%
      dplyr::filter(!.data$`Text` %in% current$`Text`) %>%
      dplyr::distinct(.data$`Text`, .data$Lang, .keep_all = TRUE)

      ret <- jsonlite::rbind_pages(list(
        current,
        geo_training,
        texts,
        user_desc,
        user_loc
      ))
    } else {
      ret <- jsonlite::rbind_pages(list(
        current,
        geo_training
      ))
	}	  
	
	ret <- ret %>% dplyr::filter(!is.na(.data$Text) & !.data$Text == "") %>%
    dplyr::distinct(.data$`Text`, .data$Lang, .keep_all = TRUE)

  text_togeo <- ret %>% dplyr::transmute(
    Text = ifelse(!is.na(.data$`Associate with`), .data$`Associate with`, .data$Text), 
    Lang = ifelse(!is.na(.data$`Associate with`), "all", .data$Lang)
  ) 
  #getting current geolocation evaluation
  progress(0.7, "geolocating all items")
  tryCatch({
      geoloc = geolocate_text(df = text_togeo, text_col="Text", lang_col = "Lang")
      ret$`Epitweetr match` <- geoloc$geo_name
      ret$`Epitweetr country match` <- geoloc$geo_country
      ret$`Epitweetr country code match` <- geoloc$geo_country_code
      ret$`Location in text` <- ifelse((is.na(ret$`Location yes/no`) | ret$`Location yes/no`=="?") & ret$Type == "Text", geoloc$tags, ret$`Location in text`)
    }
    ,error = function(e) {
      message("Models are not trained, getting geotraining dataset without evaluation")
      ret$`Epitweetr match` <- NA
      ret$`Epitweetr country match` <- NA
      ret$`Epitweetr country code match` <- NA
    }
  )
  ret %>% arrange(dplyr::desc(.data$Type), .data$`Tweet part`)
}

# returns the current geotraining annotation augmented of tweets_to_add tweets to annotate and write the results to the geotraining spreadsheet 
update_geotraining_df <- function(tweets_to_add = 100, progress = function(a, b) {}) {
  `%>%` <- magrittr::`%>%`
  training_df <- updated_geotraining_df(tweets_to_add = 100, progress = progress)
  update_topic_keywords()
  update_forced_geo()
  update_forced_geo_codes()
  if(is.function(progress)) progress(0.9, "writing result file")
  wb <- openxlsx::createWorkbook()
  openxlsx::addWorksheet(wb, "geolocation")
  # writing data to the worksheet
  openxlsx::writeDataTable(wb, sheet = "geolocation", training_df, colNames = TRUE, startRow = 1, startCol = "A")
  # setting some minimal formatting
  openxlsx::setColWidths(wb, "geolocation", cols = c(2), widths = c(70))
  openxlsx::setColWidths(wb, "geolocation", cols = c(3, 11, 12, 13), widths = c(25))
  openxlsx::setColWidths(wb, "geolocation", cols = c(4, 5, 6, 10), widths = c(17))
  openxlsx::setRowHeights(wb, "geolocation", rows = 1, heights = 20)
  openxlsx::addStyle(
    wb, 
    sheet = "geolocation", 
    style = openxlsx:: createStyle(fontSize = 10, halign = "center", fgFill = "#ff860d", border = c("top", "bottom", "left", "right"), textDecoration="bold", wrapText = T, fontColour = "#222222"), 
    rows = 1, 
    cols = 1, 
    gridExpand = FALSE
  )
  openxlsx::addStyle(
    wb, 
    sheet = "geolocation", 
    style = openxlsx:: createStyle(fontSize = 10, halign = "center", fgFill = "#ffde59", border = c("top", "bottom", "left", "right"), textDecoration="bold", wrapText = T, fontColour = "#222222"), 
    rows = 1, 
    cols = 2:6, 
    gridExpand = FALSE
  )
  openxlsx::addStyle(
    wb, 
    sheet = "geolocation", 
    style = openxlsx:: createStyle(fontSize = 10, halign = "center", fgFill = "#f7d1d5", border = c("top", "bottom", "left", "right"), textDecoration="bold", wrapText = T, fontColour = "#222222"), 
    rows = 1, 
    cols = 7:10, 
    gridExpand = FALSE
  )
  openxlsx::addStyle(
    wb, 
    sheet = "geolocation", 
    style = openxlsx:: createStyle(fontSize = 10, halign = "center", fgFill = "#dedce6", border = c("top", "bottom", "left", "right"), textDecoration="bold", wrapText = T, fontColour = "#222222"), 
    rows = 1, 
    cols = 11:13, 
    gridExpand = FALSE
  )
  openxlsx::addStyle(
    wb, 
    sheet = "geolocation", 
    style = openxlsx:: createStyle(fontSize = 10, border = c("top", "bottom", "left", "right")), 
    rows = 1:nrow(training_df)+1, 
    cols = 1:13, 
    gridExpand = TRUE
  )
  openxlsx::addStyle(
    wb, 
    sheet = "geolocation", 
    style = openxlsx:: createStyle(fontSize = 10, border = c("top", "bottom", "left", "right"), wrapText=T), 
    rows = 2:nrow(training_df)+1, 
    cols = 2, 
    gridExpand = TRUE
  )
  openxlsx::freezePane(wb, "geolocation",firstActiveRow = 2)
  openxlsx::saveWorkbook(wb, get_user_geotraining_path() ,overwrite = T) 
}

# retrain the language classifiers for entity recognition with the current set of annotations
retrain_languages <- function() {
  `%>%` <- magrittr::`%>%`
  body <- get_geotraining_df() %>% jsonlite::toJSON()
  post_result <- httr::POST(url=get_scala_geotraining_url(), httr::content_type_json(), body=body, encode = "raw", encoding = "UTF-8")
  if(httr::status_code(post_result) != 200) {
    stop(paste("retrain web service failed with the following output: ", substring(httr::content(post_result, "text", encoding = "UTF-8"), 1, 100), sep  = "\n"))
  } else {
    fileConn<-file(get_geotraining_evaluation_path(), encoding = "UTF-8")
    writeLines(httr::content(post_result, "text", encoding = "UTF-8"), fileConn)
    close(fileConn)
  }
}

# update the topic keywords json file when several location entities are found, the one closest to a topic is chosen 
update_topic_keywords <- function() {
  `%>%` <- magrittr::`%>%`
   keywords <- lapply(strsplit(gsub("\\-\\w*\\b|\"", "", unlist(lapply(conf$topics, function(t) t$query))), "OR|AND|NOT|\\(|\\)"), function(t) {x <- gsub("^ *| *$", "", t); x[x != ""]} )
   topicKeywords <- setNames(keywords, sapply(conf$topics, function(t) t$topic))
   ret <- list()
   for(i in 1:length(conf$topics)) 
     if(exists(conf$topics[[i]]$topic, where = ret)) ret[[conf$topics[[i]]$topic]] <- unique(c(ret[[conf$topics[[i]]$topic]], topicKeywords[[i]])) 
     else ret[[conf$topics[[i]]$topic]] <- unique(topicKeywords[[i]])
   write_json_atomic(ret, get_topic_keywords_path(), pretty = TRUE, force = TRUE, auto_unbox = TRUE)
}

# update the forced geo json file listing words that will be associated to particular location names ignoring the geolocation algorithm
update_forced_geo <- function() {
  `%>%` <- magrittr::`%>%`
  df <- get_geotraining_df() %>% dplyr::transmute(from = ifelse(is.na(.data$`Location in text`), .data$`Text`, .data$`Location in text`), to = .data$`Associate with`) %>% dplyr::filter(!is.na(.data$to))
  ret <- list()
  if(nrow(df) > 0) {
    for(i in 1:nrow(df)) {
      toReplace <- sapply(strsplit(df$from[[i]], ",")[[1]], function(t) {x <- gsub("^ *| *$", "", t); x[x != ""]})
      for(j in 1:length(toReplace)) ret[[toReplace[[j]]]] <- df$to[[i]]
    }
    write_json_atomic(ret, get_forced_geo_path(), pretty = TRUE, force = TRUE, auto_unbox = TRUE)
  } else {
    if(file.exists(get_forced_geo_path()))
      file.remove(get_forced_geo_path())
  }
}

# update the forced geo codes json file lisiting words that will be associated to particular location codes ignoring the geolocation algorithm
update_forced_geo_codes <- function() {
  `%>%` <- magrittr::`%>%`
  df <- get_geotraining_df() %>% dplyr::transmute(from = ifelse(is.na(.data$`Location in text`), .data$`Text`, .data$`Location in text`), to = .data$`Associate country code`) %>% dplyr::filter(!is.na(.data$to))
  ret <- list()
  if(nrow(df) > 0) {
    for(i in 1:nrow(df)) {
      toReplace <- sapply(strsplit(df$from[[i]], ",")[[1]], function(t) {x <- gsub("^ *| *$", "", t); x[x != ""]})
      for(j in 1:length(toReplace)) ret[[toReplace[[j]]]] <- df$to[[i]]
    }
    write_json_atomic(ret, get_forced_geo_codes_path(), pretty = TRUE, force = TRUE, auto_unbox = TRUE)
  } else {
    if(file.exists(get_forced_geo_codes_path()))
      file.remove(get_forced_geo_codes_path())
  }
}
Any scripts or data that you put into this service are public.
epitweetr documentation built on Nov. 16, 2023, 5:07 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
epitweetr
Early Detection of Public Health Threats from 'Twitter' Data

R/geo-tagging.R
In epitweetr: Early Detection of Public Health Threats from 'Twitter' Data

Documented in geolocate_text get_todays_sample_tweets update_geonames update_languages

Try the epitweetr package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

epitweetr Early Detection of Public Health Threats from 'Twitter' Data

R/geo-tagging.R In epitweetr: Early Detection of Public Health Threats from 'Twitter' Data

Documented in geolocate_text get_todays_sample_tweets update_geonames update_languages

Try the epitweetr package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

epitweetr
Early Detection of Public Health Threats from 'Twitter' Data

R/geo-tagging.R
In epitweetr: Early Detection of Public Health Threats from 'Twitter' Data