amcat-r: R bindings for the AmCAT API

Documented in amcat.add.articles.to.set amcat.articles amcat.connect amcat.flush amcat.getobjects amcat.getpages amcat.getURL amcat.runaction amcat.save.password amcat.upload.articles publisher.column quanteda.corpus scroll title.column

#' Connect to the AmCAT API
#'
#' Connect to the AmCAT API and requests a temporary (24h) authentication token that will be stored in the output
#' The host should be known in the ~/.amcatauth file, you can use amcat.save.password to add a password to this file
#' 
#' @param host the hostname, e.g. http://amcat.vu.nl or http://localhost:8000
#' @param token an existing token to authenticate with. If given, username and password are not used and the token is not tested
#' @param disable_ipv6. If True, only use ipv4 resolving (faster if ipv6 causes timeout). Defaults to true, but this may change in the future.
#' @return A list with authentication information that is used by the other functions in this package
#' @param passwordfile optionally, specify a different password file
#' @export
amcat.connect <- function(host,token=NULL, disable_ipv6=TRUE, ssl.verifypeer=FALSE,  passwordfile="~/.amcatauth") {
  opts = list(ssl.verifypeer = ssl.verifypeer)
  if (disable_ipv6) opts = c(opts, list(ipresolve=1))
  
  path= '/api/v4/get_token'
  if (is.null(token)) {
      a = tryCatch(.readauth(host, passwordfile=passwordfile), error=function(e) warning("Could not read ", passwordfile))
      if (is.null(a)) stop("Cannot find password in ", passwordfile, ", please add an entry to this file by using amcat.save.password!")
      username = a$username
      passwd = a$password
      
    # get auth token
    url = paste(host, path, sep='')
    form = list(username=username, password=passwd)
    r = httr::POST(url, body=form)
    stop_for_status(r, task="login")
    } else {
     conn = list(host=host, token=token, opts=opts)
     res = amcat.getURL(conn, "/api/v4/get_token", post=T, filters=list(dummy=1)) # dummy to prevent warning msg, sorry
    }
  
  content = httr::content(r, as="parsed")
  token = content$token
  version = content$version
  if (is.null(version)) version = "0"
  
  list(host=host, token=token, version=version, opts=opts)
}

# Get authentication info for a host from password file
.readauth <- function(host, passwordfile="~/.amcatauth") {
  if (!file.exists(passwordfile)) return()
  rows = read.csv(passwordfile, header=F, stringsAsFactors=F)
  colnames(rows) <- c("host", "username", "password")
  r = rows[rows$host == '*' | rows$host == host,]
  if (nrow(r) > 0) list(username=r$username[1], password=r$password[1]) 
}

#' Add or change an entry in the cached authentication file
#' 
#' amcat-r uses the file '~/.amcatauth' to read credentials for connecting to servers
#' to prevent passwords from appearing in script files. This function will update
#' the .amcatauth file (if present) to add or change the password for the given host
#' 
#' @param host the host, e.g. "https://amcat.nl"
#' @param username the AmCAT username to use
#' @param password the password for the AmCAT user
#' @param passwordfile optionally, specify a different password file
#' @export
amcat.save.password <- function(host, username, password, passwordfile="~/.amcatauth") {
  existing = if (file.exists(passwordfile)) read.csv(passwordfile, header=F, stringsAsFactors=F) else data.frame(V1=character(0),V2=character(0),V3=character(0))
  if (host %in% existing$V1) {
    existing$V2[existing$V1 == host] = username
    existing$V3[existing$V1 == host] = password
  } else {
    existing = rbind(existing, data.frame(V1=host, V2=username, V3=password, stringsAsFactors = F))
  }
  write.table(existing, file=passwordfile, sep=",",  col.names=FALSE, row.names=F)
}
#' Retrieve a single URL from AmCAT with authentication and specified filters (GET or POST) 
#' 
#' @param conn the connection object from \code{\link{amcat.connect}}
#' @param path the path of the url to retrieve (using the host from conn)
#' @param filters a named vector of filters, e.g. c(project=2, articleset=3)
#' @param post use HTTP POST instead of GET
#' @return the raw result
#' @export
amcat.getURL <- function(conn, path, filters=NULL, post=FALSE, verbose=TRUE) {
  httpheader = c(Authorization=paste("Token", conn$token))
  url = httr::parse_url(conn$host)
  url$path = paste(path, sep="/")
  # strip NULL filters
  for (n in names(filters)) if (is.null(filters[[n]])) filters[[n]] <- NULL
  if (!post) {
    # convert list(a=c(1,2)) to list(a=1, a=2). From: http://stackoverflow.com/a/22346656
    url$query = structure(do.call(c, lapply(filters, function(z) as.list(z))), names=rep(names(filters), sapply(filters, length)))
    # build GET url query
    url = httr::build_url(url)
    if (verbose) message("GET ", url)
    result = httr::GET(url, httr::add_headers(httpheader))
    stop_with_message(result)
    result
  } else {  
    result = httr::POST(url, config=httr::add_headers(httpheader), body=filters)
    stop_with_message(result)
    result
  }
}

stop_with_message= function(httr_result) {
  if (httr::http_error(httr_result)) {
    error = httr::content(httr_result, as="raw")
    if (str_starts(httr_result$headers$`content-type`, 'application/x-r-rda')) {
      error = .load.rda(error)
    } 
    error_type = floor(httr_result$status_code / 100)
    if (class(error) == "list" & !is.null(error$detail)) error=error$detail
    if (error_type == 5) msg = "This seems to be an AmCAT server error. Please see the server logs or create an issue at http://github.com/amcat/amcat/issues"
    if (error_type == 4) msg = "It looks like you did something wrong, or there is something wrong in the amcat-r library. Please check your command and the error message below, or create an issue at http://github.com/amcat/amcat-r/issues"
    stop(paste0("Unexpected Response Code: ", httr_result$status_code, "\n", toString(error), "\n", msg))
  }
}

#' Get and rbind pages from the AmCAT API
#' 
#' @param conn the connection object from \code{\link{amcat.connect}}
#' @param path the path of the url to retrieve (using the host from conn)
#' @param filters a named vector of filters, e.g. c(project=2, articleset=3)
#' @param post use HTTP POST instead of GET
#' @param page the page number to start retrieving
#' @param page_size the number of rows per page
#' @param max_page the page number to stop retrieving at, if given
#' @return dataframe 
#' @export
amcat.getpages <- function(conn, path, format=NULL, page=1, page_size=1000, filters=NULL, 
                           post=FALSE,  max_page=NULL, rbind_results=format != "json",
                           verbose=TRUE) {

  if (is.null(format)) format = if (.has.version(conn$version, "3.4.2")) "rda" else "csv" 
  filters = c(filters, page_size=page_size, format=format)
  result = list()
  npages = "?"
  while (TRUE) {
    if (!is.null(max_page)) if (page > max_page) break
    page_filters = c(filters, page=page)
    subresult = amcat.getURL(conn, path, page_filters, post=post, verbose=verbose)
    if (format == "rda") {
      res = .load.rda(httr::content(subresult))
      npages = res$pages
      subresult = res$result
      result = c(result, list(subresult))
      if (page >= npages) break
    } else if (format == "json") {
      subresult = httr::content(subresult)
      npages = subresult$pages
      result = c(result, subresult$results)
      if (page >= npages) break
    } else  {
      if (is.null(subresult) || subresult == "") break
      subresult = .amcat.readoutput(subresult, format=format)
      result = c(result, list(subresult))
      if(nrow(subresult) < page_size) break
    }
    if (verbose) message("Retrieved page ",page,"/",npages, "; last page had ", nrow(subresult), " result rows")
    page = page + 1
  }
  if (rbind_results) result = dplyr::bind_rows(result)
  result
}

.load.rda <- function(bytes) {
  e = new.env()
  c = rawConnection(bytes)
  load(c, envir = e)
  close(c)
  as.list(e)
}
  
.read.version <- function(vstr) {
  if (!is.null(vstr)) {
    
    m = stringr::str_match(vstr, "(\\d+)\\.(\\d+)(\\.(\\w+))?\\s*")
    if (!is.na(m[[1]]))  {
      return(list(major=as.numeric(m[[2]]), minor=as.numeric(m[[3]]), patch=m[[4]]))
    }
  }
  list(major=0, minor=0, patch=0)
}

.has.version <- function(actual, required) {
  actual = .read.version(actual)
  required = .read.version(required)
  
  if (actual[["major"]] > required[["major"]]) return(T)
  if (actual[["major"]] < required[["major"]]) return(F)
  return(actual[["minor"]] >= required[["minor"]])
}



#' Get objects from the AmCAT API
#'
#' Get a table of objects from the AmCAT API, e.g. projects, sets etc.
#' 
#' @param conn the connection object from \code{\link{amcat.connect}}
#' @param resource the name of the resource, e.g. 'projects'. If it is of length>1, a path a/b/c/ will be created (e.g. c("projects",1,"articlesets"))
#' @param ... Other options to pass to \code{\link{amcat.getpages}}, e.g. page_size, format, and filters
#'
#' @return A dataframe of objects (rows) by properties (columns)
#' @export
amcat.getobjects <- function(conn, resource, ...) {
  if (length(resource) > 1) resource = paste(c(resource, ""), collapse="/")
  path = paste('api', 'v4', resource, sep='/')
  amcat.getpages(conn, path, ...)
}

# Internal call to check GET results and parse as csv or json
.amcat.readoutput <- function(result, format){
  if (result == '401 Unauthorized')
    stop("401 Unauthorized")
  if (format == 'json') {
    result = fromJSON(result)
    
  } else  if (format == 'csv') {
    con <- textConnection(result)
    result = tryCatch(read.csv(con), 
                      error=function(e) data.frame())
  }
  result
}

#' Run an action from the AmCAT API
#'
#' @param conn the connection object from \code{\link{amcat.connect}}
#' @param action the name of the action
#' @param format the format to request, e.g. csv or json
#' @param ... any additional (e.g. action-specific) arguments to pass to the action API
#' @return A dataframe containing the result of the action [WvA: shouldn't that depend on the action??]
#' @export
amcat.runaction <- function(conn, action, format='csv', ...) {
  resource = 'api/action'
  url = paste(conn$host, resource, action, sep="/")
  url = paste(url, '?format=', format, sep="")
  message("Running action at ", url)
  httpheader = c(Authorization=paste("Token", conn$token))
  result = postForm(url, ..., .opts=list(httpheader=httpheader))
  
  if (result == '401 Unauthorized')
    stop("401 Unauthorized")
  if (format == 'json') {
    result = fromJSON(result)
  } else  if (format == 'csv') {
    con <- textConnection(result)
    result = read.csv2(con)
  }
  result
}


#' Get article metadata from AmCAT
#'
#' Uses the \code{\link{amcat.getobjects}} function to retrieve article metadata, and applies some
#' additional postprocessing, e.g. to convert the data to Date objects.
#'
#' @param conn the connection object from \code{\link{amcat.connect}}
#' @param project the project of a set to retrieve metadata from
#' @param articleset the article set id to retrieve - provide either this or articleset
#' @param articles the article ids to retrieve - provide either this or articleset
#' @param columns the names of columns to retrieve, e.g. date, medium, text, headline
#' @param time if true, parse the date as POSIXct datetime instead of Date
#' @param dateparts if true, add date parts (year, month, week)
#' @param medium_names if true, retrieve medium names and turn medium column into a factor
#' @return A dataframe containing the articles and the selected columns
#' @export
amcat.articles <- function(conn, project, articleset=NULL, articles=NULL, uuid=NULL, columns=c('date','title'), time=F, dateparts=F, page_size=10000, ...) {
  if (is.null(articleset) & is.null(articles) & is.null(uuid)) stop("Provide either articleset or articles (ids)/uuids")
  
  if (!is.null(articleset)) {
    path = paste("api", "v4", "projects", project, "articlesets", articleset,  "meta", sep="/")
    result = scroll(conn, path, page_size=page_size, columns=paste(columns, collapse=","), ...)
  } else {
    path = paste("api", "v4", "meta", sep="/")
    if (!is.null(articles)) {
      articles = paste(articles, collapse=",")
      result = scroll(conn, path, id=articles, page_size=page_size, columns=paste(columns, collapse=","), ...)
    } else {
      uuid = paste(uuid, collapse=",")
      result = scroll(conn, path, uuid=uuid, page_size=page_size, columns=paste(columns, collapse=","), ...)
    }
  }
  
  if ("date" %in% colnames(result)) {
    result$date = (if(time == T) as.POSIXct(result$date, format='%Y-%m-%dT%H:%M:%S') 
                   else as.Date(result$date, format='%Y-%m-%d'))
  
    if (dateparts) {
      result$year = as.Date(cut(result$date, "year"))
      result$month = as.Date(cut(result$date, "month"))
      result$week = as.Date(cut(result$date, "week"))
      columns = c(columns, "year", "month", "week")
    }
  }
  columns = c('id', columns)
  if (nrow(result) > 0) {
    for(missing in setdiff(columns, colnames(result))) result[[missing]] <- NA
    result = result[columns]
  }
  result$id = as.numeric(result$id)
  
  tibble::as_tibble(result)
}
#' Get article metadata from AmCAT
#'
#' Uses the \code{\link{amcat.getobjects}} function to retrieve article metadata, and applies some
#' additional postprocessing, e.g. to convert the data to Date objects.
#'
#' @param conn the connection object from \code{\link{amcat.connect}}
#' @param project the project of a set to retrieve metadata from
#' @param articleset the article set id to retrieve
#' @param columns the names of columns to retrieve, e.g. date, medium, text, headline
#' @param time if true, parse the date as POSIXct datetime instead of Date
#' @param dateparts if true, add date parts (year, month, week)
#' @param medium_names if true, retrieve medium names and turn medium column into a factor
#' @return A dataframe containing the articles and the selected columns
#' @export
amcat.getarticlemeta <- amcat.articles


#' Add articles to an article set
#' 
#' Add the given article ids to a new or existing article set
#' 
#' @param conn the connection object from \code{\link{amcat.connect}}
#' @param project the project to add the articles to
#' @param articles a vector of article ids
#' @param articleset the article set id of an existing set
#' @param articleset.name the name for a new article set
#' @param articleset.provenance a provenance text for a new article set
#' @return The articleset id of the new or existing article set
#' @export
amcat.add.articles.to.set <- function(conn, project, articles, articleset=NULL,
                                      articleset.name=NULL, articleset.provenance=NULL) {
  if (is.null(articleset)) {
    if (is.null(articleset.name)) 
      stop("Provide articleset or articleset.name")
    path = paste("api", "v4", "projects",project, "articlesets", "?format=json", sep="/")
    if (is.null(articleset.provenance)) 
      articleset.provenance=paste("Uploaded", length(articles), "articles from R on", format(Sys.time(), "%FT%T"))
    r = amcat.getURL(conn, path, filters=list(name=articleset.name, provenance=articleset.provenance), post=TRUE) 
    
    articleset = fromJSON(r)$id
    message("Created articleset ", articleset, ": ", articleset.name," in project ", project)
  }
  if (!is.null(articles)) {
    message("Adding ",length(articles), " articles to set ", articleset)
    #idlist = lapply(articles, function(x) list(id=x))
    idlist = articles
    url = paste(conn$host, "api", "v4", "projects",project, "articlesets", articleset, "articles", "", sep="/")
    
    
    chunks = split(articles, ceiling(seq_along(articles)/1000))
    for (i in seq_along(chunks)) {
      chunk = chunks[[i]]
      if (length(chunks) > 1) message("  [", i, "/", length(chunks), "] Adding ", length(chunk), " articles")
      resp = POST(url, body=toJSON(chunk), content_type_json(), accept_json(), add_headers(Authorization=paste("Token", conn$token)))
      if (resp$status_code != 201) stop("Unexpected status: ", resp$status_code, "\n", httr::content(resp, type="text/plain"))
    }
  }
  invisible(articleset)
}

#' Add new articles to AmCAT
#' 
#' Upload articles into a given project and article set, or into a new article set if the articleset argument is character
#' All arguments title, medium etc. should be either of the same length as text, or of length 1
#' All factor arguments will be converted to character using as.character
#' For date, please provide either a string in ISO notatoin (i.e. "2010-12-31" or "2010-12-31T23:59:00")
#' or a variable that can be converted to string using format(), e.g. Date, POSIXct or POSIXlt. 
#' The articles will be uploaded in batches of 100. 
#' 
#' @param conn the connection object from \code{\link{amcat.connect}}
#' @param project the project to add the articles to
#' @param articleset the article set id of an existing set, or the name of a new set to create
#' @param text the text of the articles to upload
#' @param title the headlines/title of the articles to upload
#' @param provenance if articleset is character, an optional provenance string to store with the new set
#' @param ... and additional fields to upload, e.g. author, byline etc. 
#' @export

amcat.upload.articles <- function(conn, project, articleset, text, title, date, provenance=NULL, ...) {
  require(jsonlite)
  n = length(text)
  if (is.character(articleset)) {
    if (is.null(provenance)) provenance=paste("Uploaded", n, "articles using R function amcat.upload.articles")
    articleset = amcat.add.articles.to.set(conn, project, articles=NULL, articleset.name=articleset, articleset.provenance=provenance) 
  }
  
  if (is.factor(date)) date=as.character(date)
  if (!is.character(date)) date = format(date, "%Y-%m-%dT%H:%M:%S")
  fields = data.frame(title=title, text=text, date=date, ...)
  # make sure all fields have correct length
  for (f in names(fields)) {
    if (is.factor(fields[[f]])) fields[[f]] = as.character(fields[[f]])
    if (length(fields[[f]]) == 1) fields[[f]] = rep(fields[[f]], n)
    if (length(fields[[f]]) != n) stop(paste("Field", f, "has incorrect length:", length(fields[[f]]), "should be 1 or ", n))
  }
  
  # not very efficient, but probably not the bottleneck
  chunks = split(fields, ceiling((1:n)/100))

  i <- 0 # primitive, but works
  for(chunk in chunks) {
    # this package can also transfrom df to json directly, less code required
    json_data = jsonlite::toJSON(chunk, dataframe = 'row')
    
    # provide better overview for upload progress
    i <- i + 1
    message("Uploading chunk ", i, "/", length(chunks), ". With ", nrow(chunk), " articles to set ", articleset)
    
    url = paste(conn$host, "api", "v4", "projects",project, "articlesets", articleset, "articles", "", sep="/")
    
    resp = POST(url, body=json_data, content_type_json(), accept_json(), add_headers(Authorization=paste("Token", conn$token)))
    if (resp$status_code != 201) stop("Unexpected status: ", resp$status_code, "\n", content(resp, type="text/plain"))
  }
  invisible(articleset)
}

#' Alias for amcat.upload.articles
amcat.upload.articles.jsonlite = amcat.upload.articles

#' Scroll an amcat API page with 'next' link
#'
#' @param conn the connection object from \code{\link{amcat.connect}}
#' @param path the path to scroll
#' @param page_size the amount of pages per request
#' @param ... additional query arguments
#'
#' @return a data frame of all returned rows
#' @export
scroll <- function(conn, path, page_size=100, ...) {
  if (!.has.version(conn$version, "3.4.2")) stop("Scrolling only possible on AmCAT >= 3.4.2")
  result = list()
  httpheader = c(Authorization=paste("Token", conn$token))
  url = httr::parse_url(conn$host)
  url$path = path
  url$query = list(page_size=page_size, format="rda", ...)
  url = httr::build_url(url)
  n = 0
  while(!is.na(url)) {
    message(url)
    r = httr::GET(url, httr::add_headers(httpheader))
    if (httr::http_error(r)) stop("Error on scrolling: ", httr::content(r))
    res = .load.rda(httr::content(r))
    subresult = res$results
    n = n + nrow(subresult)
    result = c(result, list(subresult))
    message("Got ", nrow(subresult), " rows (total: ",n," / ", res$total,")")
    url = res$`next`
    if (nrow(subresult) < page_size) break
  }
  dplyr::bind_rows(result)
}


#' Ask AmCAT to flush the elasticsearch
#'
#' @param conn the connection object from \code{\link{amcat.connect}}
#' @export
amcat.flush <- function(conn) {
  invisible(amcat.getURL(conn, "api/v4/flush/", filters=list(format="json")))
}


#' Download articles from AmCAT into a Quanteda corpus object
#'
#' @param conn the connection object from \code{\link{amcat.connect}}
#' @param project The project ID
#' @param articleset  The Articleset ID
#' @param textcolumns Which columns contain text (default: headline, text)
#' @param metacolumns Which columns to include as docvars (deafult: date, medium)
#' @param headline.as.docvar Should headline be a docvar (as well as a text column)
#' @param ... Other arguments to pass to amcat.getarticlemeta, i.e. dateparts=T
#'
#' @return a quanteda corpus object
#' @export
quanteda.corpus <- function(conn, project, articleset, textcolumns=c("headline", "text"), 
                            metacolumns=c("date", "medium"), headline.as.docvar=T, ...) {
  articles = amcat.getarticlemeta(conn, project, articleset, columns=c(textcolumns, metacolumns), ... )
  
  if (headline.as.docvar) textcolumns = setdiff(textcolumns, "headline")
  metavars = setdiff(colnames(articles), textcolumns)
  x = apply(articles[textcolumns], 1, paste, collapse='\n')
  texts = apply(articles[textcolumns], 1, paste, collapse='\n')
  quanteda::corpus(texts, docnames=articles$id, docvars=articles[metavars])
}



#' Return the column for title/headline depending om AmCAT version
#'
#' @param conn the connection object from \code{\link{amcat.connect}}
#' @export
title.column <- function(conn) {
  if (.has.version(conn$version, "3.5")) "title" else "headline"
}

#' Return the column for publisher/medium depending om AmCAT version
#' @export
publisher.column <- function(conn) {
  if (.has.version(conn$version, "3.5")) "publisher" else "medium"
}