R/transkribus.R

#' @title Transkribus API
#' @description Connect to Transkribus, inspect collections, documents, pages, perform handwritten text recognition
#' @field JSESSIONID character string with the JSESSIONID to use once logged in
#' @param user character string with your Transkribus user in order to connect
#' @param password character string with your Transkribus password in order to connect
#' @param url character string with the url to use in the call to the Transkribus API
#' @param collection id of the collection
#' @param document id of the document
#' @export
#' @examples 
#' library(madoc.utils)
#' api <- Transkribus$new(user     = "jan.wijffels@vub.ac.be", 
#'                        password = Sys.getenv("TRANSKRIBUS_PWD"))
#' 
#' ## Get pages of a collection
#' collections   <- api$list_collections()
#' collections
#' id_collection <- sample(collections$colId, size = 1)
#' documents     <- api$list_collection(collection = id_collection)
#' documents
#' id_document   <- sample(documents$docId, size = 1)
#' pages         <- api$list_document(collection = id_collection, document = id_document)
#' pages
#' 
#' ## Create a collection, upload some images to the collection, delete it again
#' id  <- api$create_collection(label = "example-collection")
#' img <- c(system.file(package = "madoc.utils", "extdata", "alto-example.jpg"),
#'          system.file(package = "madoc.utils", "extdata", "example.png"))
#' api$upload(data = img, collection = id, document = paste("Upload", Sys.time()), author = "R-API")
#' api$list_collection(collection = id)           
#' api$delete_collection(collection = id)
#' 
#' ## Look at relevant models and dictionaries
#' dicts    <- api$list_dictionaries()
#' grep(dicts, pattern = "Dutch", ignore.case = TRUE, value = TRUE)
#' models   <- api$list_models(collection = id_collection)
#' str(models)
#' dutch    <- grep(models$language, pattern = "Dutch", ignore.case = TRUE, value = TRUE)
#' dutch    <- subset(models, language %in% dutch)
#' dutch    <- c("Dutch Mountains (18th Century)", "IJsberg", "Dutch Notarial Model 18th Century")
#' dutch    <- subset(models, name %in% dutch)
#' dutch    <- subset(models, name %in% "Dutch Mountains (18th Century)" & provider == "CITlabPlus")
#' str(dutch)
#' id_model <- dutch$htrId
#' 
#' ## Inspect jobs
#' jobs   <- api$list_job()
#' jobs
#' 
#' 
#' \dontrun{
#' id  <- api$create_collection(label = "test-collection")
#' img <- c(system.file(package = "madoc.utils", "extdata", "example.png"),
#'          system.file(package = "madoc.utils", "extdata", "alto-example.jpg"))
#' api$upload(data = img, collection = id, document = "Doc with 2 images", author = "R-API")
#' 
#' ##
#' ## This section shows how to transcribe using the API
#' ##    >> note that this consumes Transkribus credits
#' 
#' ##
#' ## Inspect one image and transcribe it
#' ##
#' ##  - id_model      <- 21683  ## Dutch Mountains HTR+
#' id_collection <- id
#' id_document   <- docs$docId
#' docs          <- api$list_collection(collection = id_collection)
#' pages         <- api$list_document(collection = id_collection, document = id_document)
#' page          <- head(pages, n = 1)
#' id_job        <- api$transcribe(collection = id_collection, document = id_document, 
#'                                 page = 1,
#'                                 model = 21683, 
#'                                 dictionary = "Combined_Dutch_Model_M1.dict")
#' x             <- read_pagexml(page$page_xml) 
#' 
#' ##
#' ## A random document from a collection
#' ##
#' library(magick)
#' pages  <- api$list_document(collection = id_collection, document = id_document)
#' page   <- tail(pages, n = 1)
#' page
#' img    <- image_read(page$thumbUrl)
#' img    <- image_read(page$url)
#' image_resize(img, "x600")
#' id_job <- api$transcribe(collection = id_collection, document = id_document, page = page$pageNr, 
#'                          model = id_model, dictionary = "Combined_Dutch_Model_M1.dict")
#' api$list_job(job = id_job)   
#'  
#' ## After the job has finished, we have a Page-XML file which we can read in
#' pages  <- api$list_document(collection = id_collection, document = id_document)  
#' page   <- tail(pages, n = 1)
#' img    <- image_read(page$url)
#' x      <- read_pagexml(page$page_xml)  
#' bl     <- image_draw_baselines(img, x = x$baseline, col = "darkgreen", lwd = 4)
#' image_resize(bl, "x900")
#' bl    <- image_crop_baselineareas(img, 
#'                                   x = setNames(x$baseline, x$id), 
#'                                   textregion = x$points, 
#'                                   extend = FALSE, overview = FALSE)
#' bl    <- image_rbind(bl, color = "red", geometry = "2x2")   
#' image_resize(bl, "x900")
#' 
#' api$delete_collection(collection = id)
#' }
Transkribus <- R6Class("Transkribus",
                  public = list(
                    JSESSIONID = NULL,
                    #' @description Log in with your Transkribus user and password
                    initialize = function(url = "https://transkribus.eu/TrpServer/rest/auth/login", user, password) {
                      res  <- httr::POST(url = url, body = list(user = user, pw = password), encode = "form")
                      msg  <- httr::content(res, as = "text")
                      info <- jsonlite::fromJSON(msg)
                      self$JSESSIONID <- info$sessionId
                      invisible(info)
                    },
                    #' @description Map labels to identifiers
                    #' @param name character string with the label to map to the identifier
                    #' @param type type of mapping: either one of 'collection' of 'document'
                    map_name_to_id = function(name, type = c("collection", "document", "htr-model"), collection){
                      if(!is.character(name)){
                        return(name)
                      }
                      type <- match.arg(type)
                      if(type == "collection"){
                        mappings      <- self$list_collections() 
                        id_collection <- head(mappings$colId[which(mappings$colName %in% name)], n = 1)
                        id_collection
                      }else if(type == "document"){
                        mappings      <- self$list_collection(collection = collection) 
                        id_document   <- head(mappings$docId[which(mappings$title %in% name)], n = 1)
                        id_document
                      }else if(type == "htr-model"){
                        mappings      <- self$list_models(collection = collection) 
                        id_model      <- head(mappings$htrId[which(mappings$name %in% name)], n = 1)
                        id_model
                      }
                    },
                    #' @description List all collections you have access to
                    list_collections = function(url = "https://transkribus.eu/TrpServer/rest/collections/list"){
                      res  <- httr::GET(url = url, httr::add_headers(JSESSIONID = self$JSESSIONID))
                      msg  <- httr::content(res, as = "text")
                      info <- jsonlite::fromJSON(msg)
                      info
                    },
                    #' @description List the content (the documents) of a collection
                    list_collection = function(url = "https://transkribus.eu/TrpServer/rest/collections/%s/list", collection){
                      collection <- self$map_name_to_id(collection, type = "collection")
                      res  <- httr::GET(url = sprintf(url, collection), httr::add_headers(JSESSIONID = self$JSESSIONID))
                      msg  <- httr::content(res, as = "text")
                      info <- jsonlite::fromJSON(msg)
                      if("uploadTimestamp" %in% names(info)){
                        info$uploadTimestamp <- as.POSIXct(info$uploadTimestamp / 1000, origin = "1970-01-01")
                      }
                      info
                    },
                    #' @description Create a collection
                    #' @param label character string with the name of the collection to create
                    create_collection = function(url = "https://transkribus.eu/TrpServer/rest/collections/createCollection", label){
                      res  <- httr::POST(url = url, query  = list(collName = label), 
                                         httr::add_headers(JSESSIONID = self$JSESSIONID), 
                                         encode = "form")
                      msg  <- httr::content(res, as = "text")
                      info <- jsonlite::fromJSON(msg)
                      invisible(info)
                    },
                    #' @description Delete a collection
                    delete_collection = function(url = "https://transkribus.eu/TrpServer/rest/collections/{collection}", collection){
                      collection <- self$map_name_to_id(collection, type = "collection")
                      qry  <- glue(url)
                      res  <- httr::DELETE(url = qry, httr::add_headers(JSESSIONID = self$JSESSIONID))
                      msg  <- httr::content(res, as = "text")
                      invisible(msg)
                    },
                    #' @description List the content (the pages) of a document
                    #' @param type character string with the type of extraction, either 'pages' or 'raw'. Defaults to 'pages'
                    list_document = function(url = "https://transkribus.eu/TrpServer/rest/collections/%s/%s/fulldoc", collection, document, type = c("pages", "raw")){
                      collection <- self$map_name_to_id(collection, type = "collection")
                      document   <- self$map_name_to_id(document, type = "document", collection = collection)
                      type <- match.arg(type)
                      res  <- httr::GET(url = sprintf(url, collection, document), httr::add_headers(JSESSIONID = self$JSESSIONID))
                      msg  <- httr::content(res, as = "text")
                      info <- jsonlite::fromJSON(msg)
                      if(type == "pages"){
                        info <- info$pageList$pages
                        
                        try({
                          page_xml <- lapply(info$tsList$transcripts, FUN = function(x){
                            x$timestamp <- as.POSIXct(x$timestamp / 1000, origin = "1970-01-01")
                            x <- x[order(x$timestamp, decreasing = TRUE), ]
                            x <- head(x, n = 1)
                            x
                          })
                          info$page_xml <- udpipe::txt_collapse(lapply(page_xml, FUN = function(x) x$url))
                        })
                      }
                      info
                    },
                    #' @description Retrieve the set of dictionaries containing possible letters as output
                    list_dictionaries = function(url = "https://transkribus.eu/TrpServer/rest/recognition/dicts"){
                      res  <- httr::GET(url = "https://transkribus.eu/TrpServer/rest/recognition/dicts", httr::add_headers(JSESSIONID = self$JSESSIONID))
                      msg  <- httr::content(res, as = "text")
                      info <- strsplit(msg, "\n")
                      info <- unlist(info)
                      info
                    },
                    #' @description Retrieve all HTR/OCR models you have access to within a collection
                    list_models = function(url = "https://transkribus.eu/TrpServer/rest/recognition/%s/list", collection){
                      collection <- self$map_name_to_id(collection, type = "collection")
                      res  <- httr::GET(url = sprintf(url, collection), httr::add_headers(JSESSIONID = self$JSESSIONID), httr::timeout(5*60))
                      msg  <- httr::content(res, as = "text")
                      info <- jsonlite::fromJSON(msg)
                      info
                    },
                    #' @description List all jobs or get the information of one specific job
                    #' @param job id of the job
                    list_job = function(url = "https://transkribus.eu/TrpServer/rest/jobs/list", job){
                      if(!missing(job)){
                        url <- gsub(pattern = "list$", replacement = job, url)
                      }
                      res  <- httr::GET(url = url, httr::add_headers(JSESSIONID = self$JSESSIONID), httr::timeout(5*60))
                      msg  <- httr::content(res, as = "text")
                      info <- jsonlite::fromJSON(msg)
                      if("createTime" %in% names(info)){
                        info$createTime <- as.POSIXct(info$createTime / 1000, origin = "1970-01-01")
                        if(is.data.frame(info)){
                          info <- info[order(info$createTime, decreasing = TRUE), ]  
                        }
                      }
                      if("startTime" %in% names(info)){
                        info$startTime <- as.POSIXct(info$startTime / 1000, origin = "1970-01-01")
                      }
                      if("endTime" %in% names(info)){
                        info$endTime <- as.POSIXct(info$endTime / 1000, origin = "1970-01-01")
                      }
                      info
                    },
                    #' @description Transcribe a set of pages with a model
                    #' @param page id of the page to transcribe
                    #' @param model id of the Transkribus model to use
                    #' @param dictionary character string with the dictionary (set of letters) to use
                    transcribe = function(url = "https://transkribus.eu/TrpServer/rest/recognition/{collection}/{model}/htrCITlab?id={document}&pages={page}&dict={dictionary}",
                                          collection,
                                          document,
                                          page,
                                          model,
                                          dictionary){
                      collection <- self$map_name_to_id(collection, type = "collection")
                      document   <- self$map_name_to_id(document, type = "document", collection = collection)
                      model      <- self$map_name_to_id(model, type = "htr-model", collection = collection)
                      
                      if(missing(page)){
                        url <- gsub(url, pattern = "&pages=\\{page\\}", replacement = "")
                      }
                      #collection_id <- 123580
                      #htr_id <- 37851
                      #htr_id <- 15708
                      #doc_id <- 817369
                      #dictionary_filename <- "Combined_Dutch_Model_M1.dict"
                      #page_string <- "30554336"
                      #page_string <- 1
                      #qry <- glue("https://transkribus.eu/TrpServer/rest/recognition/{collection_id}/{htr_id}/htrCITlab?id={doc_id}&pages={page_string}&dict={dictionary_filename}")
                      qry  <- glue(url)
                      res  <- httr::POST(url = qry, httr::add_headers(JSESSIONID = self$JSESSIONID))
                      msg  <- httr::content(res, as = "text")
                      invisible(msg)
                    },
                    #' @description Upload a set of images in a collection
                    #' @param data a character vector with the full path(s) to the image files on disk
                    #' @param document the title of the document
                    #' @param author the author of the document
                    #' @param trace logical indicating to show progress
                    upload = function(url = c("https://transkribus.eu/TrpServer/rest/uploads?collId={collection}",
                                              "https://transkribus.eu/TrpServer/rest/uploads/{uploadId}"),
                                      collection, 
                                      data, 
                                      document, 
                                      author = "R-API", 
                                      trace = TRUE){
                      collection <- self$map_name_to_id(collection, type = "collection")

                      # From the Transkribus documentation
                      #The Content-Type of each request has to be multipart/form-data and it must include the complete data for one page, i.e. if a pageXmlName was set in the given structure object, then the image as well as the XML have to be delivered. It depends on the used library whether the Content-Type has to be set explicitly. Please refer to the respective documentation on multipart requests.
                      #The body part names to be used are img and xml respectively and both should be sent as application/octet-stream.
                      qry   <- glue(url[1])
                      stopifnot(all(file.exists(data)))
                      pages       <- list(md       = list(title = document, author = author),
                                          pageList = list(pages = data.frame(fileName = basename(data), 
                                                                             pageNr   = seq_along(data), 
                                                                             stringsAsFactors = FALSE)))
                      pages_json  <- jsonlite::toJSON(pages, pretty = T, auto_unbox = T)
                      res         <- httr::POST(url = qry, 
                                                body = pages_json,
                                                httr::content_type_json(),
                                                httr::add_headers(JSESSIONID = self$JSESSIONID), 
                                                encode = "raw")
                      msg          <- httr::content(res, as = "text")
                      req          <- jsonlite::fromJSON(msg)
                      uploadId     <- req$uploadId
                      qry_upload   <- glue(url[2])
                      for(i in seq_along(data)){
                        pa      <- data[i]
                        if(trace){
                          cat(sprintf("%s uploading to %s file %s", Sys.time(), qry_upload, pa), sep = "\n")
                        }
                        p       <- httr::upload_file(path = pa)
                        req_put <- httr::PUT(url = qry_upload, 
                                             body = list(img = p, page = i),
                                             httr::add_headers(JSESSIONID = self$JSESSIONID),
                                             encode = "multipart")
                        msg  <- httr::content(req_put, as = "text")
                      }
                      info <- jsonlite::fromJSON(msg)
                      invisible(info)
                    },
                    #' @description Perform layout analysis on all pages of a document in a collection
                    #' @param doBlockSeg if TRUE, existing layout will be deleted, if FALSE keep existing text block regions
                    #' @param doLineSeg if TRUE, detect lines in text blocks, if FALSE keep existing lines
                    #' @param doPolygonToBaseline if TRUE, inspect line polygons and add baselines, if FALSE keep existing baselines
                    #' @param doBaselineToPolygon if TRUE, extrapolate new line polygons from baselines, if FALSE do not extrapolate
                    layout = function(url = "https://transkribus.eu/TrpServer/rest/LA?collId={collection}", collection, document,
                                      doBlockSeg = FALSE, doLineSeg = TRUE, doPolygonToBaseline = FALSE, doBaselineToPolygon = FALSE){
                      collection <- self$map_name_to_id(collection, type = "collection")
                      document   <- self$map_name_to_id(document, type = "document", collection = collection)
                      
                      qry    <- glue(url)
                      params <- list(
                                     doBlockSeg = doBlockSeg,
                                     doLineSeg = doLineSeg,
                                     doPolygonToBaseline = doPolygonToBaseline,
                                     doBaselineToPolygon = doBaselineToPolygon)
                      pages       <- list(docList = list(docs = list(docId = document, pageList = list(pages = character()))))
                      pages_json  <- jsonlite::toJSON(pages, pretty = T, auto_unbox = T)
                      
                      res         <- httr::POST(url = qry,
                                                body = pages_json,
                                                query = params,
                                                httr::content_type_json(),
                                                httr::add_headers(JSESSIONID = self$JSESSIONID),
                                                encode = "raw")
                      msg  <- httr::content(res, as = "text")
                      info <- jsonlite::fromJSON(msg)
                      info
                    }
                  )
)
DIGI-VUB/madoc.utils documentation built on Sept. 14, 2022, 3:03 p.m.