Transkribus | R Documentation |
Connect to Transkribus, inspect collections, documents, pages, perform handwritten text recognition
JSESSIONID
character string with the JSESSIONID to use once logged in
new()
Log in with your Transkribus user and password
Transkribus$new( url = "https://transkribus.eu/TrpServer/rest/auth/login", user, password )
url
character string with the url to use in the call to the Transkribus API
user
character string with your Transkribus user in order to connect
password
character string with your Transkribus password in order to connect
map_name_to_id()
Map labels to identifiers
Transkribus$map_name_to_id( name, type = c("collection", "document", "htr-model"), collection )
name
character string with the label to map to the identifier
type
type of mapping: either one of 'collection' of 'document'
collection
id of the collection
list_collections()
List all collections you have access to
Transkribus$list_collections( url = "https://transkribus.eu/TrpServer/rest/collections/list" )
url
character string with the url to use in the call to the Transkribus API
list_collection()
List the content (the documents) of a collection
Transkribus$list_collection( url = "https://transkribus.eu/TrpServer/rest/collections/%s/list", collection )
url
character string with the url to use in the call to the Transkribus API
collection
id of the collection
create_collection()
Create a collection
Transkribus$create_collection( url = "https://transkribus.eu/TrpServer/rest/collections/createCollection", label )
url
character string with the url to use in the call to the Transkribus API
label
character string with the name of the collection to create
delete_collection()
Delete a collection
Transkribus$delete_collection( url = "https://transkribus.eu/TrpServer/rest/collections/{collection}", collection )
url
character string with the url to use in the call to the Transkribus API
collection
id of the collection
list_document()
List the content (the pages) of a document
Transkribus$list_document( url = "https://transkribus.eu/TrpServer/rest/collections/%s/%s/fulldoc", collection, document, type = c("pages", "raw") )
url
character string with the url to use in the call to the Transkribus API
collection
id of the collection
document
id of the document
type
character string with the type of extraction, either 'pages' or 'raw'. Defaults to 'pages'
list_dictionaries()
Retrieve the set of dictionaries containing possible letters as output
Transkribus$list_dictionaries( url = "https://transkribus.eu/TrpServer/rest/recognition/dicts" )
url
character string with the url to use in the call to the Transkribus API
list_models()
Retrieve all HTR/OCR models you have access to within a collection
Transkribus$list_models( url = "https://transkribus.eu/TrpServer/rest/recognition/%s/list", collection )
url
character string with the url to use in the call to the Transkribus API
collection
id of the collection
list_job()
List all jobs or get the information of one specific job
Transkribus$list_job( url = "https://transkribus.eu/TrpServer/rest/jobs/list", job )
url
character string with the url to use in the call to the Transkribus API
job
id of the job
transcribe()
Transcribe a set of pages with a model
Transkribus$transcribe( url = "https://transkribus.eu/TrpServer/rest/recognition/{collection}/{model}/htrCITlab?id={document}&pages={page}&dict={dictionary}", collection, document, page, model, dictionary )
url
character string with the url to use in the call to the Transkribus API
collection
id of the collection
document
id of the document
page
id of the page to transcribe
model
id of the Transkribus model to use
dictionary
character string with the dictionary (set of letters) to use
upload()
Upload a set of images in a collection
Transkribus$upload( url = c("https://transkribus.eu/TrpServer/rest/uploads?collId={collection}", "https://transkribus.eu/TrpServer/rest/uploads/{uploadId}"), collection, data, document, author = "R-API", trace = TRUE )
url
character string with the url to use in the call to the Transkribus API
collection
id of the collection
data
a character vector with the full path(s) to the image files on disk
document
the title of the document
author
the author of the document
trace
logical indicating to show progress
layout()
Perform layout analysis on all pages of a document in a collection
Transkribus$layout( url = "https://transkribus.eu/TrpServer/rest/LA?collId={collection}", collection, document, doBlockSeg = FALSE, doLineSeg = TRUE, doPolygonToBaseline = FALSE, doBaselineToPolygon = FALSE )
url
character string with the url to use in the call to the Transkribus API
collection
id of the collection
document
id of the document
doBlockSeg
if TRUE, existing layout will be deleted, if FALSE keep existing text block regions
doLineSeg
if TRUE, detect lines in text blocks, if FALSE keep existing lines
doPolygonToBaseline
if TRUE, inspect line polygons and add baselines, if FALSE keep existing baselines
doBaselineToPolygon
if TRUE, extrapolate new line polygons from baselines, if FALSE do not extrapolate
clone()
The objects of this class are cloneable with this method.
Transkribus$clone(deep = FALSE)
deep
Whether to make a deep clone.
library(madoc.utils) api <- Transkribus$new(user = "jan.wijffels@vub.ac.be", password = Sys.getenv("TRANSKRIBUS_PWD")) ## Get pages of a collection collections <- api$list_collections() collections id_collection <- sample(collections$colId, size = 1) documents <- api$list_collection(collection = id_collection) documents id_document <- sample(documents$docId, size = 1) pages <- api$list_document(collection = id_collection, document = id_document) pages ## Create a collection, upload some images to the collection, delete it again id <- api$create_collection(label = "example-collection") img <- c(system.file(package = "madoc.utils", "extdata", "alto-example.jpg"), system.file(package = "madoc.utils", "extdata", "example.png")) api$upload(data = img, collection = id, document = paste("Upload", Sys.time()), author = "R-API") api$list_collection(collection = id) api$delete_collection(collection = id) ## Look at relevant models and dictionaries dicts <- api$list_dictionaries() grep(dicts, pattern = "Dutch", ignore.case = TRUE, value = TRUE) models <- api$list_models(collection = id_collection) str(models) dutch <- grep(models$language, pattern = "Dutch", ignore.case = TRUE, value = TRUE) dutch <- subset(models, language %in% dutch) dutch <- c("Dutch Mountains (18th Century)", "IJsberg", "Dutch Notarial Model 18th Century") dutch <- subset(models, name %in% dutch) dutch <- subset(models, name %in% "Dutch Mountains (18th Century)" & provider == "CITlabPlus") str(dutch) id_model <- dutch$htrId ## Inspect jobs jobs <- api$list_job() jobs ## Not run: id <- api$create_collection(label = "test-collection") img <- c(system.file(package = "madoc.utils", "extdata", "example.png"), system.file(package = "madoc.utils", "extdata", "alto-example.jpg")) api$upload(data = img, collection = id, document = "Doc with 2 images", author = "R-API") ## ## This section shows how to transcribe using the API ## >> note that this consumes Transkribus credits ## ## Inspect one image and transcribe it ## ## - id_model <- 21683 ## Dutch Mountains HTR+ id_collection <- id id_document <- docs$docId docs <- api$list_collection(collection = id_collection) pages <- api$list_document(collection = id_collection, document = id_document) page <- head(pages, n = 1) id_job <- api$transcribe(collection = id_collection, document = id_document, page = 1, model = 21683, dictionary = "Combined_Dutch_Model_M1.dict") x <- read_pagexml(page$page_xml) ## ## A random document from a collection ## library(magick) pages <- api$list_document(collection = id_collection, document = id_document) page <- tail(pages, n = 1) page img <- image_read(page$thumbUrl) img <- image_read(page$url) image_resize(img, "x600") id_job <- api$transcribe(collection = id_collection, document = id_document, page = page$pageNr, model = id_model, dictionary = "Combined_Dutch_Model_M1.dict") api$list_job(job = id_job) ## After the job has finished, we have a Page-XML file which we can read in pages <- api$list_document(collection = id_collection, document = id_document) page <- tail(pages, n = 1) img <- image_read(page$url) x <- read_pagexml(page$page_xml) bl <- image_draw_baselines(img, x = x$baseline, col = "darkgreen", lwd = 4) image_resize(bl, "x900") bl <- image_crop_baselineareas(img, x = setNames(x$baseline, x$id), textregion = x$points, extend = FALSE, overview = FALSE) bl <- image_rbind(bl, color = "red", geometry = "2x2") image_resize(bl, "x900") api$delete_collection(collection = id) ## End(Not run)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.