knitr::opts_chunk$set(comment = "#>", fig.path = "tools/")
This repository contains an R package for - extracting data from Madoc - uploading images and transcribing images using the Transkribus API
'Madoc' is an 'Omeka S' based platform for the display, enrichment, and curation of digital objects in 'IIIF' format. The platform can be used for all kinds of crowdsourcing activities in the domain of digital humanities.
remotes::install_github("DIGI-VUB/madoc.utils")
library(madoc.utils) library(magick) img <- c(system.file(package = "madoc.utils", "extdata", "example.png"), system.file(package = "madoc.utils", "extdata", "alto-example.jpg")) api <- Transkribus$new(user = "jan.wijffels@vub.ac.be", password = Sys.getenv("TRANSKRIBUS_PWD")) msg <- api$create_collection(label = "test-collection") msg <- api$list_collections() msg <- api$list_models(collection = "test-collection") msg <- api$upload(collection = "test-collection", document = "Example document", data = img) msg <- api$layout(collection = "test-collection", document = "Example document") msg <- api$transcribe(collection = "test-collection", document = "Example document", page = 1, model = "IJsberg", dictionary = "Combined_Dutch_Model_M1.dict") api$list_job(job = msg) ## Get all documents in a collection, get pages of a document ## Once your job finished, import the PageXML file msg <- api$list_collection(collection = "test-collection") msg <- api$list_document(collection = "test-collection", document = "Example document") pages <- head(msg, n = 1) x <- read_pagexml(pages$page_xml) View(x) image_read(pages$url) ## Delete the collection if no longer needed msg <- api$delete_collection(collection = "test-collection")
library(madoc.utils) site <- "https://www.madoc.ugent.be/s/brugse-vrije" ## Get all projects on that madoc site projects <- madoc_projects(site) projects <- subset(projects, slug == "brugse-vrije-gebruikerstest") projects ## Get all manifests and canvasses of a collection manifests <- list() manifests <- madoc_collection(site = site, id = projects$collection_id, tidy_metadata = TRUE) canvasses <- madoc_manifest(site = site, id = manifests$manifest_id) ## Get annotations on a canvas or several canvasses annotations <- madoc_canvas_model(site = site, id = canvasses$canvas_id) anno <- subset(annotations, nchar(value) > 0) ## Get URL of canvas for which volunteers performed an annotation images <- madoc_canvas_image(site = site, id = sort(unique(anno$canvas_id))) images <- merge(images, canvasses, by = "canvas_id") images <- images[, c("manifest_id", "canvas_id", "height", "width", "image_url")]
anno <- merge(images, anno, by = "canvas_id", all.x = TRUE, all.y = FALSE) anno <- merge(anno, manifests, by = "manifest_id", all.x = TRUE, all.y = FALSE, suffixes = c("", "_manifest")) anno <- subset(anno, !is.na(value) & nchar(value) > 0) str(anno) library(writexl) write_xlsx(anno, "brugse-vrije.xlsx")
library(magick) url <- anno$image_url[[1]] url img <- image_read(url) img <- image_resize(img, "x800") txt <- anno$value[[1]] trans <- image_blank(width = image_info(img)$width, height = image_info(img)$height) trans <- image_annotate(trans, txt, size = 10, color = "green") image_append(c(img, trans))
By DIGI: Brussels Platform for Digital Humanities: https://digi.research.vub.be
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.