knitr::opts_chunk$set(comment = "#>", fig.path = "tools/")

madoc.utils

This repository contains an R package for - extracting data from Madoc - uploading images and transcribing images using the Transkribus API

'Madoc' is an 'Omeka S' based platform for the display, enrichment, and curation of digital objects in 'IIIF' format. The platform can be used for all kinds of crowdsourcing activities in the domain of digital humanities.

Installation

Example on Transkribus

library(madoc.utils)
library(magick)
img <- c(system.file(package = "madoc.utils", "extdata", "example.png"),
         system.file(package = "madoc.utils", "extdata", "alto-example.jpg"))
api <- Transkribus$new(user = "jan.wijffels@vub.ac.be", password = Sys.getenv("TRANSKRIBUS_PWD"))

msg <- api$create_collection(label = "test-collection")
msg <- api$list_collections()
msg <- api$list_models(collection  = "test-collection")
msg <- api$upload(collection       = "test-collection", document = "Example document", data = img)
msg <- api$layout(collection       = "test-collection", document = "Example document")
msg <- api$transcribe(collection   = "test-collection", document = "Example document", page = 1, 
                      model = "IJsberg", dictionary = "Combined_Dutch_Model_M1.dict")
api$list_job(job = msg)

## Get all documents in a collection, get pages of a document
## Once your job finished, import the PageXML file  
msg   <- api$list_collection(collection = "test-collection")
msg   <- api$list_document(collection = "test-collection", document = "Example document")

pages <- head(msg, n = 1)
x     <- read_pagexml(pages$page_xml) 
View(x)
image_read(pages$url)

## Delete the collection if no longer needed
msg <- api$delete_collection(collection = "test-collection")

Example on Madoc

library(madoc.utils)
site         <- "https://www.madoc.ugent.be/s/brugse-vrije"

## Get all projects on that madoc site
projects     <- madoc_projects(site)
projects     <- subset(projects, slug == "brugse-vrije-gebruikerstest")
projects

## Get all manifests and canvasses of a collection
manifests <- list()
manifests <- madoc_collection(site = site, id = projects$collection_id, tidy_metadata = TRUE)
canvasses <- madoc_manifest(site = site,   id = manifests$manifest_id)

## Get annotations on a canvas or several canvasses
annotations  <- madoc_canvas_model(site = site, id = canvasses$canvas_id)
anno         <- subset(annotations, nchar(value) > 0)

## Get URL of canvas for which volunteers performed an annotation
images <- madoc_canvas_image(site = site, id = sort(unique(anno$canvas_id)))
images <- merge(images, canvasses, by = "canvas_id")
images <- images[, c("manifest_id", "canvas_id", "height", "width", "image_url")]
anno <- merge(images, anno, by = "canvas_id", all.x = TRUE, all.y = FALSE)
anno <- merge(anno, manifests, by = "manifest_id", all.x = TRUE, all.y = FALSE, suffixes = c("", "_manifest"))
anno <- subset(anno, !is.na(value) & nchar(value) > 0)
str(anno)

library(writexl)
write_xlsx(anno, "brugse-vrije.xlsx")
library(magick)
url  <- anno$image_url[[1]]
url
img  <- image_read(url)
img  <- image_resize(img, "x800")
txt  <- anno$value[[1]]

trans <- image_blank(width = image_info(img)$width, height = image_info(img)$height)
trans <- image_annotate(trans, txt, size = 10, color = "green")
image_append(c(img, trans))

DIGI

By DIGI: Brussels Platform for Digital Humanities: https://digi.research.vub.be



DIGI-VUB/madoc.utils documentation built on Sept. 14, 2022, 3:03 p.m.