Nothing
#' Import a single 'ELAN' '*.eaf' file
#'
#' Advice: In most situations it is more convenient to use \code{act::corpus_new}, \code{act::corpus_import} for importing annotation files.
#'
#' Imports the contents of a 'ELAN' .eaf file and returns a transcript object.
#' The input to this function is either the path to an .eaf file or the contents of a .eaf file obtained from the \code{@file.content} of an existing transcript object by \code{readLines()}.
#' If you pass 'fileContent' you need to pass 'transcriptName' as parameter, too.
#'
#' Please note:
#' - 'ELAN' offers a variety of tier types, some including dependencies from other tiers. Therefore not all annotations do actually have a time value. Missing values will be detected in the superordinate tier or will be interpolated. You will not be able to recognize interpolated values in the annotations.
#' - Please also note that dependencies between tiers in you .eaf file are not reflected in the transcript object within the act package.
#'
#' @param filePath Character string; input path of a single 'ELAN' .eaf file.
#' @param fileContent Vector of character strings; contents of an 'ELAN' .eaf file read by \code{readLines()}.
#' @param transcriptName Character string; name of the transcript.
#'
#' @return Transcript object.
#'
#' @seealso \code{corpus_import}, \code{corpus_new}, \code{import}, \code{import_exb}, \code{import_rpraat}, \code{import_textgrid}
#'
#' @export
#'
#' @example inst/examples/import_eaf.R
#'
import_eaf <- function(filePath=NULL,
fileContent=NULL,
transcriptName=NULL) {
# filePath<- rstudioapi::selectFile()
# fileContent<-NULL
# transcriptName<-NULL
if (is.null(filePath) & is.null(fileContent)) {
stop("You need to pass as parameter eiter a file path to a TextGrid file (filePath) or the contents of a TextGrid file (fileContent) as parameter.")
}
if (!is.null(filePath) & !is.null(fileContent)) {
stop("Please pass only filePath or fileContent as parameter, not both.")
}
if (!is.null(fileContent) & is.null(transcriptName)) {
stop("If you pass 'fileContent' you need to pass 'transcriptName' as parameter, too.")
}
#--- get transcript name
t <- methods::new("transcript")
t@file.path <- if(is.null(filePath)) {""} else {filePath}
#--- get transcript name
if (!is.null(transcriptName)) {
t@name <- transcriptName
} else {
if(!is.null(filePath)) {
t@name <- tools::file_path_sans_ext(basename(filePath))
} else {
t@name <- "imported transcript"
}
}
t@file.type <- "eaf"
t@file.encoding <- "UTF-8"
t@import.result <- "ok"
t@load.message <- ""
t@modification.systime <- character()
#t@annotations <- .emptyAnnotations
if (!is.null(filePath)) {
#--- check if file exists
if (!file.exists(filePath)) {
t@import.result <- "error"
t@load.message <- "File does not exist."
return(t)
}
#=== READ lines
myCon <- file(filePath, encoding = "UTF-8")
myeaf <- readLines(myCon, warn=FALSE)
close(myCon)
t@file.encoding <- "UTF-8"
if (!length(options()$act.import.storeFileContentInTranscript)==0) {
if (options()$act.import.storeFileContentInTranscript==TRUE) {
t@file.content <- myeaf
}
}
}
if (!is.null(fileContent)) {
myeaf <- fileContent
}
if(is.null(myeaf)) {
t@import.result <- "error"
t@load.message <- "File not read."
return(t)
}
#=== merge lines into a long text
myeaf.merge <- stringr::str_c(myeaf, collapse = "\n")
if (!stringr::str_detect(myeaf.merge, pattern="http://www.mpi.nl/tools/elan/EAFv3.0.xsd" )) {
t@import.result <- "error"
t@load.message <- "File not recognized as eaf file version EAFv3.0.xsd"
return(t)
}
#remove a strange unicode character
myeaf.merge <- stringr::str_replace_all(myeaf.merge, pattern='\\x1B', replacement='')
#== extract media links
regex_media <- '(?s)(?<=\\bMEDIA_URL=").+?(?=")'
medialinks <- stringr::str_match_all(myeaf.merge, regex_media)
medialinks <- unique(unlist(medialinks))
medialinks <- stringr::str_replace_all(string=medialinks, pattern="file:///", replacement="")
#=== extract linguistic_types and constraints
regex_constraints <- '(?s)(<LINGUISTIC_TYPE)(.*?CONSTRAINTS="(.*?)")(.*?LINGUISTIC_TYPE_ID="(.*?)")(.*?/>)'
constraints <- stringr::str_match_all(myeaf.merge, regex_constraints)
constraints <- do.call(rbind, lapply(constraints, data.frame, stringsAsFactors=FALSE))
constraints <- data.frame(cbind(type.id=constraints[,6],
constraint=constraints[,4]),
stringsAsFactors=FALSE)
if (nrow(constraints)==0) {
constraints <- data.frame(type.id=as.character("no-constraint"), constraint=as.character("No_Constraint"), stringsAsFactors=FALSE)
} else {
constraints <- rbind(constraints, c(type.id="no-constraint",constraint="No_Constraint"), stringsAsFactors=FALSE)
}
#View(constraints)
#== extract time slots
regex_timeorder <- '(?s)<TIME_ORDER.+?/TIME_ORDER>'
timeorder <- unlist(unlist(stringr::str_match(myeaf.merge, regex_timeorder)))
regex_timeslots <- '(?s)(<TIME_SLOT.*?TIME_SLOT_ID="(.*?)")(.*?)(TIME_VALUE="(.*?)")?(/>)'
timeslots <- stringr::str_match_all(timeorder, regex_timeslots)
timeslots <- do.call(rbind, lapply(timeslots, data.frame, stringsAsFactors=FALSE))
#View(timeslots)
timeslots <- data.frame(cbind(timeSlotID=timeslots[,3], value=timeslots[,6]), stringsAsFactors=FALSE)
timeslots$value <- as.double(timeslots$value)/1000
#View(timeslots)
#=== extract annotations
#Split the entire eaf
#get position of beginning of "<tier"
pos.tier <- data.frame(stringr::str_locate_all(myeaf.merge, pattern="<TIER")[[1]],stringsAsFactors=FALSE )
annotations.block <- c()
if (nrow(pos.tier)>0) {
pos.lingtype <- stringr::str_locate(myeaf.merge, pattern="<LINGUISTIC_TYPE")[1,1]
if (nrow(pos.tier)==1) {
annotations.block <- unlist(stringr::str_sub(myeaf.merge, pos.tier$start, pos.lingtype-1))
} else {
repl <- c(pos.tier$start[2:nrow(pos.tier)], pos.lingtype)
pos.tier$end <- repl-1
annotations.block <- unlist(stringr::str_sub(myeaf.merge, pos.tier$start, pos.tier$end))
}
#print(stringr::str_sub(annotations.block,1,200))
#length(annotations.block)
#cat(annotations.block)
}
#=== extract tier info
tier.ling_type_ref <- '(<TIER)(.*LINGUISTIC_TYPE_REF="(.*?)")?'
tier.ling_type_ref <- stringr::str_match_all(annotations.block, tier.ling_type_ref)
tier.ling_type_ref <- do.call(rbind, lapply(tier.ling_type_ref, data.frame, stringsAsFactors=FALSE))
tier.ling_type_ref <- tier.ling_type_ref[,4]
#View(tier.ling_type_ref)
regex_name <- '(?s)(<TIER)(.*TIER_ID="(.*?)")?'
tier.name <- stringr::str_match_all(annotations.block, regex_name)
tier.name <- do.call(rbind, lapply(tier.name, data.frame, stringsAsFactors=FALSE))
tier.name <- tier.name[,4]
#View(tier.name)
regex_tierparent_ref <- '(?s)(<TIER)(.*PARENT_REF="(.*?)")?'
tier.parent_ref <- stringr::str_match_all(annotations.block, regex_tierparent_ref)
tier.parent_ref <- do.call(rbind, lapply(tier.parent_ref, data.frame, stringsAsFactors=FALSE))
tier.parent_ref <- tier.parent_ref[,4]
#View(tier.parent_ref)
if (is.null (tier.name)) {
#if there are no tiers in the file
tiers <- data.frame(tier.name=as.character(),
tier.type.ref=as.character(),
tier.parent.ref=as.character(),
tier.hierarchy=as.character(),
stringsAsFactors = FALSE)
} else {
tiers <- data.frame(cbind(tier.name=tier.name,
tier.type.ref=tier.ling_type_ref,
tier.parent.ref=tier.parent_ref,
tier.hierarchy=NA),
stringsAsFactors = FALSE)
#View(tiers)
#construct the hierarchy of the tiers
tiers$tier.hierarchy[is.na(tiers$tier.parent.ref)]<- 1
hierarchy <- 0
for (i in 1:nrow(tiers)) {
#exit loop if all tiers have already a place in the hierarcy
if (!any(is.na(tiers$tier.hierarchy))) {
break
}
hierarchy <- hierarchy+1
tiernames.currentlevel <- tiers$tier.name[which(tiers$tier.hierarchy==hierarchy)]
if (length(tiernames.currentlevel)!=0) {
for (j in 1:length(tiernames.currentlevel)) {
tiernames.currentlevel
tiers$tier.hierarchy[which(tiers$tier.parent.ref==tiernames.currentlevel[j])] <- hierarchy+1
}
}
}
#View(tiers)
#=get the actual constraint of the tier
#remember the order
tiers$order <- 1:nrow(tiers)
#merge with constraints
tiers <- merge(x = tiers, y = constraints, by.x = "tier.type.ref", by.y="type.id", all.x = TRUE)
#restore initial order
tiers <- tiers[order(tiers$order),]
tiers$constraint[is.na(tiers$constraint)]<- "No_Constraint"
#View(constraints)
#View(tiers.m)
#---create unique tierNames
# while (length(tiers$tier.name[duplicated(tiers$tier.name)])>0) {
# #get multiple tierNames
# multiple <- tiers$tier.name[duplicated(tiers$tier.name)]
# for (myName in multiple) {
# tiers$tier.name[tiers$tier.name==myName]<- paste(tiers$tier.name[tiers$tier.name==myName],1:length(tiers$tier.name[tiers$tier.name==myName]),sep="-")
# }
# t@import.result <- "ok"
# t@load.message <- "Some tiers have been renamed since their names were not unique."
#}
#---create unique tierNames
if (length(tiers$tier.name[duplicated(tiers$tier.name)])>0) {
tiers$tier.name <- make.unique(tiers$tier.name)
t@import.result <- "ok"
t@load.message <- "Some tiers have been renamed since their names were not unique."
}
annotations <- data.frame()
if (nrow(tiers)>0) {
#--- iterate through tiers and extract info
# Those are the difficult referring types:
regex_ann_referring <- '(?s)(<REF_ANNOTATION)(.ANNOTATION_ID="(.*?)")(.ANNOTATION_REF="(.*?)")(.PREVIOUS_ANNOTATION="(.*?)")?(.*?<ANNOTATION_VALUE>(.*?)</ANNOTATION_VALUE>)'
constraints_ann_referring <- c("Symbolic_Association", "Symbolic_Subdivision")
#These are the other types (there might be more), but this is not checked:
regex_ann_alignable <- '(?s)(<ALIGNABLE_ANNOTATION)(.*?ANNOTATION_ID="(.*?)")(.*?TIME_SLOT_REF1="(.*?)")(.*?TIME_SLOT_REF2="(.*?)">)(.*?<ANNOTATION_VALUE>(.*?)</ANNOTATION_VALUE>)'
constraints_ann_alignable <- c("Time_Subdivision", "Included_In", "No_Constraint")
#i <- 1
#tiers[i,]
#annotations.block[i]
#i <- 3
#cat(annotations.block[i])
#View(tiers)
for (i in 1:nrow(tiers)) {
#get the values of the annotations, using different regular expressions
if (tiers$constraint[i] %in% constraints_ann_referring) {
myRegEx <- regex_ann_referring
annotations.tier <- stringr::str_match_all(annotations.block[i], myRegEx)
annotations.tier <- do.call(rbind, lapply(annotations.tier, data.frame, stringsAsFactors=FALSE))
#View(annotations.tier)
if (nrow(annotations.tier)>0) {
annotations.tier <- data.frame(cbind( tier=tiers$tier.name[i],
type=tiers$tier.type[i],
hierarchy=tiers$tier.hierarchy[i],
id=annotations.tier[, 4],
content=annotations.tier[, 10],
ts1=NA,
ts2=NA,
annotation.ref=annotations.tier[, 6],
annotation.previous=annotations.tier[, 8]),
stringsAsFactors=FALSE)
}
} else {
myRegEx <- regex_ann_alignable
annotations.tier <- stringr::str_match_all(annotations.block[i], myRegEx)
annotations.tier <- do.call(rbind, lapply(annotations.tier, data.frame, stringsAsFactors=FALSE))
#View(annotations.tier)
if (nrow(annotations.tier)>0) {
annotations.tier <- data.frame(cbind( tier=tiers$tier.name[i],
type=tiers$tier.type[i],
hierarchy=tiers$tier.hierarchy[i],
id=annotations.tier[, 4],
content=annotations.tier[, 10],
ts1=annotations.tier[, 6],
ts2=annotations.tier[, 8],
annotation.ref=NA,
annotation.previous=NA),
stringsAsFactors=FALSE)
}
}
if (nrow(annotations.tier)>0) {
if (nrow(annotations)>0) {
annotations <- rbind(annotations, annotations.tier)
} else {
annotations <- annotations.tier
}
}
}
#View(annotations)
#View(annotations.tier)
#there are no annotations
if (nrow(annotations)==0) {
} else {
#===replace ts IDs by real numbers
# remember the order of the annotations
annotations$order <- 1:nrow(annotations)
#ts1
annotations <- merge(x = annotations, y = timeslots, by.x = "ts1", by.y="timeSlotID", all.x = TRUE)
mycolnames <- colnames(annotations)
mycolnames[length(mycolnames)] <- "startSec"
colnames(annotations) <- mycolnames
#ts2
annotations <- merge(x = annotations, y = timeslots, by.x = "ts2", by.y="timeSlotID", all.x = TRUE)
mycolnames <- colnames(annotations)
mycolnames[length(mycolnames)] <- "endSec"
colnames(annotations) <- mycolnames
#restore order!
annotations <- annotations[order(annotations$order),]
#View(annotations)
annotations.safe <- annotations
#=== resolve referring annotations
allrefids <- unique(annotations$annotation.ref)
allrefids <- allrefids[!is.na(allrefids)]
tiers.copy <- tiers[order(tiers$tier.hierarchy), ]
all_tiers <- tiers.copy$tier.name
#all_tiers <- unique(annotations$tier[which(annotations$annotation.ref!="")])
#=mark which are the first and the last annotations in a group
annotations <- cbind(annotations, position=NA)
if (nrow(annotations)==1) {
annotations$position[1] <- 1 # "first"
} else {
annotations$position[!is.na(annotations$annotation.ref) & is.na(annotations$annotation.previous)] <- 1# "first"
}
firstannotations <- which(annotations$position=="1") #"first"
for (i in firstannotations) {
counter <- 1
lastannotation <- NA
#print ("..")
#print(i)
#take their id and search for the next
search <- annotations$id[i]
#do a long loop
for (j in 1:10) {
#print(j)
#get the annotation that is referred to
nextannotation <- which(annotations$annotation.previous==search)
if (length(nextannotation)==0) {
if (!is.na(lastannotation)) {
# annotations$position[lastannotation]<-"last"
}
break
} else {
annotations$annotation.previous[nextannotation]<-"found"
counter <- counter+1
annotations$position[nextannotation] <- counter
search <- annotations$id[nextannotation]
#remember the last annotation
lastannotation <- nextannotation
}
}
}
#View(annotations)
annotations.safe <- annotations
#iterate through all tiers with referring annotations tiers
i <- 2
#all_tiers[i]
j <- 1
#allrefids[j]
#order / sort by tiers, annotarion referred to and Position()
annotations <- annotations[order(annotations$tier, annotations$annotation.ref, annotations$position),]
#View(annotations)
for (i in 1:length(all_tiers)) {
for (j in 1:length(allrefids)) {
#cat ("i",i, " j",j, "\n")
#cat(all_tiers[i],"\n" )
#cat(allrefids[j],"\n" )
ids <- which(annotations$tier==all_tiers[i] & annotations$annotation.ref==allrefids[j])
if (length(ids)==1) {
#if there is only one referring annotation
#--> simply take the time values of the annotation referred to
ann.referred.to <- annotations[which(annotations$id==allrefids[j])[1], ]
annotations[ids, ]$startSec <- ann.referred.to$startSec
annotations[ids, ]$endSec <- ann.referred.to$endSec
annotations[ids, ]$ts1 <- "fromsuperordinated"
}
if (length(ids)>1) {
#-- first annotation in group
#--> split up the time values
ann.referred.to <- annotations[which(annotations$id==allrefids[j])[1], ]
minSec <- ann.referred.to$startSec
maxSec <- ann.referred.to$endSec
newTimes <- seq(from=minSec, to=maxSec, length.out=length(ids)+1)
newTimes.start <- newTimes[1:length(newTimes)-1]
newTimes.end <- newTimes[2:length(newTimes)]
annotations[ids, ]$startSec <- newTimes.start
annotations[ids, ]$endSec <- newTimes.end
annotations[ids, ]$ts1 <- "interpolated"
}
}
}
#restore order()
annotations <- annotations[order(annotations$order),]
#View(annotations)
#View(tiers)
#=== resolve empty time slots of annotations
#run through all annotations
i <- 4
for (m in 1:nrow(tiers)) {
if (nrow(annotations)>1) {
for (i in 1:(nrow(annotations)-1)) {
#print(i)
#check if the end value is empty
if (is.na(annotations$endSec[i])) {
#if we are lucky, the following annotation has a start value and we are done
if (!is.na(annotations$startSec[i+1])) {
annotations$endSec[i] <- annotations$startSec[i+1]
}
#otherwise search for the next end value
counter <- 0
for (j in i:nrow(annotations)) {
if (!is.na(annotations$endSec[j])) {
break
}
}
#the span that we need to calculate is this from: i startsec to j endsec
#how many annotations are spanned: j-i+1
# we this need even one more timevalue for the sequence: j-i+2
if (!is.na(annotations$startSec[i]) & !is.na(annotations$endSec[j])) {
newTimes <- seq(from=annotations$startSec[i], to=annotations$endSec[j], length.out=j-i+2)
newTimes.start <- newTimes[1:length(newTimes)-1]
newTimes.end <- newTimes[2:length(newTimes)]
#assign new time values to annotations
l <- 0
for (k in i:j) {
l <- l+1
annotations$startSec[k] <- newTimes.start[l]
annotations$endSec[k] <- newTimes.end[l]
annotations[k, ]$ts1 <- "interpolated"
annotations[k, ]$ts2 <- "interpolated"
}
}
}
}
}
if (!any(is.na(c(annotations$startSec, annotations$endSec)))) {
break
}
}
}
}
#View(annotations)
if (nrow(annotations)==0) {
#empty myAnnotations is already set at the beginning
myAnnotations <- data.frame()
} else {
annotationID <- c(1:nrow(annotations))
myAnnotations <- data.frame(
annotationID = as.integer(annotationID),
tier.name = annotations$tier,
startSec = as.double(annotations$startSec),
endSec = as.double(annotations$endSec),
content = as.character(annotations$content),
content.norm = as.character(""),
char.orig.bytime.start = rep(as.integer(NA),nrow(annotations)),
char.orig.bytime.end = rep(as.integer(NA),nrow(annotations)),
char.norm.bytime.start = rep(as.integer(NA),nrow(annotations)),
char.norm.bytime.end = rep(as.integer(NA),nrow(annotations)),
char.orig.bytier.start = rep(as.integer(NA),nrow(annotations)),
char.orig.bytier.end = rep(as.integer(NA),nrow(annotations)),
char.norm.bytier.start = rep(as.integer(NA),nrow(annotations)),
char.norm.bytier.end = rep(as.integer(NA),nrow(annotations)),
row.names = annotationID,
stringsAsFactors = FALSE)
rownames(myAnnotations) <- myAnnotations$annotationID
#===set correct column format
t@annotations$annotationID <- as.integer(t@annotations$annotationID)
t@annotations$startSec <- as.double(t@annotations$startSec)
t@annotations$endSec <- as.double(t@annotations$endSec)
t@annotations$content <- as.character(t@annotations$content)
}
#===set transcript length
t@length.sec <- 0
#=== if it is not a completely empty transcript
if (!nrow(myAnnotations)==0) {
#=== get rid of empty intervals
if (options()$act.import.readEmptyIntervals==FALSE) {
myAnnotations <- myAnnotations[myAnnotations$content!="",]
}
myAnnotations <- myAnnotations[is.na(myAnnotations["content"])==FALSE,]
if (nrow(myAnnotations)>0) {
#=== sort transcript by start times
myAnnotations <- myAnnotations[order(myAnnotations$startSec, myAnnotations$tier.name), ]
#=== set endSec of points to startSec
myAnnotations$endSec[is.na(myAnnotations$endSec)] <- myAnnotations$startSec[is.na(myAnnotations$endSec)]
#=== set annotations.id again
myAnnotations$annotationID <- c(1:nrow(myAnnotations))
#=== set the new row names
rownames(myAnnotations) <- myAnnotations$annotationID
#View(myAnnotations)
}
#make transcript 1 sec longer than last annotation
t@length.sec <- max( t@length.sec, as.double(annotations$startSec)+1, as.double(annotations$endSec)+1)
#=== html conversion
myAnnotations$content <- textutils::HTMLdecode(myAnnotations$content)
myAnnotations$tier.name <- textutils::HTMLdecode(myAnnotations$tier.name)
}
#=== html conversion
tiers$tier.name <- textutils::HTMLdecode(tiers$tier.name)
}
#=== assign other values to object
#annotations
t@annotations <- myAnnotations
#media
t@media.path <- medialinks
#tiers: all are interval tiers
t@tiers <- act::helper_tiers_new_table(tierNames=tiers$tier.name)
t@history <- list(
list(modification = "import_eaf",
systime = Sys.time()
)
)
return(t)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.