###########################################################################
# Joshua C. Fjelstul, Ph.D.
# rgenea R Package
###########################################################################
#################################################
# function to read in a GEDCOM file
#################################################
parse_GEDCOM <- function(file) {
# define pipe locally
`%>%` <- magrittr::`%>%`
#################################################
# read in data
#################################################
# read in data
lines <- readLines(file)
# make a data frame
lines <- dplyr::tibble(text = lines)
#################################################
# lines
#################################################
# clean white space
lines$text <- stringr::str_squish(lines$text)
# line level
lines$level <- stringr::str_extract(lines$text, "^[0-9]+")
# line tag
lines$tag <- stringr::str_extract(lines$text, "[_A-Z]{2,}")
# drop program-specific tags
lines <- dplyr::filter(lines, !stringr::str_detect(tag, "^_"))
# value
lines$value <- lines$text
lines$value <- stringr::str_replace(lines$value, "[0-9]+ *[_A-Z]+ *", "")
lines$value <- stringr::str_squish(lines$value)
# drop text variable
lines <- dplyr::select(lines, -text)
# make group
lines$group <- lines$tag
lines$group[stringr::str_detect(lines$tag, "^(CONT|CONC)$")] <- NA
lines$group[!is.na(lines$group)] <- 1:sum(!is.na(lines$group))
# id
lines$id <- 1:nrow(lines)
# collapse by group
lines <- lines %>%
dplyr::group_by(group) %>%
dplyr::summarize(
id = id[1],
level = level[1],
tag = tag[1],
value = stringr::str_trim(stringr::str_c(value, collapse = " "))
) %>%
dplyr::ungroup() %>%
dplyr::arrange(id)
#################################################
# ID numbers
#################################################
make_ID_table <- function(x, regex, prefix) {
x <- stringr::str_extract_all(x, regex)
x <- unlist(x)
x <- unique(x)
id <- sample(1:99999, size = length(x), replace = FALSE)
id <- stringr::str_pad(id, side = "left", width = 5, pad = "0")
id <- stringr::str_c(prefix, id, sep = ":")
out <- dplyr::tibble(
GEDCOM_ID = x,
key_ID = id
)
return(out)
}
# person IDs
person_table <- make_ID_table(lines$value, "@P[0-9]+@", "P")
document_table <- make_ID_table(lines$value, "@S[-0-9]+@", "S")
family_table <- make_ID_table(lines$value, "@F[-0-9]+@", "F")
#################################################
# lines
#################################################
# initialize variables
lines$level_0_tag <- NA
lines$level_0_ID <- NA
lines$level_1_tag <- NA
lines$level_1_ID <- NA
lines$level_2_tag <- NA
lines$level_2_ID <- NA
lines$level_3_tag <- NA
lines$level_3_ID <- NA
lines$level_4_tag <- NA
lines$level_4_ID <- NA
# level 0
lines$level_0_tag[lines$level == 0] <- lines$tag[lines$level == 0]
lines$level_0_ID[lines$level == 0] <- 1:sum(lines$level == 0)
# level 1
lines$level_1_tag[lines$level == 1] <- lines$tag[lines$level == 1]
lines$level_1_ID[lines$level == 1] <- 1:sum(lines$level == 1)
# lines$level_1_tag[!is.na(lines$level_0_tag)] <- ""
# lines$level_1_ID[!is.na(lines$level_0_tag)] <- ""
# level 2
lines$level_2_tag[lines$level == 2] <- lines$tag[lines$level == 2]
lines$level_2_ID[lines$level == 2] <- 1:sum(lines$level == 2)
# lines$level_2_tag[!is.na(lines$level_1_tag)] <- ""
# lines$level_2_ID[!is.na(lines$level_1_tag)] <- ""
# level 3
lines$level_3_tag[lines$level == 3] <- lines$tag[lines$level == 3]
lines$level_3_ID[lines$level == 3] <- 1:sum(lines$level == 3)
# lines$level_3_tag[!is.na(lines$level_2_tag)] <- ""
# lines$level_3_ID[!is.na(lines$level_2_tag)] <- ""
# level 4
lines$level_4_tag[lines$level == 4] <- lines$tag[lines$level == 4]
lines$level_4_ID[lines$level == 4] <- 1:sum(lines$level == 4)
# lines$level_4_tag[!is.na(lines$level_3_tag)] <- ""
# lines$level_4_ID[!is.na(lines$level_3_tag)] <- ""
# fill level 0
lines <- lines %>%
tidyr::fill(level_0_tag, .direction = "down") %>%
tidyr::fill(level_0_ID, .direction = "down")
# fill level 1
lines <- lines %>%
dplyr::group_by(level_0_ID) %>%
tidyr::fill(level_1_tag, .direction = "down") %>%
tidyr::fill(level_1_ID, .direction = "down") %>%
dplyr::ungroup()
# fill level 2
lines <- lines %>%
dplyr::group_by(level_1_ID) %>%
tidyr::fill(level_2_tag, .direction = "down") %>%
tidyr::fill(level_2_ID, .direction = "down") %>%
dplyr::ungroup()
# fill level 3
lines <- lines %>%
dplyr::group_by(level_2_ID) %>%
tidyr::fill(level_3_tag, .direction = "down") %>%
tidyr::fill(level_3_ID, .direction = "down") %>%
dplyr::ungroup()
# fill level 4
lines <- lines %>%
dplyr::group_by(level_3_ID) %>%
tidyr::fill(level_4_tag, .direction = "down") %>%
tidyr::fill(level_4_ID, .direction = "down") %>%
dplyr::ungroup()
# drop header and footer
lines <- dplyr::filter(lines, level_0_tag %in% c("FAM", "INDI", "SOUR"))
# path
lines$path <- stringr::str_c(
stringr::str_replace_na(lines$level_1_tag, ""),
stringr::str_replace_na(lines$level_2_tag, ""),
stringr::str_replace_na(lines$level_3_tag, ""),
stringr::str_replace_na(lines$level_4_tag, ""),
sep = "_")
lines$path <- stringr::str_replace(lines$path, "[_]+$", "")
# person ID
lines$person_ID <- stringr::str_extract(lines$value, "@P(-)?[0-9]+@")
lines <- lines %>% tidyr::fill(person_ID)
lines$person_ID[lines$level_0_tag != "INDI"] <- NA
# family ID
lines$family_ID <- stringr::str_extract(lines$value, "@F(-)?[0-9]+@")
lines <- lines %>% tidyr::fill(family_ID)
lines$family_ID[lines$level_0_tag != "FAM"] <- NA
# source ID
lines$source_ID <- stringr::str_extract(lines$value, "@S(-)?[0-9]+@")
lines <- lines %>% tidyr::fill(source_ID)
lines$source_ID[lines$level_0_tag != "SOUR"] <- NA
# ID number
lines$ID_number <- NA
lines$ID_number[lines$level_0_tag == "INDI"] <- lines$person_ID[lines$level_0_tag == "INDI"]
lines$ID_number[lines$level_0_tag == "FAM"] <- lines$family_ID[lines$level_0_tag == "FAM"]
lines$ID_number[lines$level_0_tag == "SOUR"] <- lines$source_ID[lines$level_0_tag == "SOUR"]
# drop header and footer
lines <- dplyr::select(lines, -c(person_ID, family_ID, source_ID))
##################################################
# people
##################################################
# filter
people <- dplyr::filter(lines, level_0_tag == "INDI")
# select variables
people <- dplyr::select(people, level, tag, value, level_1_ID, level_2_ID, path, ID_number)
# rename variable
people <- dplyr::rename(people, person_ID = ID_number)
# drop level 0 rows
people <- dplyr::filter(people, level != 0)
# select tags for person-level data
people <- dplyr::filter(people, stringr::str_detect(path, "(NAME|SEX|RELI|OCCU|FAMC|FAMS|ALIA)(SOUR)?"))
# attribute
people$attribute <- "missing"
people$attribute[stringr::str_detect(people$path, "NAME")] <- "name"
people$attribute[stringr::str_detect(people$path, "SEX")] <- "sex"
people$attribute[stringr::str_detect(people$path, "RELI")] <- "religion"
people$attribute[stringr::str_detect(people$path, "OCCU")] <- "occupation"
people$attribute[stringr::str_detect(people$path, "FAMC")] <- "family (child)"
people$attribute[stringr::str_detect(people$path, "FAMS")] <- "family (partner)"
people$attribute[stringr::str_detect(people$path, "ALIA")] <- "alias"
# drop source data
people <- dplyr::filter(people, !stringr::str_detect(people$path, "SOUR_DATA$"))
# variable
people$variable <- "value"
people$variable[stringr::str_detect(people$path, "SOUR$")] <- "source"
people$variable[stringr::str_detect(people$path, "SOUR_DATA_TEXT$")] <- "source_text"
people$variable[stringr::str_detect(people$path, "SOUR_PAGE$")] <- "source_page"
people$variable[stringr::str_detect(people$path, "SOUR_NOTE$")] <- "source_note"
# fill up level 2 ID to attach sources to data
people <- people %>%
dplyr::group_by(person_ID, level_1_ID) %>%
tidyr::fill(level_2_ID, .direction = "up") %>%
dplyr::ungroup()
# observation ID
people$observation_ID <- stringr::str_c(people$person_ID, people$attribute, people$variable)
# convert to wide format
people <- tidyr::pivot_wider(data = people, id_cols = c(person_ID, level_1_ID, level_2_ID, attribute), names_from = variable, values_from = value)
# fill in values
people <- tidyr::fill(people, value, .direction = "down")
# select variables
people <- dplyr::select(people, person_ID, attribute, value, source, source_page, source_text, source_note)
##################################################
# events
##################################################
# filter
events <- dplyr::filter(
lines, path %in% c(
"BIRT", "BIRT_DATE", "BIRT_PLAC", "BIRT_SOUR", "BIRT_SOUR_PAGE", "BIRT_SOUR_NOTE",
"BAPM", "BAPM_DATE", "BAPM_PLAC", "BAPM_SOUR", "BAPM_SOUR_PAGE", "BAPM_SOUR_NOTE",
"MARR", "MARR_DATE", "MARR_PLAC", "MARR_SOUR", "MARR_SOUR_PAGE", "MARR_SOUR_NOTE",
"DIV", "DIV_DATE", "DIV_PLAC", "DIV_SOUR", "DIV_SOUR_PAGE", "DIV_SOUR_NOTE",
"EMIG", "EMIG_DATE", "EMIG_PLAC", "EMIG_SOUR", "EMIG_SOUR_PAGE", "EMIG_SOUR_NOTE",
"NATU", "NATU_DATE", "NATU_PLAC", "NATU_SOUR", "NATU_SOUR_PAGE", "NATU_SOUR_NOTE",
"GRAD", "GRAD_DATE", "GRAD_PLAC", "GRAD_SOUR", "GRAD_SOUR_PAGE", "GRAD_SOUR_NOTE",
"RESI", "RESI_DATE", "RESI_PLAC", "RESI_SOUR", "RESI_SOUR_PAGE", "RESI_SOUR_NOTE",
"PROB", "PROB_DATE", "PROB_PLAC", "PROB_SOUR", "PROB_SOUR_PAGE", "PROB_SOUR_NOTE",
"DEAT", "DEAT_DATE", "DEAT_PLAC", "DEAT_SOUR", "DEAT_SOUR_PAGE", "DEAT_SOUR_NOTE",
"BURI", "BURI_DATE", "BURI_PLAC", "BURI_SOUR", "BURI_SOUR_PAGE", "BURI_SOUR_NOTE",
"EVEN", "EVEN_DATE", "EVEN_PLAC", "EVEN_SOUR", "EVEN_SOUR_PAGE", "EVEN_SOUR_NOTE"
)
)
# rename variables
events <- dplyr::rename(events, person_ID = ID_number)
# type
events$type <- "text"
events$type[stringr::str_detect(events$path, "DATE$")] <- "date"
events$type[stringr::str_detect(events$path, "PLAC$")] <- "place"
events$type[stringr::str_detect(events$path, "SOUR$")] <- "source"
events$type[stringr::str_detect(events$path, "PAGE$")] <- "page"
events$type[stringr::str_detect(events$path, "NOTE$")] <- "note"
# drop text
# events <- dplyr::filter(events, type != "text")
events$level_2_ID[events$level_2_tag == "DATE"] <- NA
events$level_2_ID[events$level_2_tag == "PLAC"] <- NA
# fill up level 2 ID to attach sources to data
events <- events %>%
dplyr::group_by(person_ID, level_1_ID) %>%
tidyr::fill(level_2_ID, .direction = "up") %>%
dplyr::ungroup()
# convert to wide format
events <- tidyr::pivot_wider(data = events, id_cols = c(person_ID, level_1_tag, level_1_ID, level_2_ID), names_from = type, values_from = value)
# type
events$type <- "missing"
events$type[events$level_1_tag == "BAPM"] <- "baptism"
events$type[events$level_1_tag == "BIRT"] <- "birth"
events$type[events$level_1_tag == "BURI"] <- "burial"
events$type[events$level_1_tag == "DEAT"] <- "death"
events$type[events$level_1_tag == "DIV"] <- "divorce"
events$type[events$level_1_tag == "EMIG"] <- "emigration"
events$type[events$level_1_tag == "EVEN"] <- "event"
events$type[events$level_1_tag == "PROB"] <- "probate"
events$type[events$level_1_tag == "GRAD"] <- "graduation"
events$type[events$level_1_tag == "MARR"] <- "marriage"
events$type[events$level_1_tag == "NATU"] <- "naturialization"
events$type[events$level_1_tag == "RESI"] <- "residence"
# fill in variables
events <- events %>%
dplyr::group_by(person_ID, type) %>%
tidyr::fill(text, .direction = "down") %>%
tidyr::fill(date, .direction = "down") %>%
tidyr::fill(place, .direction = "down") %>%
dplyr::ungroup()
# select variables
events <- dplyr::select(events, person_ID, type, date, place, source, note, page)
##################################################
# marriages
##################################################
# marriages
marriages <- dplyr::filter(lines, level_1_tag %in% c("HUSB", "WIFE"))
# count
marriages <- marriages %>%
dplyr::group_by(ID_number) %>%
dplyr::mutate(count = dplyr::n()) %>%
dplyr::filter(count == 2)
# collapse by family
marriages <- marriages %>%
dplyr::group_by(ID_number) %>%
dplyr::summarize(
wife = value[tag == "WIFE"],
husband = value[tag == "HUSB"]
)
# rename variable
marriages <- dplyr::rename(marriages, family_ID = ID_number)
# marriage ID
id <- sample(1:99999, size = nrow(marriages), replace = FALSE)
id <- stringr::str_pad(id, side = "left", width = 5, pad = "0")
id <- stringr::str_c("M", id, sep = ":")
marriages$relationship_ID <- id
##################################################
# children
##################################################
# children
children <- dplyr::filter(lines, level_1_tag == "CHIL")
parents <- dplyr::filter(lines, level_1_tag %in% c("HUSB", "WIFE"))
# select variables
children <- dplyr::select(children, ID_number, value)
parents <- dplyr::select(parents, ID_number, value)
# rename variables
children <- dplyr::rename(children, child = value)
parents <- dplyr::rename(parents, parent = value)
# merge
children <- dplyr::left_join(children, parents, by = "ID_number")
# rename variable
children <- dplyr::rename(children, family_ID = ID_number)
# child ID
id <- sample(1:99999, size = nrow(children), replace = FALSE)
id <- stringr::str_pad(id, side = "left", width = 5, pad = "0")
id <- stringr::str_c("C", id, sep = ":")
children$relationship_ID <- id
##################################################
# relationships
##################################################
# make two versions
children <- dplyr::select(children, family_ID, child, parent, relationship_ID)
# edge type
children$type <- "child-parent"
# rename variables
names(children) <- c("family_ID", "person_A", "person_B", "relationship_ID", "type")
# make two versions
marriages <- dplyr::select(marriages, family_ID, wife, husband, relationship_ID)
# edge type
marriages$type <- "marriage"
# rename variables
names(marriages) <- c("family_ID", "person_A", "person_B", "relationship_ID", "type")
# edge
relationships <- rbind(marriages, children)
# organize variables
relationships <- dplyr::select(relationships, family_ID, relationship_ID, person_A, person_B, type)
# arrange
relationships <- dplyr::arrange(relationships, family_ID, relationship_ID, type)
##################################################
# documents
##################################################
# filter
documents <- dplyr::filter(lines, level_0_tag == "SOUR")
# drop level 0 rows
documents <- dplyr::filter(documents, level != 0)
# recode notes
documents$path[documents$path == "NOTE_CONC"] <- "NOTE"
documents$path[documents$path == "NOTE_CONT"] <- "NOTE"
documents$path[documents$path == "PUBL_CONC"] <- "PUBL"
# function to aggregate values
aggregate <- function(x) {
stringr::str_c(unique(x), collapse = " ")
}
# convert to wide format
documents <- tidyr::pivot_wider(data = documents, id_cols = c(ID_number), names_from = path, values_from = value, values_fn = list(value = aggregate))
# rename variables
names(documents) <- c("document_ID", "note", "title", "publication", "author", "repository_ID")
# organize variables
documents <- dplyr::select(documents, document_ID, title, publication, author, note)
# reformat missing
documents$note <- stringr::str_squish(documents$note)
documents$note[documents$note == ""] <- NA
##################################################
# clean ID numbers
##################################################
# person ID
people$person_ID <- plyr::mapvalues(people$person_ID, from = person_table$GEDCOM_ID, to = person_table$key_ID, warn_missing = FALSE)
people$value <- plyr::mapvalues(people$value, from = family_table$GEDCOM_ID, to = family_table$key_ID, warn_missing = FALSE)
events$person_ID <- plyr::mapvalues(events$person_ID, from = person_table$GEDCOM_ID, to = person_table$key_ID, warn_missing = FALSE)
events$person_ID <- plyr::mapvalues(events$person_ID, from = family_table$GEDCOM_ID, to = family_table$key_ID, warn_missing = FALSE)
relationships$person_A <- plyr::mapvalues(relationships$person_A, from = person_table$GEDCOM_ID, to = person_table$key_ID, warn_missing = FALSE)
relationships$person_B <- plyr::mapvalues(relationships$person_B, from = person_table$GEDCOM_ID, to = person_table$key_ID, warn_missing = FALSE)
# document ID
documents$document_ID <- plyr::mapvalues(documents$document_ID, from = document_table$GEDCOM_ID, to = document_table$key_ID, warn_missing = FALSE)
people$source <- plyr::mapvalues(people$source, from = document_table$GEDCOM_ID, to = document_table$key_ID, warn_missing = FALSE)
events$source <- plyr::mapvalues(events$source, from = document_table$GEDCOM_ID, to = document_table$key_ID, warn_missing = FALSE)
# family ID
relationships$family_ID <- plyr::mapvalues(relationships$family_ID, from = family_table$GEDCOM_ID, to = family_table$key_ID, warn_missing = FALSE)
# arrange
relationships <- dplyr::arrange(relationships, family_ID, relationship_ID, person_A, person_B)
documents <- dplyr::arrange(documents, document_ID)
events <- dplyr::arrange(events, person_ID, type)
people <- dplyr::arrange(people, person_ID, attribute)
##################################################
# list to return
##################################################
# make a list of tibbles
out <- list(
attributes = people,
events = events,
relationships = relationships,
sources = documents
)
# return list
return(out)
}
###########################################################################
# end R script
###########################################################################
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.