R/GEDCOM_parser.R

Defines functions parse_GEDCOM

###########################################################################
# Joshua C. Fjelstul, Ph.D.
# rgenea R Package
###########################################################################

#################################################
# function to read in a GEDCOM file
#################################################

parse_GEDCOM <- function(file) {

  # define pipe locally
  `%>%` <- magrittr::`%>%`

  #################################################
  # read in data
  #################################################

  # read in data
  lines <- readLines(file)

  # make a data frame
  lines <- dplyr::tibble(text = lines)

  #################################################
  # lines
  #################################################

  # clean white space
  lines$text <- stringr::str_squish(lines$text)

  # line level
  lines$level <- stringr::str_extract(lines$text, "^[0-9]+")

  # line tag
  lines$tag <- stringr::str_extract(lines$text, "[_A-Z]{2,}")

  # drop program-specific tags
  lines <- dplyr::filter(lines, !stringr::str_detect(tag, "^_"))

  # value
  lines$value <- lines$text
  lines$value <- stringr::str_replace(lines$value, "[0-9]+ *[_A-Z]+ *", "")
  lines$value <- stringr::str_squish(lines$value)

  # drop text variable
  lines <- dplyr::select(lines, -text)

  # make group
  lines$group <- lines$tag
  lines$group[stringr::str_detect(lines$tag, "^(CONT|CONC)$")] <- NA
  lines$group[!is.na(lines$group)] <- 1:sum(!is.na(lines$group))

  # id
  lines$id <- 1:nrow(lines)

  # collapse by group
  lines <- lines %>%
    dplyr::group_by(group) %>%
    dplyr::summarize(
      id = id[1],
      level = level[1],
      tag = tag[1],
      value = stringr::str_trim(stringr::str_c(value, collapse = " "))
    ) %>%
    dplyr::ungroup() %>%
    dplyr::arrange(id)

  #################################################
  # ID numbers
  #################################################

  make_ID_table <- function(x, regex, prefix) {
    x <- stringr::str_extract_all(x, regex)
    x <- unlist(x)
    x <- unique(x)
    id <- sample(1:99999, size = length(x), replace = FALSE)
    id <- stringr::str_pad(id, side = "left", width = 5, pad = "0")
    id <- stringr::str_c(prefix, id, sep = ":")
    out <- dplyr::tibble(
      GEDCOM_ID = x,
      key_ID = id
    )
    return(out)
  }

  # person IDs
  person_table <- make_ID_table(lines$value, "@P[0-9]+@", "P")
  document_table <- make_ID_table(lines$value, "@S[-0-9]+@", "S")
  family_table <- make_ID_table(lines$value, "@F[-0-9]+@", "F")

  #################################################
  # lines
  #################################################

  # initialize variables
  lines$level_0_tag <- NA
  lines$level_0_ID <- NA
  lines$level_1_tag <- NA
  lines$level_1_ID <- NA
  lines$level_2_tag <- NA
  lines$level_2_ID <- NA
  lines$level_3_tag <- NA
  lines$level_3_ID <- NA
  lines$level_4_tag <- NA
  lines$level_4_ID <- NA

  # level 0
  lines$level_0_tag[lines$level == 0] <- lines$tag[lines$level == 0]
  lines$level_0_ID[lines$level == 0] <- 1:sum(lines$level == 0)

  # level 1
  lines$level_1_tag[lines$level == 1] <- lines$tag[lines$level == 1]
  lines$level_1_ID[lines$level == 1] <- 1:sum(lines$level == 1)
  # lines$level_1_tag[!is.na(lines$level_0_tag)] <- ""
  # lines$level_1_ID[!is.na(lines$level_0_tag)] <- ""

  # level 2
  lines$level_2_tag[lines$level == 2] <- lines$tag[lines$level == 2]
  lines$level_2_ID[lines$level == 2] <- 1:sum(lines$level == 2)
  # lines$level_2_tag[!is.na(lines$level_1_tag)] <- ""
  # lines$level_2_ID[!is.na(lines$level_1_tag)] <- ""

  # level 3
  lines$level_3_tag[lines$level == 3] <- lines$tag[lines$level == 3]
  lines$level_3_ID[lines$level == 3] <- 1:sum(lines$level == 3)
  # lines$level_3_tag[!is.na(lines$level_2_tag)] <- ""
  # lines$level_3_ID[!is.na(lines$level_2_tag)] <- ""

  # level 4
  lines$level_4_tag[lines$level == 4] <- lines$tag[lines$level == 4]
  lines$level_4_ID[lines$level == 4] <- 1:sum(lines$level == 4)
  # lines$level_4_tag[!is.na(lines$level_3_tag)] <- ""
  # lines$level_4_ID[!is.na(lines$level_3_tag)] <- ""

  # fill level 0
  lines <- lines %>%
    tidyr::fill(level_0_tag, .direction = "down") %>%
    tidyr::fill(level_0_ID, .direction = "down")

  # fill level 1
  lines <- lines %>%
    dplyr::group_by(level_0_ID) %>%
    tidyr::fill(level_1_tag, .direction = "down") %>%
    tidyr::fill(level_1_ID, .direction = "down") %>%
    dplyr::ungroup()

  # fill level 2
  lines <- lines %>%
    dplyr::group_by(level_1_ID) %>%
    tidyr::fill(level_2_tag, .direction = "down") %>%
    tidyr::fill(level_2_ID, .direction = "down") %>%
    dplyr::ungroup()

  # fill level 3
  lines <- lines %>%
    dplyr::group_by(level_2_ID) %>%
    tidyr::fill(level_3_tag, .direction = "down") %>%
    tidyr::fill(level_3_ID, .direction = "down") %>%
    dplyr::ungroup()

  # fill level 4
  lines <- lines %>%
    dplyr::group_by(level_3_ID) %>%
    tidyr::fill(level_4_tag, .direction = "down") %>%
    tidyr::fill(level_4_ID, .direction = "down") %>%
    dplyr::ungroup()

  # drop header and footer
  lines <- dplyr::filter(lines, level_0_tag %in% c("FAM", "INDI", "SOUR"))

  # path
  lines$path <- stringr::str_c(
    stringr::str_replace_na(lines$level_1_tag, ""),
    stringr::str_replace_na(lines$level_2_tag, ""),
    stringr::str_replace_na(lines$level_3_tag, ""),
    stringr::str_replace_na(lines$level_4_tag, ""),
    sep = "_")
  lines$path <- stringr::str_replace(lines$path, "[_]+$", "")

  # person ID
  lines$person_ID <- stringr::str_extract(lines$value, "@P(-)?[0-9]+@")
  lines <- lines %>% tidyr::fill(person_ID)
  lines$person_ID[lines$level_0_tag != "INDI"] <- NA

  # family ID
  lines$family_ID <- stringr::str_extract(lines$value, "@F(-)?[0-9]+@")
  lines <- lines %>% tidyr::fill(family_ID)
  lines$family_ID[lines$level_0_tag != "FAM"] <- NA

  # source ID
  lines$source_ID <- stringr::str_extract(lines$value, "@S(-)?[0-9]+@")
  lines <- lines %>% tidyr::fill(source_ID)
  lines$source_ID[lines$level_0_tag != "SOUR"] <- NA

  # ID number
  lines$ID_number <- NA
  lines$ID_number[lines$level_0_tag == "INDI"] <- lines$person_ID[lines$level_0_tag == "INDI"]
  lines$ID_number[lines$level_0_tag == "FAM"] <- lines$family_ID[lines$level_0_tag == "FAM"]
  lines$ID_number[lines$level_0_tag == "SOUR"] <- lines$source_ID[lines$level_0_tag == "SOUR"]

  # drop header and footer
  lines <- dplyr::select(lines, -c(person_ID, family_ID, source_ID))

  ##################################################
  # people
  ##################################################

  # filter
  people <- dplyr::filter(lines, level_0_tag == "INDI")

  # select variables
  people <- dplyr::select(people, level, tag, value, level_1_ID, level_2_ID, path, ID_number)

  # rename variable
  people <- dplyr::rename(people, person_ID = ID_number)

  # drop level 0 rows
  people <- dplyr::filter(people, level != 0)

  # select tags for person-level data
  people <- dplyr::filter(people, stringr::str_detect(path, "(NAME|SEX|RELI|OCCU|FAMC|FAMS|ALIA)(SOUR)?"))

  # attribute
  people$attribute <- "missing"
  people$attribute[stringr::str_detect(people$path, "NAME")] <- "name"
  people$attribute[stringr::str_detect(people$path, "SEX")] <- "sex"
  people$attribute[stringr::str_detect(people$path, "RELI")] <- "religion"
  people$attribute[stringr::str_detect(people$path, "OCCU")] <- "occupation"
  people$attribute[stringr::str_detect(people$path, "FAMC")] <- "family (child)"
  people$attribute[stringr::str_detect(people$path, "FAMS")] <- "family (partner)"
  people$attribute[stringr::str_detect(people$path, "ALIA")] <- "alias"

  # drop source data
  people <- dplyr::filter(people, !stringr::str_detect(people$path, "SOUR_DATA$"))

  # variable
  people$variable <- "value"
  people$variable[stringr::str_detect(people$path, "SOUR$")] <- "source"
  people$variable[stringr::str_detect(people$path, "SOUR_DATA_TEXT$")] <- "source_text"
  people$variable[stringr::str_detect(people$path, "SOUR_PAGE$")] <- "source_page"
  people$variable[stringr::str_detect(people$path, "SOUR_NOTE$")] <- "source_note"

  # fill up level 2 ID to attach sources to data
  people <- people %>%
    dplyr::group_by(person_ID, level_1_ID) %>%
    tidyr::fill(level_2_ID, .direction = "up") %>%
    dplyr::ungroup()

  # observation ID
  people$observation_ID <- stringr::str_c(people$person_ID, people$attribute, people$variable)

  # convert to wide format
  people <- tidyr::pivot_wider(data = people, id_cols = c(person_ID, level_1_ID, level_2_ID, attribute), names_from = variable, values_from = value)

  # fill in values
  people <- tidyr::fill(people, value, .direction = "down")

  # select variables
  people <- dplyr::select(people, person_ID, attribute, value, source, source_page, source_text, source_note)

  ##################################################
  # events
  ##################################################

  # filter
  events <- dplyr::filter(
    lines, path %in% c(
      "BIRT", "BIRT_DATE", "BIRT_PLAC", "BIRT_SOUR", "BIRT_SOUR_PAGE", "BIRT_SOUR_NOTE",
      "BAPM", "BAPM_DATE", "BAPM_PLAC", "BAPM_SOUR", "BAPM_SOUR_PAGE", "BAPM_SOUR_NOTE",
      "MARR", "MARR_DATE", "MARR_PLAC", "MARR_SOUR", "MARR_SOUR_PAGE", "MARR_SOUR_NOTE",
      "DIV", "DIV_DATE", "DIV_PLAC", "DIV_SOUR", "DIV_SOUR_PAGE", "DIV_SOUR_NOTE",
      "EMIG", "EMIG_DATE", "EMIG_PLAC", "EMIG_SOUR", "EMIG_SOUR_PAGE", "EMIG_SOUR_NOTE",
      "NATU", "NATU_DATE", "NATU_PLAC", "NATU_SOUR", "NATU_SOUR_PAGE", "NATU_SOUR_NOTE",
      "GRAD", "GRAD_DATE", "GRAD_PLAC", "GRAD_SOUR", "GRAD_SOUR_PAGE", "GRAD_SOUR_NOTE",
      "RESI", "RESI_DATE", "RESI_PLAC", "RESI_SOUR", "RESI_SOUR_PAGE", "RESI_SOUR_NOTE",
      "PROB", "PROB_DATE", "PROB_PLAC", "PROB_SOUR", "PROB_SOUR_PAGE", "PROB_SOUR_NOTE",
      "DEAT", "DEAT_DATE", "DEAT_PLAC", "DEAT_SOUR", "DEAT_SOUR_PAGE", "DEAT_SOUR_NOTE",
      "BURI", "BURI_DATE", "BURI_PLAC", "BURI_SOUR", "BURI_SOUR_PAGE", "BURI_SOUR_NOTE",
      "EVEN", "EVEN_DATE", "EVEN_PLAC", "EVEN_SOUR", "EVEN_SOUR_PAGE", "EVEN_SOUR_NOTE"
    )
  )

  # rename variables
  events <- dplyr::rename(events, person_ID = ID_number)

  # type
  events$type <- "text"
  events$type[stringr::str_detect(events$path, "DATE$")] <- "date"
  events$type[stringr::str_detect(events$path, "PLAC$")] <- "place"
  events$type[stringr::str_detect(events$path, "SOUR$")] <- "source"
  events$type[stringr::str_detect(events$path, "PAGE$")] <- "page"
  events$type[stringr::str_detect(events$path, "NOTE$")] <- "note"

  # drop text
  # events <- dplyr::filter(events, type != "text")

  events$level_2_ID[events$level_2_tag == "DATE"] <- NA
  events$level_2_ID[events$level_2_tag == "PLAC"] <- NA

  # fill up level 2 ID to attach sources to data
  events <- events %>%
    dplyr::group_by(person_ID, level_1_ID) %>%
    tidyr::fill(level_2_ID, .direction = "up") %>%
    dplyr::ungroup()

  # convert to wide format
  events <- tidyr::pivot_wider(data = events, id_cols = c(person_ID, level_1_tag, level_1_ID, level_2_ID), names_from = type, values_from = value)

  # type
  events$type <- "missing"
  events$type[events$level_1_tag == "BAPM"] <- "baptism"
  events$type[events$level_1_tag == "BIRT"] <- "birth"
  events$type[events$level_1_tag == "BURI"] <- "burial"
  events$type[events$level_1_tag == "DEAT"] <- "death"
  events$type[events$level_1_tag == "DIV"] <- "divorce"
  events$type[events$level_1_tag == "EMIG"] <- "emigration"
  events$type[events$level_1_tag == "EVEN"] <- "event"
  events$type[events$level_1_tag == "PROB"] <- "probate"
  events$type[events$level_1_tag == "GRAD"] <- "graduation"
  events$type[events$level_1_tag == "MARR"] <- "marriage"
  events$type[events$level_1_tag == "NATU"] <- "naturialization"
  events$type[events$level_1_tag == "RESI"] <- "residence"

  # fill in variables
  events <- events %>%
    dplyr::group_by(person_ID, type) %>%
    tidyr::fill(text, .direction = "down") %>%
    tidyr::fill(date, .direction = "down") %>%
    tidyr::fill(place, .direction = "down") %>%
    dplyr::ungroup()

  # select variables
  events <- dplyr::select(events, person_ID, type, date, place, source, note, page)

  ##################################################
  # marriages
  ##################################################

  # marriages
  marriages <- dplyr::filter(lines, level_1_tag %in% c("HUSB", "WIFE"))

  # count
  marriages <- marriages %>%
    dplyr::group_by(ID_number) %>%
    dplyr::mutate(count = dplyr::n()) %>%
    dplyr::filter(count == 2)

  # collapse by family
  marriages <- marriages %>%
    dplyr::group_by(ID_number) %>%
    dplyr::summarize(
      wife = value[tag == "WIFE"],
      husband = value[tag == "HUSB"]
    )

  # rename variable
  marriages <- dplyr::rename(marriages, family_ID = ID_number)

  # marriage ID
  id <- sample(1:99999, size = nrow(marriages), replace = FALSE)
  id <- stringr::str_pad(id, side = "left", width = 5, pad = "0")
  id <- stringr::str_c("M", id, sep = ":")
  marriages$relationship_ID <- id

  ##################################################
  # children
  ##################################################

  # children
  children <- dplyr::filter(lines, level_1_tag == "CHIL")
  parents <- dplyr::filter(lines, level_1_tag %in% c("HUSB", "WIFE"))

  # select variables
  children <- dplyr::select(children, ID_number, value)
  parents <- dplyr::select(parents, ID_number, value)

  # rename variables
  children <- dplyr::rename(children, child = value)
  parents <- dplyr::rename(parents, parent = value)

  # merge
  children <- dplyr::left_join(children, parents, by = "ID_number")

  # rename variable
  children <- dplyr::rename(children, family_ID = ID_number)

  # child ID
  id <- sample(1:99999, size = nrow(children), replace = FALSE)
  id <- stringr::str_pad(id, side = "left", width = 5, pad = "0")
  id <- stringr::str_c("C", id, sep = ":")
  children$relationship_ID <- id

  ##################################################
  # relationships
  ##################################################

  # make two versions
  children <- dplyr::select(children, family_ID, child, parent, relationship_ID)

  # edge type
  children$type <- "child-parent"

  # rename variables
  names(children) <- c("family_ID", "person_A", "person_B", "relationship_ID", "type")

  # make two versions
  marriages <- dplyr::select(marriages, family_ID, wife, husband, relationship_ID)

  # edge type
  marriages$type <- "marriage"

  # rename variables
  names(marriages) <- c("family_ID", "person_A", "person_B", "relationship_ID", "type")

  # edge
  relationships <- rbind(marriages, children)

  # organize variables
  relationships <- dplyr::select(relationships, family_ID, relationship_ID, person_A, person_B, type)

  # arrange
  relationships <- dplyr::arrange(relationships, family_ID, relationship_ID, type)

  ##################################################
  # documents
  ##################################################

  # filter
  documents <- dplyr::filter(lines, level_0_tag == "SOUR")

  # drop level 0 rows
  documents <- dplyr::filter(documents, level != 0)

  # recode notes
  documents$path[documents$path == "NOTE_CONC"] <- "NOTE"
  documents$path[documents$path == "NOTE_CONT"] <- "NOTE"
  documents$path[documents$path == "PUBL_CONC"] <- "PUBL"

  # function to aggregate values
  aggregate <- function(x) {
    stringr::str_c(unique(x), collapse = " ")
  }

  # convert to wide format
  documents <- tidyr::pivot_wider(data = documents, id_cols = c(ID_number), names_from = path, values_from = value, values_fn = list(value = aggregate))

  # rename variables
  names(documents) <- c("document_ID", "note", "title", "publication", "author", "repository_ID")

  # organize variables
  documents <- dplyr::select(documents, document_ID, title, publication, author, note)

  # reformat missing
  documents$note <- stringr::str_squish(documents$note)
  documents$note[documents$note == ""] <- NA

  ##################################################
  # clean ID numbers
  ##################################################

  # person ID
  people$person_ID <- plyr::mapvalues(people$person_ID, from = person_table$GEDCOM_ID, to = person_table$key_ID, warn_missing = FALSE)
  people$value <- plyr::mapvalues(people$value, from = family_table$GEDCOM_ID, to = family_table$key_ID, warn_missing = FALSE)
  events$person_ID <- plyr::mapvalues(events$person_ID, from = person_table$GEDCOM_ID, to = person_table$key_ID, warn_missing = FALSE)
  events$person_ID <- plyr::mapvalues(events$person_ID, from = family_table$GEDCOM_ID, to = family_table$key_ID, warn_missing = FALSE)
  relationships$person_A <- plyr::mapvalues(relationships$person_A, from = person_table$GEDCOM_ID, to = person_table$key_ID, warn_missing = FALSE)
  relationships$person_B <- plyr::mapvalues(relationships$person_B, from = person_table$GEDCOM_ID, to = person_table$key_ID, warn_missing = FALSE)

  # document ID
  documents$document_ID <- plyr::mapvalues(documents$document_ID, from = document_table$GEDCOM_ID, to = document_table$key_ID, warn_missing = FALSE)
  people$source <- plyr::mapvalues(people$source, from = document_table$GEDCOM_ID, to = document_table$key_ID, warn_missing = FALSE)
  events$source <- plyr::mapvalues(events$source, from = document_table$GEDCOM_ID, to = document_table$key_ID, warn_missing = FALSE)

  # family ID
  relationships$family_ID <- plyr::mapvalues(relationships$family_ID, from = family_table$GEDCOM_ID, to = family_table$key_ID, warn_missing = FALSE)

  # arrange
  relationships <- dplyr::arrange(relationships, family_ID, relationship_ID, person_A, person_B)
  documents <- dplyr::arrange(documents, document_ID)
  events <- dplyr::arrange(events, person_ID, type)
  people <- dplyr::arrange(people, person_ID, attribute)

  ##################################################
  # list to return
  ##################################################

  # make a list of tibbles
  out <- list(
    attributes = people,
    events = events,
    relationships = relationships,
    sources = documents
  )

  # return list
  return(out)
}

###########################################################################
# end R script
###########################################################################
jfjelstul/rgenea documentation built on July 6, 2020, 12:27 a.m.