utility/incorporate-outside-datasets-79.R

 #This isn't part of the build process.  They should be executed infrequently, not for every build.
#   Run it when there's a chance the extract data is different, or there's been a new version frrom NlsyLinksDetermination

rm(list=ls(all=TRUE))  #Clear the variables from previous runs.
if( any(search()=="package:NlsyLinks") ) detach("package:NlsyLinks") #So the lazy-loaded datasets aren't available

# ---- load-sources ------------------------------------------------------------
# if( any(.packages(all.available=TRUE) == "NlsyLinks") ) remove.packages("NlsyLinks") #system("R CMD REMOVE NlsyLinks") #This shouldn't be necesary.
library(magrittr)
# require(NlsyLinks) #Don't load' the lazy-loaded datasets shouldn't be accessible

requireNamespace("readr"        )
requireNamespace("tidyr"        )
requireNamespace("dplyr"        )

# ---- declare-globals ---------------------------------------------------------
directoryDatasetsCsv <- "./outside-data/nlsy-79" #These CSVs are in the repository, but not in the build.
directoryDatasetsRda <- "./data" #These RDAs are derived from the CSV, and included in the build as compressed binaries.
algorithmVersion     <- 85L

pathInputLinks              <- file.path(directoryDatasetsCsv, paste0("links-2017-79.csv"))
pathInputSubjectDetails     <- file.path(directoryDatasetsCsv, "subject-details.csv")
# pathInputSubjectDetails     <- file.path(directoryDatasetsCsv, paste0("subject-details-v", algorithmVersion, ".csv"))
pathInputSurvey79           <- file.path(directoryDatasetsCsv, paste0("survey-79.csv"))
pathInputExtraOutcomes79    <- file.path(directoryDatasetsCsv, "extra-outcomes-79.csv")

pathOutputExtraOutcomes     <- file.path(directoryDatasetsRda, "ExtraOutcomes79.rda")
pathOutputLinkTrim          <- file.path(directoryDatasetsRda, "Links79Pair.rda")
pathOutputLinkExpanded      <- file.path(directoryDatasetsRda, "Links79PairExpanded.rda")
pathOutputSubjectDetails    <- file.path(directoryDatasetsRda, "SubjectDetails79.rda")
pathOutputSurvey79          <- file.path(directoryDatasetsRda, "Survey79.rda")

col_types_pair <- readr::cols_only(
  ExtendedID                    = readr::col_integer(),
  SubjectTag_S1                 = readr::col_integer(),
  SubjectTag_S2                 = readr::col_integer(),
  RelationshipPath              = readr::col_integer(),
  EverSharedHouse               = readr::col_logical(),
  R                             = readr::col_double(),
  RFull                         = readr::col_double(),
  MultipleBirthIfSameSex        = readr::col_integer(),
  IsMz                          = readr::col_integer(),
  LastSurvey_S1                 = readr::col_integer(),
  LastSurvey_S2                 = readr::col_integer(),
  RImplicitPass1                = readr::col_double(),
  RImplicit                     = readr::col_double(),
  RImplicit2004                 = readr::col_double(),
  RImplicitDifference           = readr::col_double(),
  RExplicit                     = readr::col_double(),
  RExplicitPass1                = readr::col_double(),
  RPass1                        = readr::col_double(),
  RExplicitOlderSibVersion      = readr::col_double(),
  RExplicitYoungerSibVersion    = readr::col_double(),
  RImplicitSubject              = readr::col_double(),
  RImplicitMother               = readr::col_double(),
  Generation_S1                 = readr::col_integer(),
  Generation_S2                 = readr::col_integer(),
  SubjectID_S1                  = readr::col_integer(),
  SubjectID_S2                  = readr::col_integer()
)
col_types_outcomes <- readr::cols_only(
  SubjectTag                  = readr::col_integer(),
  SubjectID                   = readr::col_integer(),
  Generation                  = readr::col_integer(),
  HeightZGenderAge            = readr::col_double(),
  WeightZGenderAge            = readr::col_double(),
  AfqtRescaled2006Gaussified  = readr::col_double(),
  Afi                         = readr::col_integer(),
  Afm                         = readr::col_integer(),
  MathStandardized            = readr::col_double()
)

col_types_subject_details <- readr::cols_only(
  SubjectTag                = readr::col_integer(),
  ExtendedID                = readr::col_integer(),
  Generation                = readr::col_integer(),
  Gender                    = readr::col_integer(),
  RaceCohort                = readr::col_integer(),
  SiblingCountInNls         = readr::col_integer(),
  BirthOrderInNls           = readr::col_integer(),
  SimilarAgeCount           = readr::col_integer(),
  HasMzPossibly             = readr::col_integer(),
  KidCountBio               = readr::col_integer(),
  KidCountInNls             = readr::col_integer(),
  Mob                       = readr::col_date(format = ""),
  LastSurveyYearCompleted   = readr::col_integer(),
  AgeAtLastSurvey           = readr::col_double(),
  IsDead                    = readr::col_integer(),
  DeathDate                 = readr::col_date(format = "")
)

col_types_survey <- readr::cols_only(
  SubjectTag          = readr::col_integer(),
  SurveySource        = readr::col_integer(),
  SurveyYear          = readr::col_integer(),
  SurveyDate          = readr::col_date(format = ""),
  AgeSelfReportYears  = readr::col_double(),
  AgeCalculateYears   = readr::col_double()
)

# ---- load-data ---------------------------------------------------------------
dsLinks79PairWithoutOutcomes  <- readr::read_csv(pathInputLinks              , col_types=col_types_pair)
ExtraOutcomes79               <- readr::read_csv(pathInputExtraOutcomes79    , col_types=col_types_outcomes)
SubjectDetails79              <- readr::read_csv(pathInputSubjectDetails     , col_types=col_types_subject_details)
Survey79                      <- readr::read_csv(pathInputSurvey79           , col_types=col_types_survey)

# ---- tweak-data --------------------------------------------------------------

# ---- Groom ExtraOutcomes79 ---------------------------------------------------------
ExtraOutcomes79 <- ExtraOutcomes79 %>%
  as.data.frame()

# ---- Groom Links79PairExpanded and Links79Pair -------------------------------------
dsLinks79PairWithoutOutcomes <-
  dsLinks79PairWithoutOutcomes %>%
  dplyr::select(-MultipleBirthIfSameSex, -RImplicitSubject, -RImplicitMother)

ExtraOutcomes79WithTags <-
  ExtraOutcomes79 %>%
  dplyr::mutate(
    SubjectTag = NlsyLinks::CreateSubjectTag(subjectID=SubjectID, generation=Generation)
  ) %>%
  as.data.frame()

remaining           <- setdiff(colnames(dsLinks79PairWithoutOutcomes),  c("SubjectTag_S1", "SubjectTag_S2"))
relationshipLabels  <- c("Gen1Housemates","Gen2Siblings","Gen2Cousins","ParentChild", "AuntNiece")

Links79PairExpanded <-
  c("MathStandardized", "HeightZGenderAge") %>%
  NlsyLinks::CreatePairLinksSingleEntered(
    outcomeNames      = .,
    outcomeDataset    = ExtraOutcomes79WithTags,
    linksPairDataset  = dsLinks79PairWithoutOutcomes,
    linksNames        = remaining
  ) %>%
  dplyr::filter(SubjectTag_S1 < SubjectTag_S2) %>%
  dplyr::mutate(
    RelationshipPath  = factor(RelationshipPath, levels=seq_along(relationshipLabels), labels=relationshipLabels),
    IsMz              = factor(IsMz            , levels=c(0, 1, 255), labels=c("No", "Yes", "DoNotKnow")),
    EverSharedHouse   = as.logical(EverSharedHouse)
  ) %>%
  dplyr::select(-RImplicitDifference) %>%
  dplyr::arrange(ExtendedID, SubjectTag_S1, SubjectTag_S2) %>%
  as.data.frame()

# multipleBirthLabels <- c("No", "Twin", "Triplet", "DoNotKnow")
# Links79PairExpanded$MultipleBirth <- factor(Links79PairExpanded$MultipleBirth, levels=c(0, 2, 3, 255), labels=multipleBirthLabels)

Links79Pair <-
  Links79PairExpanded %>%
  dplyr::select(ExtendedID, SubjectTag_S1, SubjectTag_S2, R, RelationshipPath) %>%
  as.data.frame()

# ---- Groom SubjectDetails ----------------------------------------------------------
vectorOfTwins <- sort(unique(unlist(Links79PairExpanded[Links79PairExpanded$IsMz=="Yes", c("SubjectTag_S1", "SubjectTag_S2")])))

SubjectDetails79 <-
  SubjectDetails79 %>%
  dplyr::mutate(
    Gender          = factor(Gender, levels=1:2, labels=c("Male", "Female")),
    RaceCohort      = factor(RaceCohort, levels=1:3, labels=c("Hispanic", "Black", "Nbnh")), #R02147.00 $ C00053.00
    IsMz            = (SubjectTag %in% vectorOfTwins),
    Mob             = as.Date(as.character(Mob))
  ) %>%
  dplyr::select(
    -IsDead,          #This isn't finished yet.
    -DeathDate        #This isn't finished yet.
  ) %>%
  as.data.frame()

# ---- Groom Survey79 --------------------------------------------------------------
Survey79 <-
  Survey79 |>
  dplyr::mutate(
    SurveySource  = factor(SurveySource, levels=0:3, labels=c("NoInterview", "Gen1", "Gen2C", "Gen2YA")),
    SurveyDate    = as.Date(SurveyDate),
    Age           = dplyr::coalesce(AgeCalculateYears, AgeSelfReportYears),
    Age           = round(Age, 1),
  ) |>
  dplyr::select(SubjectTag, SurveySource, SurveyYear, Age) |>
  dplyr::arrange(SubjectTag, SurveySource, SurveyYear) |>
  as.data.frame()
  # object.size() |>
  # print(units = "MB")

# ---- verify-values -----------------------------------------------------------

checkmate::assert_data_frame(ExtraOutcomes79      , min.rows=100)
checkmate::assert_data_frame(Links79Pair          , min.rows=100)
checkmate::assert_data_frame(Links79PairExpanded  , min.rows=100)
checkmate::assert_data_frame(SubjectDetails79     , min.rows=100)
checkmate::assert_data_frame(Survey79             , min.rows=100)

# ---- save-to-disk ------------------------------------------------------------
save(ExtraOutcomes79            , file=pathOutputExtraOutcomes      , compress="xz")
save(Links79Pair                , file=pathOutputLinkTrim           , compress="xz")
save(Links79PairExpanded        , file=pathOutputLinkExpanded       , compress="xz")
save(SubjectDetails79           , file=pathOutputSubjectDetails     , compress="xz")
save(Survey79                   , file=pathOutputSurvey79           , compress="xz")
nlsy-links/NlsyLinks documentation built on March 13, 2024, 4:05 a.m.