R/nasa.R

Defines functions .munge_nasa_names .dictionary_names_nasa

.dictionary_names_nasa <-
  function() {
    tibble(
      name_nasa = c(
        "access_level",
        "landing_page",
        "issued",
        "type",
        "modified",
        "identifier",
        "description",
        "title",
        "accrual_periodicity",
        "temporal",
        "spatial",
        "citation",
        "data_presentation_form",
        "release_place",
        "series_name",
        "creator",
        "graphic_preview_description",
        "graphic_preview_file",
        "editor",
        "issue_identification",
        "described_by",
        "described_by_type",
        "data_quality",
        "rights",
        "license"
      ),
      name_actual = c(
        "type_access_level",
        "url_landing_page",
        "date_issued",
        "remove_type",
        "date_modified",
        "id_nasa",
        "description_nasa",
        "title_nasa",
        "period_accrual",
        "datestime_nasa",
        "lat_lon_nasa",
        "description_citation",
        "type_data_presentation_form",
        "location_nasa_released",
        "series_name",
        "name_creator",
        "description_graphic",
        "url_graphic",
        "names_editor",
        "slug_issue",
        "url_description",
        "typefile_description",
        "has_data_quality",
        "type_rights",
        "type_license"
      )


    )
  }

.munge_nasa_names <-
  function(data) {
    names_dict <- names(data)

    dict <- .dictionary_names_nasa()
    actual_names <-
      names_dict %>%
      map_chr(function(name) {
        df_row <-
          dict %>% filter(name_nasa == name)
        if (nrow(df_row) == 0) {
          glue::glue("Missing {name}") %>% message()
          return(name)
        }

        df_row$name_actual
      })

    data %>%
      set_names(actual_names)
  }


#' NASA Data Catalog
#'
#' Acquires NASA data catalog.
#'
#'
#' @return
#' @export
#'
#' @examples
nasa_catalog <-
  memoise::memoise(function() {
    json <- fromJSON("https://data.nasa.gov/data.json")

    tbl_nasa <- json$dataset
    tbl_nasa <- tbl_nasa %>% janitor::clean_names()
    ids <- tbl_nasa$identifier %>% str_to_lower()
    tbl_offices <-
      tbl_nasa$publisher %>% as_tibble() %>%
      setNames(c("type_organization", "name_office")) %>%
      mutate(id_nasa = ids) %>%
      select(id_nasa, everything()) %>%
      select(-type_organization) %>%
      munge_data()

    tbl_contacts <- tbl_nasa$contact_point %>% as_tibble() %>%
      select(2:3) %>%
      setNames(c("name_contact", "email_contact")) %>%
      mutate(
        email_contact = email_contact %>% str_remove_all("mailto:"),
        name_contact = str_to_upper(name_contact)
      ) %>%
      mutate(id_nasa = ids) %>%
      select(id_nasa, everything()) %>%
      munge_data()

    tbl_references <- 1:length(tbl_nasa$references) %>%
      map_dfr(
        function(x){
          urls <- tbl_nasa$references[[x]]

          if (length(urls) == 0) {
            return(tibble())
          }
          tibble(id_nasa = ids[[x]], url_reference = urls)
        }) %>%
      group_by(id_nasa) %>%
      nest() %>%
      rename(data_references = data) %>%
      ungroup() %>%
      mutate(has_references = T) %>%
      select(-data_references, everything())



    tbl_keyword <- 1:length(tbl_nasa$keyword) %>%
      map_dfr(
        function(x){
          urls <- tbl_nasa$keyword[[x]]

          if (length(urls) == 0)
          {
            return(tibble())
          }
          tibble(id_nasa = ids[[x]], keyword_nasa = str_to_upper(urls))
        }) %>%
      munge_data() %>%
      group_by(id_nasa) %>%
      nest() %>%
      rename(data_keywords = data) %>%
      ungroup() %>%
      mutate(has_keywords = T) %>%
      select(-data_keywords, everything())

    tbl_themes <- 1:length(tbl_nasa$theme) %>%
      map_dfr(
        function(x){
          urls <- tbl_nasa$theme[[x]]

          if (length(urls) == 0)
          {
            return(tibble())
          }
          tibble(id_nasa = ids[[x]], theme_nasa = str_to_upper(urls))
        }) %>%
      munge_data() %>%
      group_by(id_nasa) %>%
      nest() %>%
      rename(data_theme = data) %>%
      ungroup() %>%
      mutate(has_theme = T) %>%
      select(-data_theme, everything())

    tbl_distribution <- 1:length(tbl_nasa$distribution) %>%
      map_dfr(
        function(x){
          tbl_dist <-
            tbl_nasa$distribution[[x]] %>% as_tibble()

          if (length(tbl_dist) == 0)
          {
            return(tibble())
          }
          tbl_dist %>%
            setNames(
              c(
                "type_media",
                "url_distribution",
                "description_distribution",
                "type_distribution",
                "title_distribution"
              )
            ) %>%
            mutate(id_nasa = ids[[x]])
        }) %>%
      munge_data() %>%
      group_by(id_nasa) %>%
      nest() %>%
      rename(data_distribution = data) %>%
      ungroup() %>%
      mutate(
        has_distribution = T,
        count_distribution = data_distribution %>% map_dbl(nrow)
      ) %>%
      select(-data_distribution, everything())

    tbl_programs <-
      1:length(tbl_nasa$program_code) %>%
      map_dfr(
        function(x){
          urls <- tbl_nasa$program_code[[x]]

          if (length(urls) == 0)
          {
            return(tibble())
          }
          tibble(id_nasa = ids[[x]], code_program = str_to_upper(urls))
        })  %>%
      munge_data() %>%
      separate(
        code_program,
        into = c("number_parent", "number_program"),
        convert = T,
        remove = F
      ) %>%
      group_by(id_nasa) %>%
      nest() %>%
      rename(data_program_codes = data) %>%
      ungroup() %>%
      mutate(has_program_codes = T) %>%
      select(-data_program_codes, everything())

    select_cols <- tbl_nasa %>%
      map_df(class) %>%
      gather(column, class) %>%
      filter(!class %>% str_detect("list|data")) %>%
      pull(column)

    data <-
      tbl_nasa %>% select(one_of(select_cols)) %>%
      as_tibble() %>%
      .munge_nasa_names() %>%
      munge_data() %>%
      mutate(id_nasa = str_to_lower(id_nasa))

    data <- data %>%
      separate(
        datestime_nasa,
        into = c("datetime_start", "datetime_end"),
        fill = "right",
        extra = "merge",
        sep = "\\/"
      ) %>%
      munge_data()




    data <- list(
      data,
      tbl_contacts,
      tbl_distribution,
      tbl_keyword,
      tbl_offices,
      tbl_references,
      tbl_themes
    ) %>%
      reduce(left_join, by = "id_nasa") %>%
      select(-matches("data"), everything())

    data
  })
abresler/govtrackR documentation built on July 11, 2020, 12:30 a.m.