Introduction to rdpla

rdpla: R client for Digital Public Library of America

Digital Public Library of America brings together metadata from libraries, archives, and museums in the US, and makes it freely available via their web portal as well as an API. DPLA's portal and API don't provide the items themselves from contributing institutions, but they provide links to make it easy to find things. The kinds of things DPLA holds metadata for include images of works held in museums, photographs from various photographic collections, texts, sounds, and moving images.

DPLA has a great API with good documentation - a rare thing in this world. Further documentation on their API can be found on their search fields and examples of queries. Metadata schema information here.

DPLA API has two main services (quoting from their API docs):

rdpla also has an interface (dpla_bulk) to download bulk and compressed JSON data.

Note that you can only run examples/vignette/tests if you have an API key. See below for an example of how to get an API key.

Installation

Install from CRAN

install.packages("rdpla")

Development version

if (!requireNamespace("devtools")) {
  install.packages("devtools")
}
devtools::install_github("ropensci/rdpla")

Load rdpla

library("rdpla")

API key

If you already have a DPLA API key, make sure it's in your .Renviron or .Rprofile file.

If you don't have a DPLA API key, use the dpla_get_key() function to get a key. You only need a valid email address to get a key, for example:

dpla_get_key(email = "foo@bar.com")
#> API key created and sent via email. Be sure to check your Spam folder, too.

Search - items

Note: limiting fields returned for readme brevity.

Basic search

dpla_items(q="fruit", page_size=5, fields=c("provider","creator"))
#> $meta
#> # A tibble: 1 x 3
#>   found start returned
#>   <int> <int>    <int>
#> 1 40007     0        5
#> 
#> $data
#> # A tibble: 5 x 2
#>                        provider                         creator
#>                           <chr>                           <chr>
#> 1 Mountain West Digital Library                      no content
#> 2 Mountain West Digital Library                      no content
#> 3 Mountain West Digital Library                      no content
#> 4 Mountain West Digital Library                      no content
#> 5   The New York Public Library Anderson, Alexander (1775-1870)
#> 
#> $facets
#> list()

Limit fields returned

dpla_items(q="fruit", page_size = 10, fields=c("publisher","format"))
#> $meta
#> # A tibble: 1 x 3
#>   found start returned
#>   <int> <int>    <int>
#> 1 40007     0       10
#> 
#> $data
#> # A tibble: 10 x 2
#>                                    format
#>                                     <chr>
#>  1                             no content
#>  2                             no content
#>  3                             no content
#>  4                             no content
#>  5                             no content
#>  6                             no content
#>  7                Gum bichromate on vinyl
#>  8                      1 b 10 x 12.5 cm.
#>  9 Woodblock print;Ink and color on paper
#> 10                             no content
#> # ... with 1 more variables: publisher <chr>
#> 
#> $facets
#> list()

Limit records returned

dpla_items(q="fruit", page_size=2, fields=c("provider","title"))
#> $meta
#> # A tibble: 1 x 3
#>   found start returned
#>   <int> <int>    <int>
#> 1 40007     0        2
#> 
#> $data
#> # A tibble: 2 x 2
#>   title                      provider
#>   <chr>                         <chr>
#> 1 Fruit Mountain West Digital Library
#> 2 Fruit Mountain West Digital Library
#> 
#> $facets
#> list()

Search by date

dpla_items(q="science", date_before=1900, page_size=10, fields=c("id","date"))
#> $meta
#> # A tibble: 1 x 3
#>   found start returned
#>   <int> <int>    <int>
#> 1 57622     0       10
#> 
#> $data
#> # A tibble: 10 x 2
#>                                  id      date
#>                               <chr>     <chr>
#>  1 9cfe90e850b13bc1854f3e40223529c8 1881-1882
#>  2 9d008b592ad35eaa1e4dbff8aa976318      1884
#>  3 268fb8978bbab523ec1ad48ee72e7464      1892
#>  4 7f25fff59b55bd99df3a864e514c3d1d      1893
#>  5 0457c88ca237cec73ce2876f91d56572      1893
#>  6 19bdb84f833b28cb36207d02c38cfc69      1883
#>  7 e93faad718b9d63c2c8dd8725edadb93      1891
#>  8 9f79e6f53dfd2f31a17d756a90f22e0b      1883
#>  9 e3f11047a57f18f8a21baf5d6ff3c4dd      1886
#> 10 e8f0ed10dbdcd0ffd6f504e1892515da      1885
#> 
#> $facets
#> list()

Search on specific fields

dpla_items(description="obituaries", page_size=2, fields="description")
#> $meta
#> # A tibble: 1 x 3
#>   found start returned
#>   <int> <int>    <int>
#> 1 50777     0        2
#> 
#> $data
#> # A tibble: 2 x 1
#>                          description
#>                                <chr>
#> 1              Obituaries of members
#> 2 Pages from the complied obituaries
#> 
#> $facets
#> list()
dpla_items(subject="yodeling", page_size=2, fields="subject")
#> $meta
#> # A tibble: 1 x 3
#>   found start returned
#>   <int> <int>    <int>
#> 1    54     0        2
#> 
#> $data
#> # A tibble: 2 x 1
#>                                                subject
#>                                                  <chr>
#> 1 Yodel & yodeling;Humorous songs;Musicals;Sheet music
#> 2 Yodel & yodeling;Humorous songs;Musicals;Sheet music
#> 
#> $facets
#> list()
dpla_items(provider="HathiTrust", page_size=2, fields="provider")
#> $meta
#> # A tibble: 1 x 3
#>     found start returned
#>     <int> <int>    <int>
#> 1 2647621     0        2
#> 
#> $data
#> # A tibble: 2 x 1
#>     provider
#>        <chr>
#> 1 HathiTrust
#> 2 HathiTrust
#> 
#> $facets
#> list()

Spatial search, across all spatial fields

dpla_items(sp='Boston', page_size=2, fields=c("id","provider"))
#> $meta
#> # A tibble: 1 x 3
#>   found start returned
#>   <int> <int>    <int>
#> 1 97974     0        2
#> 
#> $data
#> # A tibble: 2 x 2
#>                                 id                provider
#>                              <chr>                   <chr>
#> 1 337556aaa3096bd77e462d898b70c9d7 Smithsonian Institution
#> 2 41aa36a38d69f5247529505a55528b5d Smithsonian Institution
#> 
#> $facets
#> list()

Spatial search, by states

dpla_items(sp_state='Massachusetts OR Hawaii', page_size=2, fields=c("id","provider"))
#> $meta
#> # A tibble: 1 x 3
#>    found start returned
#>    <int> <int>    <int>
#> 1 235411     0        2
#> 
#> $data
#> # A tibble: 2 x 2
#>                                 id
#>                              <chr>
#> 1 3d3fba16636ab5211a10ff0b0bf44ae6
#> 2 0c0b0cc05188d33b63fc6adc14774250
#> # ... with 1 more variables: provider <chr>
#> 
#> $facets
#> list()

Faceted search

dpla_items(facets=c("sourceResource.spatial.state","sourceResource.spatial.country"),
      page_size=0, facet_size=5)
#> $meta
#> # A tibble: 1 x 3
#>      found start returned
#>      <int> <int>    <int>
#> 1 17104849     0        0
#> 
#> $data
#> # A tibble: 0 x 0
#> 
#> $facets
#> $facets$sourceResource.spatial.state
#> $facets$sourceResource.spatial.state$meta
#> # A tibble: 1 x 4
#>    type   total  missing   other
#>   <chr>   <int>    <int>   <int>
#> 1 terms 6249159 11599925 3632477
#> 
#> $facets$sourceResource.spatial.state$data
#> # A tibble: 5 x 2
#>            term  count
#>           <chr>  <int>
#> 1         Texas 882954
#> 2    California 636851
#> 3       Georgia 472738
#> 4      New York 397295
#> 5 Massachusetts 226844
#> 
#> 
#> $facets$sourceResource.spatial.country
#> $facets$sourceResource.spatial.country$meta
#> # A tibble: 1 x 4
#>    type   total  missing   other
#>   <chr>   <int>    <int>   <int>
#> 1 terms 7786409 10212531 1818325
#> 
#> $facets$sourceResource.spatial.country$data
#> # A tibble: 5 x 2
#>             term   count
#>            <chr>   <int>
#> 1  United States 5327273
#> 2         Russia  172146
#> 3 United Kingdom  169379
#> 4         Mexico  167957
#> 5         France  131329

Search - collections

Search for collections with the words university of texas

dpla_collections(q="university of texas", page_size=2)
#> $meta
#> # A tibble: 1 x 2
#>   found returned
#>   <int>    <int>
#> 1    20        2
#> 
#> $data
#> # A tibble: 2 x 14
#>                                `_rev`                  ingestDate
#>                                 <chr>                       <chr>
#> 1 14-bccf34a900456b064086f20da68b0f89 2017-08-08T02:55:37.637978Z
#> 2 13-e91ba552cf695a88c3f285266a272ca8 2017-08-08T02:55:47.403457Z
#> # ... with 12 more variables: `@context` <chr>, id <chr>, title <chr>,
#> #   `_id` <chr>, description <chr>, `@type` <chr>, ingestType <chr>,
#> #   `@id` <chr>, ingestionSequence <int>, score <dbl>,
#> #   validation_message <lgl>, valid_after_enrich <lgl>

You can also search in the title and description fields

dpla_collections(description="east")
#> $meta
#> # A tibble: 1 x 2
#>   found returned
#>   <int>    <int>
#> 1     3       10
#> 
#> $data
#> # A tibble: 3 x 14
#>                               `_rev`                  ingestDate
#>                                <chr>                       <chr>
#> 1 8-6b723068e71b40c6d9b64b0c14f80e20 2017-05-23T02:22:47.507183Z
#> 2 3-388428340432e8ff676cd8d10f9d02b0 2017-07-31T17:06:05.782685Z
#> 3 3-0318d8a1af2907653ac3a11fb9a5bd5b 2017-07-31T17:05:59.746631Z
#> # ... with 12 more variables: `@context` <chr>, id <chr>, title <chr>,
#> #   `_id` <chr>, description <chr>, `@type` <chr>, ingestType <chr>,
#> #   `@id` <chr>, ingestionSequence <int>, score <dbl>,
#> #   validation_message <lgl>, valid_after_enrich <lgl>

Visualize

Visualize metadata from the DPLA - histogram of number of records per state (includes states outside the US)

out <- dpla_items(facets="sourceResource.spatial.state", page_size=0, facet_size=25)
library("ggplot2")
library("scales")
ggplot(out$facets$sourceResource.spatial.state$data, aes(reorder(term, count), count)) +
  geom_bar(stat="identity") +
  coord_flip() +
  theme_grey(base_size = 16) +
  scale_y_continuous(labels = comma) +
  labs(x="State", y="Records")

plot of chunk unnamed-chunk-18



ropensci/rdpla documentation built on May 18, 2022, 6:32 p.m.