library(rvest)
library(xml2)
library(dplyr)
library(readr)
library(purrr)

Get Names of toolboxes and their URLs

Get Toolboxes

Go to https://pro.arcgis.com/en/pro-app/latest/tool-reference/main/arcgis-pro-tool-reference.htm and copy the inner html of accordion-content. I cant scrape this because there seems to bo JS involved.

toolbox_html <- '
<a class="side-nav-link icon-ui-right" data-collapsed="true" href="/en/pro-app/2.7/tool-reference/3d-analyst">3D Analyst toolbox</a>
<a class="side-nav-link icon-ui-right" data-collapsed="true" href="/en/pro-app/2.7/tool-reference/analysis">Analysis toolbox</a>
<a class="side-nav-link icon-ui-right" data-collapsed="true" href="/en/pro-app/2.7/tool-reference/aviation">Aviation toolbox</a>
<a class="side-nav-link icon-ui-right" data-collapsed="true" href="/en/pro-app/2.7/tool-reference/business-analyst">Business Analyst toolbox</a>
<a class="side-nav-link icon-ui-right" data-collapsed="true" href="/en/pro-app/2.7/tool-reference/cartography">Cartography toolbox</a>
<a class="side-nav-link icon-ui-right" data-collapsed="true" href="/en/pro-app/2.7/tool-reference/conversion">Conversion toolbox</a>
<a class="side-nav-link icon-ui-right" data-collapsed="true" href="/en/pro-app/2.7/tool-reference/crime-analysis">Crime Analysis and Safety toolbox</a>
<a class="side-nav-link icon-ui-right" data-collapsed="true" href="/en/pro-app/2.7/tool-reference/data-interoperability">Data Interoperability toolbox</a>
<a class="side-nav-link icon-ui-right" data-collapsed="true" href="/en/pro-app/2.7/tool-reference/data-management">Data Management toolbox</a>
<a class="side-nav-link icon-ui-right" data-collapsed="true" href="/en/pro-app/2.7/tool-reference/data-reviewer">Data Reviewer toolbox</a>
<a class="side-nav-link icon-ui-right" data-collapsed="true" href="/en/pro-app/2.7/tool-reference/defense">Defense  toolbox</a>
<a class="side-nav-link icon-ui-right" data-collapsed="true" href="/en/pro-app/2.7/tool-reference/editing">Editing toolbox</a>
<a class="side-nav-link icon-ui-right" data-collapsed="true" href="/en/pro-app/2.7/tool-reference/feature-analysis">Feature Analysis toolbox</a>
<a class="side-nav-link icon-ui-right" data-collapsed="true" href="/en/pro-app/2.7/tool-reference/geoanalytics-desktop">GeoAnalytics Desktop toolbox</a>
<a class="side-nav-link icon-ui-right" data-collapsed="true" href="/en/pro-app/2.7/tool-reference/big-data-analytics">GeoAnalytics Server toolbox</a>
<a class="side-nav-link icon-ui-right" data-collapsed="true" href="/en/pro-app/2.7/tool-reference/geocoding">Geocoding toolbox</a>
<a class="side-nav-link icon-ui-right" data-collapsed="true" href="/en/pro-app/2.7/tool-reference/geostatistical-analyst">Geostatistical Analyst toolbox</a>
<a class="side-nav-link icon-ui-right" data-collapsed="true" href="/en/pro-app/2.7/tool-reference/image-analyst">Image Analyst toolbox</a>
<a class="side-nav-link icon-ui-right" data-collapsed="true" href="/en/pro-app/2.7/tool-reference/indoors">Indoors toolbox</a>
<a class="side-nav-link icon-ui-right" data-collapsed="true" href="/en/pro-app/2.7/tool-reference/intelligence">Intelligence toolbox</a>
<a class="side-nav-link icon-ui-right" data-collapsed="true" href="/en/pro-app/2.7/tool-reference/linear-referencing">Linear Referencing toolbox</a>
<a class="side-nav-link icon-ui-right" data-collapsed="true" href="/en/pro-app/2.7/tool-reference/location-referencing">Location Referencing toolbox</a>
<a class="side-nav-link icon-ui-right" data-collapsed="true" href="/en/pro-app/2.7/tool-reference/maritime">Maritime toolbox</a>
<a class="side-nav-link icon-ui-right" data-collapsed="true" href="/en/pro-app/2.7/tool-reference/multidimension">Multidimension toolbox</a>
<a class="side-nav-link icon-ui-right" data-collapsed="true" href="/en/pro-app/2.7/tool-reference/network-analyst">Network Analyst toolbox</a>
<a class="side-nav-link icon-ui-right" data-collapsed="true" href="/en/pro-app/2.7/tool-reference/network-diagram">Network Diagram toolbox</a>
<a class="side-nav-link icon-ui-right" data-collapsed="true" href="/en/pro-app/2.7/tool-reference/parcel">Parcels toolbox</a>
<a class="side-nav-link icon-ui-right" data-collapsed="true" href="/en/pro-app/2.7/tool-reference/raster-analysis">Raster Analysis toolbox</a>
<a class="side-nav-link icon-ui-right" data-collapsed="true" href="/en/pro-app/2.7/tool-reference/ready-to-use">Ready To Use toolbox</a>
<a class="side-nav-link icon-ui-right" data-collapsed="true" href="/en/pro-app/2.7/tool-reference/server">Server toolbox</a>
<a class="side-nav-link icon-ui-right" data-collapsed="true" href="/en/pro-app/2.7/tool-reference/space-time-pattern-mining">Space Time Pattern Mining toolbox</a>
<a class="side-nav-link icon-ui-right" data-collapsed="true" href="/en/pro-app/2.7/tool-reference/spatial-analyst">Spatial Analyst toolbox</a>
<a class="side-nav-link icon-ui-right" data-collapsed="true" href="/en/pro-app/2.7/tool-reference/spatial-statistics">Spatial Statistics toolbox</a>
<a class="side-nav-link icon-ui-right" data-collapsed="true" href="/en/pro-app/2.7/tool-reference/territory-design">Territory Design toolbox</a>
<a class="side-nav-link icon-ui-right" data-collapsed="true" href="/en/pro-app/2.7/tool-reference/topographic-production">Topographic Production toolbox</a>
<a class="side-nav-link icon-ui-right" data-collapsed="true" href="/en/pro-app/2.7/tool-reference/trace-network">Trace Network toolbox</a>
<a class="side-nav-link icon-ui-right" data-collapsed="true" href="/en/pro-app/2.7/tool-reference/utility-networks">Utility Network toolbox</a>
<a class="side-nav-link icon-ui-right" data-collapsed="true" href="/en/pro-app/2.7/tool-reference/workflow-manager">Workflow Manager toolbox</a>
'

Create a dataframe with Toolbox name and URL

toolboxes <- read_html(toolbox_html) %>%
  html_nodes("a") %>%
  map_dfr(function(x){
    tibble(toolbox  = html_text(x),
           url = html_attr(x, "href"))
  })

toolboxes

Adjust the URL

toolboxes <- toolboxes %>%
  mutate(
    last = map_chr(str_split(url,"/"), ~.x[length(.x)]),
    # There are some cases where the pattern is a bit different
    last = case_when(
      last == "parcel"~"parcels",
      last == "utility-networks"~"utility-network",
      last == "crime-analysis"~"crime-analysis-and-safety",
      last == "defense"~"defense-tools",
      last == "feature-analysis"~"standard-feature-analysis",
      TRUE~last
    ),
    url_adjusted = case_when(
      # the pattern is still different for one toolbox
      last == "workflow-manager"~paste0(url,"/overview-of-wmx-tools.htm"),
      TRUE~paste0(url,"/an-overview-of-the-",last,"-toolbox.htm")
    ),
    url_adjusted = str_replace(url_adjusted, "/2.7/","/latest/"),
    url_adjusted = paste0("https://pro.arcgis.com",url_adjusted) 
  ) %>%
  select(-last)

toolboxes

Get Toolbox description (Only the first

objects is scraped from the website, which might also just hold licensing information).

get_toolboxdesc <- function(url){
  url %>%
    xml2::read_html() %>% 
    html_node("p") %>% 
    html_text() 
}

toolbox_desc <- toolboxes %>%
  pmap_dfr(function(toolbox, url, url_adjusted){
    Sys.sleep(0.5)
    desc <- tryCatch(
      get_toolboxdesc(url_adjusted),
      error = function(e){
        print(paste("Toolbox",toolbox,"is causing an error"))
        NA
      }
    )

    tibble(toolbox = toolbox, description = desc)
  })

toolboxes <- left_join(toolboxes, toolbox_desc, by = "toolbox") %>%
  mutate(description = str_trim(description))



write_csv(toolboxes, "auxiliary/toolboxes.csv")

Get Toolsets

# This function takes the URL of a toolbox and returns a table of all toolsets from this toolbox
grab_toolsets <- function(url){
  tab <- url %>%
    xml2::read_html() %>%
    html_node(".tablexyz")

  tab2 <- html_table(tab) %>%
    rename("toolset" = 1)

  tab3 <- tab %>% 
    html_nodes("a") %>%
    map_dfr(~tibble(url = html_attr(.x, "href"),text = html_text(.x)))

  left_join(tab2, tab3, by = c("Toolset" =  "text"))
}
toolsets <- toolboxes %>%
  select(toolbox, url_adjusted) %>%
  pmap_dfr(function(toolbox, url_adjusted){
    Sys.sleep(0.5)
    mytable <- tryCatch(
      grab_toolsets(url_adjusted),
      error = function(e){
        print(paste("toolbox",toolbox,"is causing an error"))
        tibble(.rows = 1)
      }
    )
    mytable <- mytable %>%
      mutate(toolbox = toolbox)

  })

toolset_url_adjuster <- function(url){
  paste0("https://pro.arcgis.com",str_replace(url,"/2.7/","/latest/"))
}

toolsets <- toolsets %>%
  mutate(
    url_adjusted = toolset_url_adjuster(url)
  ) %>%
  mutate(
    url_adjusted = case_when(
      toolbox == "Defense  toolbox" & Toolset == "Gridded Reference Graphic"~"https://pro.arcgis.com/en/pro-app/latest/tool-reference/defense/an-overview-of-the-gridded-reference-graphic-toolset.htm",
      toolbox == "GeoAnalytics Desktop toolbox" & Toolset == "Analyze Patterns"~"https://pro.arcgis.com/en/pro-app/latest/tool-reference/geoanalytics-desktop/an-overview-of-the-analyze-patterns-toolset.htm",
      TRUE~url_adjusted
    )
  )

# Any empty URLS?
toolsets %>% filter(is.na(url))
# did every toolbox get a row (as expected)?
length(unique(toolsets$toolbox)) == nrow(toolboxes)

# if so, is any toolset NA? I hope not
any(is.na(toolsets$Toolset))


toolsets <- rename(toolsets, description =  Description)


write_csv(toolsets, "auxiliary/toolsets.csv")
toolsets <- read_csv("auxiliary/toolsets.csv")


# This function grabs all tools from a given toolset URL
grab_tools <- function(input_url, input_toolbox, input_toolset){
  url_html <- input_url %>%
    read_html() %>%
    html_nodes(".tablexyz")



  my_tables <- html_table(url_html)

  a_nodes <- url_html %>% 
    html_nodes("a")

  url_table <- tibble(tool = html_text(a_nodes),href = html_attr(a_nodes, "href"))


  df <- map_dfr(my_tables, function(my_table){
    mycolnames <- colnames(my_table)
    if(mycolnames[1] %in% c("Tool","Tool name","Geoprocessing tools","tool")){
      colnames(my_table)[1] <- "tool"
      colnames(my_table)[2] <- "description"
      left_join(my_table,url_table, by = "tool")

    } else if(mycolnames[1] %in% c("Toolsets","Toolset")){

      colnames(my_table)[1] <- "sub_toolset"
      colnames(my_table)[2] <- "toolset_description"

      left_join(my_table, url_table, by = c("toolset" = "tool"))

    } else{
      tibble(.rows = 1)
    }
  })

  df <- df %>%
    mutate(
      toolset = input_toolset,
      toolbox = input_toolbox
    )

  return(df)
}

toolsets <- mutate(toolsets, row = row_number())
maxi <- max(toolsets$row)

tools <- toolsets %>%
  select(toolbox, toolset,url_adjusted, row) %>%
  pmap(function(toolbox, toolset, url_adjusted, row){
    # Sys.sleep(0.5)
    print(paste(row,toolbox,toolset,round(row/maxi,2)," (",url_adjusted,")",sep = " "))
    tryCatch(grab_tools(url_adjusted,toolbox, toolset),
                    error = function(e){
                      print(paste("ERROR"))
                    })

  }) 


success <- map_lgl(tools, ~ifelse(is.data.frame(.x),nrow(.x)>0,FALSE))

# These 14 Special case toolboxes were not successfull
toolsets %>% 
  filter(!success) %>%
  group_by(toolbox) %>%
  summarise()


tools_df <- map_dfr(tools[success], ~.x)

tools_df %>%
  filter_all(any_vars(is.na(.))) %>%
  View()

write_csv(tools, "auxiliary/tools.csv")


arc2r/book documentation built on March 5, 2021, 2:10 p.m.