Goal: present step by step how the biome graph was imported in R from a PDF document.
This document explains how the Whittaker_biomes_poly
SpatialPolygonsDataFrame and the Whittaker_biomes
data.frame objects were constructed.
The original graph is Figure 5.5 in Ricklefs, R. E. (2008), The economy of nature. W. H. Freeman and Company. (Chapter 5, Biological Communities, The biome concept).
A PDF page containing the Whittaker biomes graph was imported in Inkscape. Text and extra graphic layers where removed and the remaining layers were exported as PostScipt file format (File > Save As > Save as type: > PostScript (*.ps)
). Note that a multi-page PDF document can be split into component pages with PDFTK Builder application.
This uses the functionality of grImport
R package.
The main steps of importing PostsScipt in R are described in the Importing vector graphics vignette: Murrell, P. (2009). Importing vector graphics: The grImport package for R. Journal of Statistical Software, 30(4), 1-37.
In addition to installing package grImport
with the usual install.packages("grImport")
, Ghostscript needs to be installed as well.
Note: to avoid the ghostscript error 'status 127', the path to ghostscript executable file was given as suggested here
library(grImport) Sys.setenv(R_GSCMD = normalizePath("C:/Program Files/gs/gs9.22/bin/gswin64c.exe")) # Path to example PostScript file path <- system.file("extdata", "graph_ps.ps", package = "plotbiomes", mustWork = TRUE) # Path to output xml RGML file RGML_xml_path <- gsub(pattern = "graph_ps.ps", replacement = "graph_ps.xml", x = path, fixed = TRUE) # Converts a PostScript file into an RGML file PostScriptTrace(file = path, outfilename = RGML_xml_path, setflat = 1) # setflat = 1 assures a visual smooth effect (feel free to experiment) # Reads in the RGML file and creates a "Picture" object my_rgml <- readPicture(rgmlFile = RGML_xml_path)
The created picture is messy and the noise (extra elements) needs to be excluded. This is done by indexing, searching for the meaningful parts.
# Draws the picture plot.new(); grid.picture(my_rgml) # Draws each path composing the picture. This helps selecting desired paths. plot.new(); picturePaths(my_rgml)
By using picturePaths
one can identify the desired biome lines and the filled polygons. From polygons one can get the colors for further use.
# Selects desired paths from the picture object. # These are the path corresponding to the filled polygons. my_fills <- my_rgml[c(3,9,14,16,18,21,23,25,27)] # Gets the colors colors <- vector(mode = "character", length = length(my_fills@paths)) for (i in 1:length(my_fills@paths)){ colors[i] <- my_fills@paths[[i]]@rgb } # These are the path corresponding to the biome lines (polygon boundaries) my_rgml_clean <- my_rgml[c(4,10,15,17,19,22,24,26,28)] # Assigns colors to the lines for (i in 1:length(my_rgml_clean@paths)){ my_rgml_clean@paths[[i]]@rgb <- colors[i] } plot.new(); grid.picture(my_fills) plot.new(); grid.picture(my_rgml_clean)
Get the PostScript coordinates from the cleaned "Picture" object.
x_lst <- vector(mode = "list", length = length(my_rgml_clean@paths)) y_lst <- vector(mode = "list", length = length(my_rgml_clean@paths)) for (i in 1:length(my_rgml_clean@paths)){ x_lst[[i]] <- my_rgml_clean@paths[[i]]@x names(x_lst[[i]]) <- rep(i, length(x_lst[[i]])) y_lst[[i]] <- my_rgml_clean@paths[[i]]@y names(y_lst[[i]]) <- rep(i, length(x_lst[[i]])) } x <- unlist(x_lst) y <- unlist(y_lst) par(mar = c(4, 4, 1, 1)) plot(x,y)
a) Convert PostScript coordinates into meaningful coordinates
This varies from graph to graph, but in this case luckily there is a grid and axis on the graph that can be used to transform from PostScript coordinates to meaningful coordinates (temperature and precipitations).
my_grid <- my_rgml[1] my_axis <- my_rgml[30] x_grid <- my_grid@paths[[1]]@x y_grid <- my_grid@paths[[1]]@y x_grid_unq <- unique(sort(x_grid)) y_grid_unq <- unique(sort(y_grid)) # Plot original grid and axis plot.new() grid.picture(my_grid) grid.picture(my_axis) # Plot in PostScript coordinates par(mar = c(4, 4, 1, 1)) plot(x_grid, y_grid, pch = 16, col = "red") points(x, y, cex = 0.1) abline(v = x_grid_unq[2], col = "red") # X min ~ -10 abline(v = max(my_axis@summary@xscale), col = "red") # X max ~ 30 abline(h = min(my_axis@summary@yscale), col = "red") # Y min ~ 0 abline(h = y_grid_unq[6], col = "red") # Y max ~ 400
b) Convert between coordinates systems
Need to identify the corresponding min & max values on OX and OY axis on both coordinates systems. The conversion equations are those exemplified here.
# Notations: # scs - source coordinate system # rcs - result coordinate system x_min_scs <- x_grid_unq[2] # is the X of the left most vertical grid line x_min_rcs <- -10 # corresponds to -10 x_max_scs <- max(my_axis@summary@xscale) # is the X max of OX axis line x_max_rcs <- 30 # corresponds to 30 y_min_scs <- min(my_axis@summary@yscale) # is the Y min of OY axis line y_min_rcs <- 0 # corresponds to 0 y_max_scs <- y_grid_unq[6] # is the Y of the upper most horizontal grid line y_max_rcs <- 400 # Apply conversion xx <- (x - x_min_scs)/(x_max_scs - x_min_scs) * (x_max_rcs - x_min_rcs) + x_min_rcs yy <- (y - y_min_scs)/(y_max_scs - y_min_scs) * (y_max_rcs - y_min_rcs) + y_min_rcs # There should not be negative values on OY. # This happens because the original PDF has some artefacts, # including also some overlapping polygons sections. min(yy) yy[yy < 0] <- 0 # Plot in original coordinates par(mar = c(4, 4, 1, 1)) plot(x_grid, y_grid, pch = 16, col = "red", xlab = "X - original coordinates", ylab = "Y - original coordinates") points(x, y, cex = 0.1) abline(v = x_grid_unq[2], col = "red") # X min ~ -10 abline(v = max(my_axis@summary@xscale), col = "red") # X max ~ 30 abline(h = min(my_axis@summary@yscale), col = "red") # Y min ~ 0 abline(h = y_grid_unq[6], col = "red") # Y max ~ 400 # Plot in converted coordinates plot(xx, yy, cex = 0.1, xlab = "X - converted coordinates", ylab = "Y - converted coordinates") abline(v = -10, col = "red") # X min abline(v = 30, col = "red") # X max abline(h = 0, col = "red") # Y min abline(h = 400, col = "red") # Y max
# Bind the coordinates and the biome ids in one data frame object Whittaker_biomes_raw <- data.frame(temp_c = xx, precp_cm = yy, biome_id = as.numeric(names(xx))) # Assigns the biome names biomes_tbl <- data.frame(biome_id = 1:9, biome = c("Tropical seasonal forest/savanna", "Subtropical desert", "Temperate rain forest", "Tropical rain forest", "Woodland/shrubland", "Tundra", "Boreal forest", "Temperate grassland/desert", "Temperate seasonal forest")) Whittaker_biomes_raw <- merge(x = Whittaker_biomes_raw, y = biomes_tbl, by = "biome_id", sort = FALSE)
The colors where already extracted from PostScript.
In the help of ggplot2::scale_fill_manual
it is recommended to use a named vector.
Therefore, colors were stored in the named character vector Ricklefs_colors
below:
colors # hexadecimal color codes extracted from PostScript Ricklefs_colors <- colors names(Ricklefs_colors) <- biomes_tbl$biome # Sets a desired order for the names. This will affect the order in legend. biomes_order <- c("Tundra", "Boreal forest", "Temperate seasonal forest", "Temperate rain forest", "Tropical rain forest", "Tropical seasonal forest/savanna", "Subtropical desert", "Temperate grassland/desert", "Woodland/shrubland") Ricklefs_colors <- Ricklefs_colors[order(factor(names(Ricklefs_colors), levels = biomes_order))] Ricklefs_colors
Note that the raw coordinates have some artefacts - neighboring biomes display some undesired overlapping.
library(ggplot2) ggplot(data = Whittaker_biomes_raw, aes(x = temp_c, y = precp_cm, color = biome), alpha = 0.5) + geom_point()
In the figure above one can see some obvious artefacts like the intersection between the coordinates of Tropical rain forest and Temperate rain forest biomes or between those of Tropical seasonal forest/savanna and Temperate seasonal forest. A less obvious issue is the fact that neighboring biomes should share identical coordinates along their shared borders.
One way to address these issues is to use the advanced digitization capabilities of a GIS software (e.g. QGIS).
Save the coordinates as shapefile to be further digitized in QGIS.
Note that the digitization can be done directly on a georeferenced version of the graph, without painfully going through the PDF processing and all the steps to retrieve the coordinates form the PDF. The approach presented in this document is far more complex, i.e. digitizing into polygons the points obtained from importing the PostScript image. Why is this complex process presented then? Well, because it is a more secure way to reduce any digitization mistake, but that is debatable. For other graphs might be difficult to georeference them in a GIS application that requires precise identification of at least 4 points distributed across the graph (preferably the corners). Having all the points obtained from the PostScript file eases digitization and reduces mistakes.
library(sp) library(rgdal) # Divide precipitation by 10 so that it scales better with temperature # in the QGIS canvas. This gives a better visual during digitization. # Also reduces the chances of distortions due to spatial projection. Whittaker_biomes_raw$precp_cm <- Whittaker_biomes_raw$precp_cm / 10 # Make SpatialPointsDataFrame biomes_pts <- sp::SpatialPointsDataFrame( coords = Whittaker_biomes_raw[, c("temp_c", "precp_cm")], data = Whittaker_biomes_raw, proj4string = CRS("+proj=eck4 +lon_0=0 +x_0=0 +y_0=0 +ellps=WGS84 +datum=WGS84 +units=m +no_defs") ) # Save as ESRI Shapefile rgdal::writeOGR(obj = biomes_pts, dsn = "../_NotInclude/digitization", layer = "biomes_pts", driver = "ESRI Shapefile", overwrite_layer = TRUE)
A good resource for digitizing with QGIS is here. The CRS used was Eckert IV, EPSG 54012 (was deleted for the final product Whittaker_biomes_poly
). This places the graph on a world map somewhere in the Atlantic, south of Ghana. To distinguish between the points, the scale during digitization varied from 1:1 to 1:24.
Whittaker_biomes_poly
library(sf) # Read the digitized polygons Whittaker_biomes_poly <- sf::st_read(dsn = "../_NotInclude/digitization/Whittaker_biomes_poly.shp", stringsAsFactors = FALSE) # Convert back precipitation to cm. # This is done by updating the polygons coordinates. for (i in 1:nrow(Whittaker_biomes_poly)){ st_geometry(Whittaker_biomes_poly)[[i]][[1]][,2] <- 10 * st_geometry(Whittaker_biomes_poly)[[i]][[1]][,2] } # Delete the CRS used during digitization in QGIS. There is no need of spatial locations, # because the coordinates are temperature and precipitation. st_crs(Whittaker_biomes_poly) <- NA # Convert from sf to sp object Whittaker_biomes_poly <- as(Whittaker_biomes_poly, "Spatial") # save(Whittaker_biomes_poly, file = "../data/Whittaker_biomes_poly.rda")
Whittaker_biomes
We avoid applying ggplot2::fortify
or broom::tidy
directly on the Whittaker_biomes_poly
object (which is a SpatialPolygonsDataFrame
) because this loses the attribute table of the polygons (so the biome names and IDs). One way around is to convert from SpatialPolygonsDataFrame
to SpatialLinesDataFrame
, then to SpatialPointsDataFrame
and then finally to data.frame
(see this link).
library(dplyr) Whittaker_biomes <- Whittaker_biomes_poly %>% as("SpatialLinesDataFrame") %>% as("SpatialPointsDataFrame") %>% as.data.frame %>% select(temp_c = coords.x1, precp_cm = coords.x2, biome_id, biome) # save(Whittaker_biomes, file = "../data/Whittaker_biomes.rda")
Now the dataset is ready for plotting with ggplot.
library(ggplot2) ggplot() + geom_polygon(data = Whittaker_biomes, aes(x = temp_c, y = precp_cm, fill = biome), colour = "gray98", # colour of polygon border size = 0.5) + # thickness of polygon border # fill polygons with predefined colors (as in Ricklefs, 2008) scale_fill_manual(name = "Whittaker biomes", breaks = names(Ricklefs_colors), labels = names(Ricklefs_colors), values = Ricklefs_colors) + theme_bw()
Check examples at Whittaker_biomes_examples.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.