Ported Steven Bedrick’s lesson https://github.com/stevenbedrick/march_datajamboree/blob/master/Arboretum%20Scraping.ipynb to R for scraping Portland’s Hoyt Arboretum. See that python notebook for much more detailed notes.
we’ll be using the following packages:
crul
xml2
rvest
jsonlite
install.packages(c("xml2", "crul", "jsonlite", "rvest"))
library("xml2")
library("rvest")
library("jsonlite")
library("crul")
Get vector of URL suffixes for individual pages
(index <- xml2::read_html("https://hoytarboretum.gardenexplorer.org/taxalist.aspx"))
## {xml_document}
## <html id="ctl00_html" xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en" dir="ltr">
## [1] <head id="ctl00_Head1">\n<title>\r\n\tNames | Hoyt Arboretum, Portl ...
## [2] <body id="ctl00_body">\r\n\t<form name="aspnetForm" method="post" ac ...
res <- rvest::html_nodes(index, "div.content a")
out <- html_attr(res, "href")
out
## [1] "taxalist-A.aspx" "taxalist-B.aspx" "taxalist-C.aspx"
## [4] "taxalist-D.aspx" "taxalist-E.aspx" "taxalist-F.aspx"
## [7] "taxalist-G.aspx" "taxalist-H.aspx" "taxalist-I.aspx"
## [10] "taxalist-J.aspx" "taxalist-K.aspx" "taxalist-L.aspx"
## [13] "taxalist-M.aspx" "taxalist-N.aspx" "taxalist-O.aspx"
## [16] "taxalist-P.aspx" "taxalist-Q.aspx" "taxalist-R.aspx"
## [19] "taxalist-S.aspx" "taxalist-T.aspx" "taxalist-U.aspx"
## [22] "taxalist-V.aspx" "taxalist-W.aspx" "taxalist-X.aspx"
## [25] "taxalist-Y.aspx" "taxalist-Z.aspx"
Define function to get taxon names
process_letter_index <- function(x) {
url <- paste0("https://hoytarboretum.gardenexplorer.org/", x)
cli <- crul::HttpClient$new(url = url)
res <- cli$get()
res$raise_for_status()
page <- xml2::read_html(res$parse("UTF-8"))
rvest::html_text(
rvest::html_nodes(page, "ul.taxalist li span a b")
)
}
Do one at a time
process_letter_index(out[1])[1:5]
## [1] "Abies alba" "Abies alba 'Argau'"
## [3] "Abies alba 'Badenweiler'" "Abies alba 'Green Spiral'"
## [5] "Abies alba 'Pendula'"
Or do many
lapply(out[1:2], function(z) process_letter_index(z)[1:5])
## [[1]]
## [1] "Abies alba" "Abies alba 'Argau'"
## [3] "Abies alba 'Badenweiler'" "Abies alba 'Green Spiral'"
## [5] "Abies alba 'Pendula'"
##
## [[2]]
## [1] "Baccharis pilularis" "Barbarea orthoceras" "Beckmannia syzigachne"
## [4] "Berberis 'Ace'" "Berberis aquifolium"
Define function to do more detailed scraping of name and url part.
process_letter_index_detail <- function(x) {
url <- paste0("https://hoytarboretum.gardenexplorer.org/", x)
cli <- crul::HttpClient$new(url = url)
res <- cli$get()
res$raise_for_status()
page <- xml2::read_html(res$parse("UTF-8"))
tmp <- rvest::html_nodes(page, "ul.taxalist li")
do.call("rbind.data.frame", lapply(tmp, function(z) {
link_dest <- rvest::html_attr(rvest::html_nodes(z, "a"), "href")
latin_name <- rvest::html_text(rvest::html_nodes(z, "span a"))
data.frame(latin_name = latin_name, url = link_dest,
stringsAsFactors = FALSE)
}))
}
Do one at a time
head(process_letter_index_detail(out[1]))
## latin_name url
## 1 Abies alba taxon-312.aspx
## 2 Abies alba 'Argau' taxon-3693.aspx
## 3 Abies alba 'Badenweiler' taxon-3699.aspx
## 4 Abies alba 'Green Spiral' taxon-3698.aspx
## 5 Abies alba 'Pendula' taxon-945.aspx
## 6 Abies amabilis taxon-316.aspx
Or do many
lapply(out[1:2], function(z) head(process_letter_index_detail(z)))
## [[1]]
## latin_name url
## 1 Abies alba taxon-312.aspx
## 2 Abies alba 'Argau' taxon-3693.aspx
## 3 Abies alba 'Badenweiler' taxon-3699.aspx
## 4 Abies alba 'Green Spiral' taxon-3698.aspx
## 5 Abies alba 'Pendula' taxon-945.aspx
## 6 Abies amabilis taxon-316.aspx
##
## [[2]]
## latin_name url
## 1 Baccharis pilularis taxon-1123.aspx
## 2 Barbarea orthoceras taxon-3775.aspx
## 3 Beckmannia syzigachne taxon-3585.aspx
## 4 Berberis 'Ace' taxon-2415.aspx
## 5 Berberis aquifolium taxon-307.aspx
## 6 Berberis delavayi taxon-3754.aspx
Apply the function process_letter_index_detail
across all pages. Then make a single data.frame
from the output./
all_data <- lapply(out, process_letter_index_detail)
all_data_df <- do.call("rbind.data.frame", all_data)
head(all_data_df)
## latin_name url
## 1 Abies alba taxon-312.aspx
## 2 Abies alba 'Argau' taxon-3693.aspx
## 3 Abies alba 'Badenweiler' taxon-3699.aspx
## 4 Abies alba 'Green Spiral' taxon-3698.aspx
## 5 Abies alba 'Pendula' taxon-945.aspx
## 6 Abies amabilis taxon-316.aspx
write to disk
write.csv(all_data_df, "species_names.csv", row.names = FALSE)
newline delimited JSON
tfile <- tempfile(fileext = ".json")
jsonlite::stream_out(all_data_df, file(tfile))
## opening file output connection.
##
Processed 500 rows...
Processed 1000 rows...
Processed 1500 rows...
Processed 2000 rows...
Complete! Processed total of 2015 rows.
## closing file output connection.
readLines(tfile, n = 10)
## [1] "{\"latin_name\":\"Abies alba\",\"url\":\"taxon-312.aspx\"}"
## [2] "{\"latin_name\":\"Abies alba 'Argau'\",\"url\":\"taxon-3693.aspx\"}"
## [3] "{\"latin_name\":\"Abies alba 'Badenweiler'\",\"url\":\"taxon-3699.aspx\"}"
## [4] "{\"latin_name\":\"Abies alba 'Green Spiral'\",\"url\":\"taxon-3698.aspx\"}"
## [5] "{\"latin_name\":\"Abies alba 'Pendula'\",\"url\":\"taxon-945.aspx\"}"
## [6] "{\"latin_name\":\"Abies amabilis\",\"url\":\"taxon-316.aspx\"}"
## [7] "{\"latin_name\":\"Abies balsamea\",\"url\":\"taxon-726.aspx\"}"
## [8] "{\"latin_name\":\"Abies balsamea 'Nana'\",\"url\":\"taxon-810.aspx\"}"
## [9] "{\"latin_name\":\"Abies balsamea var. phanerolepis\",\"url\":\"taxon-1738.aspx\"}"
## [10] "{\"latin_name\":\"Abies bracteata\",\"url\":\"taxon-314.aspx\"}"
head( jsonlite::stream_in(file(tfile)) )
## opening file input connection.
##
Found 500 records...
Found 1000 records...
Found 1500 records...
Found 2000 records...
Found 2015 records...
Imported 2015 records. Simplifying...
## closing file input connection.
## latin_name url
## 1 Abies alba taxon-312.aspx
## 2 Abies alba 'Argau' taxon-3693.aspx
## 3 Abies alba 'Badenweiler' taxon-3699.aspx
## 4 Abies alba 'Green Spiral' taxon-3698.aspx
## 5 Abies alba 'Pendula' taxon-945.aspx
## 6 Abies amabilis taxon-316.aspx
one big json object
onebigfile <- "onebig.json"
jsonlite::write_json(all_data_df, file(onebigfile))