#Scrape the top 100 books from the past 30 days
<- function() {
get_gutenberg_top100 if (robotstxt::paths_allowed("https://www.gutenberg.org/browse/scores/top#books-last30") == "TRUE") {
<- read_html("https://www.gutenberg.org/browse/scores/top#books-last30")
gutenberg_page <- html_nodes(gutenberg_page, "#books-last30+ ol li")
books_last30 <- html_text(books_last30)
books_last30_text <- tibble(
books_last30_tbl title = gsub(" by.*", "", books_last30_text),
author = gsub(".* by ", "", books_last30_text),
popularity = seq.int(1,100)
)
<<- books_last30_tbl
df_books30
}
}
get_gutenberg_top100()
Gutenberg Data
text
SDS 264: Data Science 2
Acquiring Text Data from Gutenberg.org with Theresa Worden
<- gutenberg_works() |>
top_100_books filter(title %in% df_books30$title)
<- top_100_books$gutenberg_id
book_ids <- list()
books_with_warnings <- list()
books_with_warningsb <- data.frame()
top_100_data
<- function(book_id, mirror = "http://aleph.gutenberg.org") {
download_book tryCatch(
{<- gutenberg_download(book_id, meta_fields = c("title", "author"), mirror = mirror)
data <<- bind_rows(top_100_data, data)
top_100_data
},warning = function(w) {
<<- c(books_with_warnings, book_id)
books_with_warnings message("Warning for book ID: ", book_id)
return(NULL)
}
)
}
<- lapply(book_ids, download_book)
book_data
print("Books with warnings:")
[1] "Books with warnings:"
print(books_with_warnings)
[[1]]
[1] 11
[[2]]
[1] 46
[[3]]
[1] 100
[[4]]
[1] 120
[[5]]
[1] 145
[[6]]
[1] 174
[[7]]
[1] 345
[[8]]
[1] 730
[[9]]
[1] 1342
[[10]]
[1] 1513
[[11]]
[1] 2148
[[12]]
[1] 2199
[[13]]
[1] 2489
[[14]]
[1] 2641
[[15]]
[1] 19508
[[16]]
[1] 19640
[[17]]
[1] 41445
[[18]]
[1] 43936
[[19]]
[1] 51713
[[20]]
[1] 64317
[[21]]
[1] 67098
[[22]]
[1] 67979
<- top_100_data |>
top_100_data mutate(text = na_if(text, "")) |> #changing empty lines to NA
na.omit() |> #removing those empty (NA) lines
group_by(title) |>
mutate(line_num = row_number()) |> #adding line number (starts at 1 for each book)
ungroup()
#print(top_100_data)
#Note: this dataset is way too long to render to pdf.
#For viewing
<- top_100_data |>
top_100_line_1 filter(line_num == 1)
head(top_100_line_1)
# A tibble: 6 × 5
gutenberg_id text title author line_num
<int> <chr> <chr> <chr> <int>
1 33 The Scarlet Letter The … Hawth… 1
2 41 The Legend of Sleepy Hollow The … Irvin… 1
3 42 [Illustration] The … Steve… 1
4 45 ANNE OF GREEN GABLES Anne… Montg… 1
5 150 ******************************************… The … Plato 1
6 244 A STUDY IN SCARLET A St… Doyle… 1