Gutenberg Data

text

SDS 264: Data Science 2

Acquiring Text Data from Gutenberg.org with Theresa Worden

#Scrape the top 100 books from the past 30 days
get_gutenberg_top100 <- function() {
  if (robotstxt::paths_allowed("https://www.gutenberg.org/browse/scores/top#books-last30") == "TRUE") {
    gutenberg_page <- read_html("https://www.gutenberg.org/browse/scores/top#books-last30")
    books_last30 <- html_nodes(gutenberg_page, "#books-last30+ ol li")
    books_last30_text <- html_text(books_last30)
    books_last30_tbl <- tibble(
  title = gsub(" by.*", "", books_last30_text),
  author = gsub(".* by ", "", books_last30_text),
  popularity = seq.int(1,100)
)
    
  df_books30 <<- books_last30_tbl
  }
}

get_gutenberg_top100()

top_100_books <- gutenberg_works() |>
  filter(title %in% df_books30$title)

book_ids <- top_100_books$gutenberg_id
books_with_warnings <- list()
books_with_warningsb <- list()
top_100_data <- data.frame()

download_book <- function(book_id, mirror = "http://aleph.gutenberg.org") {
  tryCatch(
    {
      data <- gutenberg_download(book_id, meta_fields = c("title", "author"), mirror = mirror)
      top_100_data <<- bind_rows(top_100_data, data)
    },
    warning = function(w) {
      books_with_warnings <<- c(books_with_warnings, book_id)
      message("Warning for book ID: ", book_id) 
      return(NULL)
    }
  )
}

book_data <- lapply(book_ids, download_book)

print("Books with warnings:")

[1] "Books with warnings:"

print(books_with_warnings)

[[1]]
[1] 11

[[2]]
[1] 46

[[3]]
[1] 100

[[4]]
[1] 120

[[5]]
[1] 145

[[6]]
[1] 174

[[7]]
[1] 345

[[8]]
[1] 730

[[9]]
[1] 1342

[[10]]
[1] 1513

[[11]]
[1] 2148

[[12]]
[1] 2199

[[13]]
[1] 2489

[[14]]
[1] 2641

[[15]]
[1] 19508

[[16]]
[1] 19640

[[17]]
[1] 41445

[[18]]
[1] 43936

[[19]]
[1] 51713

[[20]]
[1] 64317

[[21]]
[1] 67098

[[22]]
[1] 67979

top_100_data <- top_100_data |>
  mutate(text = na_if(text, "")) |> #changing empty lines to NA
  na.omit() |> #removing those empty (NA) lines
  group_by(title) |>
  mutate(line_num = row_number()) |> #adding line number (starts at 1 for each book)
  ungroup()


#print(top_100_data)
#Note: this dataset is way too long to render to pdf. 

#For viewing
top_100_line_1 <- top_100_data |>
  filter(line_num == 1)
head(top_100_line_1)

# A tibble: 6 × 5
  gutenberg_id text                                        title author line_num
         <int> <chr>                                       <chr> <chr>     <int>
1           33 The Scarlet Letter                          The … Hawth…        1
2           41 The Legend of Sleepy Hollow                 The … Irvin…        1
3           42 [Illustration]                              The … Steve…        1
4           45 ANNE OF GREEN GABLES                        Anne… Montg…        1
5          150 ******************************************… The … Plato         1
6          244 A STUDY IN SCARLET                          A St… Doyle…        1