LibraryCheckouts/bubble_chart.R at main · broyce24/LibraryCheckouts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# setwd("~/INFO 201 Code/exploratory-analysis-broyce24")
book_data <- read.csv("checkouts_by_title_2020.csv")
library(dplyr)
library(tidyr)
library(ggplot2)
library(plotly)
library(tools)
library(tidyverse)
library(stringr)

# Remove "(Unabridged)" from all book titles
book_data$Title <- gsub("\\(Unabridged\\)", "", book_data$Title)
book_data$Title <- gsub("\\(Abridged\\)", "", book_data$Title)
book_data$Title <- gsub("\\(unabridged\\)", "", book_data$Title)
# Remove "/author" from title column
book_data$Title <- gsub("\\/.*", "", book_data$Title)
# Trim whitespace
book_data$Title <- trimws(book_data$Title)
# Convert Title to title case for better merging
book_data$Title <- str_to_title(book_data$Title)

# Exclude classic literature from dataframe entries
exclude_classic_literature <- book_data %>%
  separate("Subjects", paste("Subjects", 1:3, sep="_"), sep=",", extra="drop") %>%
  filter(Subjects_1 != "Classic Literature") %>%
  filter(Subjects_2 != "Classic Literature") %>%
  filter(Subjects_3 != "Classic Literature")

# Remove the unneccessary columns, especially MaterialType because some books
# were checked out in ebook and audiobook formats
short_title <- subset(exclude_classic_literature, select = -c(UsageClass,
                    CheckoutType, MaterialType, CheckoutYear, Publisher))

# Sum checkouts by title, across all material types
sum_title <- short_title %>%
  group_by(CheckoutMonth, Title) %>%
  dplyr::summarise(Checkouts = sum(Checkouts), "Title" = Title,
                   "Creator" = Creator, "Subjects_1" = Subjects_1) %>%
  as.data.frame()

# Sum checkouts by month
sorted_by_title <- sum_title %>%
  # Arrange in descending order
  arrange(CheckoutMonth, desc(Checkouts)) %>%
  group_by(CheckoutMonth) %>%
  # Remove duplicate rows
  distinct(Title, .keep_all = TRUE)

# Take the top n entries
sorted_by_title_short <- sorted_by_title %>%
  slice_max(Checkouts, n = 5)

# Separate title column by colon to shorten titles
short_title <- sorted_by_title_short %>%
  separate("Title", paste("Title", 1:2, sep="_"), sep=":", extra="drop")

# Remove everything after 2nd comma in Creator column
short_title$Creator <- sub("^([^,]*,[^,]*),.*", "\\1",
                               short_title$Creator)

# Add Month (name) column
added_month_names <- short_title %>%
  mutate("Month" = month.name[CheckoutMonth])

# Convert cleaned dataframe to a csv
write.csv(added_month_names, file = "clean_book_data.csv")

# # Read in clean csv file for bubble chart
# clean_book_data <- read.csv("clean_book_data.csv")
#
# # Filter by title- based on user input
# filter_by_title <- clean_book_data %>%
#   filter(Title_1 %in% input$title)
#
# # Create scatter plot
# most_pop_titles <- ggplot(data = filter_by_title) +
#   geom_point(aes(x = CheckoutMonth,
#                  y = Checkouts,
#                  colour = Title_1,
#                  label = Creator,
#                  # Make it a bubble plot
#                  size = Checkouts,
#                  # text attribute for tooltip didn't work
#                  text = paste("Title:", Title_1,
#                               "\nAuthor/Creator:", Creator, "\nMonth:", Month,
#                               "\nNumber of checkouts:", Checkouts),
#                  # Remind graph to group by title
#                  group = Title_1)) +
#   labs(title = "Most Checked Out Titles of 2020, Excluding Classic Literature",
#        # Set legend title
#        color = "Titles", size = "", x = "Month", y = "Total Monthly Checkouts")
#
# # Make interactive
# scatter_titles <- ggplotly(p = most_pop_titles,
#                            dynamicTicks = TRUE,
#                            # group = Title_1,
#                            # Call the custom text for the tooltip
#                            tooltip = c("text"))


# Filter by title- based on user input
# filter_by_title <- added_month_names %>%
# filter(Title_1 %in% "input$title")

# Create scatter plot
# most_pop_titles <- ggplot(data = filter_by_title,
  # text attribute for tooltip didn't work
#  text = paste("Title:", c(Title_1,": ", Title_2), "<br>", "Author/Creator:",
             #  Creator, "<br>", "Month:", Month, "<br>",
            #  "Number of checkouts:", Checkouts)) +
 # geom_point(aes(x = CheckoutMonth,
             #  y = Checkouts,
             #  colour = Title_1,
#                label = Creator,
#                # Make it a bubble plot
#                size = Checkouts,
#                # Remind graph to group by title
#                group = Title_1)) +
#   labs(title = "Most Checked Out Titles of 2020, Excluding Classic Literature",
#        # Set legend title
#        color = "Titles", size = "", x = "Month", y = "Total Monthly Checkouts")
#
# # Make interactive
# scatter_titles <- ggplotly(p = most_pop_titles,
#                            dynamicTicks = TRUE,
#                            group = Title_1,
#                            # Call the custom text for the tooltip
#                            tooltip = c("Month" = "x", "Number of checkouts" =
#                            "y", "Title" = "colour", "Creator" = "label"))