-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbubble_chart.R
More file actions
129 lines (115 loc) · 5.01 KB
/
bubble_chart.R
File metadata and controls
129 lines (115 loc) · 5.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# setwd("~/INFO 201 Code/exploratory-analysis-broyce24")
book_data <- read.csv("checkouts_by_title_2020.csv")
library(dplyr)
library(tidyr)
library(ggplot2)
library(plotly)
library(tools)
library(tidyverse)
library(stringr)
# Remove "(Unabridged)" from all book titles
book_data$Title <- gsub("\\(Unabridged\\)", "", book_data$Title)
book_data$Title <- gsub("\\(Abridged\\)", "", book_data$Title)
book_data$Title <- gsub("\\(unabridged\\)", "", book_data$Title)
# Remove "/author" from title column
book_data$Title <- gsub("\\/.*", "", book_data$Title)
# Trim whitespace
book_data$Title <- trimws(book_data$Title)
# Convert Title to title case for better merging
book_data$Title <- str_to_title(book_data$Title)
# Exclude classic literature from dataframe entries
exclude_classic_literature <- book_data %>%
separate("Subjects", paste("Subjects", 1:3, sep="_"), sep=",", extra="drop") %>%
filter(Subjects_1 != "Classic Literature") %>%
filter(Subjects_2 != "Classic Literature") %>%
filter(Subjects_3 != "Classic Literature")
# Remove the unneccessary columns, especially MaterialType because some books
# were checked out in ebook and audiobook formats
short_title <- subset(exclude_classic_literature, select = -c(UsageClass,
CheckoutType, MaterialType, CheckoutYear, Publisher))
# Sum checkouts by title, across all material types
sum_title <- short_title %>%
group_by(CheckoutMonth, Title) %>%
dplyr::summarise(Checkouts = sum(Checkouts), "Title" = Title,
"Creator" = Creator, "Subjects_1" = Subjects_1) %>%
as.data.frame()
# Sum checkouts by month
sorted_by_title <- sum_title %>%
# Arrange in descending order
arrange(CheckoutMonth, desc(Checkouts)) %>%
group_by(CheckoutMonth) %>%
# Remove duplicate rows
distinct(Title, .keep_all = TRUE)
# Take the top n entries
sorted_by_title_short <- sorted_by_title %>%
slice_max(Checkouts, n = 5)
# Separate title column by colon to shorten titles
short_title <- sorted_by_title_short %>%
separate("Title", paste("Title", 1:2, sep="_"), sep=":", extra="drop")
# Remove everything after 2nd comma in Creator column
short_title$Creator <- sub("^([^,]*,[^,]*),.*", "\\1",
short_title$Creator)
# Add Month (name) column
added_month_names <- short_title %>%
mutate("Month" = month.name[CheckoutMonth])
# Convert cleaned dataframe to a csv
write.csv(added_month_names, file = "clean_book_data.csv")
# # Read in clean csv file for bubble chart
# clean_book_data <- read.csv("clean_book_data.csv")
#
# # Filter by title- based on user input
# filter_by_title <- clean_book_data %>%
# filter(Title_1 %in% input$title)
#
# # Create scatter plot
# most_pop_titles <- ggplot(data = filter_by_title) +
# geom_point(aes(x = CheckoutMonth,
# y = Checkouts,
# colour = Title_1,
# label = Creator,
# # Make it a bubble plot
# size = Checkouts,
# # text attribute for tooltip didn't work
# text = paste("Title:", Title_1,
# "\nAuthor/Creator:", Creator, "\nMonth:", Month,
# "\nNumber of checkouts:", Checkouts),
# # Remind graph to group by title
# group = Title_1)) +
# labs(title = "Most Checked Out Titles of 2020, Excluding Classic Literature",
# # Set legend title
# color = "Titles", size = "", x = "Month", y = "Total Monthly Checkouts")
#
# # Make interactive
# scatter_titles <- ggplotly(p = most_pop_titles,
# dynamicTicks = TRUE,
# # group = Title_1,
# # Call the custom text for the tooltip
# tooltip = c("text"))
# Filter by title- based on user input
# filter_by_title <- added_month_names %>%
# filter(Title_1 %in% "input$title")
# Create scatter plot
# most_pop_titles <- ggplot(data = filter_by_title,
# text attribute for tooltip didn't work
# text = paste("Title:", c(Title_1,": ", Title_2), "<br>", "Author/Creator:",
# Creator, "<br>", "Month:", Month, "<br>",
# "Number of checkouts:", Checkouts)) +
# geom_point(aes(x = CheckoutMonth,
# y = Checkouts,
# colour = Title_1,
# label = Creator,
# # Make it a bubble plot
# size = Checkouts,
# # Remind graph to group by title
# group = Title_1)) +
# labs(title = "Most Checked Out Titles of 2020, Excluding Classic Literature",
# # Set legend title
# color = "Titles", size = "", x = "Month", y = "Total Monthly Checkouts")
#
# # Make interactive
# scatter_titles <- ggplotly(p = most_pop_titles,
# dynamicTicks = TRUE,
# group = Title_1,
# # Call the custom text for the tooltip
# tooltip = c("Month" = "x", "Number of checkouts" =
# "y", "Title" = "colour", "Creator" = "label"))