This repository was archived by the owner on Dec 29, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcrawl.rb
More file actions
102 lines (76 loc) · 2.34 KB
/
crawl.rb
File metadata and controls
102 lines (76 loc) · 2.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
require "nokogiri"
require "open-uri"
require "set"
require "pry"
require "graphviz"
require "selenium-webdriver"
class Crawler
ENTRY_POINT = "/".freeze
attr_accessor :pages
def initialize
initial_page = Page.new(ENTRY_POINT)
self.pages = { "/" => initial_page }
crawl(initial_page)
end
def crawl(current_page)
current_page
.links_to
.reject { |href| pages.keys.include?(href) }
.each do |href|
Page.new(href).tap do |page|
pages[href] = page
crawl(page)
end
end
end
def links_graph
pages
.values
.each_with_object({}) { |p, o| o[p.path] = p.links_to }
end
end
class Page
PORT = 3001
attr_reader :path, :body
def initialize(path)
puts "Parsing #{path}"
@path = path
@body = read
end
def links_to
body
.css("a.govuk-button")
.map { |link| link["href"] }
end
def uri
File.join("http://localhost:#{PORT}", path)
end
private
def read
Nokogiri::HTML.parse(OpenURI.open_uri(uri))
end
end
options = Selenium::WebDriver::Firefox::Options.new(args: ["-headless"])
driver = Selenium::WebDriver.for(:firefox, options: options)
window_size = OpenStruct.new(width: 800, height: 600)
driver.manage.window.size = window_size
crawler = Crawler.new
crawler.pages.values.each.with_index do |page, i|
driver.get(page.uri)
width = driver.execute_script("return Math.max(document.body.scrollWidth, document.body.offsetWidth, document.documentElement.clientWidth, document.documentElement.scrollWidth, document.documentElement.offsetWidth);")
height = driver.execute_script("return Math.max(document.body.scrollHeight, document.body.offsetHeight, document.documentElement.clientHeight, document.documentElement.scrollHeight, document.documentElement.offsetHeight);")
driver.manage.window.resize_to(width + 100, height)
puts driver.title
driver.save_screenshot("tmp/#{i}.png")
end
graph = Graphviz::Graph.new(rankdir: "LR", ranksep: 3).tap do |g|
crawler.links_graph.each.with_index do |(path, _), i|
g.add_node(path, shape: "box", imagescale: true, image: "tmp/#{i}.png")
end
crawler.links_graph.each do |path, links|
links.each { |link| g.nodes[path].connect(g.nodes[link], arrowsize: 2, penwidth: 7) }
end
end
puts graph.to_dot
Graphviz::output(graph, path: "test.png", format: "png")
driver.quit