Skip to content

Commit c8464e8

Browse files
authored
Merge pull request #1154 from kennethrioja/github-ingestor
Material Ingestor – GitHub
2 parents 910b325 + fe1a4fd commit c8464e8

File tree

16 files changed

+940
-0
lines changed

16 files changed

+940
-0
lines changed

lib/ingestors/github_ingestor.rb

Lines changed: 243 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,243 @@
1+
# frozen_string_literal: true
2+
3+
require 'open-uri'
4+
require 'json'
5+
require 'httparty'
6+
require 'nokogiri'
7+
8+
module Ingestors
9+
# GithubIngestor fetches repository information from GitHub to populate the materials' metadata.
10+
# API requests counter:
11+
# 1. Get the repo's general metadata #{GITHUB_API_BASE}/#{full_name}
12+
# and keys: name, full_name, owner.login, html_url, description,
13+
# homepage, topics, license.{key, spdx}, archived,
14+
# created_at, pushed_at, updated_at, contributors_url
15+
# 2. Get the doi #{GITHUB_API_BASE}/#{full_name}/contents/README.md
16+
# and key: content
17+
# 3. Get the version/release #{GITHUB_API_BASE}/#{full_name}/releases
18+
# and key: tag_name (first)
19+
# 4. Get the contributors' list #{GITHUB_API_BASE}/#{full_name}/contributors
20+
# and key: login (from all entries)
21+
class GithubIngestor < Ingestor # rubocop:disable Metrics/ClassLength
22+
include Ingestors::Concerns::SitemapHelpers
23+
24+
GITHUB_API_BASE = 'https://api.github.com/repos'
25+
CACHE_PREFIX = 'github_ingestor_'
26+
TTL = 1.week # cache expiration time (time to live before cache expires)
27+
28+
def self.config
29+
{
30+
key: 'github',
31+
title: 'GitHub Repository or Page',
32+
category: :materials,
33+
user_agent: 'TeSS Github ingestor'
34+
}
35+
end
36+
37+
# Reads from direct GitHub URLs, .xml sitemaps, and .txt sitemaps.
38+
# Fetches repository metadata, contributors, releases, and DOIs (from CITATION.cff).
39+
# It handles automatically GitHub Pages URLs (github.io) and standard github.com URLs.
40+
# It caches API requests to avoid repeated calls.
41+
def read(source_url)
42+
@verbose = false
43+
# Returns either a map of unique URL entries, either the URL itself
44+
sources = parse_sitemap(source_url)
45+
46+
sources.each do |url|
47+
# Reads each source, if github.{com|io}, gets the repo's api, if not, next
48+
repo_api_url = to_github_api(url)
49+
next unless repo_api_url
50+
51+
# Gets the cached repo data or reads and sets it
52+
key = "#{CACHE_PREFIX}#{repo_api_url.gsub(%r{https?://}, '').gsub('/', '_')}"
53+
repo_data = cache_fetch(key, repo_api_url)
54+
next unless repo_data
55+
56+
# Add to material
57+
add_material to_material(repo_data)
58+
end
59+
rescue StandardError => e
60+
Rails.logger.error("#{e.class}: read() failed, #{e.message}")
61+
end
62+
63+
private
64+
65+
# Takes a github.{com|io} url and returns its api.github.com url
66+
def to_github_api(url)
67+
uri = URI(url)
68+
parts = uri.path.split('/') # 'example.com/foo/bar' will have path == '/foo/bar', so three parts
69+
70+
# http(s)://github.com/<username>/<repo> is the strict way to pass
71+
if uri.host&.downcase == 'github.com' && (uri.host.count('.') == 1) && parts.size == 3
72+
github_api_from_com(parts)
73+
# http(s)://<username>.github.io/<repo> is the strict way to pass
74+
elsif uri.host&.downcase&.end_with?('.github.io') && (uri.host.count('.') == 2) && parts.size >= 2
75+
github_api_from_io(uri, parts)
76+
end
77+
end
78+
79+
def github_api_from_com(parts)
80+
"#{GITHUB_API_BASE}/#{parts[1]}/#{parts[2]}"
81+
end
82+
83+
def github_api_from_io(uri, parts)
84+
repo = parts[1]
85+
owner = uri.host.split('.').first
86+
"#{GITHUB_API_BASE}/#{owner}/#{repo}"
87+
end
88+
89+
# Fetch cached data or opens webpage/api and cache it
90+
# I chose to cache because GitHub limits up to 60 requests per hour for unauthenticated user
91+
# https://docs.github.com/en/rest/using-the-rest-api/rate-limits-for-the-rest-api?apiVersion=2022-11-28#primary-rate-limit-for-unauthenticated-users
92+
# One GitHub URL equals to 4 GitHub API requests.
93+
# key: string key for the cache
94+
# url: url to open
95+
# ttl: time-to-live in seconds (default 7 days)
96+
def cache_fetch(key, url)
97+
Rails.cache.fetch(key, expires_in: TTL, skip_nil: true) do
98+
JSON.parse(open_url(url).read)
99+
end
100+
end
101+
102+
# Sets material hash keys and values and add them to material
103+
def to_material(repo_data) # rubocop:disable Metrics/AbcSize
104+
github_io_homepage = github_io_homepage? repo_data['homepage']
105+
url = github_io_homepage ? repo_data['homepage'] : repo_data['html_url']
106+
redirected_url = get_redirected_url(url)
107+
html = get_html(redirected_url)
108+
109+
material = OpenStruct.new
110+
material.title = repo_data['name'].titleize
111+
material.url = url
112+
material.description = github_io_homepage ? fetch_definition(html, redirected_url) : repo_data['description']
113+
material.keywords = repo_data['topics']
114+
material.licence = fetch_licence(repo_data['license'])
115+
material.status = repo_data['archived'] ? 'Archived' : 'Active'
116+
material.doi = fetch_doi(repo_data['full_name'])
117+
material.version = fetch_latest_release(repo_data['full_name'])
118+
material.date_created = repo_data['created_at']
119+
material.date_published = repo_data['pushed_at']
120+
material.date_modified = repo_data['updated_at']
121+
material.contributors = fetch_contributors(repo_data['contributors_url'], repo_data['full_name'])
122+
material.resource_type = github_io_homepage ? ['Github Page'] : ['Github Repository']
123+
material.prerequisites = fetch_prerequisites(html)
124+
material
125+
end
126+
127+
def github_io_homepage?(homepage)
128+
return false if homepage.nil? || homepage.empty?
129+
130+
url = URI(homepage)
131+
url.host&.downcase&.end_with?('.github.io')
132+
end
133+
134+
def get_html(url)
135+
response = HTTParty.get(url, follow_redirects: true, headers: { 'User-Agent' => config[:user_agent] })
136+
Nokogiri::HTML(response.body)
137+
end
138+
139+
# DEFINITION – Opens the GitHub homepage, fetches the 3 first >50 char <p> tags'text
140+
# and joins them with a 'Read more...' link at the end of the description
141+
# Some of the first <p> tags were not descriptive, thus skipping them
142+
def fetch_definition(html, url)
143+
desc = ''
144+
round = 3
145+
html.css('p').each do |p|
146+
p_txt = p&.text&.strip&.gsub(/\s+/, ' ') || ''
147+
next if p_txt.length < 50 || round.zero?
148+
149+
desc = "#{desc}\n#{p_txt}"
150+
round -= 1
151+
end
152+
"#{desc}\n(...) [Read more...](#{url})"
153+
end
154+
155+
# LICENCE – Get proper licence
156+
# the licence must match the format of config/dictionaries/licences.yml
157+
def fetch_licence(licence)
158+
return 'notspecified' if licence.nil? || licence == 'null'
159+
return 'other-at' if licence['key'] == 'other'
160+
161+
licence['spdx_id']
162+
end
163+
164+
# DOI – Fetches DOI from various sources in a repo
165+
# I chose to only read the `README.md` as it seems to have the DOI badge almost everytime.
166+
# Whereas enabling the fetching of CITATION.cff or CITATION.md would result in increasing
167+
# the number of api request.
168+
def fetch_doi(full_name)
169+
filename = 'README.md'
170+
url = "#{GITHUB_API_BASE}/#{full_name}/contents/#{filename}"
171+
data = cache_fetch("#{CACHE_PREFIX}doi_#{full_name.gsub('/', '_')}_#{filename.downcase}", url)
172+
return nil unless data && data['content']
173+
174+
decoded = Base64.decode64(data['content'])
175+
doi_match = decoded.match(%r{doi.org/\s*([^\s,)]+)}i)
176+
doi_match ? "https://doi.org/#{doi_match[1]}" : nil
177+
end
178+
179+
# RELEASE – Opens releases API address and returns last release
180+
def fetch_latest_release(full_name)
181+
url = "#{GITHUB_API_BASE}/#{full_name}/releases"
182+
releases = cache_fetch("#{CACHE_PREFIX}releases_#{full_name.gsub('/', '_')}", url)
183+
releases.is_a?(Array) && releases.first ? releases.first['tag_name'] : nil
184+
end
185+
186+
# CONTRIBUTORS – Opens contributors API address and returns list of contributors
187+
def fetch_contributors(contributors_url, full_name)
188+
contributors = cache_fetch("#{CACHE_PREFIX}contributors_#{full_name.gsub('/', '_')}", contributors_url)
189+
return [] unless contributors
190+
191+
contributors.map { |c| (c['login']) }
192+
end
193+
194+
# PREREQUISITES – From the homepage HTML, looks for <p> tags which are children of ...
195+
def fetch_prerequisites(html)
196+
prereq_paragraphs = []
197+
198+
# ... any heading tag (h1–h6) or span tag with text "prereq" (EN) or "prerreq" (ES)
199+
prereq_paragraphs = fetch_prerequisites_from_h(html, prereq_paragraphs)
200+
201+
# ... any tag with id containing "prereq" (EN) or "prerreq" (ES)
202+
prereq_paragraphs = fetch_prerequisites_from_id_or_class(html, prereq_paragraphs) if prereq_paragraphs.empty?
203+
204+
prereq_paragraphs&.join("\n")&.gsub(/\n\n+/, "\n")&.strip || ''
205+
end
206+
207+
def fetch_prerequisites_from_h(html, prereq_paragraphs)
208+
html.xpath('//h1|//h2|//h3|//h4|//h5|//h6|//span').each do |h|
209+
next unless h.text =~ /prereq|prerreq/i # if prereq in text
210+
211+
paragraph = h.xpath('following-sibling::*')
212+
.take_while { |sib| %w[p ul ol].include?(sib.name) } # take either p, ul or ol
213+
prereq_paragraphs.concat(paragraph) if paragraph
214+
end
215+
prereq_paragraphs
216+
end
217+
218+
def fetch_prerequisites_from_id_or_class(html, prereq_paragraphs)
219+
html.xpath('//*[@id]').each do |node|
220+
next unless prereq_node?(node)
221+
222+
extract_following_paragraphs(node, prereq_paragraphs)
223+
extract_nested_paragraphs(node, prereq_paragraphs) if prereq_paragraphs.empty?
224+
end
225+
prereq_paragraphs
226+
end
227+
228+
def prereq_node?(node)
229+
[node['id'], node['class']].compact.any? { |attr| attr =~ /prereq|prerreq/i }
230+
end
231+
232+
def extract_following_paragraphs(node, prereq_paragraphs)
233+
paragraphs = node.xpath('following-sibling::*')
234+
.take_while { |sib| %w[p ul ol].include?(sib.name) }
235+
prereq_paragraphs.concat(paragraphs) if paragraphs
236+
end
237+
238+
def extract_nested_paragraphs(node, prereq_paragraphs)
239+
paragraphs = node.xpath('.//p | .//ul | .//ol')
240+
prereq_paragraphs.concat(paragraphs) if paragraphs.any?
241+
end
242+
end
243+
end

lib/ingestors/ingestor.rb

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,34 @@ def open_url(url, raise: false, token: nil)
7272
end
7373
end
7474

75+
# Some URLs automatically redirects the user to another webpage
76+
# This method gets a URL and returns the last redirected URL (as shown by a 30X response or a `meta[http-equiv="Refresh"]` tag)
77+
def get_redirected_url(url, limit = 5) # rubocop:disable Metrics/AbcSize
78+
raise 'Too many redirects' if limit.zero?
79+
80+
https_url = to_https(url) # some `homepage` were http
81+
response = HTTParty.get(https_url, follow_redirects: true, headers: { 'User-Agent' => config[:user_agent] || 'TeSS Bot' })
82+
return https_url unless response.headers['content-type']&.include?('html')
83+
84+
doc = Nokogiri::HTML(response.body)
85+
meta = doc.at('meta[http-equiv="Refresh"]')
86+
if meta && meta.to_s =~ /url=(.+)/i
87+
content = meta['content']
88+
relative_path = content[/url=(.+)/i, 1]
89+
base = https_url.end_with?('/') ? https_url : "#{https_url}/"
90+
escaped_path = URI::DEFAULT_PARSER.escape(relative_path).to_s
91+
new_url = "#{base}#{escaped_path}"
92+
return get_redirected_url(new_url, limit - 1)
93+
end
94+
https_url
95+
end
96+
97+
def to_https(url)
98+
uri = URI.parse(url)
99+
uri.scheme = 'https'
100+
uri.to_s
101+
end
102+
75103
def convert_description(input)
76104
return input if input.nil?
77105

lib/ingestors/ingestor_factory.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ def self.ingestors
1111
Ingestors::MaterialCsvIngestor,
1212
Ingestors::TessEventIngestor,
1313
Ingestors::ZenodoIngestor,
14+
Ingestors::GithubIngestor,
1415
] + taxila_ingestors + llm_ingestors
1516
end
1617

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
{
2+
"name": "cpluspluscourse",
3+
"full_name": "hsf-training/cpluspluscourse",
4+
"owner": {
5+
"login": "hsf-training"
6+
},
7+
"html_url": "https://github.com/hsf-training/cpluspluscourse",
8+
"description": "C++ Course Taught at CERN",
9+
"homepage": "",
10+
"topics": [
11+
"those",
12+
"are",
13+
"keywords"
14+
],
15+
"license": {
16+
"key": "apache-2.0",
17+
"name": "Apache License 2.0",
18+
"spdx_id": "Apache-2.0"
19+
},
20+
"archived": true,
21+
"created_at": "2025-09-29T14:38:38Z",
22+
"updated_at": "2025-09-30T14:38:38Z",
23+
"pushed_at": "2025-09-28T14:38:38Z",
24+
"contributors_url": "https://api.github.com/repos/hsf-training/cpluspluscourse/contributors"
25+
}
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
{
2+
"name": "python-novice-inflammation",
3+
"full_name": "swcarpentry/python-novice-inflammation",
4+
"owner": {
5+
"login": "swcarpentry"
6+
},
7+
"html_url": "https://github.com/swcarpentry/python-novice-inflammation",
8+
"description": "This is not going to be read",
9+
"homepage": "https://swcarpentry.github.io/python-novice-inflammation/",
10+
"topics": [
11+
"key",
12+
"words",
13+
"in topics"
14+
],
15+
"license": {
16+
"key": "apache-2.0",
17+
"name": "Apache License 2.0",
18+
"spdx_id": "Apache-2.0"
19+
},
20+
"archived": false,
21+
"created_at": "2025-09-29T14:38:38Z",
22+
"updated_at": "2025-09-30T14:38:38Z",
23+
"pushed_at": "2025-09-28T14:38:38Z",
24+
"contributors_url": "https://api.github.com/repos/swcarpentry/python-novice-inflammation/contributors"
25+
}
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
{
2+
"name": "hsf-training-scikit-hep-webpage",
3+
"full_name": "hsf-training/hsf-training-scikit-hep-webpage",
4+
"owner": {
5+
"login": "hsf-training"
6+
},
7+
"html_url": "https://github.com/hsf-training/hsf-training-scikit-hep-webpage",
8+
"description": null,
9+
"homepage": "https://hsf-training.github.io/hsf-training-scikit-hep-webpage/",
10+
"topics": [
11+
"hacktoberfest",
12+
"hey",
13+
"test"
14+
],
15+
"license": {
16+
"key": "other",
17+
"name": "Other",
18+
"spdx_id": "NOASSERTION"
19+
},
20+
"archived": false,
21+
"created_at": "2022-03-23T17:00:05Z",
22+
"updated_at": "2025-09-29T06:14:55Z",
23+
"pushed_at": "2025-09-23T20:09:10Z",
24+
"contributors_url": "https://api.github.com/repos/hsf-training/hsf-training-scikit-hep-webpage/contributors"
25+
}
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
{
2+
"name": "bigchange",
3+
"full_name": "hsf-training/cpluspluscourse",
4+
"html_url": "https://github.com/hsf-training/cpluspluscourse",
5+
"topics": [
6+
"those",
7+
"are",
8+
"NOT"
9+
],
10+
"contributors_url": "https://api.github.com/repos/hsf-training/cpluspluscourse/contributors"
11+
}
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
[
2+
{
3+
"login": "jane"
4+
},
5+
{
6+
"login": "doe"
7+
}
8+
]

0 commit comments

Comments
 (0)