Skip to content

Commit 79af30f

Browse files
committed
Convert sibling API event scraper classes into one class and a mixin
1 parent 4ca46ef commit 79af30f

1 file changed

Lines changed: 55 additions & 50 deletions

File tree

legistar/events.py

Lines changed: 55 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
from abc import ABCMeta, abstractmethod
21
import time
32
import datetime
43
from collections import deque
@@ -171,7 +170,7 @@ def _get_ecomment_link(self, link):
171170
return self.ecomment_dict.get(event_id, None)
172171

173172

174-
class LegistarAPIEventScraperBase(LegistarAPIScraper, metaclass=ABCMeta):
173+
class LegistarAPIEventScraper(LegistarAPIScraper):
175174
webscraper_class = LegistarEventsScraper
176175
WEB_RETRY_EVENTS = 3
177176

@@ -197,10 +196,6 @@ def _init_webscraper(self):
197196

198197
return webscraper
199198

200-
@abstractmethod
201-
def _get_web_event(self, api_event):
202-
pass
203-
204199
def api_events(self, since_datetime=None):
205200
# scrape from oldest to newest. This makes resuming big
206201
# scraping jobs easier because upon a scrape failure we can
@@ -315,25 +310,6 @@ def minutes(self, event):
315310
self._suppress_item_matter(item, minutes_url)
316311
yield item
317312

318-
def _suppress_item_matter(self, item, agenda_url):
319-
'''
320-
Agenda items in Legistar do not always display links to
321-
associated matter files even if the same agenda item
322-
in the API references a Matter File. The agenda items
323-
we scrape should honor the suppression on the Legistar
324-
agendas.
325-
326-
This is also practical because matter files that are hidden
327-
in the Legistar Agenda do not seem to available for scraping
328-
on Legistar or through the API
329-
330-
Since we are not completely sure that the same suppression
331-
logic should be used for all Legislative Bodies, this method
332-
is currently just a hook for being overridden in particular
333-
scrapers. As of now, at least LA Metro uses this hook.
334-
'''
335-
pass
336-
337313
def rollcalls(self, event):
338314
for item in self.agenda(event):
339315
if item['EventItemRollCallFlag']:
@@ -354,6 +330,25 @@ def addDocs(self, e, events, doc_type):
354330
except ValueError:
355331
pass
356332

333+
def _suppress_item_matter(self, item, agenda_url):
334+
'''
335+
Agenda items in Legistar do not always display links to
336+
associated matter files even if the same agenda item
337+
in the API references a Matter File. The agenda items
338+
we scrape should honor the suppression on the Legistar
339+
agendas.
340+
341+
This is also practical because matter files that are hidden
342+
in the Legistar Agenda do not seem to available for scraping
343+
on Legistar or through the API
344+
345+
Since we are not completely sure that the same suppression
346+
logic should be used for all Legislative Bodies, this method
347+
is currently just a hook for being overridden in particular
348+
scrapers. As of now, at least LA Metro uses this hook.
349+
'''
350+
pass
351+
357352
def _event_status(self, event):
358353
'''Events can have a status of tentative, confirmed, cancelled, or
359354
passed (http://docs.opencivicdata.org/en/latest/data/event.html). By
@@ -367,11 +362,20 @@ def _event_status(self, event):
367362
status = 'confirmed'
368363

369364
return status
370-
371-
372-
class LegistarAPIEventScraper(LegistarAPIEventScraperBase):
365+
366+
def _not_in_web_interface(self, event):
367+
'''Occasionally, an event will appear in the API before it appears in
368+
the web interface. This method checks attributes of the API event that
369+
tell uswhether the given event is one of those cases, returning True if
370+
so, and False otherwise. Available for override in jurisdictional
371+
scrapers.
372+
'''
373+
return False
373374

374375
def _get_web_event(self, api_event):
376+
if self._not_in_web_interface(api_event):
377+
return None
378+
375379
return self.web_detail(api_event)
376380

377381
def web_detail(self, event):
@@ -405,12 +409,13 @@ def web_detail(self, event):
405409
return event_page_details
406410

407411

408-
class LegistarAPIEventScraperZip(LegistarAPIEventScraperBase):
409-
'''
410-
There are some inSite sites that have information that only appears
411-
event listing page, like NYC's 'Meeting Topic.' This scraper visits
412-
the listing page and attempts to zip API and web events together
413-
'''
412+
class WebCalendarMixin:
413+
"""
414+
Sometimes, it's desirable to retrieve information from the web calendar,
415+
in addition to the API. This mixin extends the base functionality to get
416+
event information from both the detail page, if accessible, and the web
417+
calendar listing.
418+
"""
414419
def __init__(self, *args, **kwargs):
415420
super().__init__(*args, **kwargs)
416421

@@ -422,13 +427,20 @@ def __init__(self, *args, **kwargs):
422427
# Instantiate dictionary where events from generator are stored as they
423428
# are scraped.
424429
self._scraped_events = {}
425-
430+
426431
def _get_web_event(self, api_event):
427432
if self._not_in_web_interface(api_event):
428-
return None
433+
event_detail = {}
429434
else:
430-
# None if entire web calendar scraped but API event not found
431-
return self.web_results(api_event)
435+
# None if detail link does not exist or cannot be found.
436+
event_detail = super()._get_web_event(api_event) or {}
437+
438+
# Sometimes events can appear on the calendar before their detail links
439+
# become available. None if entire web calendar scraped but event not
440+
# found.
441+
event_listing = self.web_results(api_event) or {}
442+
443+
return (event_listing | event_detail) or None
432444

433445
def web_results(self, event):
434446
api_key = (event['EventBodyName'].strip(),
@@ -453,29 +465,22 @@ def _scrapeWebCalendar(self):
453465
chronological order.
454466
'''
455467
for event, _ in self._webscraper.events(follow_links=False):
456-
event_key = self._event_key(event, self._webscraper)
468+
event_key = self._event_key(event)
469+
print(event_key, event)
457470
yield event_key, event
458471

459-
def _event_key(self, event, web_scraper):
472+
def _event_key(self, event):
460473
'''Since Legistar InSite contains more information about events than
461474
are available in the API, we need to scrape both. Then, we have
462475
to line them up. This method makes a key that should be
463476
uniquely identify every event and will allow us to link
464477
events from the two data sources.
465478
'''
466-
response = web_scraper.get(event['iCalendar']['url'], verify=False)
467-
event_time = web_scraper.ical(response.text).subcomponents[0]['DTSTART'].dt
479+
response = self._webscraper.get(event['iCalendar']['url'], verify=False)
480+
event_time = self._webscraper.ical(response.text).subcomponents[0]['DTSTART'].dt
468481
event_time = pytz.timezone(self.TIMEZONE).localize(event_time)
469482

470483
key = (event['Name']['label'],
471484
event_time)
472485

473486
return key
474-
475-
def _not_in_web_interface(self, event):
476-
'''Occasionally, an event will appear in the API, but not in the web
477-
interface. This method checks attributes of the API event that tell us
478-
whether the given event is one of those cases, returning True if so, and
479-
False otherwise. Available for override in jurisdictional scrapers.
480-
'''
481-
return False

0 commit comments

Comments
 (0)