1- from abc import ABCMeta , abstractmethod
21import time
32import datetime
43from collections import deque
@@ -171,7 +170,7 @@ def _get_ecomment_link(self, link):
171170 return self .ecomment_dict .get (event_id , None )
172171
173172
174- class LegistarAPIEventScraperBase (LegistarAPIScraper , metaclass = ABCMeta ):
173+ class LegistarAPIEventScraper (LegistarAPIScraper ):
175174 webscraper_class = LegistarEventsScraper
176175 WEB_RETRY_EVENTS = 3
177176
@@ -197,10 +196,6 @@ def _init_webscraper(self):
197196
198197 return webscraper
199198
200- @abstractmethod
201- def _get_web_event (self , api_event ):
202- pass
203-
204199 def api_events (self , since_datetime = None ):
205200 # scrape from oldest to newest. This makes resuming big
206201 # scraping jobs easier because upon a scrape failure we can
@@ -315,25 +310,6 @@ def minutes(self, event):
315310 self ._suppress_item_matter (item , minutes_url )
316311 yield item
317312
318- def _suppress_item_matter (self , item , agenda_url ):
319- '''
320- Agenda items in Legistar do not always display links to
321- associated matter files even if the same agenda item
322- in the API references a Matter File. The agenda items
323- we scrape should honor the suppression on the Legistar
324- agendas.
325-
326- This is also practical because matter files that are hidden
327- in the Legistar Agenda do not seem to available for scraping
328- on Legistar or through the API
329-
330- Since we are not completely sure that the same suppression
331- logic should be used for all Legislative Bodies, this method
332- is currently just a hook for being overridden in particular
333- scrapers. As of now, at least LA Metro uses this hook.
334- '''
335- pass
336-
337313 def rollcalls (self , event ):
338314 for item in self .agenda (event ):
339315 if item ['EventItemRollCallFlag' ]:
@@ -354,6 +330,25 @@ def addDocs(self, e, events, doc_type):
354330 except ValueError :
355331 pass
356332
333+ def _suppress_item_matter (self , item , agenda_url ):
334+ '''
335+ Agenda items in Legistar do not always display links to
336+ associated matter files even if the same agenda item
337+ in the API references a Matter File. The agenda items
338+ we scrape should honor the suppression on the Legistar
339+ agendas.
340+
341+ This is also practical because matter files that are hidden
342+ in the Legistar Agenda do not seem to available for scraping
343+ on Legistar or through the API
344+
345+ Since we are not completely sure that the same suppression
346+ logic should be used for all Legislative Bodies, this method
347+ is currently just a hook for being overridden in particular
348+ scrapers. As of now, at least LA Metro uses this hook.
349+ '''
350+ pass
351+
357352 def _event_status (self , event ):
358353 '''Events can have a status of tentative, confirmed, cancelled, or
359354 passed (http://docs.opencivicdata.org/en/latest/data/event.html). By
@@ -367,11 +362,20 @@ def _event_status(self, event):
367362 status = 'confirmed'
368363
369364 return status
370-
371-
372- class LegistarAPIEventScraper (LegistarAPIEventScraperBase ):
365+
366+ def _not_in_web_interface (self , event ):
367+ '''Occasionally, an event will appear in the API before it appears in
368+ the web interface. This method checks attributes of the API event that
369+ tell uswhether the given event is one of those cases, returning True if
370+ so, and False otherwise. Available for override in jurisdictional
371+ scrapers.
372+ '''
373+ return False
373374
374375 def _get_web_event (self , api_event ):
376+ if self ._not_in_web_interface (api_event ):
377+ return None
378+
375379 return self .web_detail (api_event )
376380
377381 def web_detail (self , event ):
@@ -405,12 +409,13 @@ def web_detail(self, event):
405409 return event_page_details
406410
407411
408- class LegistarAPIEventScraperZip (LegistarAPIEventScraperBase ):
409- '''
410- There are some inSite sites that have information that only appears
411- event listing page, like NYC's 'Meeting Topic.' This scraper visits
412- the listing page and attempts to zip API and web events together
413- '''
412+ class WebCalendarMixin :
413+ """
414+ Sometimes, it's desirable to retrieve information from the web calendar,
415+ in addition to the API. This mixin extends the base functionality to get
416+ event information from both the detail page, if accessible, and the web
417+ calendar listing.
418+ """
414419 def __init__ (self , * args , ** kwargs ):
415420 super ().__init__ (* args , ** kwargs )
416421
@@ -422,13 +427,20 @@ def __init__(self, *args, **kwargs):
422427 # Instantiate dictionary where events from generator are stored as they
423428 # are scraped.
424429 self ._scraped_events = {}
425-
430+
426431 def _get_web_event (self , api_event ):
427432 if self ._not_in_web_interface (api_event ):
428- return None
433+ event_detail = {}
429434 else :
430- # None if entire web calendar scraped but API event not found
431- return self .web_results (api_event )
435+ # None if detail link does not exist or cannot be found.
436+ event_detail = super ()._get_web_event (api_event ) or {}
437+
438+ # Sometimes events can appear on the calendar before their detail links
439+ # become available. None if entire web calendar scraped but event not
440+ # found.
441+ event_listing = self .web_results (api_event ) or {}
442+
443+ return (event_listing | event_detail ) or None
432444
433445 def web_results (self , event ):
434446 api_key = (event ['EventBodyName' ].strip (),
@@ -453,29 +465,22 @@ def _scrapeWebCalendar(self):
453465 chronological order.
454466 '''
455467 for event , _ in self ._webscraper .events (follow_links = False ):
456- event_key = self ._event_key (event , self ._webscraper )
468+ event_key = self ._event_key (event )
469+ print (event_key , event )
457470 yield event_key , event
458471
459- def _event_key (self , event , web_scraper ):
472+ def _event_key (self , event ):
460473 '''Since Legistar InSite contains more information about events than
461474 are available in the API, we need to scrape both. Then, we have
462475 to line them up. This method makes a key that should be
463476 uniquely identify every event and will allow us to link
464477 events from the two data sources.
465478 '''
466- response = web_scraper .get (event ['iCalendar' ]['url' ], verify = False )
467- event_time = web_scraper .ical (response .text ).subcomponents [0 ]['DTSTART' ].dt
479+ response = self . _webscraper .get (event ['iCalendar' ]['url' ], verify = False )
480+ event_time = self . _webscraper .ical (response .text ).subcomponents [0 ]['DTSTART' ].dt
468481 event_time = pytz .timezone (self .TIMEZONE ).localize (event_time )
469482
470483 key = (event ['Name' ]['label' ],
471484 event_time )
472485
473486 return key
474-
475- def _not_in_web_interface (self , event ):
476- '''Occasionally, an event will appear in the API, but not in the web
477- interface. This method checks attributes of the API event that tell us
478- whether the given event is one of those cases, returning True if so, and
479- False otherwise. Available for override in jurisdictional scrapers.
480- '''
481- return False
0 commit comments