-
Notifications
You must be signed in to change notification settings - Fork 139
fixed process.py in UsCensuspep_sex import #1893
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
b2a4ca4
a061ef2
8edf119
cc9af8a
be35e58
6d88082
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1184,6 +1184,24 @@ def add_future_year_urls(): | |
| # Loop through years in reverse order from 2030 to 2023 | ||
| for future_year in range(2030, 2022, -1): # From 2030 to 2023 | ||
|
|
||
| # We check the National CSV first. If it's 404, the whole year is skipped. | ||
| gatekeeper_url = urls_to_scan[0].format(YEAR=future_year) | ||
| try: | ||
| # Use a short 5-second timeout for the check | ||
| response = requests.head(gatekeeper_url, | ||
| allow_redirects=True, | ||
| timeout=5) | ||
| if response.status_code != 200: | ||
| logging.info( | ||
| f"Skipping year {future_year}: National file not found (status code: {response.status_code})." | ||
| ) | ||
| continue | ||
| except requests.exceptions.RequestException as e: | ||
| logging.warning( | ||
| f"Skipping year {future_year} due to an error checking the gatekeeper URL: {e}" | ||
| ) | ||
| continue | ||
|
|
||
| YEAR = future_year | ||
| # Loop through URLs | ||
| for url in urls_to_scan: | ||
|
|
@@ -1242,7 +1260,7 @@ def download_files(): | |
| global _FILES_TO_DOWNLOAD | ||
| session = requests.session() | ||
|
|
||
| #Get set of already downloaded files | ||
| # Get set of already downloaded files | ||
| downloaded_files = set(os.listdir(_GCS_OUTPUT_PERSISTENT_PATH)) | ||
|
|
||
| for file_to_download in _FILES_TO_DOWNLOAD: | ||
|
|
@@ -1255,6 +1273,12 @@ def download_files(): | |
| else: | ||
| file_name_to_save = url.split('/')[-1] | ||
|
|
||
| # Skip if file already exists (Moved up for efficiency) | ||
| if file_name_to_save in downloaded_files: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a TTL for file sin persistent GCS so that files that are too old, say older than a month, aren't reused but are downloaded again? |
||
| logging.info( | ||
| f"Skipping already downloaded file: {file_name_to_save}") | ||
| continue | ||
|
|
||
| headers = {'User-Agent': 'Mozilla/5.0'} | ||
| try: | ||
| with session.get(url, stream=True, timeout=120, | ||
|
|
@@ -1263,45 +1287,39 @@ def download_files(): | |
|
|
||
| content_type = response.headers.get('Content-Type', '') | ||
|
|
||
| # Skip if file already exists | ||
| if file_name_to_save in downloaded_files: | ||
| logging.info( | ||
| f"Skipping already downloaded file: {file_name_to_save}" | ||
| # Minimal fix: Log error and continue to skip HTML pages | ||
| if 'html' in content_type.lower(): | ||
| logging.error( | ||
| f"Server returned HTML error page for URL: {url}. Skipping." | ||
| ) | ||
| continue | ||
| if 'html' in content_type.lower(): | ||
| logging.fatal( | ||
| f"Server returned HTML error page for URL: {url}") | ||
| else: | ||
| if response.status_code == 200: | ||
| with tempfile.NamedTemporaryFile( | ||
| delete=False) as tmp_file: | ||
| # Stream the response into a temp file | ||
| for chunk in response.iter_content(chunk_size=8192): | ||
| if chunk: | ||
| tmp_file.write(chunk) | ||
| tmp_file_path = tmp_file.name | ||
|
|
||
| # Copy to local destination | ||
| shutil.copy( | ||
| tmp_file_path, | ||
| os.path.join(_INPUT_FILE_PATH, file_name_to_save)) | ||
|
|
||
| # Copy to gcs destination | ||
| shutil.copy( | ||
| tmp_file_path, | ||
| os.path.join(_GCS_OUTPUT_PERSISTENT_PATH, | ||
| file_name_to_save)) | ||
|
|
||
| # Optionally delete the temp file | ||
| os.remove(tmp_file_path) | ||
| file_to_download['is_downloaded'] = True | ||
| logging.info(f"Downloaded file: {url}") | ||
|
|
||
| if response.status_code == 200: | ||
| with tempfile.NamedTemporaryFile(delete=False) as tmp_file: | ||
| for chunk in response.iter_content(chunk_size=8192): | ||
| if chunk: | ||
| tmp_file.write(chunk) | ||
| tmp_file_path = tmp_file.name | ||
|
|
||
| # Copy to local destination | ||
| shutil.copy( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why not shutil.move() which could be faster for local files within the same drive |
||
| tmp_file_path, | ||
| os.path.join(_INPUT_FILE_PATH, file_name_to_save)) | ||
|
|
||
| # Copy to gcs destination | ||
| shutil.copy( | ||
| tmp_file_path, | ||
| os.path.join(_GCS_OUTPUT_PERSISTENT_PATH, | ||
| file_name_to_save)) | ||
|
|
||
| os.remove(tmp_file_path) | ||
| file_to_download['is_downloaded'] = True | ||
| logging.info(f"Downloaded file: {url}") | ||
|
|
||
| except Exception as e: | ||
| file_to_download['is_downloaded'] = False | ||
| logging.error(f"Error downloading {url}: {e}") | ||
| raise # re-raise to trigger @retry | ||
| raise | ||
| time.sleep(1) | ||
|
|
||
| return True | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.