Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 52 additions & 34 deletions scripts/us_census/pep/us_pep_sex/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -1184,6 +1184,24 @@ def add_future_year_urls():
# Loop through years in reverse order from 2030 to 2023
for future_year in range(2030, 2022, -1): # From 2030 to 2023

# We check the National CSV first. If it's 404, the whole year is skipped.
gatekeeper_url = urls_to_scan[0].format(YEAR=future_year)
try:
# Use a short 5-second timeout for the check
response = requests.head(gatekeeper_url,
allow_redirects=True,
timeout=5)
if response.status_code != 200:
logging.info(
f"Skipping year {future_year}: National file not found (status code: {response.status_code})."
)
continue
except requests.exceptions.RequestException as e:
logging.warning(
f"Skipping year {future_year} due to an error checking the gatekeeper URL: {e}"
)
continue
Comment thread
niveditasing marked this conversation as resolved.

YEAR = future_year
# Loop through URLs
for url in urls_to_scan:
Expand Down Expand Up @@ -1242,7 +1260,7 @@ def download_files():
global _FILES_TO_DOWNLOAD
session = requests.session()

#Get set of already downloaded files
# Get set of already downloaded files
downloaded_files = set(os.listdir(_GCS_OUTPUT_PERSISTENT_PATH))

for file_to_download in _FILES_TO_DOWNLOAD:
Expand All @@ -1255,6 +1273,12 @@ def download_files():
else:
file_name_to_save = url.split('/')[-1]

# Skip if file already exists (Moved up for efficiency)
if file_name_to_save in downloaded_files:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a TTL for file sin persistent GCS so that files that are too old, say older than a month, aren't reused but are downloaded again?

logging.info(
f"Skipping already downloaded file: {file_name_to_save}")
continue

headers = {'User-Agent': 'Mozilla/5.0'}
try:
with session.get(url, stream=True, timeout=120,
Expand All @@ -1263,45 +1287,39 @@ def download_files():

content_type = response.headers.get('Content-Type', '')

# Skip if file already exists
if file_name_to_save in downloaded_files:
logging.info(
f"Skipping already downloaded file: {file_name_to_save}"
# Minimal fix: Log error and continue to skip HTML pages
if 'html' in content_type.lower():
logging.error(
f"Server returned HTML error page for URL: {url}. Skipping."
)
continue
if 'html' in content_type.lower():
logging.fatal(
f"Server returned HTML error page for URL: {url}")
else:
if response.status_code == 200:
with tempfile.NamedTemporaryFile(
delete=False) as tmp_file:
# Stream the response into a temp file
for chunk in response.iter_content(chunk_size=8192):
if chunk:
tmp_file.write(chunk)
tmp_file_path = tmp_file.name

# Copy to local destination
shutil.copy(
tmp_file_path,
os.path.join(_INPUT_FILE_PATH, file_name_to_save))

# Copy to gcs destination
shutil.copy(
tmp_file_path,
os.path.join(_GCS_OUTPUT_PERSISTENT_PATH,
file_name_to_save))

# Optionally delete the temp file
os.remove(tmp_file_path)
file_to_download['is_downloaded'] = True
logging.info(f"Downloaded file: {url}")

if response.status_code == 200:
with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
tmp_file.write(chunk)
tmp_file_path = tmp_file.name

# Copy to local destination
shutil.copy(
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why not shutil.move() which could be faster for local files within the same drive

tmp_file_path,
os.path.join(_INPUT_FILE_PATH, file_name_to_save))

# Copy to gcs destination
shutil.copy(
tmp_file_path,
os.path.join(_GCS_OUTPUT_PERSISTENT_PATH,
file_name_to_save))

os.remove(tmp_file_path)
file_to_download['is_downloaded'] = True
logging.info(f"Downloaded file: {url}")

except Exception as e:
file_to_download['is_downloaded'] = False
logging.error(f"Error downloading {url}: {e}")
raise # re-raise to trigger @retry
raise
time.sleep(1)

return True
Expand Down
Loading