Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 23 additions & 3 deletions arc/job/trsh.py
Original file line number Diff line number Diff line change
Expand Up @@ -1150,10 +1150,30 @@ def trsh_ess_job(label: str,
couldnt_trsh = True

if couldnt_trsh:
logger.error(f'Could not troubleshoot geometry optimization for {label}! '
f'Tried troubleshooting with the following methods: {ess_trsh_methods}')
# Count and remove 'trsh_attempt' entries for cleaner reporting
trsh_attempt_count = ess_trsh_methods.count('trsh_attempt')
filtered_methods = [method for method in ess_trsh_methods if method != 'trsh_attempt']

# Build the message with the count and filtered methods
if trsh_attempt_count > 0:
# We attempted troubleshooting some number of times.
if filtered_methods:
message = f'Tried troubleshooting {trsh_attempt_count} time(s), with the following methods: {filtered_methods}'
else:
# Edge case: scheduler/appender added 'trsh_attempt' but no concrete methods were applied
message = f'Tried troubleshooting {trsh_attempt_count} time(s); No applicable troubleshooting methods found'
else:
# No explicit 'trsh_attempt' marker present. Could be called directly (e.g. unit tests)
if filtered_methods:
message = f'Tried troubleshooting with the following methods: {filtered_methods}'
else:
# No methods at all
message = 'No applicable troubleshooting methods found'

logger.error(f'Could not troubleshoot {job_type} for {label}! '
f'{message}')
output_errors.append(f'Error: Could not troubleshoot {job_type} for {label}! '
f'Tried troubleshooting with the following methods: {ess_trsh_methods}; ')
f'{message}; ')
return (output_errors,
ess_trsh_methods,
remove_checkfile,
Expand Down
54 changes: 49 additions & 5 deletions arc/job/trsh_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,7 +445,7 @@ def test_trsh_ess_job(self):
self.assertEqual(memory, capped_memory_gb)
self.assertIn('Use a higher-memory node or lower the job cost', output_errors[0])

# Gaussian: test 7
# Gaussian: test 7 - part 1
job_status = {'keywords': ['SCF', 'GL502', 'NoSymm']}
ess_trsh_methods = ['scf=(NoDIIS)', 'int=(Acc2E=14)', 'checkfile=None', 'scf=(qc)', 'NoSymm','scf=(NDamp=30)', 'guess=INDO', 'scf=(Fermi)',
'scf=(Noincfock)', 'scf=(NoVarAcc)']
Expand All @@ -454,10 +454,32 @@ def test_trsh_ess_job(self):
job_type, software, fine, memory_gb,
num_heavy_atoms, cpu_cores, ess_trsh_methods)
self.assertTrue(couldnt_trsh)
self.assertIn(
"Error: Could not troubleshoot opt for ethanol! Tried troubleshooting with the following methods: ['scf=(NoDIIS)', 'int=(Acc2E=14)', 'checkfile=None', 'scf=(qc)', 'NoSymm', 'scf=(NDamp=30)', 'guess=INDO', 'scf=(Fermi)', 'scf=(Noincfock)', 'scf=(NoVarAcc)', 'all_attempted']; ",
output_errors,
)
# assert the output contains the expected troubleshooting summary and final marker
self.assertTrue(any('Tried troubleshooting' in e for e in output_errors))
self.assertTrue(any('with the following methods' in e for e in output_errors))
self.assertTrue(any('all_attempted' in e for e in output_errors))
self.assertTrue(all('trsh_attempt' not in e for e in output_errors))

# Gaussian: test 7 - part 2
# verify troubleshoot attempts counting (consolidated)
job_status = {'keywords': ['MaxOptCycles', 'GL9999']}
ess_trsh_methods = ['trsh_attempt',
'int=(Acc2E=14)', 'opt=(maxcycle=200)',
'trsh_attempt', 'opt=(RFO)',
'trsh_attempt', 'opt=(GDIIS)',
'trsh_attempt', 'opt=(GEDIIS)',
'trsh_attempt']
output_errors, ess_trsh_methods, remove_checkfile, level_of_theory, software, job_type, fine, trsh_keyword, \
memory, shift, cpu_cores, couldnt_trsh = trsh.trsh_ess_job(label, level_of_theory, server, job_status,
job_type, software, fine, memory_gb,
num_heavy_atoms, cpu_cores, ess_trsh_methods)
self.assertTrue(couldnt_trsh)
e = output_errors[-1]
self.assertIn('Tried troubleshooting 5 time(s)', e)
self.assertNotIn('trsh_attempt', e)
for opt in ("opt=(maxcycle=200)", "opt=(RFO)", "opt=(GDIIS)", "opt=(GEDIIS)"):
self.assertIn(opt, e)
self.assertIn('all_attempted', e)

# Gaussian: test 8
job_status = {'keywords': ['MaxOptCycles', 'GL9999','SCF']}
Expand Down Expand Up @@ -810,6 +832,28 @@ def test_trsh_ess_job(self):
num_heavy_atoms, cpu_cores, ess_trsh_methods,
is_h=True, is_monoatomic=True)

def test_trsh_ess_job_terachem_trsh_attempt_only(self):
"""Isolate the terachem trsh_attempt-only case from Gaussian stateful flow."""
label = 'ethanol'
level_of_theory = {'method': 'ccsd', 'basis': 'vdz'}
server = 'server1'
job_type = 'opt'
software = 'terachem'
fine = False
memory_gb = 16
num_heavy_atoms = 2
cpu_cores = 8
job_status = {'keywords': []}
ess_trsh_methods = ['trsh_attempt']

output_errors, ess_trsh_methods, remove_checkfile, level_of_theory, software, job_type, fine, trsh_keyword, \
memory, shift, cpu_cores, couldnt_trsh = trsh.trsh_ess_job(label, level_of_theory, server, job_status,
job_type, software, fine, memory_gb,
num_heavy_atoms, cpu_cores, ess_trsh_methods)

self.assertTrue(couldnt_trsh)
self.assertTrue(any('No applicable troubleshooting methods found' in out for out in output_errors))

def test_determine_job_log_memory_issues(self):
"""Test the determine_job_log_memory_issues() function."""
job_log_path_1 = os.path.join(ARC_TESTING_PATH, 'job_log', 'no_issues.log')
Expand Down
6 changes: 5 additions & 1 deletion arc/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3562,7 +3562,8 @@ def troubleshoot_ess(self,

level_of_theory = Level(repr=level_of_theory)
logger.info('\n')
warning_message = f'Troubleshooting {label} job {job.job_name} which failed'
# log job failure information before troubleshooting
warning_message = f'{label} Job {job.job_name} failed'
if job.job_status[1]["status"] and job.job_status[1]["status"] != 'done':
warning_message += f' with status: "{job.job_status[1]["status"]},"'
if job.job_status[1]["keywords"]:
Expand Down Expand Up @@ -3596,11 +3597,14 @@ def troubleshoot_ess(self,
self.species_dict[label].checkfile = job.checkfile
# Guard against infinite troubleshooting loops.
trsh_attempts = job.ess_trsh_methods.count('trsh_attempt')
next_attempt = trsh_attempts + 1
if trsh_attempts >= max_ess_trsh:
logger.info(f'Could not troubleshoot {job.job_type} for {label}. '
f'Reached max troubleshooting attempts ({max_ess_trsh}).')
self.output[label]['errors'] += f'Error: ESS troubleshooting attempts exhausted for {label} {job.job_type}; '
return
logger.warning(f'Troubleshooting {label} job {job.job_name} '
f'(attempt number {next_attempt}).')
job.ess_trsh_methods.append('trsh_attempt')

# Determine if the species is a hydrogen atom (or its isotope).
Expand Down