-
Notifications
You must be signed in to change notification settings - Fork 4
Avoid reporting crash results if it is like GPU problem #7
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -493,7 +493,7 @@ def pgnout_settings(config, timestamp, cutechess_idx): | |
| return '-pgnout %s' % (Cutechess.pgn_name(config, timestamp, cutechess_idx)) | ||
|
|
||
| @staticmethod | ||
| def update_results(results, line): | ||
| def update_results(config, results, line, base_name, base_network): | ||
|
|
||
| # Given any game #, find the other in the pair | ||
| def game_to_pair(g): | ||
|
|
@@ -514,21 +514,38 @@ def parse_finished_game(line): | |
| tokens = line.split() | ||
| return int(tokens[2]), tokens[6] | ||
|
|
||
| def is_gpu_crashed(config, engine, network): | ||
| print('[WARNING] Checking if crash was caused by a GPU problem...') | ||
| try: | ||
| safe_run_benchmarks(config, 'base', engine, network) | ||
| return False | ||
| except utils.OpenBenchBadBenchException: | ||
| print('[ERROR] GPU crash detected!') | ||
| return True | ||
|
Comment on lines
+517
to
+524
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I want the server error report (and blacklisting) after benchmark fails. It was only about not reporting game results. |
||
|
|
||
| # Parse for errors resulting in adjudication | ||
| reason = line.split(':')[1] | ||
| crashed = 'disconnect' in reason or 'stalls' in reason | ||
| hw_crashed = crashed and is_gpu_crashed(config, base_name, base_network) | ||
| results['crashes' ] += 'disconnect' in reason or 'stalls' in reason | ||
| results['timelosses'] += 'on time' in reason | ||
| results['illegals' ] += 'illegal' in reason | ||
|
|
||
|
Comment on lines
530
to
533
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I intentionally want to increment crash count for HW errors too. It makes sure we will notice to check |
||
| # Parse Game # and result, and save | ||
| game, result = parse_finished_game(line) | ||
| results['games'][game] = result | ||
| results['games'][game] = result if not hw_crashed else 'hw_crash' | ||
|
|
||
| # Check to see if the Pair has finished | ||
| first, second = game_to_pair(game) | ||
| if first not in results['games'] or second not in results['games']: | ||
| return | ||
|
|
||
| # Don't report results when we detect a GPU issue. | ||
| if results['games'][first] == 'hw_crash' or results['games'][second] == 'hw_crash': | ||
|
|
||
| del results['games'][first] | ||
| del results['games'][second] | ||
| return | ||
|
|
||
| # Get the indices for the Pentanomial, and the two for Trinomial | ||
| p = pair_to_penta(results['games'][first], results['games'][second]) | ||
| t1, t2 = pair_to_trinomial(results['games'][first], results['games'][second]) | ||
|
|
@@ -1024,7 +1041,7 @@ def complete_workload(config): | |
| tasks = [] # Create each of the Cutechess workers | ||
| for x in range(cutechess_cnt): | ||
| cmd = build_cutechess_command(config, dev_name, base_name, scale_factor, timestamp, x) | ||
| tasks.append(executor.submit(run_and_parse_cutechess, config, cmd, x, results, abort_flag)) | ||
| tasks.append(executor.submit(run_and_parse_cutechess, config, cmd, x, results, abort_flag, base_name, base_network)) | ||
|
|
||
| # Process the Queue until we exit, finish, or are told to stop by the server | ||
| try: | ||
|
|
@@ -1148,7 +1165,7 @@ def build_cutechess_command(config, dev_cmd, base_cmd, scale_factor, timestamp, | |
|
|
||
| return ['cutechess-ob.exe', './cutechess-ob'][IS_LINUX] + flags | ||
|
|
||
| def run_and_parse_cutechess(config, command, cutechess_idx, results_queue, abort_flag): | ||
| def run_and_parse_cutechess(config, command, cutechess_idx, results_queue, abort_flag, base_name, base_network): | ||
|
|
||
| print('\n[#%d] Launching Cutechess...\n%s\n' % (cutechess_idx, command)) | ||
| cutechess = Popen(command.split(), stdout=PIPE) | ||
|
|
@@ -1178,7 +1195,7 @@ def run_and_parse_cutechess(config, command, cutechess_idx, results_queue, abort | |
| print('[#%d] %s' % (cutechess_idx, line)) | ||
|
|
||
| if 'Finished game' in line: | ||
| Cutechess.update_results(results, line) | ||
| Cutechess.update_results(config, results, line, base_name, base_network) | ||
|
|
||
| # Add to the results queue every time we have a game-pair finished | ||
| if any(results['pentanomial']): | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think crash handling performance matters. Client is either going to stop working or test is about to fail.