Skip to content

Commit edfb0dc

Browse files
authored
Merge pull request #5 from ANcpLua/fix/capture-tesseract-stderr
Capture Tesseract stderr in OCR-failure exception message
2 parents 2173626 + 170d844 commit edfb0dc

5 files changed

Lines changed: 50 additions & 8 deletions

File tree

CreatePdf.NET.Tests/TesseractOcrProviderTests.cs

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ public class TesseractOcrProviderTests
1111
[Fact]
1212
public async Task ExtractTextFromImageAsync_WhenOutputIsMissing_ThrowsFileNotFound()
1313
{
14-
var processRunner = new FakeProcessRunner();
14+
var processRunner = new FakeProcessRunner { NextResult = new ProcessResult(1, "", "") };
1515
var environment = new FakeSystemEnvironment { FileExistsImpl = _ => false };
1616
var engine = new TesseractOcrProvider(environment, processRunner);
1717

@@ -21,14 +21,36 @@ public async Task ExtractTextFromImageAsync_WhenOutputIsMissing_ThrowsFileNotFou
2121
new OcrOptions { TesseractPath = "/bin/echo" });
2222

2323
await act.Should().ThrowAsync<FileNotFoundException>()
24-
.WithMessage("OCR output file not found. Tesseract execution failed*")
24+
.WithMessage("OCR output file not found (Tesseract exited with code 1)*")
2525
.ConfigureAwait(true);
2626

2727
processRunner.StartInfos.Should().HaveCount(1);
2828
processRunner.StartInfos[0].FileName.Should().Be("/bin/echo");
2929
processRunner.StartInfos[0].Arguments.Should().Contain("input.png");
3030
}
3131

32+
[Fact]
33+
public async Task ExtractTextFromImageAsync_WhenOutputIsMissing_IncludesStderrInExceptionMessage()
34+
{
35+
const string tesseractStderr = "Error opening data file /usr/share/tessdata/eng.traineddata";
36+
var processRunner = new FakeProcessRunner
37+
{
38+
NextResult = new ProcessResult(ExitCode: 1, StandardOutput: "", StandardError: tesseractStderr)
39+
};
40+
var environment = new FakeSystemEnvironment { FileExistsImpl = _ => false };
41+
var engine = new TesseractOcrProvider(environment, processRunner);
42+
43+
var act = () => engine.ExtractTextFromImageAsync(
44+
"input.png",
45+
Path.Combine(Path.GetTempPath(), "missing-output.txt"),
46+
new OcrOptions { TesseractPath = "/bin/echo" });
47+
48+
await act.Should().ThrowAsync<FileNotFoundException>()
49+
.Where(e => e.Message.Contains(tesseractStderr, StringComparison.Ordinal)
50+
&& e.Message.Contains("exited with code 1", StringComparison.Ordinal))
51+
.ConfigureAwait(true);
52+
}
53+
3254
[Theory]
3355
[InlineData("out.log", "out.txt")]
3456
[InlineData("out", "out.txt")]
@@ -207,10 +229,12 @@ private sealed class FakeProcessRunner : IProcessRunner
207229
{
208230
public List<ProcessStartInfo> StartInfos { get; } = [];
209231

210-
public Task RunAsync(ProcessStartInfo startInfo, CancellationToken cancellationToken)
232+
public ProcessResult NextResult { get; set; } = new(ExitCode: 0, StandardOutput: "", StandardError: "");
233+
234+
public Task<ProcessResult> RunAsync(ProcessStartInfo startInfo, CancellationToken cancellationToken)
211235
{
212236
StartInfos.Add(startInfo);
213-
return Task.CompletedTask;
237+
return Task.FromResult(NextResult);
214238
}
215239
}
216240
}

CreatePdf.NET/Internal/IProcessRunner.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,5 @@ namespace CreatePdf.NET.Internal;
44

55
internal interface IProcessRunner
66
{
7-
Task RunAsync(ProcessStartInfo startInfo, CancellationToken cancellationToken);
7+
Task<ProcessResult> RunAsync(ProcessStartInfo startInfo, CancellationToken cancellationToken);
88
}
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
namespace CreatePdf.NET.Internal;
2+
3+
internal readonly record struct ProcessResult(int ExitCode, string StandardOutput, string StandardError);

CreatePdf.NET/Internal/ProcessRunner.cs

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,22 @@ internal sealed class ProcessRunner : IProcessRunner
66
{
77
public static ProcessRunner Instance { get; } = new();
88

9-
public async Task RunAsync(ProcessStartInfo startInfo, CancellationToken cancellationToken)
9+
public async Task<ProcessResult> RunAsync(ProcessStartInfo startInfo, CancellationToken cancellationToken)
1010
{
1111
ArgumentNullException.ThrowIfNull(startInfo);
1212

1313
using var process = Process.Start(startInfo)
1414
?? throw new InvalidOperationException(
1515
$"Process.Start returned null for '{startInfo.FileName}' — no new process was created.");
16+
17+
// Read both streams concurrently to avoid the classic "child process blocks on a full pipe" deadlock.
18+
var stdoutTask = process.StandardOutput.ReadToEndAsync(cancellationToken);
19+
var stderrTask = process.StandardError.ReadToEndAsync(cancellationToken);
20+
1621
await process.WaitForExitAsync(cancellationToken).ConfigureAwait(false);
22+
var stdout = await stdoutTask.ConfigureAwait(false);
23+
var stderr = await stderrTask.ConfigureAwait(false);
24+
25+
return new ProcessResult(process.ExitCode, stdout, stderr);
1726
}
1827
}

CreatePdf.NET/Internal/TesseractOcrProvider.cs

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,15 +56,21 @@ public async Task<string> ExtractTextFromImageAsync(string pngPath, string txtPa
5656
Path.GetFileNameWithoutExtension(txtPath));
5757
var actualTxtPath = outputBase + ".txt";
5858

59-
await _processRunner.RunAsync(
59+
var result = await _processRunner.RunAsync(
6060
CreateProcessInfo(
6161
GetTesseractExecutable(options),
6262
GetOcrArguments(pngPath, outputBase, options)),
6363
cancellationToken)
6464
.ConfigureAwait(false);
6565

6666
if (!_systemEnvironment.FileExists(actualTxtPath))
67-
throw new FileNotFoundException("OCR output file not found. Tesseract execution failed.", actualTxtPath);
67+
{
68+
var exitCode = result.ExitCode.ToString(CultureInfo.InvariantCulture);
69+
var message = string.IsNullOrWhiteSpace(result.StandardError)
70+
? $"OCR output file not found (Tesseract exited with code {exitCode})."
71+
: $"OCR output file not found (Tesseract exited with code {exitCode}): {result.StandardError.Trim()}";
72+
throw new FileNotFoundException(message, actualTxtPath);
73+
}
6874

6975
var text = await File.ReadAllTextAsync(actualTxtPath, cancellationToken).ConfigureAwait(false);
7076
return text.Trim().Replace("\n", " ").Replace("\r", " ");

0 commit comments

Comments
 (0)