@@ -2,6 +2,8 @@ namespace CreatePdf.NET.Internal;
22
33internal sealed class OcrService
44{
5+ private const string TempDirPrefix = "createpdf-ocr-" ;
6+
57 private readonly IOcrProvider _provider ;
68
79 public OcrService ( ) : this ( new TesseractOcrProvider ( ) )
@@ -16,49 +18,53 @@ internal OcrService(IOcrProvider provider)
1618 public async Task < string > ProcessPdfAsync ( string pdfPath , OcrOptions options ,
1719 CancellationToken cancellationToken = default )
1820 {
19- var tempDir = Path . GetTempPath ( ) ;
20- var pdfName = Path . GetFileNameWithoutExtension ( pdfPath ) ;
21- var pngPath = Path . Combine ( tempDir , $ "{ pdfName } _{ Guid . NewGuid ( ) : N} .png") ;
22- var txtPath = Path . Combine ( tempDir , $ "{ pdfName } _{ Guid . NewGuid ( ) : N} .txt") ;
23-
21+ var workDir = Directory . CreateTempSubdirectory ( TempDirPrefix ) ;
2422 try
2523 {
26- await _provider . RasterizePdfToPngAsync ( pdfPath , pngPath , options , cancellationToken ) . ConfigureAwait ( false ) ;
27- return await _provider . ExtractTextFromImageAsync ( pngPath , txtPath , options , cancellationToken )
28- . ConfigureAwait ( false ) ;
24+ return await OcrAsync ( pdfPath , workDir . FullName , options , cancellationToken ) . ConfigureAwait ( false ) ;
2925 }
3026 finally
3127 {
32- TryDeleteFile ( pngPath ) ;
33- TryDeleteFile ( txtPath ) ;
28+ TryDeleteDirectory ( workDir . FullName ) ;
3429 }
3530 }
3631
3732 public async Task < string > ProcessPdfStreamAsync ( Stream pdfStream , OcrOptions options ,
3833 CancellationToken cancellationToken = default )
3934 {
40- var tempDir = Path . GetTempPath ( ) ;
41- var pdfFileName = Path . ChangeExtension ( Path . GetRandomFileName ( ) , ".pdf" ) ;
42- var pdfPath = Path . Combine ( tempDir , pdfFileName ) ;
43-
35+ var workDir = Directory . CreateTempSubdirectory ( TempDirPrefix ) ;
4436 try
4537 {
46- await using ( var fileStream = new FileStream ( pdfPath , FileMode . CreateNew , FileAccess . Write , FileShare . None ) )
38+ var pdfPath = Path . Combine ( workDir . FullName , "input.pdf" ) ;
39+ await using ( var fileStream =
40+ new FileStream ( pdfPath , FileMode . CreateNew , FileAccess . Write , FileShare . None ) )
4741 {
4842 await pdfStream . CopyToAsync ( fileStream , cancellationToken ) . ConfigureAwait ( false ) ;
4943 }
5044
51- return await ProcessPdfAsync ( pdfPath , options , cancellationToken ) . ConfigureAwait ( false ) ;
45+ return await OcrAsync ( pdfPath , workDir . FullName , options , cancellationToken ) . ConfigureAwait ( false ) ;
5246 }
5347 finally
5448 {
55- TryDeleteFile ( pdfPath ) ;
49+ TryDeleteDirectory ( workDir . FullName ) ;
5650 }
5751 }
5852
59- internal static void TryDeleteFile ( string path )
53+ private async Task < string > OcrAsync ( string pdfPath , string workDir , OcrOptions options ,
54+ CancellationToken cancellationToken )
55+ {
56+ var pdfName = Path . GetFileNameWithoutExtension ( pdfPath ) ;
57+ var pngPath = Path . Combine ( workDir , $ "{ pdfName } .png") ;
58+ var txtPath = Path . Combine ( workDir , $ "{ pdfName } .txt") ;
59+
60+ await _provider . RasterizePdfToPngAsync ( pdfPath , pngPath , options , cancellationToken ) . ConfigureAwait ( false ) ;
61+ return await _provider . ExtractTextFromImageAsync ( pngPath , txtPath , options , cancellationToken )
62+ . ConfigureAwait ( false ) ;
63+ }
64+
65+ internal static void TryDeleteDirectory ( string path )
6066 {
61- if ( File . Exists ( path ) )
62- File . Delete ( path ) ;
67+ if ( Directory . Exists ( path ) )
68+ Directory . Delete ( path , recursive : true ) ;
6369 }
6470}
0 commit comments