1+ import logging
2+ import os
3+
4+ import pytest
5+ import requests
6+ from dotenv import load_dotenv
7+
8+ import jigsawstack
9+ from jigsawstack .exceptions import JigsawStackError
10+
11+ load_dotenv ()
12+
13+ logging .basicConfig (level = logging .INFO )
14+ logger = logging .getLogger (__name__ )
15+
16+ jigsaw = jigsawstack .JigsawStack (api_url = "http://localhost:3000/api/" , api_key = os .getenv ("JIGSAWSTACK_API_KEY" ))
17+ async_jigsaw = jigsawstack .AsyncJigsawStack (api_url = "http://localhost:3000/api/" , api_key = os .getenv ("JIGSAWSTACK_API_KEY" ))
18+
19+ IMAGE_URL = "https://jigsawstack.com/preview/vocr-example.jpg"
20+
21+ # PDF URL for testing page_range functionality
22+ PDF_URL = "https://arxiv.org/pdf/1706.03762"
23+
24+ TEST_CASES = [
25+ {
26+ "name" : "with_url_only" ,
27+ "params" : {"url" : IMAGE_URL },
28+ "blob" : None ,
29+ "options" : None ,
30+ },
31+ {
32+ "name" : "with_blob_only" ,
33+ "params" : None ,
34+ "blob" : IMAGE_URL ,
35+ "options" : None ,
36+ },
37+ {
38+ "name" : "with_string_prompt" ,
39+ "blob" : IMAGE_URL ,
40+ "options" : {"prompt" : "Extract all text from the image" },
41+ },
42+ {
43+ "name" : "with_list_prompt" ,
44+ "blob" : IMAGE_URL ,
45+ "options" : {
46+ "prompt" : [
47+ "What is the main heading?" ,
48+ "Extract any dates mentioned" ,
49+ "What are the key points?"
50+ ]
51+ },
52+ },
53+ {
54+ "name" : "with_dict_prompt" ,
55+ "blob" : IMAGE_URL ,
56+ "options" : {
57+ "prompt" : {
58+ "title" : "Extract the main title" ,
59+ "content" : "What is the main content?" ,
60+ "metadata" : "Extract any metadata or additional information"
61+ }
62+ },
63+ },
64+ {
65+ "name" : "url_with_string_prompt" ,
66+ "params" : {
67+ "url" : IMAGE_URL ,
68+ "prompt" : "Summarize the text content"
69+ },
70+ "blob" : None ,
71+ "options" : None ,
72+ },
73+ {
74+ "name" : "url_with_list_prompt" ,
75+ "params" : {
76+ "url" : IMAGE_URL ,
77+ "prompt" : ["Extract headers" , "Extract body text" ]
78+ },
79+ "blob" : None ,
80+ "options" : None ,
81+ },
82+ ]
83+
84+ # PDF specific test cases
85+ PDF_TEST_CASES = [
86+ {
87+ "name" : "pdf_with_page_range" ,
88+ "params" : {
89+ "url" : PDF_URL ,
90+ "page_range" : [1 , 3 ],
91+ "prompt" : "Extract text from these pages"
92+ },
93+ "blob" : None ,
94+ "options" : None ,
95+ },
96+ {
97+ "name" : "pdf_single_page" ,
98+ "params" : {
99+ "url" : PDF_URL ,
100+ "page_range" : [1 , 1 ],
101+ "prompt" : "What is on the first page?"
102+ },
103+ "blob" : None ,
104+ "options" : None ,
105+ },
106+ {
107+ "name" : "pdf_blob_with_page_range" ,
108+ "blob" : PDF_URL ,
109+ "options" : {
110+ "page_range" : [1 , 3 ],
111+ "prompt" : "what is this about?"
112+ },
113+ },
114+ ]
115+
116+
117+ class TestVOCRSync :
118+ """Test synchronous VOCR methods"""
119+
120+ sync_test_cases = TEST_CASES
121+ pdf_test_cases = PDF_TEST_CASES
122+
123+ @pytest .mark .parametrize (
124+ "test_case" , sync_test_cases , ids = [tc ["name" ] for tc in sync_test_cases ]
125+ )
126+ def test_vocr (self , test_case ):
127+ """Test synchronous VOCR with various inputs"""
128+ try :
129+ if test_case .get ("blob" ):
130+ # Download blob content
131+ blob_content = requests .get (test_case ["blob" ]).content
132+ result = jigsaw .vision .vocr (blob_content , test_case .get ("options" , {}))
133+ else :
134+ # Use params directly
135+ result = jigsaw .vision .vocr (test_case ["params" ])
136+
137+ print (f"Test { test_case ['name' ]} : Success={ result .get ('success' )} " )
138+
139+ # Verify response structure
140+ assert result ["success" ] is True
141+ if "prompt" in (test_case .get ("params" ) or {}):
142+ assert "context" in result
143+ assert "width" in result
144+ assert "height" in result
145+ assert "has_text" in result
146+ assert "tags" in result
147+ assert isinstance (result ["tags" ], list )
148+ assert "sections" in result
149+ assert isinstance (result ["sections" ], list )
150+
151+ except JigsawStackError as e :
152+ pytest .fail (f"Unexpected JigsawStackError in { test_case ['name' ]} : { e } " )
153+
154+ @pytest .mark .parametrize (
155+ "test_case" , pdf_test_cases , ids = [tc ["name" ] for tc in pdf_test_cases ]
156+ )
157+ def test_vocr_pdf (self , test_case ):
158+ """Test synchronous VOCR with PDF inputs"""
159+ try :
160+ if test_case .get ("blob" ):
161+ # Download blob content
162+ blob_content = requests .get (test_case ["blob" ]).content
163+ result = jigsaw .vision .vocr (blob_content , test_case .get ("options" , {}))
164+ else :
165+ # Use params directly
166+ result = jigsaw .vision .vocr (test_case ["params" ])
167+
168+ # Verify response structure
169+ assert result ["success" ] is True
170+ if "prompt" in (test_case .get ("params" ) or {}):
171+ assert "context" in result
172+ assert "total_pages" in result
173+
174+ if test_case .get ("params" , {}).get ("page_range" ) or test_case .get ("options" , {}).get ("page_range" ):
175+ assert "page_range" in result
176+ assert isinstance (result ["page_range" ], list )
177+
178+ logger .info (f"Test { test_case ['name' ]} : total_pages={ result .get ('total_pages' )} " )
179+
180+ except JigsawStackError as e :
181+ pytest .fail (f"Unexpected JigsawStackError in { test_case ['name' ]} : { e } " )
182+
183+
184+ class TestVOCRAsync :
185+ """Test asynchronous VOCR methods"""
186+
187+ async_test_cases = TEST_CASES
188+ pdf_test_cases = PDF_TEST_CASES
189+
190+ @pytest .mark .parametrize (
191+ "test_case" , async_test_cases , ids = [tc ["name" ] for tc in async_test_cases ]
192+ )
193+ @pytest .mark .asyncio
194+ async def test_vocr_async (self , test_case ):
195+ """Test asynchronous VOCR with various inputs"""
196+ try :
197+ if test_case .get ("blob" ):
198+ # Download blob content
199+ blob_content = requests .get (test_case ["blob" ]).content
200+ result = await async_jigsaw .vision .vocr (
201+ blob_content , test_case .get ("options" , {})
202+ )
203+ else :
204+ # Use params directly
205+ result = await async_jigsaw .vision .vocr (test_case ["params" ])
206+
207+ print (f"Test { test_case ['name' ]} : Success={ result .get ('success' )} " )
208+
209+ # Verify response structure
210+ assert result ["success" ] is True
211+ if "prompt" in (test_case .get ("params" ) or {}):
212+ assert "context" in result
213+ assert "width" in result
214+ assert "height" in result
215+ assert "has_text" in result
216+ assert "tags" in result
217+ assert isinstance (result ["tags" ], list )
218+ assert "sections" in result
219+ assert isinstance (result ["sections" ], list )
220+
221+ # Log some details
222+ logger .info (f"Test { test_case ['name' ]} : has_text={ result ['has_text' ]} , tags={ result ['tags' ][:3 ] if result ['tags' ] else []} " )
223+
224+ except JigsawStackError as e :
225+ pytest .fail (f"Unexpected JigsawStackError in { test_case ['name' ]} : { e } " )
226+
227+ @pytest .mark .parametrize (
228+ "test_case" , pdf_test_cases , ids = [tc ["name" ] for tc in pdf_test_cases ]
229+ )
230+ @pytest .mark .asyncio
231+ async def test_vocr_pdf_async (self , test_case ):
232+ """Test asynchronous VOCR with PDF inputs"""
233+ try :
234+ if test_case .get ("blob" ):
235+ # Download blob content
236+ blob_content = requests .get (test_case ["blob" ]).content
237+ result = await async_jigsaw .vision .vocr (
238+ blob_content , test_case .get ("options" , {})
239+ )
240+ else :
241+ # Use params directly
242+ result = await async_jigsaw .vision .vocr (test_case ["params" ])
243+
244+ print (f"Test { test_case ['name' ]} : Success={ result .get ('success' )} " )
245+
246+ # Verify response structure
247+ assert result ["success" ] is True
248+ if "prompt" in (test_case .get ("params" ) or {}):
249+ assert "context" in result
250+ assert "total_pages" in result # PDF specific
251+
252+ # Check if page_range is in response when requested
253+ if test_case .get ("params" , {}).get ("page_range" ) or test_case .get ("options" , {}).get ("page_range" ):
254+ assert "page_range" in result
255+ assert isinstance (result ["page_range" ], list )
256+
257+ logger .info (f"Test { test_case ['name' ]} : total_pages={ result .get ('total_pages' )} " )
258+
259+ except JigsawStackError as e :
260+ pytest .fail (f"Unexpected JigsawStackError in { test_case ['name' ]} : { e } " )
0 commit comments