-
Notifications
You must be signed in to change notification settings - Fork 3
tesseract_integration #656
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
d6b163c
88f199c
5ecde02
52d1afe
e783ed9
5cc96ec
c15f5ab
f645a68
40cfc2d
b7acfcd
456e238
89ee03b
c59c475
3f25405
7906f70
7c195c8
40f51be
5fa3014
24eb092
fc8e0c1
d05230a
57bee01
3b18e36
050163a
3f3ef52
5796e5f
227031d
721059b
a2033eb
87337fc
de68188
5dc1430
f6258b6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -150,3 +150,43 @@ def none_to_false(x): | |
| is_ended = none_to_true(self.is_ended) # None for old checks => True, True->True, False->False | ||
| is_failed = none_to_false(self.is_failed) # None for old checks => False, True->True, False->False | ||
| return {'is_ended': is_ended, 'is_failed': is_failed} | ||
|
|
||
| class Image: | ||
| def __init__(self, dictionary=None): | ||
| dictionary = dictionary or {} | ||
| self._id: ObjectId = dictionary.get('_id', ObjectId()) | ||
| if isinstance(self._id, str): | ||
| self._id = ObjectId(self._id) | ||
|
|
||
| self.check_id: str = dictionary.get('check_id') | ||
| self.document_id: str = dictionary.get('document_id') | ||
| self.caption: str = dictionary.get('caption', '') | ||
| self.image_data: bytes = dictionary.get('image_data') | ||
| self.image_size: tuple[int, int] = dictionary.get('image_size') | ||
| self.text: str = dictionary.get('text') | ||
| self.page: int = dictionary.get('page') | ||
| self.checksum: str = dictionary.get('checksum') | ||
| self.text_density: float = dictionary.get('text_density') | ||
| self.symbols_percentage: float = dictionary.get('symbols_percentage') | ||
|
|
||
| def pack(self): | ||
| return { | ||
| "_id": self._id, | ||
| "check_id": self.check_id, | ||
| "document_id": self.document_id, | ||
| "caption": self.caption, | ||
| "image_data": self.image_data, | ||
| "image_size": self.image_size, | ||
| "text": self.text, | ||
| "page": self.page, | ||
| "checksum": self.checksum, | ||
| "text_density": self.text_density, | ||
| "symbols_percentage": self.symbols_percentage | ||
| } | ||
|
|
||
| class ParsedText(PackableWithId): | ||
| def __init__(self, dictionary=None): | ||
| super().__init__(dictionary) | ||
| dictionary = dictionary or {} | ||
| self.filename = dictionary.get('filename', '') | ||
| self.parsed_chapters = dictionary.get('parsed_chapters', []) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Чтобы заранее определить структуру "parsed_chapters" (а не вспоминать потом, как она выглядит и что в ней лежит) используйте pymodm.EmbeddedMongoModel (не будет создавать отдельную коллекцию, но позволит описать структуру одного раздела / списка разделов) - об этом тоже ранее говорил |
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,54 @@ | ||
| from ..base_check import BaseReportCriterion, answer | ||
| import cv2 | ||
| import numpy as np | ||
|
|
||
| class ImageQualityCheck(BaseReportCriterion): | ||
| label = "Проверка качества изображений" | ||
| _description = '' | ||
| id = 'image_quality_check' | ||
| # необходимо подобрать min_laplacian и min_entropy | ||
| def __init__(self, file_info, min_laplacian=100, min_entropy=1): | ||
| super().__init__(file_info) | ||
| self.images = self.file.images | ||
| self.min_laplacian = min_laplacian | ||
| self.min_entropy = min_entropy | ||
| self.laplacian_score = None | ||
| self.entropy_score = None | ||
|
|
||
| def check(self): | ||
| deny_list = [] | ||
| if self.images: | ||
| for img in self.images: | ||
| image_array = np.frombuffer(img.image_data, dtype=np.uint8) | ||
| img_cv = cv2.imdecode(image_array, cv2.IMREAD_COLOR) | ||
|
|
||
| if img_cv is None: | ||
| deny_list.append(f"Изображение с подписью '{img.caption}' не может быть обработано.<br>") | ||
| continue | ||
|
|
||
| self.find_params(img_cv) | ||
|
|
||
| if self.laplacian_score is None or self.entropy_score is None: | ||
| deny_list.append(f"Изображение с подписью '{img.caption}' не может быть обработано.<br>") | ||
| continue | ||
|
|
||
| if self.laplacian_score < self.min_laplacian: | ||
| deny_list.append(f"Изображение с подписью '{img.caption}' имеет низкий показатель лапласиана: {self.laplacian_score:.2f} (минимум {self.min_laplacian:.2f}).<br>") | ||
|
|
||
| if self.entropy_score < self.min_entropy: | ||
| deny_list.append(f"Изображение с подписью '{img.caption}' имеет низкую энтропию: {self.entropy_score:.2f} (минимум {self.min_entropy:.2f}).<br>") | ||
| else: | ||
| return answer(True, 'Изображения не найдены!') | ||
| if deny_list: | ||
| return answer(False, f'Изображения нечитаемы! <br>Попробуйте улучшить качество изображений, возможно они слишком размыты или зашумлены.<br>{"".join(deny_list)}') | ||
| else: | ||
| return answer(True, 'Изображения корректны!') | ||
|
|
||
| def find_params(self, image): | ||
| if image is None or image.size == 0: | ||
| return None, None | ||
| gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) | ||
| self.laplacian_score = cv2.Laplacian(gray_image, cv2.CV_64F).var() | ||
| hist, _ = np.histogram(gray_image.flatten(), bins=256, range=[0, 256]) | ||
| hist = hist / hist.sum() | ||
| self.entropy_score = -np.sum(hist * np.log2(hist + 1e-10)) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,28 @@ | ||
| from ..base_check import BaseReportCriterion, answer | ||
|
|
||
|
|
||
| class ImageTextCheck(BaseReportCriterion): | ||
| label = "Проверка текста, считанного с изображений" | ||
| _description = '' | ||
| id = 'image_text_check' | ||
| # Подобрать значения для symbols_set, max_symbols_percentage, max_text_density | ||
| def __init__(self, file_info, symbols_set=list("@#$%^&*~`‘|±§№¤¢£€{¥}©®™•¶÷×"), max_symbols_percentage=5, max_text_density=4): | ||
| super().__init__(file_info) | ||
| self.images = self.file.images | ||
| self.symbols_set = symbols_set | ||
| self.max_symbols_percentage = max_symbols_percentage | ||
| self.max_text_density = max_text_density | ||
|
|
||
| def check(self): | ||
| from app.tesseract_tasks import tesseract_recognize, callback_task | ||
| from db.db_methods import add_celery_tesseract_task | ||
| if self.images: | ||
| tesseract_task = tesseract_recognize.apply_async( | ||
| args=[self.images[0].check_id, self.symbols_set, self.max_symbols_percentage, self.max_text_density], | ||
| link=callback_task.s(self.images[0].check_id), | ||
| link_error=callback_task.s(self.images[0].check_id) | ||
| ) | ||
| add_celery_tesseract_task(tesseract_task.id, self.images[0].check_id) | ||
| return answer(True, 'Изображения проверяются!') | ||
| else: | ||
| return answer(True, 'Изображения не найдены!') |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Напоминаю о том, что новые модели БД надо делать через https://pymodm.readthedocs.io/en/latest/api/#pymodm.MongoModel