moevm · Dariiiii · Sep 27, 2024 · Sep 29, 2024 · Sep 30, 2024 · Feb 6, 2025
diff --git a/Dockerfile_worker_base b/Dockerfile_worker_base
@@ -9,4 +9,5 @@ ENV LANG=en_US.UTF-8
 ENV TZ=Europe/Moscow
 
 RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
-RUN apt update && apt install -y g++ gcc libreoffice-writer libreoffice-impress default-jre libreoffice-java-common
+
+RUN apt update && apt install -y g++ gcc libreoffice-writer libreoffice-impress default-jre libreoffice-java-common tesseract-ocr tesseract-ocr-rus
diff --git a/app/db/db_methods.py b/app/db/db_methods.py
@@ -1,13 +1,14 @@
 from datetime import datetime
 from os.path import basename
 
+import hashlib
 import pymongo
 from bson import ObjectId
 from gridfs import GridFSBucket, NoFile, errors as gridfs_errors
 from pymongo import MongoClient, errors as pymongo_errors
 from utils import convert_to
 
-from .db_types import User, Presentation, Check, Consumers, Logs
+from .db_types import User, Presentation, Check, Consumers, Logs, Image
 
 client = MongoClient("mongodb://mongodb:27017")
 db = client['dis-db']
@@ -18,14 +19,61 @@
 checks_collection = db['checks']
 consumers_collection = db['consumers']
 criteria_pack_collection = db['criteria_pack']
+parsed_texts_collection = db['parsed_texts']
 logs_collection = db.create_collection(
     'logs', capped=True, size=5242880) if not db['logs'] else db['logs']
 celery_check_collection = db['celery_check']  # collection for mapping celery_task to check
+celery_tesseract_collection = db['celery_tesseract']
+images_collection = db['images']  # коллекция для хранения изображений
 
 
 def get_client():
     return client
 
+def get_image(image_id):
+    image = images_collection.find_one({'_id': image_id})
+    if image is not None:
+        return Image(image)
+    else:
+        return None
+
+def get_images_by_check_id(check_id):
+    images = images_collection.find({'check_id': str(check_id)})
+    if images is not None:
+        image_list = []
+        for img in images:
+            image_list.append(Image(img))
+        return image_list
+    else:
+        return None
+
+def save_image_to_db(check_id, image_data, caption, image_size, document_id=None, text=None, page=None, checksum=None, text_density=None, symbols_percentage=None):
+    image = Image({
+        'check_id': check_id,
+        'document_id': document_id,
+        'image_data': image_data,
+        'caption': caption,
+        'image_size': image_size,
+        'text': text,
+        'page': page,
+        'checksum': checksum or calculate_image_checksum(image_data),
+        'text_density': text_density,
+        'symbols_percentage': symbols_percentage
+    })
+    result = images_collection.insert_one(image.pack())
+    return result.inserted_id 
+
+def update_image(image):
+    return bool(images_collection.find_one_and_replace({'_id': image._id}, image.pack()))
+
+def calculate_image_checksum(image_bytes):
+    return hashlib.sha256(image_bytes).hexdigest() if image_bytes else None
+
+def is_checksum_in_db(checksum):
+    if not checksum:
+        return False
+    existing = images_collection.find_one({"checksum": checksum})
+    return existing is not None
 
 # Returns user if user was created and None if already exists
 def add_user(username, password_hash='', is_LTI=False):
@@ -145,6 +193,12 @@ def add_check(file_id, check):
 def update_check(check):
     return bool(checks_collection.find_one_and_replace({'_id': check._id}, check.pack()))
 
+def add_parsed_text(check_id, parsed_text):
+    result = parsed_texts_collection.update_one({'filename': parsed_text.filename}, {'$set': parsed_text.pack()}, upsert=True)
+    if result.upserted_id: parsed_texts_id = result.upserted_id
+    else: parsed_texts_id = parsed_texts_collection.find_one({'filename': parsed_text.filename})['_id']
+    files_info_collection.update_one({'_id': check_id}, {"$push": {'parsed_texts': parsed_texts_id}})
+    return parsed_texts_id
 
 def get_pdf_id(file_id=None):
     if not file_id: file_id = ObjectId()
@@ -462,3 +516,40 @@ def get_celery_task(celery_task_id):
 
 def get_celery_task_by_check(check_id):
     return celery_check_collection.find_one({'check_id': check_id})
+
+
+def get_celery_task_status_by_check(check_id):
+    celery_task = get_celery_task_by_check(check_id)
+    if celery_task and 'finished_at' in celery_task:
+        return True
+    return False
+
+
+def add_celery_tesseract_task(celery_tesseract_task_id, check_id):
+    return celery_tesseract_collection.insert_one(
+        {'celery_tesseract_task_id': celery_tesseract_task_id, 'check_id': check_id, 'started_at': datetime.now()}).inserted_id
+
+
+def get_celery_tesseract_task_status_by_check(check_id):
+    celery_tesseract_task = get_celery_tesseract_task_by_check(check_id)
+    if celery_tesseract_task and 'finished_at' in celery_tesseract_task:
+        return True
+    return False
+
+
+def mark_celery_tesseract_task_as_finished_by_check(check_id, tesseract_result, finished_time=None):
+    celery_tesseract_task = get_celery_tesseract_task_by_check(check_id)
+    if not celery_tesseract_task: return
+    if finished_time is None: finished_time = datetime.now()
+    return celery_tesseract_collection.update_one({'check_id': check_id}, {
+        '$set': {'finished_at': finished_time,
+                 'tesseract_result': tesseract_result,
+                 'processing_time': (finished_time - celery_tesseract_task['started_at']).total_seconds()}})
+
+
+def get_celery_tesseract_task(celery_tesseract_task_id):
+    return celery_tesseract_collection.find_one({'celery_tesseract_task_id': celery_tesseract_task_id})
+
+
+def get_celery_tesseract_task_by_check(check_id):
+    return celery_tesseract_collection.find_one({'check_id': check_id})
diff --git a/app/db/db_types.py b/app/db/db_types.py
@@ -150,3 +150,43 @@ def none_to_false(x):
         is_ended = none_to_true(self.is_ended)  # None for old checks => True, True->True, False->False
         is_failed = none_to_false(self.is_failed)  # None for old checks => False, True->True, False->False
         return {'is_ended': is_ended, 'is_failed': is_failed}
+
+class Image:
+    def __init__(self, dictionary=None):
+        dictionary = dictionary or {}
+        self._id: ObjectId = dictionary.get('_id', ObjectId())
+        if isinstance(self._id, str):
+            self._id = ObjectId(self._id)
+
+        self.check_id: str = dictionary.get('check_id')
+        self.document_id: str = dictionary.get('document_id')
+        self.caption: str = dictionary.get('caption', '')
+        self.image_data: bytes = dictionary.get('image_data')
+        self.image_size: tuple[int, int] = dictionary.get('image_size')
+        self.text: str = dictionary.get('text')
+        self.page: int = dictionary.get('page')
+        self.checksum: str = dictionary.get('checksum')
+        self.text_density: float = dictionary.get('text_density')
+        self.symbols_percentage: float = dictionary.get('symbols_percentage')
+
+    def pack(self):
+        return {
+            "_id": self._id,
+            "check_id": self.check_id,
+            "document_id": self.document_id,
+            "caption": self.caption,
+            "image_data": self.image_data,
+            "image_size": self.image_size,
+            "text": self.text,
+            "page": self.page,
+            "checksum": self.checksum,
+            "text_density": self.text_density,
+            "symbols_percentage": self.symbols_percentage
+        }
+
+class ParsedText(PackableWithId):
+    def __init__(self, dictionary=None):
+        super().__init__(dictionary)
+        dictionary = dictionary or {}
+        self.filename = dictionary.get('filename', '')
+        self.parsed_chapters = dictionary.get('parsed_chapters', [])
diff --git a/app/main/check_packs/pack_config.py b/app/main/check_packs/pack_config.py
@@ -27,6 +27,8 @@
 ]
 BASE_REPORT_CRITERION = [
     ["simple_check"],
+    ["image_text_check"],
+    ['image_quality_check'],
     ["banned_words_in_literature"],
     ["page_counter"],
     ["image_share_check"],

diff --git a/app/main/checks/report_checks/image_quality_check.py b/app/main/checks/report_checks/image_quality_check.py
@@ -0,0 +1,54 @@
+from ..base_check import BaseReportCriterion, answer
+import cv2
+import numpy as np
+
+class ImageQualityCheck(BaseReportCriterion):
+    label = "Проверка качества изображений"
+    _description = ''
+    id = 'image_quality_check'
+    # необходимо подобрать min_laplacian и min_entropy
+    def __init__(self, file_info, min_laplacian=100, min_entropy=1):
+        super().__init__(file_info)
+        self.images = self.file.images
+        self.min_laplacian = min_laplacian
+        self.min_entropy = min_entropy
+        self.laplacian_score = None
+        self.entropy_score = None
+
+    def check(self):
+        deny_list = []
+        if self.images:
+            for img in self.images:
+                image_array = np.frombuffer(img.image_data, dtype=np.uint8)
+                img_cv = cv2.imdecode(image_array, cv2.IMREAD_COLOR)
+
+                if img_cv is None:
+                    deny_list.append(f"Изображение с подписью '{img.caption}' не может быть обработано.<br>")
+                    continue
+
+                self.find_params(img_cv)
+
+                if self.laplacian_score is None or self.entropy_score is None:
+                    deny_list.append(f"Изображение с подписью '{img.caption}' не может быть обработано.<br>")
+                    continue
+
+                if self.laplacian_score < self.min_laplacian:
+                    deny_list.append(f"Изображение с подписью '{img.caption}' имеет низкий показатель лапласиана: {self.laplacian_score:.2f} (минимум {self.min_laplacian:.2f}).<br>")
+
+                if self.entropy_score < self.min_entropy:
+                    deny_list.append(f"Изображение с подписью '{img.caption}' имеет низкую энтропию: {self.entropy_score:.2f} (минимум {self.min_entropy:.2f}).<br>")
+        else: 
+            return answer(True, 'Изображения не найдены!')
+        if deny_list:
+            return answer(False, f'Изображения нечитаемы! <br>Попробуйте улучшить качество изображений, возможно они слишком размыты или зашумлены.<br>{"".join(deny_list)}')
+        else:
+            return answer(True, 'Изображения корректны!')
+
+    def find_params(self, image):
+        if image is None or image.size == 0:
+            return None, None
+        gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+        self.laplacian_score = cv2.Laplacian(gray_image, cv2.CV_64F).var()
+        hist, _ = np.histogram(gray_image.flatten(), bins=256, range=[0, 256])
+        hist = hist / hist.sum()
+        self.entropy_score = -np.sum(hist * np.log2(hist + 1e-10))
diff --git a/app/main/checks/report_checks/image_text_check.py b/app/main/checks/report_checks/image_text_check.py
@@ -0,0 +1,28 @@
+from ..base_check import BaseReportCriterion, answer
+
+
+class ImageTextCheck(BaseReportCriterion):
+    label = "Проверка текста, считанного с изображений"
+    _description = ''
+    id = 'image_text_check'
+    # Подобрать значения для symbols_set, max_symbols_percentage, max_text_density
+    def __init__(self, file_info, symbols_set=list("@#$%^&*~`‘|±§№¤¢£€{¥}©®™•¶÷×"), max_symbols_percentage=5, max_text_density=4):
+        super().__init__(file_info)
+        self.images = self.file.images
+        self.symbols_set = symbols_set
+        self.max_symbols_percentage = max_symbols_percentage
+        self.max_text_density = max_text_density
+
+    def check(self):
+        from app.tesseract_tasks import tesseract_recognize, callback_task
+        from db.db_methods import add_celery_tesseract_task
+        if self.images:
+            tesseract_task = tesseract_recognize.apply_async(
+                args=[self.images[0].check_id, self.symbols_set, self.max_symbols_percentage, self.max_text_density],
+                link=callback_task.s(self.images[0].check_id),
+                link_error=callback_task.s(self.images[0].check_id)
+            )
+            add_celery_tesseract_task(tesseract_task.id, self.images[0].check_id)
+            return answer(True, 'Изображения проверяются!')
+        else:
+            return answer(True, 'Изображения не найдены!')
diff --git a/app/main/parser.py b/app/main/parser.py
@@ -8,18 +8,24 @@
 from main.reports.md_uploader import MdUploader
 from utils import convert_to
 
+
 logger = logging.getLogger('root_logger')
 
+def parse(filepath, pdf_filepath, check_id):
 
-def parse(filepath, pdf_filepath):
     tmp_filepath = filepath.lower()
     try:
         if tmp_filepath.endswith(('.odp', '.ppt', '.pptx')):
             new_filepath = filepath
             if tmp_filepath.endswith(('.odp', '.ppt')):
                 logger.info(f"Презентация {filepath} старого формата. Временно преобразована в pptx для обработки.")
                 new_filepath = convert_to(filepath, target_format='pptx')
-            file_object = PresentationPPTX(new_filepath)
+
+            presentation = PresentationPPTX(new_filepath)
+            presentation.extract_images_with_captions(check_id)
+            file_object = presentation
+
+
         elif tmp_filepath.endswith(('.doc', '.odt', '.docx', )):
             new_filepath = filepath
             if tmp_filepath.endswith(('.doc', '.odt')):
@@ -29,6 +35,7 @@ def parse(filepath, pdf_filepath):
             docx = DocxUploader()
             docx.upload(new_filepath, pdf_filepath)
             docx.parse()
+            docx.extract_images_with_captions(check_id)
             file_object = docx
 
         elif tmp_filepath.endswith('.md' ):
@@ -54,4 +61,4 @@ def save_to_temp_file(file):
     temp_file.write(file.read())
     temp_file.close()
     file.seek(0)
-    return temp_file.name
+    return temp_file.name
diff --git a/app/main/presentations/pptx/presentation_pptx.py b/app/main/presentations/pptx/presentation_pptx.py
@@ -1,4 +1,7 @@
+from io import BytesIO
+
 from pptx import Presentation
+from pptx.enum.shapes import MSO_SHAPE_TYPE
 
 from .slide_pptx import SlidePPTX
 from ..presentation_basic import PresentationBasic
@@ -17,3 +20,39 @@ def add_slides(self):
 
     def __str__(self):
         return super().__str__()
+
+    def extract_images_with_captions(self, check_id):
+        from app.db.db_methods import save_image_to_db
+
+        # Проход по каждому слайду в презентации
+        for slide in self.slides:
+            image_found = False
+            image_data = None
+            caption_text = None
+
+            # Проход по всем фигурам на слайде
+            for shape in slide.slide.shapes:  # Используем slide.slide для доступа к текущему слайду
+                if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
+                    image_found = True
+                    image_part = shape.image  # Получаем объект изображения
+
+                    # Извлекаем бинарные данные изображения
+                    image_stream = image_part.blob
+                    image_data = BytesIO(image_stream)
+
+                # Если мы нашли изображение, ищем следующий непустой текст как подпись
+                if image_found:
+                    for shape in slide.slide.shapes:
+                        if not shape.has_text_frame:
+                            continue
+                        text = shape.text.strip()
+                        if text:  # Находим непустое текстовое поле (предположительно, это подпись)
+                            caption_text = text
+                            # Сохраняем изображение и его подпись
+                            save_image_to_db(check_id, image_data.getvalue(), caption_text)
+                            break  # Предполагаем, что это подпись к текущему изображению
+
+                    # Сброс флага и данных изображения для следующего цикла
+                    image_found = False
+                    image_data = None
+                    caption_text = None
diff --git a/app/main/reports/document_uploader.py b/app/main/reports/document_uploader.py
@@ -12,6 +12,7 @@ def __init__(self):
         self.literature_page = 0
         self.first_lines = []
         self.page_count = 0
+        self.images = []
 
     @abstractmethod
     def upload(self):