refactor image transform code and fix a bunch of edge case resizing/cropping bugs, avoid downloading the same file multiple times, add unit tests and much debug logging

tykling · tykling · commit 3f054bd6b5c3 · 2024-12-03T20:06:05.000+01:00
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -16,4 +16,5 @@ repos:
           - "exifread"
           - "httpx"
           - "pillow"
+        exclude: "tests/.*"
 ...
diff --git a/pyproject.toml b/pyproject.toml
@@ -28,6 +28,10 @@ dev = [
     "pre-commit==4.0.0",
     "setuptools-scm==8.0.4",
 ]
+test = [
+    "pytest==8.3.3",
+    "pytest-mock==3.14.0",
+]
 
 [project.urls]
 homepage = "https://github.com/bornhack/bma-client-lib-python"
@@ -63,3 +67,14 @@ line-length = 120
 
 [tool.ruff.lint.pydocstyle]
 convention = "google"
+
+[tool.ruff.lint.per-file-ignores]
+"*/tests/*" = [
+    "S101", # https://docs.astral.sh/ruff/rules/assert/
+    "ANN002", # https://docs.astral.sh/ruff/rules/missing-type-args/
+    "ANN003", # https://docs.astral.sh/ruff/rules/missing-type-kwargs/
+]
+
+[tool.mypy]
+mypy_path = "src"
+strict = true
diff --git a/src/bma_client_lib/bma_client.py b/src/bma_client_lib/bma_client.py
@@ -2,7 +2,6 @@
 
 import json
 import logging
-import math
 import time
 import uuid
 from fractions import Fraction
@@ -39,7 +38,7 @@
 
 # get version
 try:
-    __version__ = version("bma-client-lib")
+    __version__ = version("bma_client_lib")
 except PackageNotFoundError:
     __version__ = "0.0.0"
 
@@ -79,6 +78,11 @@ def __init__(
         self.skip_exif_tags = SKIP_EXIF_TAGS
         self.get_server_settings()
         self.__version__ = __version__
+        # build client object
+        self.clientjson = {
+            "client_uuid": self.uuid,
+            "client_version": f"bma-client-lib {__version__}",
+        }
 
     def update_access_token(self) -> None:
         """Set or update self.access_token using self.refresh_token."""
@@ -104,54 +108,57 @@ def get_server_settings(self) -> dict[str, dict[str, dict[str, list[str]]]]:
             self.base_url + "/api/v1/json/jobs/settings/",
         ).raise_for_status()
         self.settings = r.json()["bma_response"]
-        return r.json()
+        return self.settings  # type: ignore[no-any-return]
 
     def get_jobs(self, job_filter: str = "?limit=0") -> list[Job]:
         """Get a filtered list of the jobs this user has access to."""
         r = self.client.get(self.base_url + f"/api/v1/json/jobs/{job_filter}").raise_for_status()
         response = r.json()["bma_response"]
         logger.debug(f"Returning {len(response)} jobs with filter {job_filter}")
-        return response
+        return response  # type: ignore[no-any-return]
 
     def get_file_info(self, file_uuid: uuid.UUID) -> dict[str, str]:
         """Get metadata for a file."""
         r = self.client.get(self.base_url + f"/api/v1/json/files/{file_uuid}/").raise_for_status()
-        return r.json()["bma_response"]
+        return r.json()["bma_response"]  # type: ignore[no-any-return]
 
     def download(self, url: str, path: Path) -> Path:
         """Download a file to a path."""
         r = self.client.get(url).raise_for_status()
         logger.debug(f"Done downloading {len(r.content)} bytes from {url}, saving to {path}")
+        path.parent.mkdir(parents=True, exist_ok=True)
         with path.open("wb") as f:
             f.write(r.content)
         return path
 
     def download_job_source(self, job: Job) -> Path:
         """Download the file needed to do a job."""
+        # skip the leading slash when using url as a local path
+        path = self.path / job.source_url[1:]
+        if path.exists():
+            # file was downloaded previously
+            return path
+        # get the file
         return self.download(
             url=self.base_url + job.source_url,
-            path=self.path / job.source_filename,
+            path=path,
         )
 
     def get_job_assignment(self, job_filter: str = "") -> list[Job]:
         """Ask for new job(s) from the API."""
         url = self.base_url + "/api/v1/json/jobs/assign/"
         if job_filter:
             url += job_filter
-        data = {
-            "client_uuid": self.uuid,
-            "client_version": f"bma-client-lib {__version__}",
-        }
         try:
-            r = self.client.post(url, json=data).raise_for_status()
+            r = self.client.post(url, json=self.clientjson).raise_for_status()
             response = r.json()["bma_response"]
         except httpx.HTTPStatusError as e:
             if e.response.status_code == HTTPStatus.NOT_FOUND:
                 response = []
             else:
                 raise
         logger.debug(f"Returning {len(response)} assigned jobs")
-        return response
+        return response  # type: ignore[no-any-return]
 
     def unassign_job(self, job: Job) -> bool:
         """Unassign a job."""
@@ -163,7 +170,7 @@ def unassign_job(self, job: Job) -> bool:
 
     def upload_file(self, path: Path, attribution: str, file_license: str) -> dict[str, dict[str, str]]:
         """Upload a file."""
-        # get mimetype
+        # get mimetype using magic on the first 2kb of the file
         with path.open("rb") as fh:
             mimetype = magic.from_buffer(fh.read(2048), mime=True)
 
@@ -209,11 +216,11 @@ def upload_file(self, path: Path, attribution: str, file_license: str) -> dict[s
             # doit
             r = self.client.post(
                 self.base_url + "/api/v1/json/files/upload/",
-                data={"metadata": json.dumps(data)},
+                data={"f_metadata": json.dumps(data), "client": json.dumps(self.clientjson)},
                 files=files,
                 timeout=30,
             )
-            return r.json()
+            return r.json()  # type: ignore[no-any-return]
 
     def handle_job(self, job: Job) -> None:
         """Do the thing and upload the result."""
@@ -236,7 +243,7 @@ def handle_job(self, job: Job) -> None:
                 raise JobNotSupportedError(job=job)
             source = self.download_job_source(job)
             result = self.create_thumbnail_source(job=job)
-            filename = job.source_filename
+            filename = job.source_url
 
         else:
             raise JobNotSupportedError(job=job)
@@ -264,6 +271,7 @@ def write_and_upload_result(self, job: Job, result: "JobResult", filename: str)
                     kwargs["append_images"] = image[1:]
                     kwargs["save_all"] = True
                 image[0].save(buf, format=job.filetype, exif=exif, **kwargs)
+                metadata = {"width": image[0].width, "height": image[0].height, "mimetype": job.mimetype}
 
             elif isinstance(job, ImageExifExtractionJob):
                 logger.debug(f"Got exif data {result}")
@@ -329,6 +337,7 @@ def handle_image_conversion_job(
         logger.debug(f"Desired image size is {size}, aspect ratio: {ratio} ({orig_str}), converting image...")
         start = time.time()
         images = transform_image(original_img=image, crop_w=size[0], crop_h=size[1])
+        logger.debug(f"Result image size is {images[0].width}*{images[0].height}")
         logger.debug(f"Converting image size and AR took {time.time() - start} seconds")
 
         logger.debug("Done, returning result...")
@@ -340,20 +349,15 @@ def upload_job_result(
         buf: "BytesIO",
         filename: str,
         metadata: dict[str, str | int] | None = None,
-    ) -> dict:
+    ) -> dict[str, str]:
         """Upload the result of a job."""
         size = buf.getbuffer().nbytes
         logger.debug(f"Uploading {size} bytes result for job {job.job_uuid} with filename {filename}")
         start = time.time()
         files = {"f": (filename, buf)}
-        # build client object
-        client = {
-            "client_uuid": self.uuid,
-            "client_version": "bma-client-lib {__version__}",
-        }
-        data = {"client": json.dumps(client)}
-        if isinstance(job, ThumbnailSourceJob):
-            # ThumbnailSourceJob needs a metadata object as well
+        data = {"client": json.dumps(self.clientjson)}
+        if isinstance(job, ThumbnailJob | ThumbnailSourceJob | ImageConversionJob):
+            # Image generating jobs needs a metadata object as well
             data["metadata"] = json.dumps(metadata)
         # doit
         r = self.client.post(
@@ -363,7 +367,7 @@ def upload_job_result(
         ).raise_for_status()
         t = time.time() - start
         logger.debug(f"Done, it took {t} seconds to upload {size} bytes, speed {round(size/t)} bytes/sec")
-        return r.json()
+        return r.json()  # type: ignore[no-any-return]
 
     def get_exif(self, fname: Path) -> "ExifExtractionJobResult":
         """Return a dict with exif data as read by exifread from the file.
@@ -398,27 +402,9 @@ def create_album(self, file_uuids: list[uuid.UUID], title: str, description: str
             "description": description,
         }
         r = self.client.post(url, json=data).raise_for_status()
-        return r.json()["bma_response"]
+        return r.json()["bma_response"]  # type: ignore[no-any-return]
 
     def create_thumbnail_source(self, job: ThumbnailSourceJob) -> "ThumbnailSourceJobResult":
         """Create a thumbnail source for this file."""
-        info = self.get_file_info(file_uuid=job.basefile_uuid)
-        if info["filetype"] == "image":
-            # use a max 500px wide version of the image as thumbnail source
-            path = self.path / info["filename"]
-            original_ratio = Fraction(int(info["width"]), int(info["height"]))
-            height = math.floor(500 / original_ratio)
-            # just call the regular image conversion method to make a thumbnail
-            return self.handle_image_conversion_job(
-                job=ImageConversionJob(
-                    **job.__dict__,
-                    width=500,
-                    height=height,
-                    custom_aspect_ratio=False,
-                    filetype="WEBP",
-                    mimetype="image/webp",
-                ),
-                orig=path,
-            )
         # unsupported filetype
         raise JobNotSupportedError(job=job)
diff --git a/src/bma_client_lib/datastructures.py b/src/bma_client_lib/datastructures.py
@@ -19,8 +19,6 @@ class BaseJob:
     client_version: str
     finished: bool
     source_url: str
-    source_filename: str
-    source_mimetype: str
     schema_name: str
 
 
diff --git a/src/bma_client_lib/pillow_resize_and_crop.py b/src/bma_client_lib/pillow_resize_and_crop.py
@@ -1,64 +1,106 @@
 """Pillow cropping with sequence (gif, webp) support.
 
-Borrowed from https://gist.github.com/muratgozel/ce1aa99f97fc1a99b3f3ec90cf77e5f5
+Originally based on https://gist.github.com/muratgozel/ce1aa99f97fc1a99b3f3ec90cf77e5f5
 """
 
-from math import fabs, floor
+import logging
+from fractions import Fraction
+from math import floor
 
 from PIL import Image, ImageFile, ImageSequence
 
+logger = logging.getLogger("bma_client")
 
-def transform_image(original_img: Image.Image, crop_w: int, crop_h: int, center_point: tuple[int, int] = (0.5,0.5)) -> list[Image.Image | ImageFile.ImageFile]:
-    """Resizes and crops the image to the specified crop_w and crop_h if necessary.
+
+def transform_image(
+    original_img: Image.Image, crop_w: int, crop_h: int, center_point: tuple[float, float] = (0.5, 0.5)
+) -> list[Image.Image | ImageFile.ImageFile]:
+    """Shrinks and crops the image to the specified crop_w and crop_h if necessary.
 
     Works with multi frame gif and webp images.
 
     Args:
       original_img(Image.Image): is the image instance created by pillow ( Image.open(filepath) )
-      crop_w(int): is the width in pixels for the image that will be resized and cropped
-      crop_h(int): is the height in pixels for the image that will be resized and cropped
+      crop_w(int): is the desired width in pixels
+      crop_h(int): is the desired height in pixels
+      center_point(tuple[float,float]): The center point of cropping as a percentage.
 
     returns:
-      Instance of an Image or list of frames which they are instances of an Image individually
+      List of one or more Image instances
     """
     img_w, img_h = (original_img.size[0], original_img.size[1])
     # sequence?
     n_frames = getattr(original_img, "n_frames", 1)
 
     def transform_frame(frame: Image.Image) -> Image.Image | ImageFile.ImageFile:
         """Resizes and crops the individual frame in the image."""
-        # resize the image to the specified height if crop_w is null in the recipe
-        if crop_w is None:
-            if crop_h == img_h:
-                return frame
-            new_w = floor(img_w * crop_h / img_h)
-            new_h = crop_h
-            return frame.resize((new_w, new_h), resample=Image.Resampling.LANCZOS)
-
         # return the original image if crop size is equal to img size
         if crop_w == img_w and crop_h == img_h:
+            logger.debug(
+                f"Image size and requested size are the same ({crop_w}*{crop_h}), returning image without resizing"
+            )
             return frame
 
-        # first resize to get most visible area of the image and then crop
-        w_diff = fabs(crop_w - img_w)
-        h_diff = fabs(crop_h - img_h)
-        enlarge_image = bool(crop_w > img_w or crop_h > img_h)
-        shrink_image = bool(crop_w < img_w or crop_h < img_h)
-
-        if enlarge_image is True:
-            new_w = floor(crop_h * img_w / img_h) if h_diff > w_diff else crop_w
-            new_h = floor(crop_w * img_h / img_w) if h_diff < w_diff else crop_h
-
-        if shrink_image is True:
-            new_w = crop_w if h_diff > w_diff else floor(crop_h * img_w / img_h)
-            new_h = crop_h if h_diff < w_diff else floor(crop_w * img_h / img_w)
-
-        left = (new_w - crop_w) * center_point[0]
+        # resizing is required before cropping only if both image dimensions are bigger than the crop size
+        if crop_w < img_w and crop_h < img_h:
+            # if calculated height is bigger than requested crop height
+            if floor(crop_w * img_h / img_w) > crop_h:
+                # then resize image to requested crop width keeping proportional height
+                new_w = crop_w
+                new_h = floor(crop_w * img_h / img_w)
+            else:
+                # else resize the image to requested crop height keeping proportional width
+                new_w = floor(crop_h * img_w / img_h)
+                new_h = crop_h
+        else:
+            # keep size since one or both dimensions is <= crop size
+            new_w = img_w
+            new_h = img_h
+
+        # get crop coordinates
+        left = floor((new_w - crop_w) * center_point[0])
+        top = floor((new_h - crop_h) * center_point[1])
         right = left + crop_w
-        top = (new_h - crop_h) * center_point[1]
         bottom = top + crop_h
 
-        return frame.resize((new_w, new_h), resample=Image.Resampling.LANCZOS).crop((left, top, right, bottom))
+        orig_ratio = Fraction(img_w, img_h)
+        new_ratio = Fraction(new_w, new_h)
+        logger.debug(
+            f"Original size is {img_w}*{img_h} ({orig_ratio}), requested size is {crop_w}*{crop_h}, resizing "
+            f"to {new_w}*{new_h} ({new_ratio}), then cropping - initial crop coord are {left, top, right, bottom}"
+        )
+
+        # make sure any space outside the image is transparent
+        t_left = max(left, 0)
+        t_top = max(top, 0)
+        t_right = min(right, new_w)
+        t_bottom = min(bottom, new_h)
+        logger.debug(f"after transparency adjustments crop coords are {t_left, t_top, t_right, t_bottom}")
+
+        # resize and crop the image
+        frame = frame.resize((new_w, new_h), resample=Image.Resampling.LANCZOS).crop((t_left, t_top, t_right, t_bottom))
+        logger.debug(f"Result frame size is {frame.width}*{frame.height}")
+
+        logger.debug(f"new {crop_w}*{crop_h} orig {img_w}*{img_h}")
+        if crop_w > img_w or crop_h > img_h:
+            # original image has one or both dimensions smaller than the requested size,
+            # paste the image onto a transparent canvas exactly as big as requested
+            canvas = Image.new("RGBA", (crop_w, crop_h), (0, 0, 0, 0))
+            logger.debug(f"Transparent canvas size is {canvas.width}*{canvas.height}")
+            c_left = t_left - left
+            c_top = t_top - top
+            logger.debug(f"Positioning image on canvas at {c_left} {c_top}")
+            canvas.paste(frame, (c_left, c_top))
+            return canvas
+        # original image larger than the requested size in both dimensions,
+        # no transparent canvas needed, just return the resized frame as-is
+        # frame might be 1px smaller than requested in one or both dimensions
+        # due to rounding
+        logger.debug(
+            f"Image has been downsized from {img_w}*{img_h} to {frame.width}*{frame.height} - "
+            f"requested size was {crop_w}*{crop_h}."
+        )
+        return frame
 
     # single frame image
     if n_frames == 1:
diff --git a/src/bma_client_lib/tests/__init__.py b/src/bma_client_lib/tests/__init__.py
@@ -0,0 +1 @@
+"""This __init__.py is empty."""
diff --git a/src/bma_client_lib/tests/test_pillow_resize_and_crop.py b/src/bma_client_lib/tests/test_pillow_resize_and_crop.py