better testing

petrelharp · petrelharp · commit 536364495986 · 2026-03-21T07:43:53.000-07:00
diff --git a/python/tests/test_metadata.py b/python/tests/test_metadata.py
@@ -649,7 +649,7 @@ def test_disallow_duplicate_keys(self):
         ):
             tskit.MetadataSchema(schema)
 
-    def test_round_trip_with_struct_and_json(self):
+    def schema_with_blobs(self, num_blobs):
         schema = {
             "codec": "json+struct",
             "json": {
@@ -662,59 +662,36 @@ def test_round_trip_with_struct_and_json(self):
             },
             "struct": {
                 "type": "object",
-                "properties": {"blob": {"type": "integer", "binaryFormat": "i"}},
+                "properties": {},
             },
         }
-        ms = tskit.MetadataSchema(schema)
-        row = {"label": "alpha", "count": 7, "blob": 5}
-        encoded = ms.validate_and_encode_row(row)
-        out = ms.decode_row(encoded)
-        assert out == row
-
-    def test_blob_bytes_aligned(self):
-        # test that the portion of the encoded metadata up until the struct
-        # is 8-byte aligned; we do that in the pedantic way
-        # of figuring out how much memory is being used per int
-        # in the struct part and subtracting that off
-        def schema_with_blobs(k):
-            schema = {
-                "codec": "json+struct",
-                "json": {
-                    "type": "object",
-                    "properties": {
-                        "label": {"type": "string"},
-                        "count": {"type": "number"},
-                    },
-                    "required": ["label"],
-                },
-                "struct": {
-                    "type": "object",
-                    "properties": {},
-                },
+        for j in range(num_blobs):
+            schema["struct"]["properties"][f"b{j}"] = {
+                "type": "integer",
+                "binaryFormat": "i",
             }
-            for j in range(k):
-                schema["struct"]["properties"][f"b{j}"] = {
-                    "type": "integer",
-                    "binaryFormat": "i",
-                }
-            return tskit.MetadataSchema(schema)
-
-        k_list = (0, 1, 2, 3)
-        schemas = [schema_with_blobs(k) for k in k_list]
-        rows = []
-        for k in k_list:
-            row = {"label": "alpha", "count": 7}
-            for j in range(k):
-                row[f"b{j}"] = j
-            rows.append(row)
-        encoded = [ms.validate_and_encode_row(row) for ms, row in zip(schemas, rows)]
-        dbytes = len(encoded[2]) - len(encoded[1])
-        assert len(encoded[3]) - len(encoded[2]) == dbytes
-        for k, en in zip(k_list, encoded):
-            assert (len(en) - k * dbytes) % 8 == 0
-        for ms, en, row in zip(schemas, encoded, rows):
-            decoded = ms.decode_row(en)
-            assert decoded == row
+        return tskit.MetadataSchema(schema)
+
+    @pytest.mark.parametrize("k", (0, 1, 5, 1001))
+    def test_round_trip_with_struct_and_json(self, k):
+        ms = self.schema_with_blobs(k)
+        ms0 = self.schema_with_blobs(0)
+        bytes_per_blob = len(struct.pack("i", 0))
+        for s in [
+            "",
+            "abc",
+            "superfragilisticexpialodocious",
+            " " * 1000 + "foo" + " " * 1000,
+        ]:
+            row = {"label": s, "count": 7}
+            encoded0 = ms0.validate_and_encode_row(row)
+            row.update({f"b{j}": j for j in range(k)})
+            encoded = ms.validate_and_encode_row(row)
+            out = ms.decode_row(encoded)
+            assert out == row
+            # validate byte alignment
+            assert len(encoded) - len(encoded0) == k * bytes_per_blob
+            assert len(encoded0) % 8 == 0
 
     def test_json_defaults_applied(self):
         schema = {
diff --git a/python/tskit/metadata.py b/python/tskit/metadata.py
@@ -318,7 +318,9 @@ def decode(self, encoded: bytes) -> Any:
                     "Invalid json+struct payload: declared lengths exceed buffer size"
                 )
             json_bytes = encoded[start : start + jlen]
-            blob_bytes = encoded[start + jlen : start + jlen + blen + padding_length]
+            blob_bytes = encoded[
+                start + jlen + padding_length : start + jlen + padding_length + blen
+            ]
             json_data = self.json_codec.decode(json_bytes)
             struct_data = self.struct_codec.decode(blob_bytes)
             overlap = set(json_data).intersection(struct_data)