Skip to content

Commit 463618b

Browse files
Muad Abd El Hayclaude
authored andcommitted
Fix test assertions and add regression test for overlapping secondary attrs
- Fix assertion counts: Experiment.make() inserts fake_experiments_per_subject rows per key, so populate(max_calls=2) produces 10 rows, not 2 - Add test_populate_antijoin_overlapping_attrs: self-contained test with Sensor/ProcessedSensor tables that share secondary attribute names (num_samples, quality), reproducing the exact conditions where the antijoin fails without .proj() - Run ruff-format to fix lint Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 14d44d7 commit 463618b

File tree

1 file changed

+102
-29
lines changed

1 file changed

+102
-29
lines changed

tests/integration/test_autopopulate.py

Lines changed: 102 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -115,54 +115,39 @@ def test_allow_insert(clean_autopopulate, subject, experiment):
115115
def test_populate_antijoin_with_secondary_attrs(clean_autopopulate, subject, experiment):
116116
"""Test that populate correctly computes pending keys via antijoin.
117117
118-
Regression test for a bug where `key_source - self` returned all keys
119-
instead of just unpopulated ones when the target table has secondary
120-
attributes. The antijoin must match on primary key only, ignoring
121-
secondary attributes. Without `.proj()`, the antijoin could fail to
122-
exclude already-populated keys.
123-
124-
This affected both direct mode (reserve_jobs=False) and distributed mode
125-
(reserve_jobs=True), causing workers to waste time re-checking already
126-
completed entries.
118+
Verifies that partial populate + antijoin gives correct pending counts.
119+
Note: Experiment.make() inserts fake_experiments_per_subject rows per key.
127120
"""
128121
assert subject, "root tables are empty"
129122
assert not experiment, "table already filled?"
130123

131124
total_keys = len(experiment.key_source)
132125
assert total_keys > 0
133126

134-
# Partially populate (only 2 entries)
127+
# Partially populate (2 keys from key_source)
135128
experiment.populate(max_calls=2)
136-
assert len(experiment) == 2
129+
assert len(experiment) == 2 * experiment.fake_experiments_per_subject
137130

138-
# The critical test: key_source - target must return only unpopulated keys.
139-
# Before the fix, this returned all keys (== total_keys) because the
140-
# antijoin failed to match on PK when secondary attributes were present.
131+
# key_source - target must return only unpopulated keys
141132
pending = experiment.key_source - experiment
142-
assert len(pending) == total_keys - 2, (
143-
f"Antijoin returned {len(pending)} pending keys, expected {total_keys - 2}. "
144-
f"key_source - target may not be matching on primary key only."
145-
)
133+
assert len(pending) == total_keys - 2, f"Antijoin returned {len(pending)} pending keys, expected {total_keys - 2}."
146134

147-
# Also verify progress() reports correct counts
135+
# Verify progress() reports correct counts
148136
remaining, total = experiment.progress()
149137
assert total == total_keys
150138
assert remaining == total_keys - 2
151139

152140
# Populate the rest and verify antijoin returns 0
153141
experiment.populate()
154142
pending_after = experiment.key_source - experiment
155-
assert len(pending_after) == 0, (
156-
f"Antijoin returned {len(pending_after)} pending keys after full populate, expected 0."
157-
)
143+
assert len(pending_after) == 0, f"Antijoin returned {len(pending_after)} pending keys after full populate, expected 0."
158144

159145

160146
def test_populate_distributed_antijoin(clean_autopopulate, subject, experiment):
161147
"""Test that reserve_jobs=True correctly identifies pending keys.
162148
163149
When using distributed mode, jobs.refresh() must only insert truly pending
164-
keys into the jobs table, not already-completed ones. This verifies the
165-
antijoin in jobs.refresh() works correctly with secondary attributes.
150+
keys into the jobs table, not already-completed ones.
166151
"""
167152
assert subject, "root tables are empty"
168153
assert not experiment, "table already filled?"
@@ -171,20 +156,108 @@ def test_populate_distributed_antijoin(clean_autopopulate, subject, experiment):
171156

172157
# Partially populate
173158
experiment.populate(max_calls=2)
174-
assert len(experiment) == 2
159+
assert len(experiment) == 2 * experiment.fake_experiments_per_subject
175160

176161
# Refresh jobs — should only create entries for unpopulated keys
177162
experiment.jobs.refresh(delay=-1)
178163
pending_jobs = len(experiment.jobs.pending)
179-
assert pending_jobs == total_keys - 2, (
180-
f"jobs.refresh() created {pending_jobs} pending jobs, expected {total_keys - 2}. "
181-
f"The antijoin in refresh() may not be excluding already-completed keys."
182-
)
164+
assert pending_jobs == total_keys - 2, f"jobs.refresh() created {pending_jobs} pending jobs, expected {total_keys - 2}."
183165

184166
# Clean up
185167
experiment.jobs.delete_quick()
186168

187169

170+
def test_populate_antijoin_overlapping_attrs(prefix, connection_test):
171+
"""Regression test: antijoin with overlapping secondary attribute names.
172+
173+
This reproduces the bug where `key_source - self` returns ALL keys instead
174+
of just unpopulated ones. The condition is:
175+
176+
1. key_source returns secondary attributes (e.g., num_samples, quality)
177+
2. The target table has secondary attributes with the SAME NAMES
178+
3. The VALUES differ between source and target after populate
179+
180+
Without .proj() on the target, SQL matches on ALL common column names
181+
(including secondary attrs), so different values mean no match, and all
182+
keys appear "pending" even after populate.
183+
184+
Real-world example: LightningPoseOutput (key_source) has num_frames,
185+
quality, processing_datetime as secondary attrs. InitialContainer (target)
186+
also has those same-named columns with different values.
187+
"""
188+
test_schema = dj.Schema(f"{prefix}_antijoin_overlap", connection=connection_test)
189+
190+
@test_schema
191+
class Sensor(dj.Lookup):
192+
definition = """
193+
sensor_id : int32
194+
---
195+
num_samples : int32
196+
quality : float
197+
"""
198+
contents = [
199+
(1, 100, 0.95),
200+
(2, 200, 0.87),
201+
(3, 150, 0.92),
202+
(4, 175, 0.89),
203+
]
204+
205+
@test_schema
206+
class ProcessedSensor(dj.Computed):
207+
definition = """
208+
-> Sensor
209+
---
210+
num_samples : int32 # same name as Sensor's secondary attr
211+
quality : float # same name as Sensor's secondary attr
212+
result : float
213+
"""
214+
215+
@property
216+
def key_source(self):
217+
return Sensor() # returns sensor_id + num_samples + quality
218+
219+
def make(self, key):
220+
# Values intentionally differ from source — this is what triggers
221+
# the bug: the antijoin tries to match on num_samples and quality
222+
# too, and since values differ, no match is found.
223+
self.insert1(
224+
dict(
225+
sensor_id=key["sensor_id"],
226+
num_samples=key["num_samples"] * 2,
227+
quality=round(key["quality"] + 0.05, 2),
228+
result=key["num_samples"] * key["quality"],
229+
)
230+
)
231+
232+
try:
233+
# Partially populate (2 out of 4)
234+
ProcessedSensor().populate(max_calls=2)
235+
assert len(ProcessedSensor()) == 2
236+
237+
total_keys = len(ProcessedSensor().key_source)
238+
assert total_keys == 4
239+
240+
# The critical test: populate() must correctly identify remaining keys.
241+
# Before the fix, populate() used `key_source - self` which matched on
242+
# num_samples and quality too, returning all 4 keys as "pending".
243+
ProcessedSensor().populate()
244+
assert len(ProcessedSensor()) == 4, (
245+
f"After full populate, expected 4 entries but got {len(ProcessedSensor())}. "
246+
f"populate() likely re-processed already-completed keys."
247+
)
248+
249+
# Verify progress reports 0 remaining
250+
remaining, total = ProcessedSensor().progress()
251+
assert remaining == 0, f"Expected 0 remaining, got {remaining}"
252+
assert total == 4
253+
254+
# Verify antijoin with .proj() is correct
255+
pending = ProcessedSensor().key_source - ProcessedSensor().proj()
256+
assert len(pending) == 0
257+
finally:
258+
test_schema.drop(force=True)
259+
260+
188261
def test_load_dependencies(prefix, connection_test):
189262
schema = dj.Schema(f"{prefix}_load_dependencies_populate", connection=connection_test)
190263

0 commit comments

Comments
 (0)