@@ -115,54 +115,39 @@ def test_allow_insert(clean_autopopulate, subject, experiment):
115115def test_populate_antijoin_with_secondary_attrs (clean_autopopulate , subject , experiment ):
116116 """Test that populate correctly computes pending keys via antijoin.
117117
118- Regression test for a bug where `key_source - self` returned all keys
119- instead of just unpopulated ones when the target table has secondary
120- attributes. The antijoin must match on primary key only, ignoring
121- secondary attributes. Without `.proj()`, the antijoin could fail to
122- exclude already-populated keys.
123-
124- This affected both direct mode (reserve_jobs=False) and distributed mode
125- (reserve_jobs=True), causing workers to waste time re-checking already
126- completed entries.
118+ Verifies that partial populate + antijoin gives correct pending counts.
119+ Note: Experiment.make() inserts fake_experiments_per_subject rows per key.
127120 """
128121 assert subject , "root tables are empty"
129122 assert not experiment , "table already filled?"
130123
131124 total_keys = len (experiment .key_source )
132125 assert total_keys > 0
133126
134- # Partially populate (only 2 entries )
127+ # Partially populate (2 keys from key_source )
135128 experiment .populate (max_calls = 2 )
136- assert len (experiment ) == 2
129+ assert len (experiment ) == 2 * experiment . fake_experiments_per_subject
137130
138- # The critical test: key_source - target must return only unpopulated keys.
139- # Before the fix, this returned all keys (== total_keys) because the
140- # antijoin failed to match on PK when secondary attributes were present.
131+ # key_source - target must return only unpopulated keys
141132 pending = experiment .key_source - experiment
142- assert len (pending ) == total_keys - 2 , (
143- f"Antijoin returned { len (pending )} pending keys, expected { total_keys - 2 } . "
144- f"key_source - target may not be matching on primary key only."
145- )
133+ assert len (pending ) == total_keys - 2 , f"Antijoin returned { len (pending )} pending keys, expected { total_keys - 2 } ."
146134
147- # Also verify progress() reports correct counts
135+ # Verify progress() reports correct counts
148136 remaining , total = experiment .progress ()
149137 assert total == total_keys
150138 assert remaining == total_keys - 2
151139
152140 # Populate the rest and verify antijoin returns 0
153141 experiment .populate ()
154142 pending_after = experiment .key_source - experiment
155- assert len (pending_after ) == 0 , (
156- f"Antijoin returned { len (pending_after )} pending keys after full populate, expected 0."
157- )
143+ assert len (pending_after ) == 0 , f"Antijoin returned { len (pending_after )} pending keys after full populate, expected 0."
158144
159145
160146def test_populate_distributed_antijoin (clean_autopopulate , subject , experiment ):
161147 """Test that reserve_jobs=True correctly identifies pending keys.
162148
163149 When using distributed mode, jobs.refresh() must only insert truly pending
164- keys into the jobs table, not already-completed ones. This verifies the
165- antijoin in jobs.refresh() works correctly with secondary attributes.
150+ keys into the jobs table, not already-completed ones.
166151 """
167152 assert subject , "root tables are empty"
168153 assert not experiment , "table already filled?"
@@ -171,20 +156,108 @@ def test_populate_distributed_antijoin(clean_autopopulate, subject, experiment):
171156
172157 # Partially populate
173158 experiment .populate (max_calls = 2 )
174- assert len (experiment ) == 2
159+ assert len (experiment ) == 2 * experiment . fake_experiments_per_subject
175160
176161 # Refresh jobs — should only create entries for unpopulated keys
177162 experiment .jobs .refresh (delay = - 1 )
178163 pending_jobs = len (experiment .jobs .pending )
179- assert pending_jobs == total_keys - 2 , (
180- f"jobs.refresh() created { pending_jobs } pending jobs, expected { total_keys - 2 } . "
181- f"The antijoin in refresh() may not be excluding already-completed keys."
182- )
164+ assert pending_jobs == total_keys - 2 , f"jobs.refresh() created { pending_jobs } pending jobs, expected { total_keys - 2 } ."
183165
184166 # Clean up
185167 experiment .jobs .delete_quick ()
186168
187169
170+ def test_populate_antijoin_overlapping_attrs (prefix , connection_test ):
171+ """Regression test: antijoin with overlapping secondary attribute names.
172+
173+ This reproduces the bug where `key_source - self` returns ALL keys instead
174+ of just unpopulated ones. The condition is:
175+
176+ 1. key_source returns secondary attributes (e.g., num_samples, quality)
177+ 2. The target table has secondary attributes with the SAME NAMES
178+ 3. The VALUES differ between source and target after populate
179+
180+ Without .proj() on the target, SQL matches on ALL common column names
181+ (including secondary attrs), so different values mean no match, and all
182+ keys appear "pending" even after populate.
183+
184+ Real-world example: LightningPoseOutput (key_source) has num_frames,
185+ quality, processing_datetime as secondary attrs. InitialContainer (target)
186+ also has those same-named columns with different values.
187+ """
188+ test_schema = dj .Schema (f"{ prefix } _antijoin_overlap" , connection = connection_test )
189+
190+ @test_schema
191+ class Sensor (dj .Lookup ):
192+ definition = """
193+ sensor_id : int32
194+ ---
195+ num_samples : int32
196+ quality : float
197+ """
198+ contents = [
199+ (1 , 100 , 0.95 ),
200+ (2 , 200 , 0.87 ),
201+ (3 , 150 , 0.92 ),
202+ (4 , 175 , 0.89 ),
203+ ]
204+
205+ @test_schema
206+ class ProcessedSensor (dj .Computed ):
207+ definition = """
208+ -> Sensor
209+ ---
210+ num_samples : int32 # same name as Sensor's secondary attr
211+ quality : float # same name as Sensor's secondary attr
212+ result : float
213+ """
214+
215+ @property
216+ def key_source (self ):
217+ return Sensor () # returns sensor_id + num_samples + quality
218+
219+ def make (self , key ):
220+ # Values intentionally differ from source — this is what triggers
221+ # the bug: the antijoin tries to match on num_samples and quality
222+ # too, and since values differ, no match is found.
223+ self .insert1 (
224+ dict (
225+ sensor_id = key ["sensor_id" ],
226+ num_samples = key ["num_samples" ] * 2 ,
227+ quality = round (key ["quality" ] + 0.05 , 2 ),
228+ result = key ["num_samples" ] * key ["quality" ],
229+ )
230+ )
231+
232+ try :
233+ # Partially populate (2 out of 4)
234+ ProcessedSensor ().populate (max_calls = 2 )
235+ assert len (ProcessedSensor ()) == 2
236+
237+ total_keys = len (ProcessedSensor ().key_source )
238+ assert total_keys == 4
239+
240+ # The critical test: populate() must correctly identify remaining keys.
241+ # Before the fix, populate() used `key_source - self` which matched on
242+ # num_samples and quality too, returning all 4 keys as "pending".
243+ ProcessedSensor ().populate ()
244+ assert len (ProcessedSensor ()) == 4 , (
245+ f"After full populate, expected 4 entries but got { len (ProcessedSensor ())} . "
246+ f"populate() likely re-processed already-completed keys."
247+ )
248+
249+ # Verify progress reports 0 remaining
250+ remaining , total = ProcessedSensor ().progress ()
251+ assert remaining == 0 , f"Expected 0 remaining, got { remaining } "
252+ assert total == 4
253+
254+ # Verify antijoin with .proj() is correct
255+ pending = ProcessedSensor ().key_source - ProcessedSensor ().proj ()
256+ assert len (pending ) == 0
257+ finally :
258+ test_schema .drop (force = True )
259+
260+
188261def test_load_dependencies (prefix , connection_test ):
189262 schema = dj .Schema (f"{ prefix } _load_dependencies_populate" , connection = connection_test )
190263
0 commit comments