@@ -361,7 +361,7 @@ class BitArray
361361 size_t idx = 0 ;
362362 while (idx + 8 <= cycleLen)
363363 {
364- __m512i existing = _mm512_load_si512 (reinterpret_cast <const __m512i *>(words + wordIndex + idx));
364+ __m512i existing = _mm512_loadu_si512 (reinterpret_cast <const void *>(words + wordIndex + idx));
365365 const __m512i masks = _mm512_set_epi64 (
366366 static_cast <long long >(cycleMasks[idx + 7 ]),
367367 static_cast <long long >(cycleMasks[idx + 6 ]),
@@ -372,29 +372,29 @@ class BitArray
372372 static_cast <long long >(cycleMasks[idx + 1 ]),
373373 static_cast <long long >(cycleMasks[idx + 0 ]));
374374 existing = _mm512_or_si512 (existing, masks);
375- _mm512_store_si512 (reinterpret_cast <__m512i *>(words + wordIndex + idx), existing);
375+ _mm512_storeu_si512 (reinterpret_cast <void *>(words + wordIndex + idx), existing);
376376 idx += 8 ;
377377 }
378378 while (idx + 4 <= cycleLen)
379379 {
380- __m256i existing = _mm256_load_si256 (reinterpret_cast <const __m256i *>(words + wordIndex + idx));
380+ __m256i existing = _mm256_loadu_si256 (reinterpret_cast <const __m256i_u *>(words + wordIndex + idx));
381381 const __m256i masks = _mm256_set_epi64x (
382382 static_cast <long long >(cycleMasks[idx + 3 ]),
383383 static_cast <long long >(cycleMasks[idx + 2 ]),
384384 static_cast <long long >(cycleMasks[idx + 1 ]),
385385 static_cast <long long >(cycleMasks[idx + 0 ]));
386386 existing = _mm256_or_si256 (existing, masks);
387- _mm256_store_si256 (reinterpret_cast <__m256i *>(words + wordIndex + idx), existing);
387+ _mm256_storeu_si256 (reinterpret_cast <__m256i_u *>(words + wordIndex + idx), existing);
388388 idx += 4 ;
389389 }
390390 while (idx + 2 <= cycleLen)
391391 {
392- __m128i existing = _mm_load_si128 (reinterpret_cast <const __m128i *>(words + wordIndex + idx));
392+ __m128i existing = _mm_loadu_si128 (reinterpret_cast <const __m128i_u *>(words + wordIndex + idx));
393393 const __m128i masks = _mm_set_epi64x (
394394 static_cast <long long >(cycleMasks[idx + 1 ]),
395395 static_cast <long long >(cycleMasks[idx + 0 ]));
396396 existing = _mm_or_si128 (existing, masks);
397- _mm_store_si128 (reinterpret_cast <__m128i *>(words + wordIndex + idx), existing);
397+ _mm_storeu_si128 (reinterpret_cast <__m128i_u *>(words + wordIndex + idx), existing);
398398 idx += 2 ;
399399 }
400400 while (idx < cycleLen)
@@ -410,24 +410,24 @@ class BitArray
410410 size_t idx = 0 ;
411411 while (idx + 4 <= cycleLen)
412412 {
413- __m256i existing = _mm256_load_si256 (reinterpret_cast <const __m256i *>(words + wordIndex + idx));
413+ __m256i existing = _mm256_loadu_si256 (reinterpret_cast <const __m256i_u *>(words + wordIndex + idx));
414414 const __m256i masks = _mm256_set_epi64x (
415415 static_cast <long long >(cycleMasks[idx + 3 ]),
416416 static_cast <long long >(cycleMasks[idx + 2 ]),
417417 static_cast <long long >(cycleMasks[idx + 1 ]),
418418 static_cast <long long >(cycleMasks[idx + 0 ]));
419419 existing = _mm256_or_si256 (existing, masks);
420- _mm256_store_si256 (reinterpret_cast <__m256i *>(words + wordIndex + idx), existing);
420+ _mm256_storeu_si256 (reinterpret_cast <__m256i_u *>(words + wordIndex + idx), existing);
421421 idx += 4 ;
422422 }
423423 while (idx + 2 <= cycleLen)
424424 {
425- __m128i existing = _mm_load_si128 (reinterpret_cast <const __m128i *>(words + wordIndex + idx));
425+ __m128i existing = _mm_loadu_si128 (reinterpret_cast <const __m128i_u *>(words + wordIndex + idx));
426426 const __m128i masks = _mm_set_epi64x (
427427 static_cast <long long >(cycleMasks[idx + 1 ]),
428428 static_cast <long long >(cycleMasks[idx + 0 ]));
429429 existing = _mm_or_si128 (existing, masks);
430- _mm_store_si128 (reinterpret_cast <__m128i *>(words + wordIndex + idx), existing);
430+ _mm_storeu_si128 (reinterpret_cast <__m128i_u *>(words + wordIndex + idx), existing);
431431 idx += 2 ;
432432 }
433433 while (idx < cycleLen)
@@ -443,12 +443,12 @@ class BitArray
443443 size_t idx = 0 ;
444444 while (idx + 2 <= cycleLen)
445445 {
446- __m128i existing = _mm_load_si128 (reinterpret_cast <const __m128i *>(words + wordIndex + idx));
446+ __m128i existing = _mm_loadu_si128 (reinterpret_cast <const __m128i_u *>(words + wordIndex + idx));
447447 const __m128i masks = _mm_set_epi64x (
448448 static_cast <long long >(cycleMasks[idx + 1 ]),
449449 static_cast <long long >(cycleMasks[idx + 0 ]));
450450 existing = _mm_or_si128 (existing, masks);
451- _mm_store_si128 (reinterpret_cast <__m128i *>(words + wordIndex + idx), existing);
451+ _mm_storeu_si128 (reinterpret_cast <__m128i_u *>(words + wordIndex + idx), existing);
452452 idx += 2 ;
453453 }
454454 while (idx < cycleLen)
0 commit comments