Skip to content

Commit 33aa68d

Browse files
author
Dave Plummer
committed
Fix x86 vectorization
1 parent ad14116 commit 33aa68d

File tree

1 file changed

+12
-12
lines changed

1 file changed

+12
-12
lines changed

PrimeCPP/solution_5/PrimeCPP_array.cpp

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -361,7 +361,7 @@ class BitArray
361361
size_t idx = 0;
362362
while (idx + 8 <= cycleLen)
363363
{
364-
__m512i existing = _mm512_load_si512(reinterpret_cast<const __m512i*>(words + wordIndex + idx));
364+
__m512i existing = _mm512_loadu_si512(reinterpret_cast<const void*>(words + wordIndex + idx));
365365
const __m512i masks = _mm512_set_epi64(
366366
static_cast<long long>(cycleMasks[idx + 7]),
367367
static_cast<long long>(cycleMasks[idx + 6]),
@@ -372,29 +372,29 @@ class BitArray
372372
static_cast<long long>(cycleMasks[idx + 1]),
373373
static_cast<long long>(cycleMasks[idx + 0]));
374374
existing = _mm512_or_si512(existing, masks);
375-
_mm512_store_si512(reinterpret_cast<__m512i*>(words + wordIndex + idx), existing);
375+
_mm512_storeu_si512(reinterpret_cast<void*>(words + wordIndex + idx), existing);
376376
idx += 8;
377377
}
378378
while (idx + 4 <= cycleLen)
379379
{
380-
__m256i existing = _mm256_load_si256(reinterpret_cast<const __m256i*>(words + wordIndex + idx));
380+
__m256i existing = _mm256_loadu_si256(reinterpret_cast<const __m256i_u*>(words + wordIndex + idx));
381381
const __m256i masks = _mm256_set_epi64x(
382382
static_cast<long long>(cycleMasks[idx + 3]),
383383
static_cast<long long>(cycleMasks[idx + 2]),
384384
static_cast<long long>(cycleMasks[idx + 1]),
385385
static_cast<long long>(cycleMasks[idx + 0]));
386386
existing = _mm256_or_si256(existing, masks);
387-
_mm256_store_si256(reinterpret_cast<__m256i*>(words + wordIndex + idx), existing);
387+
_mm256_storeu_si256(reinterpret_cast<__m256i_u*>(words + wordIndex + idx), existing);
388388
idx += 4;
389389
}
390390
while (idx + 2 <= cycleLen)
391391
{
392-
__m128i existing = _mm_load_si128(reinterpret_cast<const __m128i*>(words + wordIndex + idx));
392+
__m128i existing = _mm_loadu_si128(reinterpret_cast<const __m128i_u*>(words + wordIndex + idx));
393393
const __m128i masks = _mm_set_epi64x(
394394
static_cast<long long>(cycleMasks[idx + 1]),
395395
static_cast<long long>(cycleMasks[idx + 0]));
396396
existing = _mm_or_si128(existing, masks);
397-
_mm_store_si128(reinterpret_cast<__m128i*>(words + wordIndex + idx), existing);
397+
_mm_storeu_si128(reinterpret_cast<__m128i_u*>(words + wordIndex + idx), existing);
398398
idx += 2;
399399
}
400400
while (idx < cycleLen)
@@ -410,24 +410,24 @@ class BitArray
410410
size_t idx = 0;
411411
while (idx + 4 <= cycleLen)
412412
{
413-
__m256i existing = _mm256_load_si256(reinterpret_cast<const __m256i*>(words + wordIndex + idx));
413+
__m256i existing = _mm256_loadu_si256(reinterpret_cast<const __m256i_u*>(words + wordIndex + idx));
414414
const __m256i masks = _mm256_set_epi64x(
415415
static_cast<long long>(cycleMasks[idx + 3]),
416416
static_cast<long long>(cycleMasks[idx + 2]),
417417
static_cast<long long>(cycleMasks[idx + 1]),
418418
static_cast<long long>(cycleMasks[idx + 0]));
419419
existing = _mm256_or_si256(existing, masks);
420-
_mm256_store_si256(reinterpret_cast<__m256i*>(words + wordIndex + idx), existing);
420+
_mm256_storeu_si256(reinterpret_cast<__m256i_u*>(words + wordIndex + idx), existing);
421421
idx += 4;
422422
}
423423
while (idx + 2 <= cycleLen)
424424
{
425-
__m128i existing = _mm_load_si128(reinterpret_cast<const __m128i*>(words + wordIndex + idx));
425+
__m128i existing = _mm_loadu_si128(reinterpret_cast<const __m128i_u*>(words + wordIndex + idx));
426426
const __m128i masks = _mm_set_epi64x(
427427
static_cast<long long>(cycleMasks[idx + 1]),
428428
static_cast<long long>(cycleMasks[idx + 0]));
429429
existing = _mm_or_si128(existing, masks);
430-
_mm_store_si128(reinterpret_cast<__m128i*>(words + wordIndex + idx), existing);
430+
_mm_storeu_si128(reinterpret_cast<__m128i_u*>(words + wordIndex + idx), existing);
431431
idx += 2;
432432
}
433433
while (idx < cycleLen)
@@ -443,12 +443,12 @@ class BitArray
443443
size_t idx = 0;
444444
while (idx + 2 <= cycleLen)
445445
{
446-
__m128i existing = _mm_load_si128(reinterpret_cast<const __m128i*>(words + wordIndex + idx));
446+
__m128i existing = _mm_loadu_si128(reinterpret_cast<const __m128i_u*>(words + wordIndex + idx));
447447
const __m128i masks = _mm_set_epi64x(
448448
static_cast<long long>(cycleMasks[idx + 1]),
449449
static_cast<long long>(cycleMasks[idx + 0]));
450450
existing = _mm_or_si128(existing, masks);
451-
_mm_store_si128(reinterpret_cast<__m128i*>(words + wordIndex + idx), existing);
451+
_mm_storeu_si128(reinterpret_cast<__m128i_u*>(words + wordIndex + idx), existing);
452452
idx += 2;
453453
}
454454
while (idx < cycleLen)

0 commit comments

Comments
 (0)