Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .cirrus.yml
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,10 @@ rocky_task:
task:
name: freebsd
freebsd_instance:
image_family: freebsd-14-2
image_family: freebsd-15-0-snap

pkginstall_script:
- pkg update -f
- IGNORE_OSVERSION=yes pkg update -f
- pkg install -y gcc autoconf automake libdeflate libtool

compile_script:
Expand Down
199 changes: 104 additions & 95 deletions htscodecs/rANS_static32x16pr_neon.c
Original file line number Diff line number Diff line change
Expand Up @@ -1238,153 +1238,162 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
// }

for (z = 0; z < NX; z+=4) {
*(uint64_t *)&out[iN[z]] =
uint64_t t0[4] = {
((uint64_t)(t[0][z])<< 0) +
((uint64_t)(t[1][z])<< 8) +
((uint64_t)(t[2][z])<<16) +
((uint64_t)(t[3][z])<<24) +
((uint64_t)(t[4][z])<<32) +
((uint64_t)(t[5][z])<<40) +
((uint64_t)(t[6][z])<<48) +
((uint64_t)(t[7][z])<<56);
*(uint64_t *)&out[iN[z+1]] =
((uint64_t)(t[0][z+1])<< 0) +
((uint64_t)(t[1][z+1])<< 8) +
((uint64_t)(t[2][z+1])<<16) +
((uint64_t)(t[3][z+1])<<24) +
((uint64_t)(t[4][z+1])<<32) +
((uint64_t)(t[5][z+1])<<40) +
((uint64_t)(t[6][z+1])<<48) +
((uint64_t)(t[7][z+1])<<56);
*(uint64_t *)&out[iN[z+2]] =
((uint64_t)(t[0][z+2])<< 0) +
((uint64_t)(t[1][z+2])<< 8) +
((uint64_t)(t[2][z+2])<<16) +
((uint64_t)(t[3][z+2])<<24) +
((uint64_t)(t[4][z+2])<<32) +
((uint64_t)(t[5][z+2])<<40) +
((uint64_t)(t[6][z+2])<<48) +
((uint64_t)(t[7][z+2])<<56);
*(uint64_t *)&out[iN[z+3]] =
((uint64_t)(t[0][z+3])<< 0) +
((uint64_t)(t[1][z+3])<< 8) +
((uint64_t)(t[2][z+3])<<16) +
((uint64_t)(t[3][z+3])<<24) +
((uint64_t)(t[4][z+3])<<32) +
((uint64_t)(t[5][z+3])<<40) +
((uint64_t)(t[6][z+3])<<48) +
((uint64_t)(t[7][z+3])<<56);
((uint64_t)(t[7][z])<<56),

*(uint64_t *)&out[iN[z]+8] =
((uint64_t)(t[8+0][z])<< 0) +
((uint64_t)(t[8+1][z])<< 8) +
((uint64_t)(t[8+2][z])<<16) +
((uint64_t)(t[8+3][z])<<24) +
((uint64_t)(t[8+4][z])<<32) +
((uint64_t)(t[8+5][z])<<40) +
((uint64_t)(t[8+6][z])<<48) +
((uint64_t)(t[8+7][z])<<56);
*(uint64_t *)&out[iN[z+1]+8] =
((uint64_t)(t[8+0][z+1])<< 0) +
((uint64_t)(t[8+1][z+1])<< 8) +
((uint64_t)(t[8+2][z+1])<<16) +
((uint64_t)(t[8+3][z+1])<<24) +
((uint64_t)(t[8+4][z+1])<<32) +
((uint64_t)(t[8+5][z+1])<<40) +
((uint64_t)(t[8+6][z+1])<<48) +
((uint64_t)(t[8+7][z+1])<<56);
*(uint64_t *)&out[iN[z+2]+8] =
((uint64_t)(t[8+0][z+2])<< 0) +
((uint64_t)(t[8+1][z+2])<< 8) +
((uint64_t)(t[8+2][z+2])<<16) +
((uint64_t)(t[8+3][z+2])<<24) +
((uint64_t)(t[8+4][z+2])<<32) +
((uint64_t)(t[8+5][z+2])<<40) +
((uint64_t)(t[8+6][z+2])<<48) +
((uint64_t)(t[8+7][z+2])<<56);
*(uint64_t *)&out[iN[z+3]+8] =
((uint64_t)(t[8+0][z+3])<< 0) +
((uint64_t)(t[8+1][z+3])<< 8) +
((uint64_t)(t[8+2][z+3])<<16) +
((uint64_t)(t[8+3][z+3])<<24) +
((uint64_t)(t[8+4][z+3])<<32) +
((uint64_t)(t[8+5][z+3])<<40) +
((uint64_t)(t[8+6][z+3])<<48) +
((uint64_t)(t[8+7][z+3])<<56);
((uint64_t)(t[8+7][z])<<56),

*(uint64_t *)&out[iN[z]+16] =
((uint64_t)(t[16+0][z])<< 0) +
((uint64_t)(t[16+1][z])<< 8) +
((uint64_t)(t[16+2][z])<<16) +
((uint64_t)(t[16+3][z])<<24) +
((uint64_t)(t[16+4][z])<<32) +
((uint64_t)(t[16+5][z])<<40) +
((uint64_t)(t[16+6][z])<<48) +
((uint64_t)(t[16+7][z])<<56);
*(uint64_t *)&out[iN[z+1]+16] =
((uint64_t)(t[16+0][z+1])<< 0) +
((uint64_t)(t[16+1][z+1])<< 8) +
((uint64_t)(t[16+2][z+1])<<16) +
((uint64_t)(t[16+3][z+1])<<24) +
((uint64_t)(t[16+4][z+1])<<32) +
((uint64_t)(t[16+5][z+1])<<40) +
((uint64_t)(t[16+6][z+1])<<48) +
((uint64_t)(t[16+7][z+1])<<56);
*(uint64_t *)&out[iN[z+2]+16] =
((uint64_t)(t[16+0][z+2])<< 0) +
((uint64_t)(t[16+1][z+2])<< 8) +
((uint64_t)(t[16+2][z+2])<<16) +
((uint64_t)(t[16+3][z+2])<<24) +
((uint64_t)(t[16+4][z+2])<<32) +
((uint64_t)(t[16+5][z+2])<<40) +
((uint64_t)(t[16+6][z+2])<<48) +
((uint64_t)(t[16+7][z+2])<<56);
*(uint64_t *)&out[iN[z+3]+16] =
((uint64_t)(t[16+0][z+3])<< 0) +
((uint64_t)(t[16+1][z+3])<< 8) +
((uint64_t)(t[16+2][z+3])<<16) +
((uint64_t)(t[16+3][z+3])<<24) +
((uint64_t)(t[16+4][z+3])<<32) +
((uint64_t)(t[16+5][z+3])<<40) +
((uint64_t)(t[16+6][z+3])<<48) +
((uint64_t)(t[16+7][z+3])<<56);
((uint64_t)(t[16+7][z])<<56),

*(uint64_t *)&out[iN[z]+24] =
((uint64_t)(t[24+0][z])<< 0) +
((uint64_t)(t[24+1][z])<< 8) +
((uint64_t)(t[24+2][z])<<16) +
((uint64_t)(t[24+3][z])<<24) +
((uint64_t)(t[24+4][z])<<32) +
((uint64_t)(t[24+5][z])<<40) +
((uint64_t)(t[24+6][z])<<48) +
((uint64_t)(t[24+7][z])<<56);
*(uint64_t *)&out[iN[z+1]+24] =
((uint64_t)(t[24+7][z])<<56)
};
memcpy(&out[iN[z]], &t0, 32);

uint64_t t1[4] = {
((uint64_t)(t[0][z+1])<< 0) +
((uint64_t)(t[1][z+1])<< 8) +
((uint64_t)(t[2][z+1])<<16) +
((uint64_t)(t[3][z+1])<<24) +
((uint64_t)(t[4][z+1])<<32) +
((uint64_t)(t[5][z+1])<<40) +
((uint64_t)(t[6][z+1])<<48) +
((uint64_t)(t[7][z+1])<<56),

((uint64_t)(t[8+0][z+1])<< 0) +
((uint64_t)(t[8+1][z+1])<< 8) +
((uint64_t)(t[8+2][z+1])<<16) +
((uint64_t)(t[8+3][z+1])<<24) +
((uint64_t)(t[8+4][z+1])<<32) +
((uint64_t)(t[8+5][z+1])<<40) +
((uint64_t)(t[8+6][z+1])<<48) +
((uint64_t)(t[8+7][z+1])<<56),

((uint64_t)(t[16+0][z+1])<< 0) +
((uint64_t)(t[16+1][z+1])<< 8) +
((uint64_t)(t[16+2][z+1])<<16) +
((uint64_t)(t[16+3][z+1])<<24) +
((uint64_t)(t[16+4][z+1])<<32) +
((uint64_t)(t[16+5][z+1])<<40) +
((uint64_t)(t[16+6][z+1])<<48) +
((uint64_t)(t[16+7][z+1])<<56),

((uint64_t)(t[24+0][z+1])<< 0) +
((uint64_t)(t[24+1][z+1])<< 8) +
((uint64_t)(t[24+2][z+1])<<16) +
((uint64_t)(t[24+3][z+1])<<24) +
((uint64_t)(t[24+4][z+1])<<32) +
((uint64_t)(t[24+5][z+1])<<40) +
((uint64_t)(t[24+6][z+1])<<48) +
((uint64_t)(t[24+7][z+1])<<56);
*(uint64_t *)&out[iN[z+2]+24] =
((uint64_t)(t[24+7][z+1])<<56)
};
memcpy(&out[iN[z+1]], &t1, 32);

uint64_t t2[4] = {
((uint64_t)(t[0][z+2])<< 0) +
((uint64_t)(t[1][z+2])<< 8) +
((uint64_t)(t[2][z+2])<<16) +
((uint64_t)(t[3][z+2])<<24) +
((uint64_t)(t[4][z+2])<<32) +
((uint64_t)(t[5][z+2])<<40) +
((uint64_t)(t[6][z+2])<<48) +
((uint64_t)(t[7][z+2])<<56),

((uint64_t)(t[8+0][z+2])<< 0) +
((uint64_t)(t[8+1][z+2])<< 8) +
((uint64_t)(t[8+2][z+2])<<16) +
((uint64_t)(t[8+3][z+2])<<24) +
((uint64_t)(t[8+4][z+2])<<32) +
((uint64_t)(t[8+5][z+2])<<40) +
((uint64_t)(t[8+6][z+2])<<48) +
((uint64_t)(t[8+7][z+2])<<56),

((uint64_t)(t[16+0][z+2])<< 0) +
((uint64_t)(t[16+1][z+2])<< 8) +
((uint64_t)(t[16+2][z+2])<<16) +
((uint64_t)(t[16+3][z+2])<<24) +
((uint64_t)(t[16+4][z+2])<<32) +
((uint64_t)(t[16+5][z+2])<<40) +
((uint64_t)(t[16+6][z+2])<<48) +
((uint64_t)(t[16+7][z+2])<<56),

((uint64_t)(t[24+0][z+2])<< 0) +
((uint64_t)(t[24+1][z+2])<< 8) +
((uint64_t)(t[24+2][z+2])<<16) +
((uint64_t)(t[24+3][z+2])<<24) +
((uint64_t)(t[24+4][z+2])<<32) +
((uint64_t)(t[24+5][z+2])<<40) +
((uint64_t)(t[24+6][z+2])<<48) +
((uint64_t)(t[24+7][z+2])<<56);
*(uint64_t *)&out[iN[z+3]+24] =
((uint64_t)(t[24+7][z+2])<<56),

};
memcpy(&out[iN[z+2]], &t2, 32);

uint64_t t3[4] = {
((uint64_t)(t[0][z+3])<< 0) +
((uint64_t)(t[1][z+3])<< 8) +
((uint64_t)(t[2][z+3])<<16) +
((uint64_t)(t[3][z+3])<<24) +
((uint64_t)(t[4][z+3])<<32) +
((uint64_t)(t[5][z+3])<<40) +
((uint64_t)(t[6][z+3])<<48) +
((uint64_t)(t[7][z+3])<<56),

((uint64_t)(t[8+0][z+3])<< 0) +
((uint64_t)(t[8+1][z+3])<< 8) +
((uint64_t)(t[8+2][z+3])<<16) +
((uint64_t)(t[8+3][z+3])<<24) +
((uint64_t)(t[8+4][z+3])<<32) +
((uint64_t)(t[8+5][z+3])<<40) +
((uint64_t)(t[8+6][z+3])<<48) +
((uint64_t)(t[8+7][z+3])<<56),

((uint64_t)(t[16+0][z+3])<< 0) +
((uint64_t)(t[16+1][z+3])<< 8) +
((uint64_t)(t[16+2][z+3])<<16) +
((uint64_t)(t[16+3][z+3])<<24) +
((uint64_t)(t[16+4][z+3])<<32) +
((uint64_t)(t[16+5][z+3])<<40) +
((uint64_t)(t[16+6][z+3])<<48) +
((uint64_t)(t[16+7][z+3])<<56),

((uint64_t)(t[24+0][z+3])<< 0) +
((uint64_t)(t[24+1][z+3])<< 8) +
((uint64_t)(t[24+2][z+3])<<16) +
((uint64_t)(t[24+3][z+3])<<24) +
((uint64_t)(t[24+4][z+3])<<32) +
((uint64_t)(t[24+5][z+3])<<40) +
((uint64_t)(t[24+6][z+3])<<48) +
((uint64_t)(t[24+7][z+3])<<56);
((uint64_t)(t[24+7][z+3])<<56)
};
memcpy(&out[iN[z+3]], &t3, 32);

iN[z+0] += 32;
iN[z+1] += 32;
Expand Down