58#ifndef SSE2NEON_PRECISE_MINMAX
59#define SSE2NEON_PRECISE_MINMAX (0)
62#ifndef SSE2NEON_PRECISE_DIV
63#define SSE2NEON_PRECISE_DIV (0)
66#ifndef SSE2NEON_PRECISE_SQRT
67#define SSE2NEON_PRECISE_SQRT (0)
70#if defined(__GNUC__) || defined(__clang__)
71#pragma push_macro("FORCE_INLINE")
72#pragma push_macro("ALIGN_STRUCT")
73#define FORCE_INLINE static inline __attribute__((always_inline))
74#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
76#define likely(x) __builtin_expect(!!(x), 1)
79#define unlikely(x) __builtin_expect(!!(x), 0)
82#error "Macro name collisions may happen with unsupported compiler."
86#define FORCE_INLINE static inline
88#define ALIGN_STRUCT(x) __declspec(align(x))
95#define unlikely(x) (x)
104#if defined(__arm__) && __ARM_ARCH == 7
109#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
110#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
112#if !defined(__clang__)
113#pragma GCC push_options
114#pragma GCC target("fpu=neon")
116#elif defined(__aarch64__)
117#if !defined(__clang__)
118#pragma GCC push_options
119#pragma GCC target("+simd")
122#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
129#if !defined(__aarch64__)
139#define __has_builtin(x) HAS##x
140#define HAS__builtin_popcount 1
141#define HAS__builtin_popcountll 1
143#define __has_builtin(x) 0
155#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
156 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
159#define _MM_FROUND_TO_NEAREST_INT 0x00
160#define _MM_FROUND_TO_NEG_INF 0x01
161#define _MM_FROUND_TO_POS_INF 0x02
162#define _MM_FROUND_TO_ZERO 0x03
163#define _MM_FROUND_CUR_DIRECTION 0x04
164#define _MM_FROUND_NO_EXC 0x08
165#define _MM_ROUND_NEAREST 0x0000
166#define _MM_ROUND_DOWN 0x2000
167#define _MM_ROUND_UP 0x4000
168#define _MM_ROUND_TOWARD_ZERO 0x6000
171#define __constrange(a, b) const
179typedef int64x1_t __m64;
180typedef float32x4_t __m128;
184#if defined(__aarch64__)
185typedef float64x2_t __m128d;
187typedef float32x4_t __m128d;
189typedef int64x2_t __m128i;
193#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
194#define vreinterpretq_m128_f32(x) (x)
195#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
197#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
198#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
199#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
200#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
202#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
203#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
204#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
205#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
207#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
208#define vreinterpretq_f32_m128(x) (x)
209#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
211#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
212#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
213#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
214#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
216#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
217#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
218#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
219#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
221#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
222#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
223#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
224#define vreinterpretq_m128i_s64(x) (x)
226#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
227#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
228#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
229#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
231#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
232#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
234#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
235#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
236#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
237#define vreinterpretq_s64_m128i(x) (x)
239#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
240#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
241#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
242#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
244#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
245#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
246#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
247#define vreinterpret_m64_s64(x) (x)
249#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
250#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
251#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
252#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
254#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
255#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
256#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
258#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
259#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
260#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
261#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
263#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
264#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
265#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
266#define vreinterpret_s64_m64(x) (x)
268#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
270#if defined(__aarch64__)
271#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
272#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
274#define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
276#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
277#define vreinterpretq_m128d_f64(x) (x)
279#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
281#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x)
282#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
284#define vreinterpretq_f64_m128d(x) (x)
285#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
287#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
288#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
290#define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x)
291#define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
293#define vreinterpretq_m128d_f32(x) (x)
295#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
297#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x)
298#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
300#define vreinterpretq_f32_m128d(x) (x)
326typedef union ALIGN_STRUCT(16) SIMDVec {
333 uint16_t m128_u16[8];
334 uint32_t m128_u32[4];
335 uint64_t m128_u64[2];
339#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
340#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
341#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
345FORCE_INLINE
unsigned int _MM_GET_ROUNDING_MODE();
346FORCE_INLINE __m128 _mm_move_ss(__m128, __m128);
347FORCE_INLINE __m128 _mm_or_ps(__m128, __m128);
348FORCE_INLINE __m128 _mm_set_ps1(
float);
349FORCE_INLINE __m128 _mm_setzero_ps(
void);
351FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i);
352FORCE_INLINE __m128i _mm_castps_si128(__m128);
353FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i);
354FORCE_INLINE __m128i _mm_cvtps_epi32(__m128);
355FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d);
356FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i);
357FORCE_INLINE __m128i _mm_set_epi32(
int,
int,
int,
int);
358FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t);
359FORCE_INLINE __m128d _mm_set_pd(
double,
double);
360FORCE_INLINE __m128i _mm_set1_epi32(
int);
361FORCE_INLINE __m128i _mm_setzero_si128();
363FORCE_INLINE __m128d _mm_ceil_pd(__m128d);
364FORCE_INLINE __m128 _mm_ceil_ps(__m128);
365FORCE_INLINE __m128d _mm_floor_pd(__m128d);
366FORCE_INLINE __m128 _mm_floor_ps(__m128);
367FORCE_INLINE __m128d _mm_round_pd(__m128d,
int);
368FORCE_INLINE __m128 _mm_round_ps(__m128,
int);
370FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
375#if defined(__GNUC__) && !defined(__clang__) && \
376 ((__GNUC__ <= 10 && defined(__arm__)) || \
377 (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \
378 (__GNUC__ <= 9 && defined(__aarch64__)))
379FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(
const uint8_t *p)
382 ret.val[0] = vld1q_u8(p + 0);
383 ret.val[1] = vld1q_u8(p + 16);
384 ret.val[2] = vld1q_u8(p + 32);
385 ret.val[3] = vld1q_u8(p + 48);
390FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(
const uint8_t *p)
392 return vld1q_u8_x4(p);
492#if defined(__aarch64__)
499FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
501 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
502 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
503 return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
509FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
511 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
512 float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
513 return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
516FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
518 float32x2_t a21 = vget_high_f32(
519 vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
520 float32x2_t b03 = vget_low_f32(
521 vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
522 return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
525FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
527 float32x2_t a03 = vget_low_f32(
528 vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
529 float32x2_t b21 = vget_high_f32(
530 vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
531 return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
534FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
536 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
537 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
538 return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
541FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
543 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
544 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
545 return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
548FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
550 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
551 float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
552 return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
557FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
559 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
560 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
561 return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
564FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
566 float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
567 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
568 return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
571FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
574 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
575 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
576 return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
579FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
581 float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
583 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
584 return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
587FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
589 float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
591 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
592 float32x2_t a02 = vset_lane_f32(a0, a22, 1);
593 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
594 return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
597FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
600 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
601 float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
602 return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
605FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
607 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
608 float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
609 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
610 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
611 return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
614FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
616 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
617 float32_t b2 = vgetq_lane_f32(b, 2);
618 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
619 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
620 return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
623FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
625 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
626 float32_t b2 = vgetq_lane_f32(b, 2);
627 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
628 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
629 return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
634FORCE_INLINE
void _sse2neon_kadd_f32(
float *sum,
float *c,
float y)
642#if defined(__ARM_FEATURE_CRYPTO)
644FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
646 poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
647 poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
648 return vreinterpretq_u64_p128(vmull_p64(a, b));
664static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
666 poly8x8_t a = vreinterpret_p8_u64(_a);
667 poly8x8_t b = vreinterpret_p8_u64(_b);
670 uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
671 vcreate_u8(0x00000000ffffffff));
672 uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
673 vcreate_u8(0x0000000000000000));
676 uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b));
678 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1)));
680 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b));
682 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2)));
684 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b));
686 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3)));
688 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b));
690 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4)));
693 uint8x16_t l = veorq_u8(e, f);
694 uint8x16_t m = veorq_u8(g, h);
695 uint8x16_t n = veorq_u8(i, j);
699#if defined(__aarch64__)
700 uint8x16_t lm_p0 = vreinterpretq_u8_u64(
701 vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
702 uint8x16_t lm_p1 = vreinterpretq_u8_u64(
703 vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
704 uint8x16_t nk_p0 = vreinterpretq_u8_u64(
705 vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
706 uint8x16_t nk_p1 = vreinterpretq_u8_u64(
707 vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
709 uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
710 uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
711 uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
712 uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
716 uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
717 uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
718 uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
722 uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
723 uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
724 uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
727#if defined(__aarch64__)
728 uint8x16_t t0 = vreinterpretq_u8_u64(
729 vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
730 uint8x16_t t1 = vreinterpretq_u8_u64(
731 vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
732 uint8x16_t t2 = vreinterpretq_u8_u64(
733 vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
734 uint8x16_t t3 = vreinterpretq_u8_u64(
735 vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
737 uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
738 uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
739 uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
740 uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
743 uint8x16_t t0_shift = vextq_u8(t0, t0, 15);
744 uint8x16_t t1_shift = vextq_u8(t1, t1, 14);
745 uint8x16_t t2_shift = vextq_u8(t2, t2, 13);
746 uint8x16_t t3_shift = vextq_u8(t3, t3, 12);
749 uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
750 uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
751 uint8x16_t mix = veorq_u8(d, cross1);
752 uint8x16_t r = veorq_u8(mix, cross2);
753 return vreinterpretq_u64_u8(r);
765#define _mm_shuffle_epi32_default(a, imm) \
769 vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3))); \
770 ret = vsetq_lane_s32( \
771 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
773 ret = vsetq_lane_s32( \
774 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
776 ret = vsetq_lane_s32( \
777 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
779 vreinterpretq_m128i_s32(ret); \
784FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
786 int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
787 int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
788 return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
794FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
796 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
797 int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
798 return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
803FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
805 return vreinterpretq_m128i_s32(
806 vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
811FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
813 return vreinterpretq_m128i_s32(
814 vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
819FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
821 int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
822 return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
827FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
829 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
830 int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
831 return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
837FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
839 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
840 return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
843FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
845 int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
846 int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
847 return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
850FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
852 int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
853 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
854 return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
857FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
859 int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
860 int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
861 return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
866#if defined(__aarch64__)
867#define _mm_shuffle_epi32_splat(a, imm) \
869 vreinterpretq_m128i_s32( \
870 vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
873#define _mm_shuffle_epi32_splat(a, imm) \
875 vreinterpretq_m128i_s32( \
876 vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
894#define _mm_shuffle_ps_default(a, b, imm) \
898 vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))); \
899 ret = vsetq_lane_f32( \
900 vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
902 ret = vsetq_lane_f32( \
903 vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
905 ret = vsetq_lane_f32( \
906 vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
908 vreinterpretq_m128_f32(ret); \
917#define _mm_shufflelo_epi16_function(a, imm) \
919 int16x8_t ret = vreinterpretq_s16_m128i(a); \
920 int16x4_t lowBits = vget_low_s16(ret); \
921 ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \
922 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
924 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
926 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
928 vreinterpretq_m128i_s16(ret); \
937#define _mm_shufflehi_epi16_function(a, imm) \
939 int16x8_t ret = vreinterpretq_s16_m128i(a); \
940 int16x4_t highBits = vget_high_s16(ret); \
941 ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \
942 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
944 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
946 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
948 vreinterpretq_m128i_s16(ret); \
961FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
963 return vreinterpretq_m128_f32(
964 vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
969FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
971 float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
972 float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
974 return vreinterpretq_m128_f32(vaddq_f32(a, value));
986FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
988 return vreinterpretq_m128_s32(
989 vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
1001FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
1003 return vreinterpretq_m128_s32(
1004 vbicq_s32(vreinterpretq_s32_m128(b),
1005 vreinterpretq_s32_m128(a)));
1017FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
1019 return vreinterpret_m64_u16(
1020 vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
1032FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
1034 return vreinterpret_m64_u8(
1035 vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
1040FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
1042 return vreinterpretq_m128_u32(
1043 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1048FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
1050 return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
1055FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
1057 return vreinterpretq_m128_u32(
1058 vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1063FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
1065 return _mm_move_ss(a, _mm_cmpge_ps(a, b));
1076FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
1078 return vreinterpretq_m128_u32(
1079 vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1084FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
1086 return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
1097FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
1099 return vreinterpretq_m128_u32(
1100 vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1105FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
1107 return _mm_move_ss(a, _mm_cmple_ps(a, b));
1112FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
1114 return vreinterpretq_m128_u32(
1115 vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1120FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
1122 return _mm_move_ss(a, _mm_cmplt_ps(a, b));
1127FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
1129 return vreinterpretq_m128_u32(vmvnq_u32(
1130 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1135FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
1137 return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
1142FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
1144 return _mm_cmplt_ps(a, b);
1149FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
1151 return _mm_cmplt_ss(a, b);
1156FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
1158 return _mm_cmple_ps(a, b);
1163FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
1165 return _mm_cmple_ss(a, b);
1170FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
1172 return _mm_cmpgt_ps(a, b);
1177FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
1179 return _mm_cmpgt_ss(a, b);
1184FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
1186 return _mm_cmpge_ps(a, b);
1191FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
1193 return _mm_cmpge_ss(a, b);
1203FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
1209 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1211 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1212 return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
1217FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
1219 return _mm_move_ss(a, _mm_cmpord_ps(a, b));
1224FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
1227 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1229 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1230 return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
1235FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
1237 return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
1243FORCE_INLINE
int _mm_comieq_ss(__m128 a, __m128 b)
1247 uint32x4_t a_not_nan =
1248 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1249 uint32x4_t b_not_nan =
1250 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1251 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
1253 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1254 return vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_eq_b), 0) & 0x1;
1260FORCE_INLINE
int _mm_comige_ss(__m128 a, __m128 b)
1264 uint32x4_t a_not_nan =
1265 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1266 uint32x4_t b_not_nan =
1267 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1268 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
1270 vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1271 return vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) & 0x1;
1277FORCE_INLINE
int _mm_comigt_ss(__m128 a, __m128 b)
1281 uint32x4_t a_not_nan =
1282 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1283 uint32x4_t b_not_nan =
1284 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1285 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
1287 vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1288 return vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) & 0x1;
1294FORCE_INLINE
int _mm_comile_ss(__m128 a, __m128 b)
1298 uint32x4_t a_not_nan =
1299 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1300 uint32x4_t b_not_nan =
1301 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1302 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
1304 vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1305 return vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_le_b), 0) & 0x1;
1313FORCE_INLINE
int _mm_comilt_ss(__m128 a, __m128 b)
1315 uint32x4_t a_not_nan =
1316 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1317 uint32x4_t b_not_nan =
1318 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1319 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
1321 vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1322 return vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_lt_b), 0) & 0x1;
1328FORCE_INLINE
int _mm_comineq_ss(__m128 a, __m128 b)
1332 uint32x4_t a_not_nan =
1333 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1334 uint32x4_t b_not_nan =
1335 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1336 uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
1337 uint32x4_t a_neq_b = vmvnq_u32(
1338 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1339 return vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_neq_b), 0) & 0x1;
1353FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
1355 return vreinterpretq_m128_f32(
1356 vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
1357 vget_high_f32(vreinterpretq_f32_m128(a))));
1369FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
1371#if defined(__aarch64__)
1372 return vreinterpret_m64_s32(
1373 vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a)))));
1375 return vreinterpret_m64_s32(vcvt_s32_f32(vget_low_f32(
1376 vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)))));
1388FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a,
int b)
1390 return vreinterpretq_m128_f32(
1391 vsetq_lane_f32((
float) b, vreinterpretq_f32_m128(a), 0));
1397FORCE_INLINE
int _mm_cvt_ss2si(__m128 a)
1399#if defined(__aarch64__)
1400 return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))),
1403 float32_t data = vgetq_lane_f32(
1404 vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
1405 return (int32_t) data;
1419FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
1421 return vreinterpretq_m128_f32(
1422 vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
1435FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
1437 return vreinterpretq_m128_f32(
1438 vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
1439 vget_high_f32(vreinterpretq_f32_m128(a))));
1454FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
1456 return vreinterpretq_m128_f32(vcvtq_f32_s32(
1457 vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
1470FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
1472 return vreinterpretq_m128_f32(vcvtq_f32_s32(
1473 vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
1492FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
1494 const __m128 i16Min = _mm_set_ps1(INT16_MIN);
1495 const __m128 i16Max = _mm_set_ps1(INT16_MAX);
1496 const __m128 i32Max = _mm_set_ps1(INT32_MAX);
1497 const __m128i maxMask = _mm_castps_si128(
1498 _mm_and_ps(_mm_cmpge_ps(a, i16Max), _mm_cmple_ps(a, i32Max)));
1499 const __m128i betweenMask = _mm_castps_si128(
1500 _mm_and_ps(_mm_cmpgt_ps(a, i16Min), _mm_cmplt_ps(a, i16Max)));
1501 const __m128i minMask = _mm_cmpeq_epi32(_mm_or_si128(maxMask, betweenMask),
1502 _mm_setzero_si128());
1503 __m128i max = _mm_and_si128(maxMask, _mm_set1_epi32(INT16_MAX));
1504 __m128i min = _mm_and_si128(minMask, _mm_set1_epi32(INT16_MIN));
1505 __m128i cvt = _mm_and_si128(betweenMask, _mm_cvtps_epi32(a));
1506 __m128i res32 = _mm_or_si128(_mm_or_si128(max, min), cvt);
1507 return vreinterpret_m64_s16(vmovn_s32(vreinterpretq_s32_m128i(res32)));
1519#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
1537FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a)
1539 const __m128 i8Min = _mm_set_ps1(INT8_MIN);
1540 const __m128 i8Max = _mm_set_ps1(INT8_MAX);
1541 const __m128 i32Max = _mm_set_ps1(INT32_MAX);
1542 const __m128i maxMask = _mm_castps_si128(
1543 _mm_and_ps(_mm_cmpge_ps(a, i8Max), _mm_cmple_ps(a, i32Max)));
1544 const __m128i betweenMask = _mm_castps_si128(
1545 _mm_and_ps(_mm_cmpgt_ps(a, i8Min), _mm_cmplt_ps(a, i8Max)));
1546 const __m128i minMask = _mm_cmpeq_epi32(_mm_or_si128(maxMask, betweenMask),
1547 _mm_setzero_si128());
1548 __m128i max = _mm_and_si128(maxMask, _mm_set1_epi32(INT8_MAX));
1549 __m128i min = _mm_and_si128(minMask, _mm_set1_epi32(INT8_MIN));
1550 __m128i cvt = _mm_and_si128(betweenMask, _mm_cvtps_epi32(a));
1551 __m128i res32 = _mm_or_si128(_mm_or_si128(max, min), cvt);
1552 int16x4_t res16 = vmovn_s32(vreinterpretq_s32_m128i(res32));
1553 int8x8_t res8 = vmovn_s16(vcombine_s16(res16, res16));
1554 unsigned int bitMask[2] = {
static_cast<unsigned int>(0xFFFFFFFF), 0};
1555 int8x8_t mask = vreinterpret_s8_u32(vld1_u32(bitMask));
1557 return vreinterpret_m64_s8(vorr_s8(vand_s8(mask, res8), vdup_n_s8(0)));
1570FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
1572 return vreinterpretq_m128_f32(
1573 vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
1587FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
1589 return vreinterpretq_m128_f32(vcvtq_f32_u32(
1590 vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
1601#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
1611FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
1613 return vreinterpretq_m128_f32(
1614 vsetq_lane_f32((
float) b, vreinterpretq_f32_m128(a), 0));
1622FORCE_INLINE
float _mm_cvtss_f32(__m128 a)
1624 return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1633#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
1641FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
1643#if defined(__aarch64__)
1644 return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0);
1646 float32_t data = vgetq_lane_f32(
1647 vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
1648 return (int64_t) data;
1661FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
1663 return vreinterpret_m64_s32(
1664 vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
1673FORCE_INLINE
int _mm_cvtt_ss2si(__m128 a)
1675 return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
1687#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
1695#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
1703FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
1705 return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1716FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
1718#if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV
1719 return vreinterpretq_m128_f32(
1720 vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1722 float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
1723 recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
1724#if SSE2NEON_PRECISE_DIV
1726 recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
1728 return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
1734FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
1737 vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
1738 return vreinterpretq_m128_f32(
1739 vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
1745#define _mm_extract_pi16(a, imm) \
1746 (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
1750FORCE_INLINE
void _mm_free(
void *addr)
1759FORCE_INLINE
unsigned int _MM_GET_ROUNDING_MODE()
1763#if defined(__aarch64__)
1770#if defined(__aarch64__)
1771 asm volatile(
"mrs %0, FPCR" :
"=r"(r.value));
1773 asm volatile(
"vmrs %0, FPSCR" :
"=r"(r.value));
1776 if (r.field.bit22) {
1777 return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP;
1779 return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST;
1786#define _mm_insert_pi16(a, b, imm) \
1788 vreinterpret_m64_s16( \
1789 vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \
1794FORCE_INLINE __m128 _mm_load_ps(
const float *p)
1796 return vreinterpretq_m128_f32(vld1q_f32(p));
1808#define _mm_load_ps1 _mm_load1_ps
1813FORCE_INLINE __m128 _mm_load_ss(
const float *p)
1815 return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
1821FORCE_INLINE __m128 _mm_load1_ps(
const float *p)
1823 return vreinterpretq_m128_f32(vld1q_dup_f32(p));
1836FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64
const *p)
1838 return vreinterpretq_m128_f32(
1839 vcombine_f32(vget_low_f32(a), vld1_f32((
const float32_t *) p)));
1853FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64
const *p)
1855 return vreinterpretq_m128_f32(
1856 vcombine_f32(vld1_f32((
const float32_t *) p), vget_high_f32(a)));
1869FORCE_INLINE __m128 _mm_loadr_ps(
const float *p)
1871 float32x4_t v = vrev64q_f32(vld1q_f32(p));
1872 return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
1877FORCE_INLINE __m128 _mm_loadu_ps(
const float *p)
1881 return vreinterpretq_m128_f32(vld1q_f32(p));
1890FORCE_INLINE __m128i _mm_loadu_si16(
const void *p)
1892 return vreinterpretq_m128i_s16(
1893 vsetq_lane_s16(*(
const int16_t *) p, vdupq_n_s16(0), 0));
1902FORCE_INLINE __m128i _mm_loadu_si64(
const void *p)
1904 return vreinterpretq_m128i_s64(
1905 vcombine_s64(vld1_s64((
const int64_t *) p), vdup_n_s64(0)));
1911FORCE_INLINE
void *_mm_malloc(
size_t size,
size_t align)
1915 return malloc(size);
1916 if (align == 2 || (
sizeof(
void *) == 8 && align == 4))
1917 align =
sizeof(
void *);
1918 if (!posix_memalign(&ptr, align, size))
1927FORCE_INLINE
void _mm_maskmove_si64(__m64 a, __m64 mask,
char *mem_addr)
1929 int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7);
1930 __m128 b = _mm_load_ps((
const float *) mem_addr);
1932 vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a),
1933 vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b))));
1934 vst1_s8((int8_t *) mem_addr, masked);
1941#define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
1952FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
1954 return vreinterpret_m64_s16(
1955 vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
1961FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
1963#if SSE2NEON_PRECISE_MINMAX
1964 float32x4_t _a = vreinterpretq_f32_m128(a);
1965 float32x4_t _b = vreinterpretq_f32_m128(b);
1966 return vbslq_f32(vcltq_f32(_b, _a), _a, _b);
1968 return vreinterpretq_m128_f32(
1969 vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1982FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
1984 return vreinterpret_m64_u8(
1985 vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
1991FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
1993 float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
1994 return vreinterpretq_m128_f32(
1995 vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
2007FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
2009 return vreinterpret_m64_s16(
2010 vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
2016FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
2018#if SSE2NEON_PRECISE_MINMAX
2019 float32x4_t _a = vreinterpretq_f32_m128(a);
2020 float32x4_t _b = vreinterpretq_f32_m128(b);
2021 return vbslq_f32(vcltq_f32(_a, _b), _a, _b);
2023 return vreinterpretq_m128_f32(
2024 vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2037FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
2039 return vreinterpret_m64_u8(
2040 vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
2046FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
2048 float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
2049 return vreinterpretq_m128_f32(
2050 vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
2055FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
2057 return vreinterpretq_m128_f32(
2058 vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
2059 vreinterpretq_f32_m128(a), 0));
2068FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B)
2070 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A));
2071 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B));
2072 return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
2081FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
2083 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
2084 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
2085 return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
2091FORCE_INLINE
int _mm_movemask_pi8(__m64 a)
2093 uint8x8_t input = vreinterpret_u8_m64(a);
2094#if defined(__aarch64__)
2095 static const int8x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
2096 uint8x8_t tmp = vshr_n_u8(input, 7);
2097 return vaddv_u8(vshl_u8(tmp, shift));
2100 uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7));
2101 uint32x2_t paired16 =
2102 vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
2103 uint8x8_t paired32 =
2104 vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14));
2105 return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4);
2113FORCE_INLINE
int _mm_movemask_ps(__m128 a)
2115 uint32x4_t input = vreinterpretq_u32_m128(a);
2116#if defined(__aarch64__)
2117 static const int32x4_t shift = {0, 1, 2, 3};
2118 uint32x4_t tmp = vshrq_n_u32(input, 31);
2119 return vaddvq_u32(vshlq_u32(tmp, shift));
2124 uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
2127 vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
2129 return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
2141FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
2143 return vreinterpretq_m128_f32(
2144 vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2155FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
2157 return _mm_move_ss(a, _mm_mul_ps(a, b));
2164FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
2166 return vreinterpret_m64_u16(vshrn_n_u32(
2167 vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
2173FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
2175 return vreinterpretq_m128_s32(
2176 vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
2188#define _m_pavgb(a, b) _mm_avg_pu8(a, b)
2199#define _m_pavgw(a, b) _mm_avg_pu16(a, b)
2204#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
2209#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
2214#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
2219#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
2224#define _m_pminsw(a, b) _mm_min_pi16(a, b)
2229#define _m_pminub(a, b) _mm_min_pu8(a, b)
2234#define _m_pmovmskb(a) _mm_movemask_pi8(a)
2240#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
2244FORCE_INLINE
void _mm_prefetch(
const void *p,
int i)
2247 __builtin_prefetch(p);
2255#define _m_psadbw(a, b) _mm_sad_pu8(a, b)
2260#define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)
2266FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
2268 float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
2269 recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
2270#if SSE2NEON_PRECISE_DIV
2272 recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
2274 return vreinterpretq_m128_f32(recip);
2286FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
2288 return _mm_move_ss(a, _mm_rcp_ps(a));
2295FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
2297 float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2298#if SSE2NEON_PRECISE_SQRT
2301 out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
2303 out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
2305 return vreinterpretq_m128_f32(out);
2313FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
2315 return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
2323FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
2325 uint64x1_t t = vpaddl_u32(vpaddl_u16(
2326 vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)))));
2327 return vreinterpret_m64_u16(
2328 vset_lane_u16(vget_lane_u64(t, 0), vdup_n_u16(0), 0));
2333FORCE_INLINE __m128 _mm_set_ps(
float w,
float z,
float y,
float x)
2335 float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
2336 return vreinterpretq_m128_f32(vld1q_f32(data));
2341FORCE_INLINE __m128 _mm_set_ps1(
float _w)
2343 return vreinterpretq_m128_f32(vdupq_n_f32(_w));
2351FORCE_INLINE
void _MM_SET_ROUNDING_MODE(
int rounding)
2355#if defined(__aarch64__)
2362#if defined(__aarch64__)
2363 asm volatile(
"mrs %0, FPCR" :
"=r"(r.value));
2365 asm volatile(
"vmrs %0, FPSCR" :
"=r"(r.value));
2369 case _MM_ROUND_TOWARD_ZERO:
2373 case _MM_ROUND_DOWN:
2386#if defined(__aarch64__)
2387 asm volatile(
"msr FPCR, %0" ::
"r"(r));
2389 asm volatile(
"vmsr FPSCR, %0" ::
"r"(r));
2396FORCE_INLINE __m128 _mm_set_ss(
float a)
2398 float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0};
2399 return vreinterpretq_m128_f32(vld1q_f32(data));
2407FORCE_INLINE __m128 _mm_set1_ps(
float _w)
2409 return vreinterpretq_m128_f32(vdupq_n_f32(_w));
2412FORCE_INLINE
void _mm_setcsr(
unsigned int a)
2414 _MM_SET_ROUNDING_MODE(a);
2420FORCE_INLINE __m128 _mm_setr_ps(
float w,
float z,
float y,
float x)
2422 float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
2423 return vreinterpretq_m128_f32(vld1q_f32(data));
2428FORCE_INLINE __m128 _mm_setzero_ps(
void)
2430 return vreinterpretq_m128_f32(vdupq_n_f32(0));
2436#if __has_builtin(__builtin_shufflevector)
2437#define _mm_shuffle_pi16(a, imm) \
2439 vreinterpret_m64_s16(__builtin_shufflevector( \
2440 vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \
2441 ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3))); \
2444#define _mm_shuffle_pi16(a, imm) \
2448 vmov_n_s16(vget_lane_s16(vreinterpret_s16_m64(a), (imm) & (0x3))); \
2449 ret = vset_lane_s16( \
2450 vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 2) & 0x3), ret, \
2452 ret = vset_lane_s16( \
2453 vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 4) & 0x3), ret, \
2455 ret = vset_lane_s16( \
2456 vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 6) & 0x3), ret, \
2458 vreinterpret_m64_s16(ret); \
2465FORCE_INLINE
void _mm_sfence(
void)
2467 __sync_synchronize();
2472#if __has_builtin(__builtin_shufflevector)
2473#define _mm_shuffle_ps(a, b, imm) \
2475 float32x4_t _input1 = vreinterpretq_f32_m128(a); \
2476 float32x4_t _input2 = vreinterpretq_f32_m128(b); \
2477 float32x4_t _shuf = __builtin_shufflevector( \
2478 _input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
2479 (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
2480 vreinterpretq_m128_f32(_shuf); \
2483#define _mm_shuffle_ps(a, b, imm) \
2487 case _MM_SHUFFLE(1, 0, 3, 2): \
2488 ret = _mm_shuffle_ps_1032((a), (b)); \
2490 case _MM_SHUFFLE(2, 3, 0, 1): \
2491 ret = _mm_shuffle_ps_2301((a), (b)); \
2493 case _MM_SHUFFLE(0, 3, 2, 1): \
2494 ret = _mm_shuffle_ps_0321((a), (b)); \
2496 case _MM_SHUFFLE(2, 1, 0, 3): \
2497 ret = _mm_shuffle_ps_2103((a), (b)); \
2499 case _MM_SHUFFLE(1, 0, 1, 0): \
2500 ret = _mm_movelh_ps((a), (b)); \
2502 case _MM_SHUFFLE(1, 0, 0, 1): \
2503 ret = _mm_shuffle_ps_1001((a), (b)); \
2505 case _MM_SHUFFLE(0, 1, 0, 1): \
2506 ret = _mm_shuffle_ps_0101((a), (b)); \
2508 case _MM_SHUFFLE(3, 2, 1, 0): \
2509 ret = _mm_shuffle_ps_3210((a), (b)); \
2511 case _MM_SHUFFLE(0, 0, 1, 1): \
2512 ret = _mm_shuffle_ps_0011((a), (b)); \
2514 case _MM_SHUFFLE(0, 0, 2, 2): \
2515 ret = _mm_shuffle_ps_0022((a), (b)); \
2517 case _MM_SHUFFLE(2, 2, 0, 0): \
2518 ret = _mm_shuffle_ps_2200((a), (b)); \
2520 case _MM_SHUFFLE(3, 2, 0, 2): \
2521 ret = _mm_shuffle_ps_3202((a), (b)); \
2523 case _MM_SHUFFLE(3, 2, 3, 2): \
2524 ret = _mm_movehl_ps((b), (a)); \
2526 case _MM_SHUFFLE(1, 1, 3, 3): \
2527 ret = _mm_shuffle_ps_1133((a), (b)); \
2529 case _MM_SHUFFLE(2, 0, 1, 0): \
2530 ret = _mm_shuffle_ps_2010((a), (b)); \
2532 case _MM_SHUFFLE(2, 0, 0, 1): \
2533 ret = _mm_shuffle_ps_2001((a), (b)); \
2535 case _MM_SHUFFLE(2, 0, 3, 2): \
2536 ret = _mm_shuffle_ps_2032((a), (b)); \
2539 ret = _mm_shuffle_ps_default((a), (b), (imm)); \
2556FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
2558#if SSE2NEON_PRECISE_SQRT
2559 float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2563 const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
2564 const uint32x4_t div_by_zero =
2565 vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
2566 recip = vreinterpretq_f32_u32(
2567 vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
2571 vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
2574 vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
2578 return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));
2579#elif defined(__aarch64__)
2580 return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
2582 float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2583 float32x4_t sq = vrecpeq_f32(recipsq);
2584 return vreinterpretq_m128_f32(sq);
2591FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
2594 vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
2595 return vreinterpretq_m128_f32(
2596 vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
2601FORCE_INLINE
void _mm_store_ps(
float *p, __m128 a)
2603 vst1q_f32(p, vreinterpretq_f32_m128(a));
2616FORCE_INLINE
void _mm_store_ps1(
float *p, __m128 a)
2618 float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
2619 vst1q_f32(p, vdupq_n_f32(a0));
2624FORCE_INLINE
void _mm_store_ss(
float *p, __m128 a)
2626 vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
2639#define _mm_store1_ps _mm_store_ps1
2648FORCE_INLINE
void _mm_storeh_pi(__m64 *p, __m128 a)
2650 *p = vreinterpret_m64_f32(vget_high_f32(a));
2660FORCE_INLINE
void _mm_storel_pi(__m64 *p, __m128 a)
2662 *p = vreinterpret_m64_f32(vget_low_f32(a));
2675FORCE_INLINE
void _mm_storer_ps(
float *p, __m128 a)
2677 float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
2678 float32x4_t rev = vextq_f32(tmp, tmp, 2);
2684FORCE_INLINE
void _mm_storeu_ps(
float *p, __m128 a)
2686 vst1q_f32(p, vreinterpretq_f32_m128(a));
2691FORCE_INLINE
void _mm_storeu_si16(
void *p, __m128i a)
2693 vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0);
2698FORCE_INLINE
void _mm_storeu_si64(
void *p, __m128i a)
2700 vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0);
2706FORCE_INLINE
void _mm_stream_pi(__m64 *p, __m64 a)
2708 vst1_s64((int64_t *) p, vreinterpret_s64_m64(a));
2714FORCE_INLINE
void _mm_stream_ps(
float *p, __m128 a)
2716#if __has_builtin(__builtin_nontemporal_store)
2717 __builtin_nontemporal_store(a, (float32x4_t *) p);
2719 vst1q_f32(p, vreinterpretq_f32_m128(a));
2731FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
2733 return vreinterpretq_m128_f32(
2734 vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2746FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
2748 return _mm_move_ss(a, _mm_sub_ps(a, b));
2755#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
2757 float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \
2758 float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \
2759 row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \
2760 vget_low_f32(ROW23.val[0])); \
2761 row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \
2762 vget_low_f32(ROW23.val[1])); \
2763 row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \
2764 vget_high_f32(ROW23.val[0])); \
2765 row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \
2766 vget_high_f32(ROW23.val[1])); \
2771#define _mm_ucomieq_ss _mm_comieq_ss
2772#define _mm_ucomige_ss _mm_comige_ss
2773#define _mm_ucomigt_ss _mm_comigt_ss
2774#define _mm_ucomile_ss _mm_comile_ss
2775#define _mm_ucomilt_ss _mm_comilt_ss
2776#define _mm_ucomineq_ss _mm_comineq_ss
2780FORCE_INLINE __m128i _mm_undefined_si128(
void)
2782#if defined(__GNUC__) || defined(__clang__)
2783#pragma GCC diagnostic push
2784#pragma GCC diagnostic ignored "-Wuninitialized"
2788#if defined(__GNUC__) || defined(__clang__)
2789#pragma GCC diagnostic pop
2795FORCE_INLINE __m128 _mm_undefined_ps(
void)
2797#if defined(__GNUC__) || defined(__clang__)
2798#pragma GCC diagnostic push
2799#pragma GCC diagnostic ignored "-Wuninitialized"
2803#if defined(__GNUC__) || defined(__clang__)
2804#pragma GCC diagnostic pop
2817FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
2819#if defined(__aarch64__)
2820 return vreinterpretq_m128_f32(
2821 vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2823 float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
2824 float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
2825 float32x2x2_t result = vzip_f32(a1, b1);
2826 return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
2839FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
2841#if defined(__aarch64__)
2842 return vreinterpretq_m128_f32(
2843 vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2845 float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
2846 float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
2847 float32x2x2_t result = vzip_f32(a1, b1);
2848 return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
2855FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
2857 return vreinterpretq_m128_s32(
2858 veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
2866FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
2868 return vreinterpretq_m128i_s16(
2869 vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2881FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
2883 return vreinterpretq_m128i_s32(
2884 vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
2890FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
2892 return vreinterpretq_m128i_s64(
2893 vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
2899FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
2901 return vreinterpretq_m128i_s8(
2902 vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2908FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
2910#if defined(__aarch64__)
2911 return vreinterpretq_m128d_f64(
2912 vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
2914 double *da = (
double *) &a;
2915 double *db = (
double *) &b;
2917 c[0] = da[0] + db[0];
2918 c[1] = da[1] + db[1];
2919 return vld1q_f32((float32_t *) c);
2931FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
2933#if defined(__aarch64__)
2934 return _mm_move_sd(a, _mm_add_pd(a, b));
2936 double *da = (
double *) &a;
2937 double *db = (
double *) &b;
2939 c[0] = da[0] + db[0];
2941 return vld1q_f32((float32_t *) c);
2950FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
2952 return vreinterpret_m64_s64(
2953 vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
2965FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
2967 return vreinterpretq_m128i_s16(
2968 vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2980FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
2982 return vreinterpretq_m128i_s8(
2983 vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2989FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
2991 return vreinterpretq_m128i_u16(
2992 vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
2998FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
3000 return vreinterpretq_m128i_u8(
3001 vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
3013FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
3015 return vreinterpretq_m128d_s64(
3016 vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
3025FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
3027 return vreinterpretq_m128i_s32(
3028 vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3040FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
3043 return vreinterpretq_m128d_s64(
3044 vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
3053FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
3055 return vreinterpretq_m128i_s32(
3056 vbicq_s32(vreinterpretq_s32_m128i(b),
3057 vreinterpretq_s32_m128i(a)));
3069FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
3071 return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
3072 vreinterpretq_u16_m128i(b));
3084FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
3086 return vreinterpretq_m128i_u8(
3087 vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
3093#define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)
3098#define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
3103FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
3105 return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a));
3111FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
3113 return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
3119FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
3121 return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
3127FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
3129 return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
3135FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
3137#if defined(__aarch64__)
3138 return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
3140 return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a));
3147FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
3149 return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
3155FORCE_INLINE
void _mm_clflush(
void const *p)
3164FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
3166 return vreinterpretq_m128i_u16(
3167 vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3172FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
3174 return vreinterpretq_m128i_u32(
3175 vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3181FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
3183 return vreinterpretq_m128i_u8(
3184 vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3190FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
3192#if defined(__aarch64__)
3193 return vreinterpretq_m128d_u64(
3194 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3198 vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
3199 uint32x4_t swapped = vrev64q_u32(cmp);
3200 return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped));
3208FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
3210 return _mm_move_sd(a, _mm_cmpeq_pd(a, b));
3216FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
3218#if defined(__aarch64__)
3219 return vreinterpretq_m128d_u64(
3220 vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3222 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3223 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3224 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3225 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3227 d[0] = (*(
double *) &a0) >= (*(
double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3228 d[1] = (*(
double *) &a1) >= (*(
double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3230 return vreinterpretq_m128d_u64(vld1q_u64(d));
3238FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
3240#if defined(__aarch64__)
3241 return _mm_move_sd(a, _mm_cmpge_pd(a, b));
3244 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3245 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3246 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3248 d[0] = (*(
double *) &a0) >= (*(
double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3251 return vreinterpretq_m128d_u64(vld1q_u64(d));
3264FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
3266 return vreinterpretq_m128i_u16(
3267 vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3273FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
3275 return vreinterpretq_m128i_u32(
3276 vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3288FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
3290 return vreinterpretq_m128i_u8(
3291 vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3297FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
3299#if defined(__aarch64__)
3300 return vreinterpretq_m128d_u64(
3301 vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3303 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3304 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3305 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3306 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3308 d[0] = (*(
double *) &a0) > (*(
double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3309 d[1] = (*(
double *) &a1) > (*(
double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3311 return vreinterpretq_m128d_u64(vld1q_u64(d));
3319FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
3321#if defined(__aarch64__)
3322 return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
3325 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3326 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3327 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3329 d[0] = (*(
double *) &a0) > (*(
double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3332 return vreinterpretq_m128d_u64(vld1q_u64(d));
3339FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
3341#if defined(__aarch64__)
3342 return vreinterpretq_m128d_u64(
3343 vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3345 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3346 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3347 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3348 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3350 d[0] = (*(
double *) &a0) <= (*(
double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3351 d[1] = (*(
double *) &a1) <= (*(
double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3353 return vreinterpretq_m128d_u64(vld1q_u64(d));
3361FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
3363#if defined(__aarch64__)
3364 return _mm_move_sd(a, _mm_cmple_pd(a, b));
3367 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3368 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3369 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3371 d[0] = (*(
double *) &a0) <= (*(
double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3374 return vreinterpretq_m128d_u64(vld1q_u64(d));
3387FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
3389 return vreinterpretq_m128i_u16(
3390 vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3397FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
3399 return vreinterpretq_m128i_u32(
3400 vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3406FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
3408 return vreinterpretq_m128i_u8(
3409 vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3415FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
3417#if defined(__aarch64__)
3418 return vreinterpretq_m128d_u64(
3419 vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3421 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3422 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3423 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3424 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3426 d[0] = (*(
double *) &a0) < (*(
double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3427 d[1] = (*(
double *) &a1) < (*(
double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3429 return vreinterpretq_m128d_u64(vld1q_u64(d));
3437FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
3439#if defined(__aarch64__)
3440 return _mm_move_sd(a, _mm_cmplt_pd(a, b));
3442 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3443 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3444 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3446 d[0] = (*(
double *) &a0) < (*(
double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3449 return vreinterpretq_m128d_u64(vld1q_u64(d));
3456FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
3458#if defined(__aarch64__)
3459 return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64(
3460 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));
3464 vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
3465 uint32x4_t swapped = vrev64q_u32(cmp);
3466 return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped)));
3474FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
3476 return _mm_move_sd(a, _mm_cmpneq_pd(a, b));
3482#define _mm_cmpnge_pd(a, b) _mm_cmplt_pd(a, b)
3488#define _mm_cmpnge_sd(a, b) _mm_cmplt_sd(a, b)
3493#define _mm_cmpngt_pd(a, b) _mm_cmple_pd(a, b)
3499#define _mm_cmpngt_sd(a, b) _mm_cmple_sd(a, b)
3504#define _mm_cmpnle_pd(a, b) _mm_cmpgt_pd(a, b)
3510#define _mm_cmpnle_sd(a, b) _mm_cmpgt_sd(a, b)
3515#define _mm_cmpnlt_pd(a, b) _mm_cmpge_pd(a, b)
3521#define _mm_cmpnlt_sd(a, b) _mm_cmpge_sd(a, b)
3526FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
3528#if defined(__aarch64__)
3530 uint64x2_t not_nan_a =
3531 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
3532 uint64x2_t not_nan_b =
3533 vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
3534 return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b));
3536 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3537 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3538 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3539 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3541 d[0] = ((*(
double *) &a0) == (*(
double *) &a0) &&
3542 (*(
double *) &b0) == (*(
double *) &b0))
3545 d[1] = ((*(
double *) &a1) == (*(
double *) &a1) &&
3546 (*(
double *) &b1) == (*(
double *) &b1))
3550 return vreinterpretq_m128d_u64(vld1q_u64(d));
3558FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
3560#if defined(__aarch64__)
3561 return _mm_move_sd(a, _mm_cmpord_pd(a, b));
3563 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3564 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3565 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3567 d[0] = ((*(
double *) &a0) == (*(
double *) &a0) &&
3568 (*(
double *) &b0) == (*(
double *) &b0))
3573 return vreinterpretq_m128d_u64(vld1q_u64(d));
3580FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
3582#if defined(__aarch64__)
3584 uint64x2_t not_nan_a =
3585 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
3586 uint64x2_t not_nan_b =
3587 vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
3588 return vreinterpretq_m128d_s32(
3589 vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
3591 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3592 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3593 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3594 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3596 d[0] = ((*(
double *) &a0) == (*(
double *) &a0) &&
3597 (*(
double *) &b0) == (*(
double *) &b0))
3600 d[1] = ((*(
double *) &a1) == (*(
double *) &a1) &&
3601 (*(
double *) &b1) == (*(
double *) &b1))
3605 return vreinterpretq_m128d_u64(vld1q_u64(d));
3613FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
3615#if defined(__aarch64__)
3616 return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
3618 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3619 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3620 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3622 d[0] = ((*(
double *) &a0) == (*(
double *) &a0) &&
3623 (*(
double *) &b0) == (*(
double *) &b0))
3628 return vreinterpretq_m128d_u64(vld1q_u64(d));
3635FORCE_INLINE
int _mm_comige_sd(__m128d a, __m128d b)
3637#if defined(__aarch64__)
3638 return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
3640 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3641 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3643 return (*(
double *) &a0 >= *(
double *) &b0);
3650FORCE_INLINE
int _mm_comigt_sd(__m128d a, __m128d b)
3652#if defined(__aarch64__)
3653 return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
3655 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3656 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3658 return (*(
double *) &a0 > *(
double *) &b0);
3665FORCE_INLINE
int _mm_comile_sd(__m128d a, __m128d b)
3667#if defined(__aarch64__)
3668 return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
3670 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3671 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3673 return (*(
double *) &a0 <= *(
double *) &b0);
3680FORCE_INLINE
int _mm_comilt_sd(__m128d a, __m128d b)
3682#if defined(__aarch64__)
3683 return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
3685 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3686 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3688 return (*(
double *) &a0 < *(
double *) &b0);
3695FORCE_INLINE
int _mm_comieq_sd(__m128d a, __m128d b)
3697#if defined(__aarch64__)
3698 return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1;
3700 uint32x4_t a_not_nan =
3701 vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(a));
3702 uint32x4_t b_not_nan =
3703 vceqq_u32(vreinterpretq_u32_m128d(b), vreinterpretq_u32_m128d(b));
3704 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
3706 vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
3707 uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan),
3708 vreinterpretq_u64_u32(a_eq_b));
3709 return !!vgetq_lane_u64(and_results, 0);
3716FORCE_INLINE
int _mm_comineq_sd(__m128d a, __m128d b)
3718#if defined(__aarch64__)
3719 return !vgetq_lane_u64(vceqq_f64(a, b), 0);
3723 vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
3724 return !vgetq_lane_u64(vreinterpretq_u64_u32(a_eq_b), 0);
3738FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
3740#if defined(__aarch64__)
3741 return vreinterpretq_m128d_f64(
3742 vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))));
3744 double a0 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
3745 double a1 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1);
3746 return _mm_set_pd(a1, a0);
3753FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
3755 return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
3768FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
3770 __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
3771 double d0 = ((
double *) &rnd)[0];
3772 double d1 = ((
double *) &rnd)[1];
3773 return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);
3786FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
3788 __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
3789 double d0 = ((
double *) &rnd)[0];
3790 double d1 = ((
double *) &rnd)[1];
3791 int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
3792 return vreinterpret_m64_s32(vld1_s32(data));
3807FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
3809#if defined(__aarch64__)
3810 float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
3811 return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
3813 float a0 = (float) ((
double *) &a)[0];
3814 float a1 = (float) ((
double *) &a)[1];
3815 return _mm_set_ps(0, 0, a1, a0);
3829FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
3831#if defined(__aarch64__)
3832 return vreinterpretq_m128d_f64(
3833 vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a))));
3835 double a0 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 0);
3836 double a1 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 1);
3837 return _mm_set_pd(a1, a0);
3852FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
3854#if defined(__aarch64__)
3855 switch (_MM_GET_ROUNDING_MODE()) {
3856 case _MM_ROUND_NEAREST:
3857 return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
3858 case _MM_ROUND_DOWN:
3859 return vreinterpretq_m128i_s32(vcvtmq_s32_f32(a));
3861 return vreinterpretq_m128i_s32(vcvtpq_s32_f32(a));
3863 return vreinterpretq_m128i_s32(vcvtq_s32_f32(a));
3866 float *f = (
float *) &a;
3867 switch (_MM_GET_ROUNDING_MODE()) {
3868 case _MM_ROUND_NEAREST: {
3869 uint32x4_t signmask = vdupq_n_u32(0x80000000);
3870 float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
3872 int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
3873 vreinterpretq_f32_m128(a), half));
3874 int32x4_t r_trunc = vcvtq_s32_f32(
3875 vreinterpretq_f32_m128(a));
3876 int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
3877 vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31));
3878 int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
3880 float32x4_t delta = vsubq_f32(
3881 vreinterpretq_f32_m128(a),
3882 vcvtq_f32_s32(r_trunc));
3883 uint32x4_t is_delta_half =
3884 vceqq_f32(delta, half);
3885 return vreinterpretq_m128i_s32(
3886 vbslq_s32(is_delta_half, r_even, r_normal));
3888 case _MM_ROUND_DOWN:
3889 return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]),
3892 return _mm_set_epi32(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]),
3895 return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1],
3912FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
3914#if defined(__aarch64__)
3915 return vreinterpretq_m128d_f64(
3916 vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
3918 double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
3919 double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
3920 return _mm_set_pd(a1, a0);
3929FORCE_INLINE
double _mm_cvtsd_f64(__m128d a)
3931#if defined(__aarch64__)
3932 return (
double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
3934 return ((
double *) &a)[0];
3944FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
3946#if defined(__aarch64__)
3947 return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
3949 __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
3950 double ret = ((
double *) &rnd)[0];
3951 return (int32_t) ret;
3961FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
3963#if defined(__aarch64__)
3964 return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
3966 __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
3967 double ret = ((
double *) &rnd)[0];
3968 return (int64_t) ret;
3978#define _mm_cvtsd_si64x _mm_cvtsd_si64
3985FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
3987#if defined(__aarch64__)
3988 return vreinterpretq_m128_f32(vsetq_lane_f32(
3989 vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
3990 vreinterpretq_f32_m128(a), 0));
3992 return vreinterpretq_m128_f32(vsetq_lane_f32((
float) ((
double *) &b)[0],
3993 vreinterpretq_f32_m128(a), 0));
4002FORCE_INLINE
int _mm_cvtsi128_si32(__m128i a)
4004 return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
4012FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
4014 return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
4019#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4025FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
4027#if defined(__aarch64__)
4028 return vreinterpretq_m128d_f64(
4029 vsetq_lane_f64((
double) b, vreinterpretq_f64_m128d(a), 0));
4031 double bf = (double) b;
4032 return vreinterpretq_m128d_s64(
4033 vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
4042#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4053FORCE_INLINE __m128i _mm_cvtsi32_si128(
int a)
4055 return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
4062FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
4064#if defined(__aarch64__)
4065 return vreinterpretq_m128d_f64(
4066 vsetq_lane_f64((
double) b, vreinterpretq_f64_m128d(a), 0));
4068 double bf = (double) b;
4069 return vreinterpretq_m128d_s64(
4070 vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
4079FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
4081 return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
4087#define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a)
4093#define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b)
4104FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
4106 double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
4107#if defined(__aarch64__)
4108 return vreinterpretq_m128d_f64(
4109 vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
4111 return vreinterpretq_m128d_s64(
4112 vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0));
4119FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
4121 double a0 = ((
double *) &a)[0];
4122 double a1 = ((
double *) &a)[1];
4123 return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0);
4129FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
4131 double a0 = ((
double *) &a)[0];
4132 double a1 = ((
double *) &a)[1];
4133 int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
4134 return vreinterpret_m64_s32(vld1_s32(data));
4140FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
4142 return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
4151FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
4153 double ret = *((
double *) &a);
4154 return (int32_t) ret;
4163FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
4165#if defined(__aarch64__)
4166 return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
4168 double ret = *((
double *) &a);
4169 return (int64_t) ret;
4179#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
4190FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
4192#if defined(__aarch64__)
4193 return vreinterpretq_m128d_f64(
4194 vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4196 double *da = (
double *) &a;
4197 double *db = (
double *) &b;
4199 c[0] = da[0] / db[0];
4200 c[1] = da[1] / db[1];
4201 return vld1q_f32((float32_t *) c);
4210FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
4212#if defined(__aarch64__)
4214 vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
4215 return vreinterpretq_m128d_f64(
4216 vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1));
4218 return _mm_move_sd(a, _mm_div_pd(a, b));
4226#define _mm_extract_epi16(a, imm) \
4227 vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
4234#define _mm_insert_epi16(a, b, imm) \
4236 vreinterpretq_m128i_s16( \
4237 vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
4246FORCE_INLINE __m128d _mm_load_pd(
const double *p)
4248#if defined(__aarch64__)
4249 return vreinterpretq_m128d_f64(vld1q_f64(p));
4251 const float *fp = (
const float *) p;
4252 float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
4253 return vreinterpretq_m128d_f32(vld1q_f32(data));
4264#define _mm_load_pd1 _mm_load1_pd
4274FORCE_INLINE __m128d _mm_load_sd(
const double *p)
4276#if defined(__aarch64__)
4277 return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
4279 const float *fp = (
const float *) p;
4280 float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
4281 return vreinterpretq_m128d_f32(vld1q_f32(data));
4287FORCE_INLINE __m128i _mm_load_si128(
const __m128i *p)
4289 return vreinterpretq_m128i_s32(vld1q_s32((
const int32_t *) p));
4299FORCE_INLINE __m128d _mm_load1_pd(
const double *p)
4301#if defined(__aarch64__)
4302 return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
4304 return vreinterpretq_m128d_s64(vdupq_n_s64(*(
const int64_t *) p));
4316FORCE_INLINE __m128d _mm_loadh_pd(__m128d a,
const double *p)
4318#if defined(__aarch64__)
4319 return vreinterpretq_m128d_f64(
4320 vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
4322 return vreinterpretq_m128d_f32(vcombine_f32(
4323 vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((
const float *) p)));
4329FORCE_INLINE __m128i _mm_loadl_epi64(__m128i
const *p)
4334 return vreinterpretq_m128i_s32(
4335 vcombine_s32(vld1_s32((int32_t
const *) p), vcreate_s32(0)));
4346FORCE_INLINE __m128d _mm_loadl_pd(__m128d a,
const double *p)
4348#if defined(__aarch64__)
4349 return vreinterpretq_m128d_f64(
4350 vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
4352 return vreinterpretq_m128d_f32(
4353 vcombine_f32(vld1_f32((
const float *) p),
4354 vget_high_f32(vreinterpretq_f32_m128d(a))));
4366FORCE_INLINE __m128d _mm_loadr_pd(
const double *p)
4368#if defined(__aarch64__)
4369 float64x2_t v = vld1q_f64(p);
4370 return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
4372 int64x2_t v = vld1q_s64((
const int64_t *) p);
4373 return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
4379FORCE_INLINE __m128d _mm_loadu_pd(
const double *p)
4381 return _mm_load_pd(p);
4386FORCE_INLINE __m128i _mm_loadu_si128(
const __m128i *p)
4388 return vreinterpretq_m128i_s32(vld1q_s32((
const int32_t *) p));
4397FORCE_INLINE __m128i _mm_loadu_si32(
const void *p)
4399 return vreinterpretq_m128i_s32(
4400 vsetq_lane_s32(*(
const int32_t *) p, vdupq_n_s32(0), 0));
4411FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
4413 int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
4414 vget_low_s16(vreinterpretq_s16_m128i(b)));
4415 int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
4416 vget_high_s16(vreinterpretq_s16_m128i(b)));
4418 int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
4419 int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
4421 return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
4429FORCE_INLINE
void _mm_maskmoveu_si128(__m128i a, __m128i mask,
char *mem_addr)
4431 int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7);
4432 __m128 b = _mm_load_ps((
const float *) mem_addr);
4434 vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a),
4435 vreinterpretq_s8_m128(b));
4436 vst1q_s8((int8_t *) mem_addr, masked);
4442FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
4444 return vreinterpretq_m128i_s16(
4445 vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4451FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
4453 return vreinterpretq_m128i_u8(
4454 vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
4460FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
4462#if defined(__aarch64__)
4463 return vreinterpretq_m128d_f64(
4464 vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4466 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
4467 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
4468 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
4469 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
4471 d[0] = (*(
double *) &a0) > (*(
double *) &b0) ? a0 : b0;
4472 d[1] = (*(
double *) &a1) > (*(
double *) &b1) ? a1 : b1;
4474 return vreinterpretq_m128d_u64(vld1q_u64(d));
4482FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
4484#if defined(__aarch64__)
4485 return _mm_move_sd(a, _mm_max_pd(a, b));
4487 double *da = (
double *) &a;
4488 double *db = (
double *) &b;
4489 double c[2] = {fmax(da[0], db[0]), da[1]};
4490 return vld1q_f32((float32_t *) c);
4497FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
4499 return vreinterpretq_m128i_s16(
4500 vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4506FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
4508 return vreinterpretq_m128i_u8(
4509 vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
4515FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
4517#if defined(__aarch64__)
4518 return vreinterpretq_m128d_f64(
4519 vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4521 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
4522 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
4523 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
4524 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
4526 d[0] = (*(
double *) &a0) < (*(
double *) &b0) ? a0 : b0;
4527 d[1] = (*(
double *) &a1) < (*(
double *) &b1) ? a1 : b1;
4528 return vreinterpretq_m128d_u64(vld1q_u64(d));
4536FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
4538#if defined(__aarch64__)
4539 return _mm_move_sd(a, _mm_min_pd(a, b));
4541 double *da = (
double *) &a;
4542 double *db = (
double *) &b;
4543 double c[2] = {fmin(da[0], db[0]), da[1]};
4544 return vld1q_f32((float32_t *) c);
4555FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
4557 return vreinterpretq_m128i_s64(
4558 vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
4569FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
4571 return vreinterpretq_m128d_f32(
4572 vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)),
4573 vget_high_f32(vreinterpretq_f32_m128d(a))));
4580FORCE_INLINE
int _mm_movemask_epi8(__m128i a)
4591 uint8x16_t input = vreinterpretq_u8_m128i(a);
4606 uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
4621 uint32x4_t paired16 =
4622 vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
4635 uint64x2_t paired32 =
4636 vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
4649 uint8x16_t paired64 =
4650 vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
4657 return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
4663FORCE_INLINE
int _mm_movemask_pd(__m128d a)
4665 uint64x2_t input = vreinterpretq_u64_m128d(a);
4666 uint64x2_t high_bits = vshrq_n_u64(input, 63);
4667 return vgetq_lane_u64(high_bits, 0) | (vgetq_lane_u64(high_bits, 1) << 1);
4675FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
4677 return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
4687FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
4689 return vreinterpretq_m128i_s64(
4690 vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
4698FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
4701 uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
4702 uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
4703 return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
4709FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
4711#if defined(__aarch64__)
4712 return vreinterpretq_m128d_f64(
4713 vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4715 double *da = (
double *) &a;
4716 double *db = (
double *) &b;
4718 c[0] = da[0] * db[0];
4719 c[1] = da[1] * db[1];
4720 return vld1q_f32((float32_t *) c);
4728FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
4730 return _mm_move_sd(a, _mm_mul_pd(a, b));
4739FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
4741 return vreinterpret_m64_u64(vget_low_u64(
4742 vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
4754FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
4760 int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
4761 int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
4762 int32x4_t ab3210 = vmull_s16(a3210, b3210);
4763 int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
4764 int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
4765 int32x4_t ab7654 = vmull_s16(a7654, b7654);
4767 vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
4768 return vreinterpretq_m128i_u16(r.val[1]);
4775FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
4777 uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
4778 uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
4779 uint32x4_t ab3210 = vmull_u16(a3210, b3210);
4780#if defined(__aarch64__)
4782 vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
4783 uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
4784 vreinterpretq_u16_u32(ab7654));
4785 return vreinterpretq_m128i_u16(r);
4787 uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a));
4788 uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b));
4789 uint32x4_t ab7654 = vmull_u16(a7654, b7654);
4791 vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
4792 return vreinterpretq_m128i_u16(r.val[1]);
4805FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
4807 return vreinterpretq_m128i_s16(
4808 vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4814FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
4816 return vreinterpretq_m128d_s64(
4817 vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
4825FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
4827 return vreinterpretq_m128i_s32(
4828 vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
4834FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
4836 return vreinterpretq_m128i_s8(
4837 vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
4838 vqmovn_s16(vreinterpretq_s16_m128i(b))));
4854FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
4856 return vreinterpretq_m128i_s16(
4857 vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
4858 vqmovn_s32(vreinterpretq_s32_m128i(b))));
4874FORCE_INLINE __m128i _mm_packus_epi16(
const __m128i a,
const __m128i b)
4876 return vreinterpretq_m128i_u8(
4877 vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
4878 vqmovun_s16(vreinterpretq_s16_m128i(b))));
4886FORCE_INLINE
void _mm_pause()
4888 __asm__ __volatile__(
"isb\n");
4896FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
4898 uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
4899 return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t)));
4904FORCE_INLINE __m128i _mm_set_epi16(
short i7,
4913 int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
4914 return vreinterpretq_m128i_s16(vld1q_s16(data));
4919FORCE_INLINE __m128i _mm_set_epi32(
int i3,
int i2,
int i1,
int i0)
4921 int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
4922 return vreinterpretq_m128i_s32(vld1q_s32(data));
4928FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
4930 return _mm_set_epi64x((int64_t) i1, (int64_t) i2);
4936FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
4938 return vreinterpretq_m128i_s64(
4939 vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
4944FORCE_INLINE __m128i _mm_set_epi8(
signed char b15,
4961 int8_t ALIGN_STRUCT(16)
4962 data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
4963 (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
4964 (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
4965 (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
4966 return (__m128i) vld1q_s8(data);
4972FORCE_INLINE __m128d _mm_set_pd(
double e1,
double e0)
4974 double ALIGN_STRUCT(16) data[2] = {e0, e1};
4975#if defined(__aarch64__)
4976 return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
4978 return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
4985#define _mm_set_pd1 _mm_set1_pd
4990FORCE_INLINE __m128d _mm_set_sd(
double a)
4992 return _mm_set_pd(0, a);
5003FORCE_INLINE __m128i _mm_set1_epi16(
short w)
5005 return vreinterpretq_m128i_s16(vdupq_n_s16(w));
5016FORCE_INLINE __m128i _mm_set1_epi32(
int _i)
5018 return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
5023FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
5025 return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));
5030FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
5032 return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
5043FORCE_INLINE __m128i _mm_set1_epi8(
signed char w)
5045 return vreinterpretq_m128i_s8(vdupq_n_s8(w));
5051FORCE_INLINE __m128d _mm_set1_pd(
double d)
5053#if defined(__aarch64__)
5054 return vreinterpretq_m128d_f64(vdupq_n_f64(d));
5056 return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d));
5067FORCE_INLINE __m128i _mm_setr_epi16(
short w0,
5076 int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
5077 return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
5082FORCE_INLINE __m128i _mm_setr_epi32(
int i3,
int i2,
int i1,
int i0)
5084 int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
5085 return vreinterpretq_m128i_s32(vld1q_s32(data));
5090FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
5092 return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
5097FORCE_INLINE __m128i _mm_setr_epi8(
signed char b0,
5114 int8_t ALIGN_STRUCT(16)
5115 data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
5116 (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
5117 (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
5118 (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
5119 return (__m128i) vld1q_s8(data);
5125FORCE_INLINE __m128d _mm_setr_pd(
double e1,
double e0)
5127 return _mm_set_pd(e0, e1);
5132FORCE_INLINE __m128d _mm_setzero_pd(
void)
5134#if defined(__aarch64__)
5135 return vreinterpretq_m128d_f64(vdupq_n_f64(0));
5137 return vreinterpretq_m128d_f32(vdupq_n_f32(0));
5143FORCE_INLINE __m128i _mm_setzero_si128(
void)
5145 return vreinterpretq_m128i_s32(vdupq_n_s32(0));
5152#if __has_builtin(__builtin_shufflevector)
5153#define _mm_shuffle_epi32(a, imm) \
5155 int32x4_t _input = vreinterpretq_s32_m128i(a); \
5156 int32x4_t _shuf = __builtin_shufflevector( \
5157 _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
5158 ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \
5159 vreinterpretq_m128i_s32(_shuf); \
5162#define _mm_shuffle_epi32(a, imm) \
5166 case _MM_SHUFFLE(1, 0, 3, 2): \
5167 ret = _mm_shuffle_epi_1032((a)); \
5169 case _MM_SHUFFLE(2, 3, 0, 1): \
5170 ret = _mm_shuffle_epi_2301((a)); \
5172 case _MM_SHUFFLE(0, 3, 2, 1): \
5173 ret = _mm_shuffle_epi_0321((a)); \
5175 case _MM_SHUFFLE(2, 1, 0, 3): \
5176 ret = _mm_shuffle_epi_2103((a)); \
5178 case _MM_SHUFFLE(1, 0, 1, 0): \
5179 ret = _mm_shuffle_epi_1010((a)); \
5181 case _MM_SHUFFLE(1, 0, 0, 1): \
5182 ret = _mm_shuffle_epi_1001((a)); \
5184 case _MM_SHUFFLE(0, 1, 0, 1): \
5185 ret = _mm_shuffle_epi_0101((a)); \
5187 case _MM_SHUFFLE(2, 2, 1, 1): \
5188 ret = _mm_shuffle_epi_2211((a)); \
5190 case _MM_SHUFFLE(0, 1, 2, 2): \
5191 ret = _mm_shuffle_epi_0122((a)); \
5193 case _MM_SHUFFLE(3, 3, 3, 2): \
5194 ret = _mm_shuffle_epi_3332((a)); \
5196 case _MM_SHUFFLE(0, 0, 0, 0): \
5197 ret = _mm_shuffle_epi32_splat((a), 0); \
5199 case _MM_SHUFFLE(1, 1, 1, 1): \
5200 ret = _mm_shuffle_epi32_splat((a), 1); \
5202 case _MM_SHUFFLE(2, 2, 2, 2): \
5203 ret = _mm_shuffle_epi32_splat((a), 2); \
5205 case _MM_SHUFFLE(3, 3, 3, 3): \
5206 ret = _mm_shuffle_epi32_splat((a), 3); \
5209 ret = _mm_shuffle_epi32_default((a), (imm)); \
5223#if __has_builtin(__builtin_shufflevector)
5224#define _mm_shuffle_pd(a, b, imm8) \
5225 vreinterpretq_m128d_s64(__builtin_shufflevector( \
5226 vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), imm8 & 0x1, \
5227 ((imm8 & 0x2) >> 1) + 2))
5229#define _mm_shuffle_pd(a, b, imm8) \
5230 _mm_castsi128_pd(_mm_set_epi64x( \
5231 vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
5232 vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
5237#if __has_builtin(__builtin_shufflevector)
5238#define _mm_shufflehi_epi16(a, imm) \
5240 int16x8_t _input = vreinterpretq_s16_m128i(a); \
5241 int16x8_t _shuf = __builtin_shufflevector( \
5242 _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4, \
5243 (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
5244 (((imm) >> 6) & 0x3) + 4); \
5245 vreinterpretq_m128i_s16(_shuf); \
5248#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
5253#if __has_builtin(__builtin_shufflevector)
5254#define _mm_shufflelo_epi16(a, imm) \
5256 int16x8_t _input = vreinterpretq_s16_m128i(a); \
5257 int16x8_t _shuf = __builtin_shufflevector( \
5258 _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \
5259 (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
5260 vreinterpretq_m128i_s16(_shuf); \
5263#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
5275FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
5277 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5278 if (unlikely(c > 15))
5279 return _mm_setzero_si128();
5281 int16x8_t vc = vdupq_n_s16((int16_t) c);
5282 return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
5294FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
5296 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5297 if (unlikely(c > 31))
5298 return _mm_setzero_si128();
5300 int32x4_t vc = vdupq_n_s32((int32_t) c);
5301 return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
5311FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
5313 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5314 if (unlikely(c > 63))
5315 return _mm_setzero_si128();
5317 int64x2_t vc = vdupq_n_s64((int64_t) c);
5318 return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
5330#define _mm_slli_epi16(a, imm) \
5333 if (unlikely((imm)) <= 0) { \
5336 if (unlikely((imm) > 15)) { \
5337 ret = _mm_setzero_si128(); \
5339 ret = vreinterpretq_m128i_s16( \
5340 vshlq_n_s16(vreinterpretq_s16_m128i(a), (imm))); \
5349FORCE_INLINE __m128i _mm_slli_epi32(__m128i a,
int imm)
5351 if (unlikely(imm <= 0))
5353 if (unlikely(imm > 31))
5354 return _mm_setzero_si128();
5355 return vreinterpretq_m128i_s32(
5356 vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
5361FORCE_INLINE __m128i _mm_slli_epi64(__m128i a,
int imm)
5363 if (unlikely(imm <= 0))
5365 if (unlikely(imm > 63))
5366 return _mm_setzero_si128();
5367 return vreinterpretq_m128i_s64(
5368 vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
5378#define _mm_slli_si128(a, imm) \
5381 if (unlikely((imm) <= 0)) { \
5384 if (unlikely((imm) > 15)) { \
5385 ret = _mm_setzero_si128(); \
5387 ret = vreinterpretq_m128i_s8(vextq_s8( \
5388 vdupq_n_s8(0), vreinterpretq_s8_m128i(a), 16 - (imm))); \
5396FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
5398#if defined(__aarch64__)
5399 return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
5401 double a0 = sqrt(((
double *) &a)[0]);
5402 double a1 = sqrt(((
double *) &a)[1]);
5403 return _mm_set_pd(a1, a0);
5411FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
5413#if defined(__aarch64__)
5414 return _mm_move_sd(a, _mm_sqrt_pd(b));
5416 return _mm_set_pd(((
double *) &a)[1], sqrt(((
double *) &b)[0]));
5429FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
5431 int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
5432 if (unlikely(c > 15))
5433 return _mm_cmplt_epi16(a, _mm_setzero_si128());
5434 return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c)));
5446FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
5448 int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
5449 if (unlikely(c > 31))
5450 return _mm_cmplt_epi32(a, _mm_setzero_si128());
5451 return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c)));
5457FORCE_INLINE __m128i _mm_srai_epi16(__m128i a,
int imm)
5459 const int count = (imm & ~15) ? 15 : imm;
5460 return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
5477#define _mm_srai_epi32(a, imm) \
5480 if (unlikely((imm) == 0)) { \
5482 } else if (likely(0 < (imm) && (imm) < 32)) { \
5483 ret = vreinterpretq_m128i_s32( \
5484 vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-imm))); \
5486 ret = vreinterpretq_m128i_s32( \
5487 vshrq_n_s32(vreinterpretq_s32_m128i(a), 31)); \
5501FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
5503 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5504 if (unlikely(c > 15))
5505 return _mm_setzero_si128();
5507 int16x8_t vc = vdupq_n_s16(-(int16_t) c);
5508 return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
5520FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
5522 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5523 if (unlikely(c > 31))
5524 return _mm_setzero_si128();
5526 int32x4_t vc = vdupq_n_s32(-(int32_t) c);
5527 return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
5537FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
5539 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5540 if (unlikely(c > 63))
5541 return _mm_setzero_si128();
5543 int64x2_t vc = vdupq_n_s64(-(int64_t) c);
5544 return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
5560#define _mm_srli_epi16(a, imm) \
5563 if (unlikely(imm) == 0) { \
5565 } else if (likely(0 < (imm) && (imm) < 16)) { \
5566 ret = vreinterpretq_m128i_u16( \
5567 vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-imm))); \
5569 ret = _mm_setzero_si128(); \
5588#define _mm_srli_epi32(a, imm) \
5591 if (unlikely((imm) == 0)) { \
5593 } else if (likely(0 < (imm) && (imm) < 32)) { \
5594 ret = vreinterpretq_m128i_u32( \
5595 vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-imm))); \
5597 ret = _mm_setzero_si128(); \
5615#define _mm_srli_epi64(a, imm) \
5618 if (unlikely((imm) == 0)) { \
5620 } else if (likely(0 < (imm) && (imm) < 64)) { \
5621 ret = vreinterpretq_m128i_u64( \
5622 vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-imm))); \
5624 ret = _mm_setzero_si128(); \
5636#define _mm_srli_si128(a, imm) \
5639 if (unlikely((imm) <= 0)) { \
5642 if (unlikely((imm) > 15)) { \
5643 ret = _mm_setzero_si128(); \
5645 ret = vreinterpretq_m128i_s8( \
5646 vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), (imm))); \
5655FORCE_INLINE
void _mm_store_pd(
double *mem_addr, __m128d a)
5657#if defined(__aarch64__)
5658 vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
5660 vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
5668FORCE_INLINE
void _mm_store_pd1(
double *mem_addr, __m128d a)
5670#if defined(__aarch64__)
5671 float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
5672 vst1q_f64((float64_t *) mem_addr,
5673 vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
5675 float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a));
5676 vst1q_f32((float32_t *) mem_addr,
5677 vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low)));
5684FORCE_INLINE
void _mm_store_sd(
double *mem_addr, __m128d a)
5686#if defined(__aarch64__)
5687 vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
5689 vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a)));
5695FORCE_INLINE
void _mm_store_si128(__m128i *p, __m128i a)
5697 vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
5704#define _mm_store1_pd _mm_store_pd1
5712FORCE_INLINE
void _mm_storeh_pd(
double *mem_addr, __m128d a)
5714#if defined(__aarch64__)
5715 vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
5717 vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a)));
5723FORCE_INLINE
void _mm_storel_epi64(__m128i *a, __m128i b)
5725 uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a));
5726 uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b));
5727 *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi));
5736FORCE_INLINE
void _mm_storel_pd(
double *mem_addr, __m128d a)
5738#if defined(__aarch64__)
5739 vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
5741 vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a)));
5753FORCE_INLINE
void _mm_storer_pd(
double *mem_addr, __m128d a)
5755 float32x4_t f = vreinterpretq_f32_m128d(a);
5756 _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2)));
5763FORCE_INLINE
void _mm_storeu_pd(
double *mem_addr, __m128d a)
5765 _mm_store_pd(mem_addr, a);
5770FORCE_INLINE
void _mm_storeu_si128(__m128i *p, __m128i a)
5772 vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
5777FORCE_INLINE
void _mm_storeu_si32(
void *p, __m128i a)
5779 vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0);
5787FORCE_INLINE
void _mm_stream_pd(
double *p, __m128d a)
5789#if __has_builtin(__builtin_nontemporal_store)
5790 __builtin_nontemporal_store(a, (float32x4_t *) p);
5791#elif defined(__aarch64__)
5792 vst1q_f64(p, vreinterpretq_f64_m128d(a));
5794 vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a));
5802FORCE_INLINE
void _mm_stream_si128(__m128i *p, __m128i a)
5804#if __has_builtin(__builtin_nontemporal_store)
5805 __builtin_nontemporal_store(a, p);
5807 vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
5815FORCE_INLINE
void _mm_stream_si32(
int *p,
int a)
5817 vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);
5823FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
5825 return vreinterpretq_m128i_s16(
5826 vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
5838FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
5840 return vreinterpretq_m128i_s32(
5841 vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
5848FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
5850 return vreinterpretq_m128i_s64(
5851 vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
5857FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
5859 return vreinterpretq_m128i_s8(
5860 vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
5873FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
5875#if defined(__aarch64__)
5876 return vreinterpretq_m128d_f64(
5877 vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
5879 double *da = (
double *) &a;
5880 double *db = (
double *) &b;
5882 c[0] = da[0] - db[0];
5883 c[1] = da[1] - db[1];
5884 return vld1q_f32((float32_t *) c);
5893FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
5895 return _mm_move_sd(a, _mm_sub_pd(a, b));
5903FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
5905 return vreinterpret_m64_s64(
5906 vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
5918FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
5920 return vreinterpretq_m128i_s16(
5921 vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
5933FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
5935 return vreinterpretq_m128i_s8(
5936 vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
5942FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
5944 return vreinterpretq_m128i_u16(
5945 vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
5957FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
5959 return vreinterpretq_m128i_u8(
5960 vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
5963#define _mm_ucomieq_sd _mm_comieq_sd
5964#define _mm_ucomige_sd _mm_comige_sd
5965#define _mm_ucomigt_sd _mm_comigt_sd
5966#define _mm_ucomile_sd _mm_comile_sd
5967#define _mm_ucomilt_sd _mm_comilt_sd
5968#define _mm_ucomineq_sd _mm_comineq_sd
5972FORCE_INLINE __m128d _mm_undefined_pd(
void)
5974#if defined(__GNUC__) || defined(__clang__)
5975#pragma GCC diagnostic push
5976#pragma GCC diagnostic ignored "-Wuninitialized"
5980#if defined(__GNUC__) || defined(__clang__)
5981#pragma GCC diagnostic pop
5998FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
6000#if defined(__aarch64__)
6001 return vreinterpretq_m128i_s16(
6002 vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
6004 int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
6005 int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
6006 int16x4x2_t result = vzip_s16(a1, b1);
6007 return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
6014FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
6016#if defined(__aarch64__)
6017 return vreinterpretq_m128i_s32(
6018 vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
6020 int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
6021 int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
6022 int32x2x2_t result = vzip_s32(a1, b1);
6023 return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
6032FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
6034 int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
6035 int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
6036 return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
6051FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
6053#if defined(__aarch64__)
6054 return vreinterpretq_m128i_s8(
6055 vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
6058 vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
6060 vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
6061 int8x8x2_t result = vzip_s8(a1, b1);
6062 return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
6077FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
6079#if defined(__aarch64__)
6080 return vreinterpretq_m128d_f64(
6081 vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6083 return vreinterpretq_m128d_s64(
6084 vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)),
6085 vget_high_s64(vreinterpretq_s64_m128d(b))));
6102FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
6104#if defined(__aarch64__)
6105 return vreinterpretq_m128i_s16(
6106 vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
6108 int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
6109 int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
6110 int16x4x2_t result = vzip_s16(a1, b1);
6111 return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
6124FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
6126#if defined(__aarch64__)
6127 return vreinterpretq_m128i_s32(
6128 vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
6130 int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
6131 int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
6132 int32x2x2_t result = vzip_s32(a1, b1);
6133 return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
6137FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
6139 int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
6140 int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
6141 return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
6156FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
6158#if defined(__aarch64__)
6159 return vreinterpretq_m128i_s8(
6160 vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
6162 int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
6163 int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
6164 int8x8x2_t result = vzip_s8(a1, b1);
6165 return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
6180FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
6182#if defined(__aarch64__)
6183 return vreinterpretq_m128d_f64(
6184 vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6186 return vreinterpretq_m128d_s64(
6187 vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)),
6188 vget_low_s64(vreinterpretq_s64_m128d(b))));
6201FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
6203 return vreinterpretq_m128d_s64(
6204 veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
6209FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
6211 return vreinterpretq_m128i_s32(
6212 veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
6231FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
6233 __m128d mask = _mm_set_pd(1.0f, -1.0f);
6234#if defined(__aarch64__)
6235 return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),
6236 vreinterpretq_f64_m128d(b),
6237 vreinterpretq_f64_m128d(mask)));
6239 return _mm_add_pd(_mm_mul_pd(b, mask), a);
6247FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
6249 __m128 mask = {-1.0f, 1.0f, -1.0f, 1.0f};
6250#if defined(__aarch64__) || defined(__ARM_FEATURE_FMA)
6251 return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a),
6252 vreinterpretq_f32_m128(mask),
6253 vreinterpretq_f32_m128(b)));
6255 return _mm_add_ps(_mm_mul_ps(b, mask), a);
6262FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
6264#if defined(__aarch64__)
6265 return vreinterpretq_m128d_f64(
6266 vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6268 double *da = (
double *) &a;
6269 double *db = (
double *) &b;
6270 double c[] = {da[0] + da[1], db[0] + db[1]};
6271 return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
6278FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
6280#if defined(__aarch64__)
6281 return vreinterpretq_m128_f32(
6282 vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
6284 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
6285 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
6286 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
6287 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
6288 return vreinterpretq_m128_f32(
6289 vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
6296FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b)
6298#if defined(__aarch64__)
6299 return vreinterpretq_m128d_f64(vsubq_f64(
6300 vuzp1q_f64(vreinterpretq_f64_m128d(_a), vreinterpretq_f64_m128d(_b)),
6301 vuzp2q_f64(vreinterpretq_f64_m128d(_a), vreinterpretq_f64_m128d(_b))));
6303 double *da = (
double *) &_a;
6304 double *db = (
double *) &_b;
6305 double c[] = {da[0] - da[1], db[0] - db[1]};
6306 return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
6313FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
6315#if defined(__aarch64__)
6316 return vreinterpretq_m128_f32(vsubq_f32(
6317 vuzp1q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)),
6318 vuzp2q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b))));
6321 vuzpq_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b));
6322 return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
6333#define _mm_lddqu_si128 _mm_loadu_si128
6342#define _mm_loaddup_pd _mm_load1_pd
6347FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
6350 return vreinterpretq_m128d_f64(
6351 vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
6353 return vreinterpretq_m128d_u64(
6354 vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)));
6361FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
6363#if __has_builtin(__builtin_shufflevector)
6364 return vreinterpretq_m128_f32(__builtin_shufflevector(
6365 vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
6367 float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
6368 float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
6369 float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
6370 return vreinterpretq_m128_f32(vld1q_f32(data));
6377FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
6379#if __has_builtin(__builtin_shufflevector)
6380 return vreinterpretq_m128_f32(__builtin_shufflevector(
6381 vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
6383 float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
6384 float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
6385 float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
6386 return vreinterpretq_m128_f32(vld1q_f32(data));
6401FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
6403 return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
6415FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
6417 return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
6429FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
6431 return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
6443FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
6445 return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
6457FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
6459 return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
6471FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
6473 return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
6483#define _mm_alignr_epi8(a, b, imm) \
6486 if (unlikely((imm) >= 32)) { \
6487 ret = _mm_setzero_si128(); \
6489 uint8x16_t tmp_low, tmp_high; \
6491 const int idx = imm - 16; \
6492 tmp_low = vreinterpretq_u8_m128i(a); \
6493 tmp_high = vdupq_n_u8(0); \
6495 vreinterpretq_m128i_u8(vextq_u8(tmp_low, tmp_high, idx)); \
6497 const int idx = imm; \
6498 tmp_low = vreinterpretq_u8_m128i(b); \
6499 tmp_high = vreinterpretq_u8_m128i(a); \
6501 vreinterpretq_m128i_u8(vextq_u8(tmp_low, tmp_high, idx)); \
6514#define _mm_alignr_pi8(a, b, imm) \
6517 if (unlikely((imm) >= 16)) { \
6518 ret = vreinterpret_m64_s8(vdup_n_s8(0)); \
6520 uint8x8_t tmp_low, tmp_high; \
6522 const int idx = imm - 8; \
6523 tmp_low = vreinterpret_u8_m64(a); \
6524 tmp_high = vdup_n_u8(0); \
6525 ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
6527 const int idx = imm; \
6528 tmp_low = vreinterpret_u8_m64(b); \
6529 tmp_high = vreinterpret_u8_m64(a); \
6530 ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
6538FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
6540 int16x8_t a = vreinterpretq_s16_m128i(_a);
6541 int16x8_t b = vreinterpretq_s16_m128i(_b);
6542#if defined(__aarch64__)
6543 return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
6545 return vreinterpretq_m128i_s16(
6546 vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
6547 vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
6553FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
6555 int32x4_t a = vreinterpretq_s32_m128i(_a);
6556 int32x4_t b = vreinterpretq_s32_m128i(_b);
6557 return vreinterpretq_m128i_s32(
6558 vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
6559 vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
6565FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
6567 return vreinterpret_m64_s16(
6568 vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
6574FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
6576 return vreinterpret_m64_s32(
6577 vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
6582FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
6584#if defined(__aarch64__)
6585 int16x8_t a = vreinterpretq_s16_m128i(_a);
6586 int16x8_t b = vreinterpretq_s16_m128i(_b);
6587 return vreinterpretq_s64_s16(
6588 vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6590 int32x4_t a = vreinterpretq_s32_m128i(_a);
6591 int32x4_t b = vreinterpretq_s32_m128i(_b);
6595 int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
6596 int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
6598 return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
6605FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
6607 int16x4_t a = vreinterpret_s16_m64(_a);
6608 int16x4_t b = vreinterpret_s16_m64(_b);
6609#if defined(__aarch64__)
6610 return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
6612 int16x4x2_t res = vuzp_s16(a, b);
6613 return vreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1]));
6619FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
6621 int32x4_t a = vreinterpretq_s32_m128i(_a);
6622 int32x4_t b = vreinterpretq_s32_m128i(_b);
6626 int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
6627 int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
6629 return vreinterpretq_m128i_s16(vsubq_s16(ab0246, ab1357));
6634FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
6636 int64x2_t a = vreinterpretq_s64_m128i(_a);
6637 int64x2_t b = vreinterpretq_s64_m128i(_b);
6641 int32x4_t ab02 = vcombine_s32(vmovn_s64(a), vmovn_s64(b));
6642 int32x4_t ab13 = vcombine_s32(vshrn_n_s64(a, 32), vshrn_n_s64(b, 32));
6644 return vreinterpretq_m128i_s32(vsubq_s32(ab02, ab13));
6650FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
6653 vcombine_s32(vreinterpret_s32_m64(_a), vreinterpret_s32_m64(_b));
6655 int16x4_t ab_low_bits = vmovn_s32(ab);
6656 int16x4_t ab_high_bits = vshrn_n_s32(ab, 16);
6658 return vreinterpret_m64_s16(vsub_s16(ab_low_bits, ab_high_bits));
6664FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
6666#if defined(__aarch64__)
6667 int32x2_t a = vreinterpret_s32_m64(_a);
6668 int32x2_t b = vreinterpret_s32_m64(_b);
6669 return vreinterpret_m64_s32(vsub_s32(vtrn1_s32(a, b), vtrn2_s32(a, b)));
6671 int32x2x2_t trn_ab =
6672 vtrn_s32(vreinterpret_s32_m64(_a), vreinterpret_s32_m64(_b));
6673 return vreinterpret_m64_s32(vsub_s32(trn_ab.val[0], trn_ab.val[1]));
6680FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
6682#if defined(__aarch64__)
6683 int16x8_t a = vreinterpretq_s16_m128i(_a);
6684 int16x8_t b = vreinterpretq_s16_m128i(_b);
6685 return vreinterpretq_s64_s16(
6686 vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6688 int32x4_t a = vreinterpretq_s32_m128i(_a);
6689 int32x4_t b = vreinterpretq_s32_m128i(_b);
6693 int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
6694 int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
6696 return vreinterpretq_m128i_s16(vqsubq_s16(ab0246, ab1357));
6703FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
6705 int16x4_t a = vreinterpret_s16_m64(_a);
6706 int16x4_t b = vreinterpret_s16_m64(_b);
6707#if defined(__aarch64__)
6708 return vreinterpret_s64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
6710 int16x4x2_t res = vuzp_s16(a, b);
6711 return vreinterpret_s64_s16(vqsub_s16(res.val[0], res.val[1]));
6725FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
6727#if defined(__aarch64__)
6728 uint8x16_t a = vreinterpretq_u8_m128i(_a);
6729 int8x16_t b = vreinterpretq_s8_m128i(_b);
6730 int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
6731 vmovl_s8(vget_low_s8(b)));
6732 int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
6733 vmovl_s8(vget_high_s8(b)));
6734 return vreinterpretq_m128i_s16(
6735 vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
6739 uint16x8_t a = vreinterpretq_u16_m128i(_a);
6740 int16x8_t b = vreinterpretq_s16_m128i(_b);
6743 int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
6744 int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
6747 int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
6748 int16x8_t b_odd = vshrq_n_s16(b, 8);
6751 int16x8_t prod1 = vmulq_s16(a_even, b_even);
6752 int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
6755 return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
6764FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
6766 uint16x4_t a = vreinterpret_u16_m64(_a);
6767 int16x4_t b = vreinterpret_s16_m64(_b);
6770 int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8));
6771 int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff)));
6774 int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8);
6775 int16x4_t b_odd = vshr_n_s16(b, 8);
6778 int16x4_t prod1 = vmul_s16(a_even, b_even);
6779 int16x4_t prod2 = vmul_s16(a_odd, b_odd);
6782 return vreinterpret_m64_s16(vqadd_s16(prod1, prod2));
6794FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
6800 int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
6801 vget_low_s16(vreinterpretq_s16_m128i(b)));
6802 int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
6803 vget_high_s16(vreinterpretq_s16_m128i(b)));
6807 int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
6808 int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
6811 return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
6818FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)
6820 int32x4_t mul_extend =
6821 vmull_s16((vreinterpret_s16_m64(a)), (vreinterpret_s16_m64(b)));
6824 return vreinterpret_m64_s16(vrshrn_n_s32(mul_extend, 15));
6830FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
6832 int8x16_t tbl = vreinterpretq_s8_m128i(a);
6833 uint8x16_t idx = vreinterpretq_u8_m128i(b);
6834 uint8x16_t idx_masked =
6835 vandq_u8(idx, vdupq_n_u8(0x8F));
6836#if defined(__aarch64__)
6837 return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
6838#elif defined(__GNUC__)
6842 __asm__ __volatile__(
6843 "vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
6844 "vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
6846 : [tbl]
"w"(tbl), [idx]
"w"(idx_masked));
6847 return vreinterpretq_m128i_s8(ret);
6850 int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
6851 return vreinterpretq_m128i_s8(
6852 vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
6853 vtbl2_s8(a_split, vget_high_u8(idx_masked))));
6871FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b)
6873 const int8x8_t controlMask =
6874 vand_s8(vreinterpret_s8_m64(b), vdup_n_s8(1 << 7 | 0x07));
6875 int8x8_t res = vtbl1_s8(vreinterpret_s8_m64(a), controlMask);
6876 return vreinterpret_m64_s8(res);
6893FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
6895 int16x8_t a = vreinterpretq_s16_m128i(_a);
6896 int16x8_t b = vreinterpretq_s16_m128i(_b);
6900 uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
6902#if defined(__aarch64__)
6903 int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
6905 int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
6910 int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
6912 int16x8_t res = vbicq_s16(masked, zeroMask);
6913 return vreinterpretq_m128i_s16(res);
6930FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
6932 int32x4_t a = vreinterpretq_s32_m128i(_a);
6933 int32x4_t b = vreinterpretq_s32_m128i(_b);
6937 uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
6940#if defined(__aarch64__)
6941 int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
6943 int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
6948 int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
6950 int32x4_t res = vbicq_s32(masked, zeroMask);
6951 return vreinterpretq_m128i_s32(res);
6968FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
6970 int8x16_t a = vreinterpretq_s8_m128i(_a);
6971 int8x16_t b = vreinterpretq_s8_m128i(_b);
6975 uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
6978#if defined(__aarch64__)
6979 int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
6981 int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
6986 int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
6988 int8x16_t res = vbicq_s8(masked, zeroMask);
6990 return vreinterpretq_m128i_s8(res);
7009FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
7011 int16x4_t a = vreinterpret_s16_m64(_a);
7012 int16x4_t b = vreinterpret_s16_m64(_b);
7016 uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
7019#if defined(__aarch64__)
7020 int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
7022 int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
7027 int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
7029 int16x4_t res = vbic_s16(masked, zeroMask);
7031 return vreinterpret_m64_s16(res);
7050FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
7052 int32x2_t a = vreinterpret_s32_m64(_a);
7053 int32x2_t b = vreinterpret_s32_m64(_b);
7057 uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
7060#if defined(__aarch64__)
7061 int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
7063 int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
7068 int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
7070 int32x2_t res = vbic_s32(masked, zeroMask);
7072 return vreinterpret_m64_s32(res);
7091FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
7093 int8x8_t a = vreinterpret_s8_m64(_a);
7094 int8x8_t b = vreinterpret_s8_m64(_b);
7098 uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
7101#if defined(__aarch64__)
7102 int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
7104 int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
7109 int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
7111 int8x8_t res = vbic_s8(masked, zeroMask);
7113 return vreinterpret_m64_s8(res);
7131#define _mm_blend_epi16(a, b, imm) \
7133 const uint16_t _mask[8] = {((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0, \
7134 ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0, \
7135 ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0, \
7136 ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0, \
7137 ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0, \
7138 ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0, \
7139 ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0, \
7140 ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0}; \
7141 uint16x8_t _mask_vec = vld1q_u16(_mask); \
7142 uint16x8_t _a = vreinterpretq_u16_m128i(a); \
7143 uint16x8_t _b = vreinterpretq_u16_m128i(b); \
7144 vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a)); \
7150#define _mm_blend_pd(a, b, imm) \
7152 const uint64_t _mask[2] = { \
7153 ((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0), \
7154 ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)}; \
7155 uint64x2_t _mask_vec = vld1q_u64(_mask); \
7156 uint64x2_t _a = vreinterpretq_u64_m128d(a); \
7157 uint64x2_t _b = vreinterpretq_u64_m128d(b); \
7158 vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, _b, _a)); \
7164FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b,
const char imm8)
7166 const uint32_t ALIGN_STRUCT(16)
7167 data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
7168 ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
7169 ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
7170 ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
7171 uint32x4_t mask = vld1q_u32(data);
7172 float32x4_t a = vreinterpretq_f32_m128(_a);
7173 float32x4_t b = vreinterpretq_f32_m128(_b);
7174 return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
7188FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
7192 vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
7193 uint8x16_t a = vreinterpretq_u8_m128i(_a);
7194 uint8x16_t b = vreinterpretq_u8_m128i(_b);
7195 return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
7201FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
7204 vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
7205#if defined(__aarch64__)
7206 float64x2_t a = vreinterpretq_f64_m128d(_a);
7207 float64x2_t b = vreinterpretq_f64_m128d(_b);
7208 return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
7210 uint64x2_t a = vreinterpretq_u64_m128d(_a);
7211 uint64x2_t b = vreinterpretq_u64_m128d(_b);
7212 return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a));
7219FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
7223 vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31));
7224 float32x4_t a = vreinterpretq_f32_m128(_a);
7225 float32x4_t b = vreinterpretq_f32_m128(_b);
7226 return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
7233FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
7235#if defined(__aarch64__)
7236 return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
7238 double *f = (
double *) &a;
7239 return _mm_set_pd(ceil(f[1]), ceil(f[0]));
7247FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
7249#if defined(__aarch64__)
7250 return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
7252 float *f = (
float *) &a;
7253 return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0]));
7262FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b)
7264 return _mm_move_sd(a, _mm_ceil_pd(b));
7276FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
7278 return _mm_move_ss(a, _mm_ceil_ps(b));
7283FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
7285#if defined(__aarch64__)
7286 return vreinterpretq_m128i_u64(
7287 vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
7292 vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
7293 uint32x4_t swapped = vrev64q_u32(cmp);
7294 return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
7300FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
7302 return vreinterpretq_m128i_s32(
7303 vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
7308FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
7310 int16x8_t s16x8 = vreinterpretq_s16_m128i(a);
7311 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8));
7312 int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4));
7313 return vreinterpretq_m128i_s64(s64x2);
7318FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
7320 return vreinterpretq_m128i_s64(
7321 vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
7326FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
7328 int8x16_t s8x16 = vreinterpretq_s8_m128i(a);
7329 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));
7330 return vreinterpretq_m128i_s16(s16x8);
7335FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
7337 int8x16_t s8x16 = vreinterpretq_s8_m128i(a);
7338 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));
7339 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8));
7340 return vreinterpretq_m128i_s32(s32x4);
7345FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
7347 int8x16_t s8x16 = vreinterpretq_s8_m128i(a);
7348 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));
7349 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8));
7350 int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4));
7351 return vreinterpretq_m128i_s64(s64x2);
7356FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
7358 return vreinterpretq_m128i_u32(
7359 vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
7364FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
7366 uint16x8_t u16x8 = vreinterpretq_u16_m128i(a);
7367 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8));
7368 uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4));
7369 return vreinterpretq_m128i_u64(u64x2);
7374FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
7376 return vreinterpretq_m128i_u64(
7377 vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
7383FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
7385 uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);
7386 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));
7387 return vreinterpretq_m128i_u16(u16x8);
7393FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
7395 uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);
7396 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));
7397 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8));
7398 return vreinterpretq_m128i_u32(u32x4);
7403FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
7405 uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);
7406 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));
7407 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8));
7408 uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4));
7409 return vreinterpretq_m128i_u64(u64x2);
7416FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b,
const int imm)
7418#if defined(__aarch64__)
7421 return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b)));
7424 float32x4_t m = _mm_mul_ps(a, b);
7426 return _mm_set1_ps(vaddvq_f32(m));
7431 float32x4_t f32a = vreinterpretq_f32_m128(a);
7432 float32x4_t f32b = vreinterpretq_f32_m128(b);
7438 _sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]);
7440 _sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]);
7442 _sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]);
7444 _sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]);
7448 (imm & 0x1) ? s : 0,
7449 (imm & 0x2) ? s : 0,
7450 (imm & 0x4) ? s : 0,
7451 (imm & 0x8) ? s : 0,
7453 return vreinterpretq_m128_f32(res);
7459#define _mm_extract_epi32(a, imm) \
7460 vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
7465#define _mm_extract_epi64(a, imm) \
7466 vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
7472#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
7476#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
7482FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
7484#if defined(__aarch64__)
7485 return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
7487 double *f = (
double *) &a;
7488 return _mm_set_pd(floor(f[1]), floor(f[0]));
7496FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
7498#if defined(__aarch64__)
7499 return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
7501 float *f = (
float *) &a;
7502 return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0]));
7511FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b)
7513 return _mm_move_sd(a, _mm_floor_pd(b));
7525FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
7527 return _mm_move_ss(a, _mm_floor_ps(b));
7534#define _mm_insert_epi32(a, b, imm) \
7536 vreinterpretq_m128i_s32( \
7537 vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
7544#define _mm_insert_epi64(a, b, imm) \
7546 vreinterpretq_m128i_s64( \
7547 vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
7554#define _mm_insert_epi8(a, b, imm) \
7556 vreinterpretq_m128i_s8( \
7557 vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
7564#define _mm_insert_ps(a, b, imm8) \
7566 float32x4_t tmp1 = vsetq_lane_f32(vgetq_lane_f32(b, (imm >> 6) & 0x3), \
7567 vreinterpretq_f32_m128(a), 0); \
7568 float32x4_t tmp2 = \
7569 vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), vreinterpretq_f32_m128(a), \
7570 ((imm >> 4) & 0x3)); \
7571 const uint32_t data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0, \
7572 ((imm8) & (1 << 1)) ? UINT32_MAX : 0, \
7573 ((imm8) & (1 << 2)) ? UINT32_MAX : 0, \
7574 ((imm8) & (1 << 3)) ? UINT32_MAX : 0}; \
7575 uint32x4_t mask = vld1q_u32(data); \
7576 float32x4_t all_zeros = vdupq_n_f32(0); \
7578 vreinterpretq_m128_f32( \
7579 vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2))); \
7593FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
7595 return vreinterpretq_m128i_s32(
7596 vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
7602FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
7604 return vreinterpretq_m128i_s8(
7605 vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
7611FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
7613 return vreinterpretq_m128i_u16(
7614 vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
7620FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
7622 return vreinterpretq_m128i_u32(
7623 vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
7636FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
7638 return vreinterpretq_m128i_s32(
7639 vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
7645FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
7647 return vreinterpretq_m128i_s8(
7648 vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
7654FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
7656 return vreinterpretq_m128i_u16(
7657 vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
7663FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
7665 return vreinterpretq_m128i_u32(
7666 vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
7686FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
7689 uint16_t min, idx = 0;
7691#if defined(__aarch64__)
7692 min = vminvq_u16(vreinterpretq_u16_m128i(a));
7695 tmp = vreinterpret_m64_u16(
7696 vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
7697 vget_high_u16(vreinterpretq_u16_m128i(a))));
7698 tmp = vreinterpret_m64_u16(
7699 vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
7700 tmp = vreinterpret_m64_u16(
7701 vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
7702 min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
7706 for (i = 0; i < 8; i++) {
7707 if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
7711 a = _mm_srli_si128(a, 2);
7714 dst = _mm_setzero_si128();
7715 dst = vreinterpretq_m128i_u16(
7716 vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
7717 dst = vreinterpretq_m128i_u16(
7718 vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
7727FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
7730 int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
7731 int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
7732 return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
7738FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
7740 return vreinterpretq_m128i_s32(
7741 vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
7755FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
7757 return vreinterpretq_m128i_u16(
7758 vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
7759 vqmovun_s32(vreinterpretq_s32_m128i(b))));
7766FORCE_INLINE __m128d _mm_round_pd(__m128d a,
int rounding)
7768#if defined(__aarch64__)
7770 case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
7771 return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a)));
7772 case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
7773 return _mm_floor_pd(a);
7774 case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
7775 return _mm_ceil_pd(a);
7776 case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
7777 return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a)));
7779 return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)));
7782 double *v_double = (
double *) &a;
7784 if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
7785 (rounding == _MM_FROUND_CUR_DIRECTION &&
7786 _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
7788 for (
int i = 0; i < 2; i++) {
7789 tmp = (v_double[i] < 0) ? -v_double[i] : v_double[i];
7790 double roundDown = floor(tmp);
7791 double roundUp = ceil(tmp);
7792 double diffDown = tmp - roundDown;
7793 double diffUp = roundUp - tmp;
7794 if (diffDown < diffUp) {
7797 }
else if (diffDown > diffUp) {
7803 double half = roundDown / 2;
7804 if (half != floor(half)) {
7814 res[i] = (v_double[i] < 0) ? -res[i] : res[i];
7816 return _mm_set_pd(res[1], res[0]);
7817 }
else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
7818 (rounding == _MM_FROUND_CUR_DIRECTION &&
7819 _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
7820 return _mm_floor_pd(a);
7821 }
else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
7822 (rounding == _MM_FROUND_CUR_DIRECTION &&
7823 _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
7824 return _mm_ceil_pd(a);
7826 return _mm_set_pd(v_double[1] > 0 ? floor(v_double[1]) : ceil(v_double[1]),
7827 v_double[0] > 0 ? floor(v_double[0]) : ceil(v_double[0]));
7835FORCE_INLINE __m128 _mm_round_ps(__m128 a,
int rounding)
7837#if defined(__aarch64__)
7839 case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
7840 return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
7841 case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
7842 return _mm_floor_ps(a);
7843 case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
7844 return _mm_ceil_ps(a);
7845 case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
7846 return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
7848 return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
7851 float *v_float = (
float *) &a;
7853 if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
7854 (rounding == _MM_FROUND_CUR_DIRECTION &&
7855 _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
7856 uint32x4_t signmask = vdupq_n_u32(0x80000000);
7857 float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
7859 int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
7860 vreinterpretq_f32_m128(a), half));
7861 int32x4_t r_trunc = vcvtq_s32_f32(
7862 vreinterpretq_f32_m128(a));
7863 int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
7864 vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31));
7865 int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
7867 float32x4_t delta = vsubq_f32(
7868 vreinterpretq_f32_m128(a),
7869 vcvtq_f32_s32(r_trunc));
7870 uint32x4_t is_delta_half =
7871 vceqq_f32(delta, half);
7872 return vreinterpretq_m128_f32(
7873 vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal)));
7874 }
else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
7875 (rounding == _MM_FROUND_CUR_DIRECTION &&
7876 _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
7877 return _mm_floor_ps(a);
7878 }
else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
7879 (rounding == _MM_FROUND_CUR_DIRECTION &&
7880 _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
7881 return _mm_ceil_ps(a);
7883 return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]),
7884 v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]),
7885 v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]),
7886 v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0]));
7895FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b,
int rounding)
7897 return _mm_move_sd(a, _mm_round_pd(b, rounding));
7915FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b,
int rounding)
7917 return _mm_move_ss(a, _mm_round_ps(b, rounding));
7927FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
7929#if __has_builtin(__builtin_nontemporal_store)
7930 return __builtin_nontemporal_load(p);
7932 return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
7939FORCE_INLINE
int _mm_test_all_ones(__m128i a)
7941 return (uint64_t)(vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
7948FORCE_INLINE
int _mm_test_all_zeros(__m128i a, __m128i mask)
7950 int64x2_t a_and_mask =
7951 vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
7952 return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1));
7961FORCE_INLINE
int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
7964 vandq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
7966 vbicq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
7967 uint64x2_t result = vandq_u64(zf, cf);
7968 return !(vgetq_lane_u64(result, 0) | vgetq_lane_u64(result, 1));
7976FORCE_INLINE
int _mm_testc_si128(__m128i a, __m128i b)
7979 vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_m128i(a))),
7980 vreinterpretq_s64_m128i(b));
7981 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
7990#define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b)
7997FORCE_INLINE
int _mm_testz_si128(__m128i a, __m128i b)
8000 vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
8001 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
8008FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
8010#if defined(__aarch64__)
8011 return vreinterpretq_m128i_u64(
8012 vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
8014 return vreinterpretq_m128i_s64(vshrq_n_s64(
8015 vqsubq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a)),
8023FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
8025#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8026 __asm__ __volatile__(
"crc32ch %w[c], %w[c], %w[v]\n\t"
8030 crc = _mm_crc32_u8(crc, v & 0xff);
8031 crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
8039FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
8041#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8042 __asm__ __volatile__(
"crc32cw %w[c], %w[c], %w[v]\n\t"
8046 crc = _mm_crc32_u16(crc, v & 0xffff);
8047 crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
8055FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
8057#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8058 __asm__ __volatile__(
"crc32cx %w[c], %w[c], %x[v]\n\t"
8062 crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff);
8063 crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff);
8071FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
8073#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8074 __asm__ __volatile__(
"crc32cb %w[c], %w[c], %w[v]\n\t"
8079 for (
int bit = 0; bit < 8; bit++) {
8081 crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
8091#if !defined(__ARM_FEATURE_CRYPTO)
8093#define SSE2NEON_AES_DATA(w) \
8095 w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
8096 w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
8097 w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
8098 w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
8099 w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
8100 w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
8101 w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
8102 w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
8103 w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
8104 w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
8105 w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
8106 w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
8107 w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
8108 w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
8109 w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
8110 w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
8111 w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
8112 w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
8113 w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
8114 w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
8115 w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
8116 w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
8117 w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
8118 w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
8119 w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
8120 w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
8121 w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
8122 w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
8123 w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
8124 w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
8125 w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
8126 w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
8127 w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
8128 w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
8129 w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
8130 w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
8131 w(0xb0), w(0x54), w(0xbb), w(0x16) \
8136#define SSE2NEON_AES_H0(x) (x)
8137static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_DATA(SSE2NEON_AES_H0);
8138#undef SSE2NEON_AES_H0
8146FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
8148#if defined(__aarch64__)
8149 static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
8150 0xe, 0x3, 0x8, 0xd, 0x2, 0x7,
8151 0xc, 0x1, 0x6, 0xb};
8152 static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
8153 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
8156 uint8x16_t w = vreinterpretq_u8_m128i(EncBlock);
8159 w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
8162 v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(SSE2NEON_sbox), w);
8163 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40);
8164 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80);
8165 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0);
8168 w = (v << 1) ^ (uint8x16_t)(((int8x16_t) v >> 7) & 0x1b);
8169 w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
8170 w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
8173 return vreinterpretq_m128i_u8(w) ^ RoundKey;
8176#define SSE2NEON_AES_B2W(b0, b1, b2, b3) \
8177 (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | \
8179#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b ))
8180#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
8181#define SSE2NEON_AES_U0(p) \
8182 SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
8183#define SSE2NEON_AES_U1(p) \
8184 SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
8185#define SSE2NEON_AES_U2(p) \
8186 SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
8187#define SSE2NEON_AES_U3(p) \
8188 SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
8189 static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
8190 SSE2NEON_AES_DATA(SSE2NEON_AES_U0),
8191 SSE2NEON_AES_DATA(SSE2NEON_AES_U1),
8192 SSE2NEON_AES_DATA(SSE2NEON_AES_U2),
8193 SSE2NEON_AES_DATA(SSE2NEON_AES_U3),
8195#undef SSE2NEON_AES_B2W
8196#undef SSE2NEON_AES_F2
8197#undef SSE2NEON_AES_F3
8198#undef SSE2NEON_AES_U0
8199#undef SSE2NEON_AES_U1
8200#undef SSE2NEON_AES_U2
8201#undef SSE2NEON_AES_U3
8203 uint32_t x0 = _mm_cvtsi128_si32(EncBlock);
8204 uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0x55));
8205 uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xAA));
8206 uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xFF));
8208 __m128i out = _mm_set_epi32(
8209 (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
8210 aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
8211 (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
8212 aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
8213 (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
8214 aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
8215 (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
8216 aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
8218 return _mm_xor_si128(out, RoundKey);
8225FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
8229 {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 0)],
8230 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 5)],
8231 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 10)],
8232 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 15)]},
8233 {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 4)],
8234 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 9)],
8235 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 14)],
8236 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 3)]},
8237 {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 8)],
8238 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 13)],
8239 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 2)],
8240 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 7)]},
8241 {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 12)],
8242 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 1)],
8243 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 6)],
8244 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 11)]},
8246 for (
int i = 0; i < 16; i++)
8247 vreinterpretq_nth_u8_m128i(a, i) =
8248 v[i / 4][i % 4] ^ vreinterpretq_nth_u8_m128i(RoundKey, i);
8258FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key,
const int rcon)
8260 uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55));
8261 uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF));
8262 for (
int i = 0; i < 4; ++i) {
8263 ((uint8_t *) &X1)[i] = SSE2NEON_sbox[((uint8_t *) &X1)[i]];
8264 ((uint8_t *) &X3)[i] = SSE2NEON_sbox[((uint8_t *) &X3)[i]];
8266 return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
8267 ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
8269#undef SSE2NEON_AES_DATA
8278FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
8280 return vreinterpretq_m128i_u8(
8281 vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
8282 vreinterpretq_u8_m128i(b));
8286FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
8288 return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
8289 vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
8293FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a,
const int rcon)
8296 uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
8300 u8[0x4], u8[0x1], u8[0xE], u8[0xB],
8301 u8[0x1], u8[0xE], u8[0xB], u8[0x4],
8302 u8[0xC], u8[0x9], u8[0x6], u8[0x3],
8303 u8[0x9], u8[0x6], u8[0x3], u8[0xC],
8305 uint32x4_t r = {0, (unsigned) rcon, 0, (
unsigned) rcon};
8306 return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
8315FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b,
const int imm)
8317 uint64x2_t a = vreinterpretq_u64_m128i(_a);
8318 uint64x2_t b = vreinterpretq_u64_m128i(_b);
8319 switch (imm & 0x11) {
8321 return vreinterpretq_m128i_u64(
8322 _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
8324 return vreinterpretq_m128i_u64(
8325 _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
8327 return vreinterpretq_m128i_u64(
8328 _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
8330 return vreinterpretq_m128i_u64(
8331 _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
8340FORCE_INLINE
int _mm_popcnt_u32(
unsigned int a)
8342#if defined(__aarch64__)
8343#if __has_builtin(__builtin_popcount)
8344 return __builtin_popcount(a);
8346 return (
int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
8350 uint8x8_t input_val, count8x8_val;
8351 uint16x4_t count16x4_val;
8352 uint32x2_t count32x2_val;
8354 input_val = vld1_u8((uint8_t *) &a);
8355 count8x8_val = vcnt_u8(input_val);
8356 count16x4_val = vpaddl_u8(count8x8_val);
8357 count32x2_val = vpaddl_u16(count16x4_val);
8359 vst1_u32(&count, count32x2_val);
8367FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
8369#if defined(__aarch64__)
8370#if __has_builtin(__builtin_popcountll)
8371 return __builtin_popcountll(a);
8373 return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
8377 uint8x8_t input_val, count8x8_val;
8378 uint16x4_t count16x4_val;
8379 uint32x2_t count32x2_val;
8380 uint64x1_t count64x1_val;
8382 input_val = vld1_u8((uint8_t *) &a);
8383 count8x8_val = vcnt_u8(input_val);
8384 count16x4_val = vpaddl_u8(count8x8_val);
8385 count32x2_val = vpaddl_u16(count16x4_val);
8386 count64x1_val = vpaddl_u32(count32x2_val);
8387 vst1_u64(&count, count64x1_val);
8392#if defined(__GNUC__) || defined(__clang__)
8393#pragma pop_macro("ALIGN_STRUCT")
8394#pragma pop_macro("FORCE_INLINE")
8397#if defined(__GNUC__) && !defined(__clang__)
8398#pragma GCC pop_options
Definition sse2neon.h:486