SDDC_Driver
Loading...
Searching...
No Matches
sse2neon.h
1#ifndef SSE2NEON_H
2#define SSE2NEON_H
3
4// This header file provides a simple API translation layer
5// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
6//
7// This header file does not yet translate all of the SSE intrinsics.
8//
9// Contributors to this work are:
10// John W. Ratcliff <jratcliffscarab@gmail.com>
11// Brandon Rowlett <browlett@nvidia.com>
12// Ken Fast <kfast@gdeb.com>
13// Eric van Beurden <evanbeurden@nvidia.com>
14// Alexander Potylitsin <apotylitsin@nvidia.com>
15// Hasindu Gamaarachchi <hasindu2008@gmail.com>
16// Jim Huang <jserv@biilabs.io>
17// Mark Cheng <marktwtn@biilabs.io>
18// Malcolm James MacLeod <malcolm@gulden.com>
19// Devin Hussey (easyaspi314) <husseydevin@gmail.com>
20// Sebastian Pop <spop@amazon.com>
21// Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
22// Danila Kutenin <danilak@google.com>
23// François Turban (JishinMaster) <francois.turban@gmail.com>
24// Pei-Hsuan Hung <afcidk@gmail.com>
25// Yang-Hao Yuan <yanghau@biilabs.io>
26// Syoyo Fujita <syoyo@lighttransport.com>
27// Brecht Van Lommel <brecht@blender.org>
28
29/*
30 * sse2neon is freely redistributable under the MIT License.
31 *
32 * Permission is hereby granted, free of charge, to any person obtaining a copy
33 * of this software and associated documentation files (the "Software"), to deal
34 * in the Software without restriction, including without limitation the rights
35 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
36 * copies of the Software, and to permit persons to whom the Software is
37 * furnished to do so, subject to the following conditions:
38 *
39 * The above copyright notice and this permission notice shall be included in
40 * all copies or substantial portions of the Software.
41 *
42 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
43 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
44 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
45 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
46 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
47 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
48 * SOFTWARE.
49 */
50
51/* Tunable configurations */
52
53/* Enable precise implementation of math operations
54 * This would slow down the computation a bit, but gives consistent result with
55 * x86 SSE2. (e.g. would solve a hole or NaN pixel in the rendering result)
56 */
57/* _mm_min_ps and _mm_max_ps */
58#ifndef SSE2NEON_PRECISE_MINMAX
59#define SSE2NEON_PRECISE_MINMAX (0)
60#endif
61/* _mm_rcp_ps and _mm_div_ps */
62#ifndef SSE2NEON_PRECISE_DIV
63#define SSE2NEON_PRECISE_DIV (0)
64#endif
65/* _mm_sqrt_ps and _mm_rsqrt_ps */
66#ifndef SSE2NEON_PRECISE_SQRT
67#define SSE2NEON_PRECISE_SQRT (0)
68#endif
69
70#if defined(__GNUC__) || defined(__clang__)
71#pragma push_macro("FORCE_INLINE")
72#pragma push_macro("ALIGN_STRUCT")
73#define FORCE_INLINE static inline __attribute__((always_inline))
74#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
75#ifndef likely
76#define likely(x) __builtin_expect(!!(x), 1)
77#endif
78#ifndef unlikely
79#define unlikely(x) __builtin_expect(!!(x), 0)
80#endif
81#else
82#error "Macro name collisions may happen with unsupported compiler."
83#ifdef FORCE_INLINE
84#undef FORCE_INLINE
85#endif
86#define FORCE_INLINE static inline
87#ifndef ALIGN_STRUCT
88#define ALIGN_STRUCT(x) __declspec(align(x))
89#endif
90#endif
91#ifndef likely
92#define likely(x) (x)
93#endif
94#ifndef unlikely
95#define unlikely(x) (x)
96#endif
97
98#include <stdint.h>
99#include <stdlib.h>
100
101/* Architecture-specific build options */
102/* FIXME: #pragma GCC push_options is only available on GCC */
103#if defined(__GNUC__)
104#if defined(__arm__) && __ARM_ARCH == 7
105/* According to ARM C Language Extensions Architecture specification,
106 * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
107 * architecture supported.
108 */
109#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
110#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
111#endif
112#if !defined(__clang__)
113#pragma GCC push_options
114#pragma GCC target("fpu=neon")
115#endif
116#elif defined(__aarch64__)
117#if !defined(__clang__)
118#pragma GCC push_options
119#pragma GCC target("+simd")
120#endif
121#else
122#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
123#endif
124#endif
125
126#include <arm_neon.h>
127
128/* Rounding functions require either Aarch64 instructions or libm failback */
129#if !defined(__aarch64__)
130#include <math.h>
131#endif
132
133/* "__has_builtin" can be used to query support for built-in functions
134 * provided by gcc/clang and other compilers that support it.
135 */
136#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
137/* Compatibility with gcc <= 9 */
138#if __GNUC__ <= 9
139#define __has_builtin(x) HAS##x
140#define HAS__builtin_popcount 1
141#define HAS__builtin_popcountll 1
142#else
143#define __has_builtin(x) 0
144#endif
145#endif
146
155#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
156 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
157
158/* Rounding mode macros. */
159#define _MM_FROUND_TO_NEAREST_INT 0x00
160#define _MM_FROUND_TO_NEG_INF 0x01
161#define _MM_FROUND_TO_POS_INF 0x02
162#define _MM_FROUND_TO_ZERO 0x03
163#define _MM_FROUND_CUR_DIRECTION 0x04
164#define _MM_FROUND_NO_EXC 0x08
165#define _MM_ROUND_NEAREST 0x0000
166#define _MM_ROUND_DOWN 0x2000
167#define _MM_ROUND_UP 0x4000
168#define _MM_ROUND_TOWARD_ZERO 0x6000
169
170/* indicate immediate constant argument in a given range */
171#define __constrange(a, b) const
172
173/* A few intrinsics accept traditional data types like ints or floats, but
174 * most operate on data types that are specific to SSE.
175 * If a vector type ends in d, it contains doubles, and if it does not have
176 * a suffix, it contains floats. An integer vector type can contain any type
177 * of integer, from chars to shorts to unsigned long longs.
178 */
179typedef int64x1_t __m64;
180typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
181// On ARM 32-bit architecture, the float64x2_t is not supported.
182// The data type __m128d should be represented in a different way for related
183// intrinsic conversion.
184#if defined(__aarch64__)
185typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
186#else
187typedef float32x4_t __m128d;
188#endif
189typedef int64x2_t __m128i; /* 128-bit vector containing integers */
190
191/* type-safe casting between types */
192
193#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
194#define vreinterpretq_m128_f32(x) (x)
195#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
196
197#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
198#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
199#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
200#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
201
202#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
203#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
204#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
205#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
206
207#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
208#define vreinterpretq_f32_m128(x) (x)
209#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
210
211#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
212#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
213#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
214#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
215
216#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
217#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
218#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
219#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
220
221#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
222#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
223#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
224#define vreinterpretq_m128i_s64(x) (x)
225
226#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
227#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
228#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
229#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
230
231#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
232#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
233
234#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
235#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
236#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
237#define vreinterpretq_s64_m128i(x) (x)
238
239#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
240#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
241#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
242#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
243
244#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
245#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
246#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
247#define vreinterpret_m64_s64(x) (x)
248
249#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
250#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
251#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
252#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
253
254#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
255#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
256#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
257
258#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
259#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
260#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
261#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
262
263#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
264#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
265#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
266#define vreinterpret_s64_m64(x) (x)
267
268#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
269
270#if defined(__aarch64__)
271#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
272#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
273
274#define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
275
276#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
277#define vreinterpretq_m128d_f64(x) (x)
278
279#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
280
281#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x)
282#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
283
284#define vreinterpretq_f64_m128d(x) (x)
285#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
286#else
287#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
288#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
289
290#define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x)
291#define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
292
293#define vreinterpretq_m128d_f32(x) (x)
294
295#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
296
297#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x)
298#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
299
300#define vreinterpretq_f32_m128d(x) (x)
301#endif
302
303// A struct is defined in this header file called 'SIMDVec' which can be used
304// by applications which attempt to access the contents of an _m128 struct
305// directly. It is important to note that accessing the __m128 struct directly
306// is bad coding practice by Microsoft: @see:
307// https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx
308//
309// However, some legacy source code may try to access the contents of an __m128
310// struct directly so the developer can use the SIMDVec as an alias for it. Any
311// casting must be done manually by the developer, as you cannot cast or
312// otherwise alias the base NEON data type for intrinsic operations.
313//
314// union intended to allow direct access to an __m128 variable using the names
315// that the MSVC compiler provides. This union should really only be used when
316// trying to access the members of the vector as integer values. GCC/clang
317// allow native access to the float members through a simple array access
318// operator (in C since 4.6, in C++ since 4.8).
319//
320// Ideally direct accesses to SIMD vectors should not be used since it can cause
321// a performance hit. If it really is needed however, the original __m128
322// variable can be aliased with a pointer to this union and used to access
323// individual components. The use of this union should be hidden behind a macro
324// that is used throughout the codebase to access the members instead of always
325// declaring this type of variable.
326typedef union ALIGN_STRUCT(16) SIMDVec {
327 float m128_f32[4]; // as floats - DON'T USE. Added for convenience.
328 int8_t m128_i8[16]; // as signed 8-bit integers.
329 int16_t m128_i16[8]; // as signed 16-bit integers.
330 int32_t m128_i32[4]; // as signed 32-bit integers.
331 int64_t m128_i64[2]; // as signed 64-bit integers.
332 uint8_t m128_u8[16]; // as unsigned 8-bit integers.
333 uint16_t m128_u16[8]; // as unsigned 16-bit integers.
334 uint32_t m128_u32[4]; // as unsigned 32-bit integers.
335 uint64_t m128_u64[2]; // as unsigned 64-bit integers.
336} SIMDVec;
337
338// casting using SIMDVec
339#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
340#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
341#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
342
343// Function declaration
344// SSE
345FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE();
346FORCE_INLINE __m128 _mm_move_ss(__m128, __m128);
347FORCE_INLINE __m128 _mm_or_ps(__m128, __m128);
348FORCE_INLINE __m128 _mm_set_ps1(float);
349FORCE_INLINE __m128 _mm_setzero_ps(void);
350// SSE2
351FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i);
352FORCE_INLINE __m128i _mm_castps_si128(__m128);
353FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i);
354FORCE_INLINE __m128i _mm_cvtps_epi32(__m128);
355FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d);
356FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i);
357FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int);
358FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t);
359FORCE_INLINE __m128d _mm_set_pd(double, double);
360FORCE_INLINE __m128i _mm_set1_epi32(int);
361FORCE_INLINE __m128i _mm_setzero_si128();
362// SSE4.1
363FORCE_INLINE __m128d _mm_ceil_pd(__m128d);
364FORCE_INLINE __m128 _mm_ceil_ps(__m128);
365FORCE_INLINE __m128d _mm_floor_pd(__m128d);
366FORCE_INLINE __m128 _mm_floor_ps(__m128);
367FORCE_INLINE __m128d _mm_round_pd(__m128d, int);
368FORCE_INLINE __m128 _mm_round_ps(__m128, int);
369// SSE4.2
370FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
371
372/* Backwards compatibility for compilers with lack of specific type support */
373
374// Older gcc does not define vld1q_u8_x4 type
375#if defined(__GNUC__) && !defined(__clang__) && \
376 ((__GNUC__ <= 10 && defined(__arm__)) || \
377 (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \
378 (__GNUC__ <= 9 && defined(__aarch64__)))
379FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
380{
381 uint8x16x4_t ret;
382 ret.val[0] = vld1q_u8(p + 0);
383 ret.val[1] = vld1q_u8(p + 16);
384 ret.val[2] = vld1q_u8(p + 32);
385 ret.val[3] = vld1q_u8(p + 48);
386 return ret;
387}
388#else
389// Wraps vld1q_u8_x4
390FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
391{
392 return vld1q_u8_x4(p);
393}
394#endif
395
396/* Function Naming Conventions
397 * The naming convention of SSE intrinsics is straightforward. A generic SSE
398 * intrinsic function is given as follows:
399 * _mm_<name>_<data_type>
400 *
401 * The parts of this format are given as follows:
402 * 1. <name> describes the operation performed by the intrinsic
403 * 2. <data_type> identifies the data type of the function's primary arguments
404 *
405 * This last part, <data_type>, is a little complicated. It identifies the
406 * content of the input values, and can be set to any of the following values:
407 * + ps - vectors contain floats (ps stands for packed single-precision)
408 * + pd - vectors cantain doubles (pd stands for packed double-precision)
409 * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
410 * signed integers
411 * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
412 * unsigned integers
413 * + si128 - unspecified 128-bit vector or 256-bit vector
414 * + m128/m128i/m128d - identifies input vector types when they are different
415 * than the type of the returned vector
416 *
417 * For example, _mm_setzero_ps. The _mm implies that the function returns
418 * a 128-bit vector. The _ps at the end implies that the argument vectors
419 * contain floats.
420 *
421 * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
422 * // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
423 * __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
424 * // Set packed 8-bit integers
425 * // 128 bits, 16 chars, per 8 bits
426 * __m128i v_perm = _mm_setr_epi8(1, 0, 2, 3, 8, 9, 10, 11,
427 * 4, 5, 12, 13, 6, 7, 14, 15);
428 * // Shuffle packed 8-bit integers
429 * __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
430 *
431 * Data (Number, Binary, Byte Index):
432 +------+------+-------------+------+------+-------------+
433 | 1 | 2 | 3 | 4 | Number
434 +------+------+------+------+------+------+------+------+
435 | 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary
436 +------+------+------+------+------+------+------+------+
437 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | Index
438 +------+------+------+------+------+------+------+------+
439
440 +------+------+------+------+------+------+------+------+
441 | 5 | 6 | 7 | 8 | Number
442 +------+------+------+------+------+------+------+------+
443 | 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary
444 +------+------+------+------+------+------+------+------+
445 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | Index
446 +------+------+------+------+------+------+------+------+
447 * Index (Byte Index):
448 +------+------+------+------+------+------+------+------+
449 | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 |
450 +------+------+------+------+------+------+------+------+
451
452 +------+------+------+------+------+------+------+------+
453 | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 |
454 +------+------+------+------+------+------+------+------+
455 * Result:
456 +------+------+------+------+------+------+------+------+
457 | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 | Index
458 +------+------+------+------+------+------+------+------+
459 | 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary
460 +------+------+------+------+------+------+------+------+
461 | 256 | 2 | 5 | 6 | Number
462 +------+------+------+------+------+------+------+------+
463
464 +------+------+------+------+------+------+------+------+
465 | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 | Index
466 +------+------+------+------+------+------+------+------+
467 | 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary
468 +------+------+------+------+------+------+------+------+
469 | 3 | 7 | 4 | 8 | Number
470 +------+------+------+------+------+------+-------------+
471 */
472
473/* Constants for use with _mm_prefetch. */
474enum _mm_hint {
475 _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */
476 _MM_HINT_T0 = 1, /* load data to L1 and L2 cache */
477 _MM_HINT_T1 = 2, /* load data to L2 cache only */
478 _MM_HINT_T2 = 3, /* load data to L2 cache only, mark it as NTA */
479 _MM_HINT_ENTA = 4, /* exclusive version of _MM_HINT_NTA */
480 _MM_HINT_ET0 = 5, /* exclusive version of _MM_HINT_T0 */
481 _MM_HINT_ET1 = 6, /* exclusive version of _MM_HINT_T1 */
482 _MM_HINT_ET2 = 7 /* exclusive version of _MM_HINT_T2 */
483};
484
485// The bit field mapping to the FPCR(floating-point control register)
486typedef struct {
487 uint16_t res0;
488 uint8_t res1 : 6;
489 uint8_t bit22 : 1;
490 uint8_t bit23 : 1;
491 uint8_t res2;
492#if defined(__aarch64__)
493 uint32_t res3;
494#endif
496
497// Takes the upper 64 bits of a and places it in the low end of the result
498// Takes the lower 64 bits of b and places it into the high end of the result.
499FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
500{
501 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
502 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
503 return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
504}
505
506// takes the lower two 32-bit values from a and swaps them and places in high
507// end of result takes the higher two 32 bit values from b and swaps them and
508// places in low end of result.
509FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
510{
511 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
512 float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
513 return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
514}
515
516FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
517{
518 float32x2_t a21 = vget_high_f32(
519 vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
520 float32x2_t b03 = vget_low_f32(
521 vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
522 return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
523}
524
525FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
526{
527 float32x2_t a03 = vget_low_f32(
528 vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
529 float32x2_t b21 = vget_high_f32(
530 vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
531 return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
532}
533
534FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
535{
536 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
537 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
538 return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
539}
540
541FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
542{
543 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
544 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
545 return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
546}
547
548FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
549{
550 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
551 float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
552 return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
553}
554
555// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
556// high
557FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
558{
559 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
560 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
561 return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
562}
563
564FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
565{
566 float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
567 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
568 return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
569}
570
571FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
572{
573 float32x2_t a22 =
574 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
575 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
576 return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
577}
578
579FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
580{
581 float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
582 float32x2_t b22 =
583 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
584 return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
585}
586
587FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
588{
589 float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
590 float32x2_t a22 =
591 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
592 float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
593 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
594 return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
595}
596
597FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
598{
599 float32x2_t a33 =
600 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
601 float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
602 return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
603}
604
605FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
606{
607 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
608 float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
609 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
610 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
611 return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
612}
613
614FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
615{
616 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
617 float32_t b2 = vgetq_lane_f32(b, 2);
618 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
619 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
620 return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
621}
622
623FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
624{
625 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
626 float32_t b2 = vgetq_lane_f32(b, 2);
627 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
628 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
629 return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
630}
631
632// Kahan summation for accurate summation of floating-point numbers.
633// http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html
634FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y)
635{
636 y -= *c;
637 float t = *sum + y;
638 *c = (t - *sum) - y;
639 *sum = t;
640}
641
642#if defined(__ARM_FEATURE_CRYPTO)
643// Wraps vmull_p64
644FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
645{
646 poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
647 poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
648 return vreinterpretq_u64_p128(vmull_p64(a, b));
649}
650#else // ARMv7 polyfill
651// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
652//
653// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
654// 64-bit->128-bit polynomial multiply.
655//
656// It needs some work and is somewhat slow, but it is still faster than all
657// known scalar methods.
658//
659// Algorithm adapted to C from
660// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
661// from "Fast Software Polynomial Multiplication on ARM Processors Using the
662// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
663// (https://hal.inria.fr/hal-01506572)
664static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
665{
666 poly8x8_t a = vreinterpret_p8_u64(_a);
667 poly8x8_t b = vreinterpret_p8_u64(_b);
668
669 // Masks
670 uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
671 vcreate_u8(0x00000000ffffffff));
672 uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
673 vcreate_u8(0x0000000000000000));
674
675 // Do the multiplies, rotating with vext to get all combinations
676 uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0
677 uint8x16_t e =
678 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1
679 uint8x16_t f =
680 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0
681 uint8x16_t g =
682 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2
683 uint8x16_t h =
684 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0
685 uint8x16_t i =
686 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3
687 uint8x16_t j =
688 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0
689 uint8x16_t k =
690 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4
691
692 // Add cross products
693 uint8x16_t l = veorq_u8(e, f); // L = E + F
694 uint8x16_t m = veorq_u8(g, h); // M = G + H
695 uint8x16_t n = veorq_u8(i, j); // N = I + J
696
697 // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
698 // instructions.
699#if defined(__aarch64__)
700 uint8x16_t lm_p0 = vreinterpretq_u8_u64(
701 vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
702 uint8x16_t lm_p1 = vreinterpretq_u8_u64(
703 vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
704 uint8x16_t nk_p0 = vreinterpretq_u8_u64(
705 vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
706 uint8x16_t nk_p1 = vreinterpretq_u8_u64(
707 vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
708#else
709 uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
710 uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
711 uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
712 uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
713#endif
714 // t0 = (L) (P0 + P1) << 8
715 // t1 = (M) (P2 + P3) << 16
716 uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
717 uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
718 uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
719
720 // t2 = (N) (P4 + P5) << 24
721 // t3 = (K) (P6 + P7) << 32
722 uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
723 uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
724 uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
725
726 // De-interleave
727#if defined(__aarch64__)
728 uint8x16_t t0 = vreinterpretq_u8_u64(
729 vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
730 uint8x16_t t1 = vreinterpretq_u8_u64(
731 vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
732 uint8x16_t t2 = vreinterpretq_u8_u64(
733 vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
734 uint8x16_t t3 = vreinterpretq_u8_u64(
735 vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
736#else
737 uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
738 uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
739 uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
740 uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
741#endif
742 // Shift the cross products
743 uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8
744 uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16
745 uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24
746 uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32
747
748 // Accumulate the products
749 uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
750 uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
751 uint8x16_t mix = veorq_u8(d, cross1);
752 uint8x16_t r = veorq_u8(mix, cross2);
753 return vreinterpretq_u64_u8(r);
754}
755#endif // ARMv7 polyfill
756
757// C equivalent:
758// __m128i _mm_shuffle_epi32_default(__m128i a,
759// __constrange(0, 255) int imm) {
760// __m128i ret;
761// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3];
762// ret[2] = a[(imm >> 4) & 0x03]; ret[3] = a[(imm >> 6) & 0x03];
763// return ret;
764// }
765#define _mm_shuffle_epi32_default(a, imm) \
766 __extension__({ \
767 int32x4_t ret; \
768 ret = vmovq_n_s32( \
769 vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3))); \
770 ret = vsetq_lane_s32( \
771 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
772 ret, 1); \
773 ret = vsetq_lane_s32( \
774 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
775 ret, 2); \
776 ret = vsetq_lane_s32( \
777 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
778 ret, 3); \
779 vreinterpretq_m128i_s32(ret); \
780 })
781
782// Takes the upper 64 bits of a and places it in the low end of the result
783// Takes the lower 64 bits of a and places it into the high end of the result.
784FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
785{
786 int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
787 int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
788 return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
789}
790
791// takes the lower two 32-bit values from a and swaps them and places in low end
792// of result takes the higher two 32 bit values from a and swaps them and places
793// in high end of result.
794FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
795{
796 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
797 int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
798 return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
799}
800
801// rotates the least significant 32 bits into the most significant 32 bits, and
802// shifts the rest down
803FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
804{
805 return vreinterpretq_m128i_s32(
806 vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
807}
808
809// rotates the most significant 32 bits into the least significant 32 bits, and
810// shifts the rest up
811FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
812{
813 return vreinterpretq_m128i_s32(
814 vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
815}
816
817// gets the lower 64 bits of a, and places it in the upper 64 bits
818// gets the lower 64 bits of a and places it in the lower 64 bits
819FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
820{
821 int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
822 return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
823}
824
825// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
826// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
827FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
828{
829 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
830 int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
831 return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
832}
833
834// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
835// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
836// places it in the lower 64 bits
837FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
838{
839 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
840 return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
841}
842
843FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
844{
845 int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
846 int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
847 return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
848}
849
850FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
851{
852 int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
853 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
854 return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
855}
856
857FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
858{
859 int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
860 int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
861 return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
862}
863
864// FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
865// int imm)
866#if defined(__aarch64__)
867#define _mm_shuffle_epi32_splat(a, imm) \
868 __extension__({ \
869 vreinterpretq_m128i_s32( \
870 vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
871 })
872#else
873#define _mm_shuffle_epi32_splat(a, imm) \
874 __extension__({ \
875 vreinterpretq_m128i_s32( \
876 vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
877 })
878#endif
879
880// NEON does not support a general purpose permute intrinsic
881// Selects four specific single-precision, floating-point values from a and b,
882// based on the mask i.
883//
884// C equivalent:
885// __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
886// __constrange(0, 255) int imm) {
887// __m128 ret;
888// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3];
889// ret[2] = b[(imm >> 4) & 0x03]; ret[3] = b[(imm >> 6) & 0x03];
890// return ret;
891// }
892//
893// https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
894#define _mm_shuffle_ps_default(a, b, imm) \
895 __extension__({ \
896 float32x4_t ret; \
897 ret = vmovq_n_f32( \
898 vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))); \
899 ret = vsetq_lane_f32( \
900 vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
901 ret, 1); \
902 ret = vsetq_lane_f32( \
903 vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
904 ret, 2); \
905 ret = vsetq_lane_f32( \
906 vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
907 ret, 3); \
908 vreinterpretq_m128_f32(ret); \
909 })
910
911// Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified
912// by imm.
913// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100)
914// FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a,
915// __constrange(0,255) int
916// imm)
917#define _mm_shufflelo_epi16_function(a, imm) \
918 __extension__({ \
919 int16x8_t ret = vreinterpretq_s16_m128i(a); \
920 int16x4_t lowBits = vget_low_s16(ret); \
921 ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \
922 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
923 1); \
924 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
925 2); \
926 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
927 3); \
928 vreinterpretq_m128i_s16(ret); \
929 })
930
931// Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified
932// by imm.
933// https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
934// FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a,
935// __constrange(0,255) int
936// imm)
937#define _mm_shufflehi_epi16_function(a, imm) \
938 __extension__({ \
939 int16x8_t ret = vreinterpretq_s16_m128i(a); \
940 int16x4_t highBits = vget_high_s16(ret); \
941 ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \
942 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
943 5); \
944 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
945 6); \
946 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
947 7); \
948 vreinterpretq_m128i_s16(ret); \
949 })
950
951/* SSE */
952
953// Adds the four single-precision, floating-point values of a and b.
954//
955// r0 := a0 + b0
956// r1 := a1 + b1
957// r2 := a2 + b2
958// r3 := a3 + b3
959//
960// https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
961FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
962{
963 return vreinterpretq_m128_f32(
964 vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
965}
966
967// adds the scalar single-precision floating point values of a and b.
968// https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
969FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
970{
971 float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
972 float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
973 // the upper values in the result must be the remnants of <a>.
974 return vreinterpretq_m128_f32(vaddq_f32(a, value));
975}
976
977// Computes the bitwise AND of the four single-precision, floating-point values
978// of a and b.
979//
980// r0 := a0 & b0
981// r1 := a1 & b1
982// r2 := a2 & b2
983// r3 := a3 & b3
984//
985// https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
986FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
987{
988 return vreinterpretq_m128_s32(
989 vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
990}
991
992// Computes the bitwise AND-NOT of the four single-precision, floating-point
993// values of a and b.
994//
995// r0 := ~a0 & b0
996// r1 := ~a1 & b1
997// r2 := ~a2 & b2
998// r3 := ~a3 & b3
999//
1000// https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
1001FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
1002{
1003 return vreinterpretq_m128_s32(
1004 vbicq_s32(vreinterpretq_s32_m128(b),
1005 vreinterpretq_s32_m128(a))); // *NOTE* argument swap
1006}
1007
1008// Average packed unsigned 16-bit integers in a and b, and store the results in
1009// dst.
1010//
1011// FOR j := 0 to 3
1012// i := j*16
1013// dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
1014// ENDFOR
1015//
1016// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16
1017FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
1018{
1019 return vreinterpret_m64_u16(
1020 vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
1021}
1022
1023// Average packed unsigned 8-bit integers in a and b, and store the results in
1024// dst.
1025//
1026// FOR j := 0 to 7
1027// i := j*8
1028// dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
1029// ENDFOR
1030//
1031// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8
1032FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
1033{
1034 return vreinterpret_m64_u8(
1035 vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
1036}
1037
1038// Compares for equality.
1039// https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
1040FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
1041{
1042 return vreinterpretq_m128_u32(
1043 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1044}
1045
1046// Compares for equality.
1047// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100)
1048FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
1049{
1050 return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
1051}
1052
1053// Compares for greater than or equal.
1054// https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
1055FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
1056{
1057 return vreinterpretq_m128_u32(
1058 vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1059}
1060
1061// Compares for greater than or equal.
1062// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100)
1063FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
1064{
1065 return _mm_move_ss(a, _mm_cmpge_ps(a, b));
1066}
1067
1068// Compares for greater than.
1069//
1070// r0 := (a0 > b0) ? 0xffffffff : 0x0
1071// r1 := (a1 > b1) ? 0xffffffff : 0x0
1072// r2 := (a2 > b2) ? 0xffffffff : 0x0
1073// r3 := (a3 > b3) ? 0xffffffff : 0x0
1074//
1075// https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
1076FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
1077{
1078 return vreinterpretq_m128_u32(
1079 vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1080}
1081
1082// Compares for greater than.
1083// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100)
1084FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
1085{
1086 return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
1087}
1088
1089// Compares for less than or equal.
1090//
1091// r0 := (a0 <= b0) ? 0xffffffff : 0x0
1092// r1 := (a1 <= b1) ? 0xffffffff : 0x0
1093// r2 := (a2 <= b2) ? 0xffffffff : 0x0
1094// r3 := (a3 <= b3) ? 0xffffffff : 0x0
1095//
1096// https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
1097FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
1098{
1099 return vreinterpretq_m128_u32(
1100 vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1101}
1102
1103// Compares for less than or equal.
1104// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100)
1105FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
1106{
1107 return _mm_move_ss(a, _mm_cmple_ps(a, b));
1108}
1109
1110// Compares for less than
1111// https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
1112FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
1113{
1114 return vreinterpretq_m128_u32(
1115 vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1116}
1117
1118// Compares for less than
1119// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100)
1120FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
1121{
1122 return _mm_move_ss(a, _mm_cmplt_ps(a, b));
1123}
1124
1125// Compares for inequality.
1126// https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
1127FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
1128{
1129 return vreinterpretq_m128_u32(vmvnq_u32(
1130 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1131}
1132
1133// Compares for inequality.
1134// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100)
1135FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
1136{
1137 return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
1138}
1139
1140// Compares for not greater than or equal.
1141// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100)
1142FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
1143{
1144 return _mm_cmplt_ps(a, b);
1145}
1146
1147// Compares for not greater than or equal.
1148// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100)
1149FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
1150{
1151 return _mm_cmplt_ss(a, b);
1152}
1153
1154// Compares for not greater than.
1155// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100)
1156FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
1157{
1158 return _mm_cmple_ps(a, b);
1159}
1160
1161// Compares for not greater than.
1162// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
1163FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
1164{
1165 return _mm_cmple_ss(a, b);
1166}
1167
1168// Compares for not less than or equal.
1169// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100)
1170FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
1171{
1172 return _mm_cmpgt_ps(a, b);
1173}
1174
1175// Compares for not less than or equal.
1176// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
1177FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
1178{
1179 return _mm_cmpgt_ss(a, b);
1180}
1181
1182// Compares for not less than.
1183// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100)
1184FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
1185{
1186 return _mm_cmpge_ps(a, b);
1187}
1188
1189// Compares for not less than.
1190// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100)
1191FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
1192{
1193 return _mm_cmpge_ss(a, b);
1194}
1195
1196// Compares the four 32-bit floats in a and b to check if any values are NaN.
1197// Ordered compare between each value returns true for "orderable" and false for
1198// "not orderable" (NaN).
1199// https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see
1200// also:
1201// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
1202// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
1203FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
1204{
1205 // Note: NEON does not have ordered compare builtin
1206 // Need to compare a eq a and b eq b to check for NaN
1207 // Do AND of results to get final
1208 uint32x4_t ceqaa =
1209 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1210 uint32x4_t ceqbb =
1211 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1212 return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
1213}
1214
1215// Compares for ordered.
1216// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100)
1217FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
1218{
1219 return _mm_move_ss(a, _mm_cmpord_ps(a, b));
1220}
1221
1222// Compares for unordered.
1223// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100)
1224FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
1225{
1226 uint32x4_t f32a =
1227 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1228 uint32x4_t f32b =
1229 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1230 return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
1231}
1232
1233// Compares for unordered.
1234// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100)
1235FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
1236{
1237 return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
1238}
1239
1240// Compares the lower single-precision floating point scalar values of a and b
1241// using an equality operation. :
1242// https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
1243FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
1244{
1245 // return vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
1246 // vreinterpretq_f32_m128(b)), 0);
1247 uint32x4_t a_not_nan =
1248 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1249 uint32x4_t b_not_nan =
1250 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1251 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
1252 uint32x4_t a_eq_b =
1253 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1254 return vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_eq_b), 0) & 0x1;
1255}
1256
1257// Compares the lower single-precision floating point scalar values of a and b
1258// using a greater than or equal operation. :
1259// https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
1260FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
1261{
1262 // return vgetq_lane_u32(vcgeq_f32(vreinterpretq_f32_m128(a),
1263 // vreinterpretq_f32_m128(b)), 0);
1264 uint32x4_t a_not_nan =
1265 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1266 uint32x4_t b_not_nan =
1267 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1268 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
1269 uint32x4_t a_ge_b =
1270 vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1271 return vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) & 0x1;
1272}
1273
1274// Compares the lower single-precision floating point scalar values of a and b
1275// using a greater than operation. :
1276// https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
1277FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
1278{
1279 // return vgetq_lane_u32(vcgtq_f32(vreinterpretq_f32_m128(a),
1280 // vreinterpretq_f32_m128(b)), 0);
1281 uint32x4_t a_not_nan =
1282 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1283 uint32x4_t b_not_nan =
1284 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1285 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
1286 uint32x4_t a_gt_b =
1287 vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1288 return vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) & 0x1;
1289}
1290
1291// Compares the lower single-precision floating point scalar values of a and b
1292// using a less than or equal operation. :
1293// https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
1294FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
1295{
1296 // return vgetq_lane_u32(vcleq_f32(vreinterpretq_f32_m128(a),
1297 // vreinterpretq_f32_m128(b)), 0);
1298 uint32x4_t a_not_nan =
1299 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1300 uint32x4_t b_not_nan =
1301 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1302 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
1303 uint32x4_t a_le_b =
1304 vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1305 return vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_le_b), 0) & 0x1;
1306}
1307
1308// Compares the lower single-precision floating point scalar values of a and b
1309// using a less than operation. :
1310// https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important
1311// note!! The documentation on MSDN is incorrect! If either of the values is a
1312// NAN the docs say you will get a one, but in fact, it will return a zero!!
1313FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
1314{
1315 uint32x4_t a_not_nan =
1316 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1317 uint32x4_t b_not_nan =
1318 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1319 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
1320 uint32x4_t a_lt_b =
1321 vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1322 return vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_lt_b), 0) & 0x1;
1323}
1324
1325// Compares the lower single-precision floating point scalar values of a and b
1326// using an inequality operation. :
1327// https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
1328FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
1329{
1330 // return !vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
1331 // vreinterpretq_f32_m128(b)), 0);
1332 uint32x4_t a_not_nan =
1333 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1334 uint32x4_t b_not_nan =
1335 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1336 uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
1337 uint32x4_t a_neq_b = vmvnq_u32(
1338 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1339 return vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_neq_b), 0) & 0x1;
1340}
1341
1342// Convert packed signed 32-bit integers in b to packed single-precision
1343// (32-bit) floating-point elements, store the results in the lower 2 elements
1344// of dst, and copy the upper 2 packed elements from a to the upper elements of
1345// dst.
1346//
1347// dst[31:0] := Convert_Int32_To_FP32(b[31:0])
1348// dst[63:32] := Convert_Int32_To_FP32(b[63:32])
1349// dst[95:64] := a[95:64]
1350// dst[127:96] := a[127:96]
1351//
1352// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps
1353FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
1354{
1355 return vreinterpretq_m128_f32(
1356 vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
1357 vget_high_f32(vreinterpretq_f32_m128(a))));
1358}
1359
1360// Convert packed single-precision (32-bit) floating-point elements in a to
1361// packed 32-bit integers, and store the results in dst.
1362//
1363// FOR j := 0 to 1
1364// i := 32*j
1365// dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
1366// ENDFOR
1367//
1368// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ps2pi
1369FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
1370{
1371#if defined(__aarch64__)
1372 return vreinterpret_m64_s32(
1373 vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a)))));
1374#else
1375 return vreinterpret_m64_s32(vcvt_s32_f32(vget_low_f32(
1376 vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)))));
1377#endif
1378}
1379
1380// Convert the signed 32-bit integer b to a single-precision (32-bit)
1381// floating-point element, store the result in the lower element of dst, and
1382// copy the upper 3 packed elements from a to the upper elements of dst.
1383//
1384// dst[31:0] := Convert_Int32_To_FP32(b[31:0])
1385// dst[127:32] := a[127:32]
1386//
1387// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss
1388FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
1389{
1390 return vreinterpretq_m128_f32(
1391 vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
1392}
1393
1394// Convert the lower single-precision (32-bit) floating-point element in a to a
1395// 32-bit integer, and store the result in dst.
1396// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si
1397FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
1398{
1399#if defined(__aarch64__)
1400 return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))),
1401 0);
1402#else
1403 float32_t data = vgetq_lane_f32(
1404 vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
1405 return (int32_t) data;
1406#endif
1407}
1408
1409// Convert packed 16-bit integers in a to packed single-precision (32-bit)
1410// floating-point elements, and store the results in dst.
1411//
1412// FOR j := 0 to 3
1413// i := j*16
1414// m := j*32
1415// dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
1416// ENDFOR
1417//
1418// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps
1419FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
1420{
1421 return vreinterpretq_m128_f32(
1422 vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
1423}
1424
1425// Convert packed 32-bit integers in b to packed single-precision (32-bit)
1426// floating-point elements, store the results in the lower 2 elements of dst,
1427// and copy the upper 2 packed elements from a to the upper elements of dst.
1428//
1429// dst[31:0] := Convert_Int32_To_FP32(b[31:0])
1430// dst[63:32] := Convert_Int32_To_FP32(b[63:32])
1431// dst[95:64] := a[95:64]
1432// dst[127:96] := a[127:96]
1433//
1434// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps
1435FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
1436{
1437 return vreinterpretq_m128_f32(
1438 vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
1439 vget_high_f32(vreinterpretq_f32_m128(a))));
1440}
1441
1442// Convert packed signed 32-bit integers in a to packed single-precision
1443// (32-bit) floating-point elements, store the results in the lower 2 elements
1444// of dst, then covert the packed signed 32-bit integers in b to
1445// single-precision (32-bit) floating-point element, and store the results in
1446// the upper 2 elements of dst.
1447//
1448// dst[31:0] := Convert_Int32_To_FP32(a[31:0])
1449// dst[63:32] := Convert_Int32_To_FP32(a[63:32])
1450// dst[95:64] := Convert_Int32_To_FP32(b[31:0])
1451// dst[127:96] := Convert_Int32_To_FP32(b[63:32])
1452//
1453// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps
1454FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
1455{
1456 return vreinterpretq_m128_f32(vcvtq_f32_s32(
1457 vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
1458}
1459
1460// Convert the lower packed 8-bit integers in a to packed single-precision
1461// (32-bit) floating-point elements, and store the results in dst.
1462//
1463// FOR j := 0 to 3
1464// i := j*8
1465// m := j*32
1466// dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
1467// ENDFOR
1468//
1469// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps
1470FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
1471{
1472 return vreinterpretq_m128_f32(vcvtq_f32_s32(
1473 vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
1474}
1475
1476// Convert packed single-precision (32-bit) floating-point elements in a to
1477// packed 16-bit integers, and store the results in dst. Note: this intrinsic
1478// will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
1479// 0x7FFFFFFF.
1480//
1481// FOR j := 0 to 3
1482// i := 16*j
1483// k := 32*j
1484// IF a[k+31:k] >= FP32(0x7FFF) && a[k+31:k] <= FP32(0x7FFFFFFF)
1485// dst[i+15:i] := 0x7FFF
1486// ELSE
1487// dst[i+15:i] := Convert_FP32_To_Int16(a[k+31:k])
1488// FI
1489// ENDFOR
1490//
1491// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi16
1492FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
1493{
1494 const __m128 i16Min = _mm_set_ps1(INT16_MIN);
1495 const __m128 i16Max = _mm_set_ps1(INT16_MAX);
1496 const __m128 i32Max = _mm_set_ps1(INT32_MAX);
1497 const __m128i maxMask = _mm_castps_si128(
1498 _mm_and_ps(_mm_cmpge_ps(a, i16Max), _mm_cmple_ps(a, i32Max)));
1499 const __m128i betweenMask = _mm_castps_si128(
1500 _mm_and_ps(_mm_cmpgt_ps(a, i16Min), _mm_cmplt_ps(a, i16Max)));
1501 const __m128i minMask = _mm_cmpeq_epi32(_mm_or_si128(maxMask, betweenMask),
1502 _mm_setzero_si128());
1503 __m128i max = _mm_and_si128(maxMask, _mm_set1_epi32(INT16_MAX));
1504 __m128i min = _mm_and_si128(minMask, _mm_set1_epi32(INT16_MIN));
1505 __m128i cvt = _mm_and_si128(betweenMask, _mm_cvtps_epi32(a));
1506 __m128i res32 = _mm_or_si128(_mm_or_si128(max, min), cvt);
1507 return vreinterpret_m64_s16(vmovn_s32(vreinterpretq_s32_m128i(res32)));
1508}
1509
1510// Convert packed single-precision (32-bit) floating-point elements in a to
1511// packed 32-bit integers, and store the results in dst.
1512//
1513// FOR j := 0 to 1
1514// i := 32*j
1515// dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
1516// ENDFOR
1517//
1518// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi32
1519#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
1520
1521// Convert packed single-precision (32-bit) floating-point elements in a to
1522// packed 8-bit integers, and store the results in lower 4 elements of dst.
1523// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values
1524// between 0x7F and 0x7FFFFFFF.
1525//
1526// FOR j := 0 to 3
1527// i := 8*j
1528// k := 32*j
1529// IF a[k+31:k] >= FP32(0x7F) && a[k+31:k] <= FP32(0x7FFFFFFF)
1530// dst[i+7:i] := 0x7F
1531// ELSE
1532// dst[i+7:i] := Convert_FP32_To_Int8(a[k+31:k])
1533// FI
1534// ENDFOR
1535//
1536// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi8
1537FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a)
1538{
1539 const __m128 i8Min = _mm_set_ps1(INT8_MIN);
1540 const __m128 i8Max = _mm_set_ps1(INT8_MAX);
1541 const __m128 i32Max = _mm_set_ps1(INT32_MAX);
1542 const __m128i maxMask = _mm_castps_si128(
1543 _mm_and_ps(_mm_cmpge_ps(a, i8Max), _mm_cmple_ps(a, i32Max)));
1544 const __m128i betweenMask = _mm_castps_si128(
1545 _mm_and_ps(_mm_cmpgt_ps(a, i8Min), _mm_cmplt_ps(a, i8Max)));
1546 const __m128i minMask = _mm_cmpeq_epi32(_mm_or_si128(maxMask, betweenMask),
1547 _mm_setzero_si128());
1548 __m128i max = _mm_and_si128(maxMask, _mm_set1_epi32(INT8_MAX));
1549 __m128i min = _mm_and_si128(minMask, _mm_set1_epi32(INT8_MIN));
1550 __m128i cvt = _mm_and_si128(betweenMask, _mm_cvtps_epi32(a));
1551 __m128i res32 = _mm_or_si128(_mm_or_si128(max, min), cvt);
1552 int16x4_t res16 = vmovn_s32(vreinterpretq_s32_m128i(res32));
1553 int8x8_t res8 = vmovn_s16(vcombine_s16(res16, res16));
1554 unsigned int bitMask[2] = {static_cast<unsigned int>(0xFFFFFFFF), 0};
1555 int8x8_t mask = vreinterpret_s8_u32(vld1_u32(bitMask));
1556
1557 return vreinterpret_m64_s8(vorr_s8(vand_s8(mask, res8), vdup_n_s8(0)));
1558}
1559
1560// Convert packed unsigned 16-bit integers in a to packed single-precision
1561// (32-bit) floating-point elements, and store the results in dst.
1562//
1563// FOR j := 0 to 3
1564// i := j*16
1565// m := j*32
1566// dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i])
1567// ENDFOR
1568//
1569// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps
1570FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
1571{
1572 return vreinterpretq_m128_f32(
1573 vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
1574}
1575
1576// Convert the lower packed unsigned 8-bit integers in a to packed
1577// single-precision (32-bit) floating-point elements, and store the results in
1578// dst.
1579//
1580// FOR j := 0 to 3
1581// i := j*8
1582// m := j*32
1583// dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i])
1584// ENDFOR
1585//
1586// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps
1587FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
1588{
1589 return vreinterpretq_m128_f32(vcvtq_f32_u32(
1590 vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
1591}
1592
1593// Convert the signed 32-bit integer b to a single-precision (32-bit)
1594// floating-point element, store the result in the lower element of dst, and
1595// copy the upper 3 packed elements from a to the upper elements of dst.
1596//
1597// dst[31:0] := Convert_Int32_To_FP32(b[31:0])
1598// dst[127:32] := a[127:32]
1599//
1600// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss
1601#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
1602
1603// Convert the signed 64-bit integer b to a single-precision (32-bit)
1604// floating-point element, store the result in the lower element of dst, and
1605// copy the upper 3 packed elements from a to the upper elements of dst.
1606//
1607// dst[31:0] := Convert_Int64_To_FP32(b[63:0])
1608// dst[127:32] := a[127:32]
1609//
1610// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_ss
1611FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
1612{
1613 return vreinterpretq_m128_f32(
1614 vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
1615}
1616
1617// Copy the lower single-precision (32-bit) floating-point element of a to dst.
1618//
1619// dst[31:0] := a[31:0]
1620//
1621// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32
1622FORCE_INLINE float _mm_cvtss_f32(__m128 a)
1623{
1624 return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1625}
1626
1627// Convert the lower single-precision (32-bit) floating-point element in a to a
1628// 32-bit integer, and store the result in dst.
1629//
1630// dst[31:0] := Convert_FP32_To_Int32(a[31:0])
1631//
1632// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32
1633#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
1634
1635// Convert the lower single-precision (32-bit) floating-point element in a to a
1636// 64-bit integer, and store the result in dst.
1637//
1638// dst[63:0] := Convert_FP32_To_Int64(a[31:0])
1639//
1640// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si64
1641FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
1642{
1643#if defined(__aarch64__)
1644 return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0);
1645#else
1646 float32_t data = vgetq_lane_f32(
1647 vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
1648 return (int64_t) data;
1649#endif
1650}
1651
1652// Convert packed single-precision (32-bit) floating-point elements in a to
1653// packed 32-bit integers with truncation, and store the results in dst.
1654//
1655// FOR j := 0 to 1
1656// i := 32*j
1657// dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
1658// ENDFOR
1659//
1660// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ps2pi
1661FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
1662{
1663 return vreinterpret_m64_s32(
1664 vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
1665}
1666
1667// Convert the lower single-precision (32-bit) floating-point element in a to a
1668// 32-bit integer with truncation, and store the result in dst.
1669//
1670// dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
1671//
1672// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si
1673FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
1674{
1675 return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
1676}
1677
1678// Convert packed single-precision (32-bit) floating-point elements in a to
1679// packed 32-bit integers with truncation, and store the results in dst.
1680//
1681// FOR j := 0 to 1
1682// i := 32*j
1683// dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
1684// ENDFOR
1685//
1686// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_pi32
1687#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
1688
1689// Convert the lower single-precision (32-bit) floating-point element in a to a
1690// 32-bit integer with truncation, and store the result in dst.
1691//
1692// dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
1693//
1694// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32
1695#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
1696
1697// Convert the lower single-precision (32-bit) floating-point element in a to a
1698// 64-bit integer with truncation, and store the result in dst.
1699//
1700// dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
1701//
1702// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si64
1703FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
1704{
1705 return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1706}
1707
1708// Divides the four single-precision, floating-point values of a and b.
1709//
1710// r0 := a0 / b0
1711// r1 := a1 / b1
1712// r2 := a2 / b2
1713// r3 := a3 / b3
1714//
1715// https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
1716FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
1717{
1718#if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV
1719 return vreinterpretq_m128_f32(
1720 vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1721#else
1722 float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
1723 recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
1724#if SSE2NEON_PRECISE_DIV
1725 // Additional Netwon-Raphson iteration for accuracy
1726 recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
1727#endif
1728 return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
1729#endif
1730}
1731
1732// Divides the scalar single-precision floating point value of a by b.
1733// https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
1734FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
1735{
1736 float32_t value =
1737 vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
1738 return vreinterpretq_m128_f32(
1739 vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
1740}
1741
1742// Extract a 16-bit integer from a, selected with imm8, and store the result in
1743// the lower element of dst.
1744// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_pi16
1745#define _mm_extract_pi16(a, imm) \
1746 (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
1747
1748// Free aligned memory that was allocated with _mm_malloc.
1749// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_free
1750FORCE_INLINE void _mm_free(void *addr)
1751{
1752 free(addr);
1753}
1754
1755// Macro: Get the rounding mode bits from the MXCSR control and status register.
1756// The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
1757// _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
1758// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_ROUNDING_MODE
1759FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE()
1760{
1761 union {
1762 fpcr_bitfield field;
1763#if defined(__aarch64__)
1764 uint64_t value;
1765#else
1766 uint32_t value;
1767#endif
1768 } r;
1769
1770#if defined(__aarch64__)
1771 asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
1772#else
1773 asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
1774#endif
1775
1776 if (r.field.bit22) {
1777 return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP;
1778 } else {
1779 return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST;
1780 }
1781}
1782
1783// Copy a to dst, and insert the 16-bit integer i into dst at the location
1784// specified by imm8.
1785// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_pi16
1786#define _mm_insert_pi16(a, b, imm) \
1787 __extension__({ \
1788 vreinterpret_m64_s16( \
1789 vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \
1790 })
1791
1792// Loads four single-precision, floating-point values.
1793// https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
1794FORCE_INLINE __m128 _mm_load_ps(const float *p)
1795{
1796 return vreinterpretq_m128_f32(vld1q_f32(p));
1797}
1798
1799// Load a single-precision (32-bit) floating-point element from memory into all
1800// elements of dst.
1801//
1802// dst[31:0] := MEM[mem_addr+31:mem_addr]
1803// dst[63:32] := MEM[mem_addr+31:mem_addr]
1804// dst[95:64] := MEM[mem_addr+31:mem_addr]
1805// dst[127:96] := MEM[mem_addr+31:mem_addr]
1806//
1807// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1
1808#define _mm_load_ps1 _mm_load1_ps
1809
1810// Loads an single - precision, floating - point value into the low word and
1811// clears the upper three words.
1812// https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
1813FORCE_INLINE __m128 _mm_load_ss(const float *p)
1814{
1815 return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
1816}
1817
1818// Loads a single single-precision, floating-point value, copying it into all
1819// four words
1820// https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
1821FORCE_INLINE __m128 _mm_load1_ps(const float *p)
1822{
1823 return vreinterpretq_m128_f32(vld1q_dup_f32(p));
1824}
1825
1826// Sets the upper two single-precision, floating-point values with 64
1827// bits of data loaded from the address p; the lower two values are passed
1828// through from a.
1829//
1830// r0 := a0
1831// r1 := a1
1832// r2 := *p0
1833// r3 := *p1
1834//
1835// https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx
1836FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
1837{
1838 return vreinterpretq_m128_f32(
1839 vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
1840}
1841
1842// Sets the lower two single-precision, floating-point values with 64
1843// bits of data loaded from the address p; the upper two values are passed
1844// through from a.
1845//
1846// Return Value
1847// r0 := *p0
1848// r1 := *p1
1849// r2 := a2
1850// r3 := a3
1851//
1852// https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx
1853FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
1854{
1855 return vreinterpretq_m128_f32(
1856 vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
1857}
1858
1859// Load 4 single-precision (32-bit) floating-point elements from memory into dst
1860// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
1861// general-protection exception may be generated.
1862//
1863// dst[31:0] := MEM[mem_addr+127:mem_addr+96]
1864// dst[63:32] := MEM[mem_addr+95:mem_addr+64]
1865// dst[95:64] := MEM[mem_addr+63:mem_addr+32]
1866// dst[127:96] := MEM[mem_addr+31:mem_addr]
1867//
1868// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps
1869FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
1870{
1871 float32x4_t v = vrev64q_f32(vld1q_f32(p));
1872 return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
1873}
1874
1875// Loads four single-precision, floating-point values.
1876// https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
1877FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
1878{
1879 // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
1880 // equivalent for neon
1881 return vreinterpretq_m128_f32(vld1q_f32(p));
1882}
1883
1884// Load unaligned 16-bit integer from memory into the first element of dst.
1885//
1886// dst[15:0] := MEM[mem_addr+15:mem_addr]
1887// dst[MAX:16] := 0
1888//
1889// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16
1890FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
1891{
1892 return vreinterpretq_m128i_s16(
1893 vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
1894}
1895
1896// Load unaligned 64-bit integer from memory into the first element of dst.
1897//
1898// dst[63:0] := MEM[mem_addr+63:mem_addr]
1899// dst[MAX:64] := 0
1900//
1901// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64
1902FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
1903{
1904 return vreinterpretq_m128i_s64(
1905 vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
1906}
1907
1908// Allocate aligned blocks of memory.
1909// https://software.intel.com/en-us/
1910// cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks
1911FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
1912{
1913 void *ptr;
1914 if (align == 1)
1915 return malloc(size);
1916 if (align == 2 || (sizeof(void *) == 8 && align == 4))
1917 align = sizeof(void *);
1918 if (!posix_memalign(&ptr, align, size))
1919 return ptr;
1920 return NULL;
1921}
1922
1923// Conditionally store 8-bit integer elements from a into memory using mask
1924// (elements are not stored when the highest bit is not set in the corresponding
1925// element) and a non-temporal memory hint.
1926// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmove_si64
1927FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
1928{
1929 int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7);
1930 __m128 b = _mm_load_ps((const float *) mem_addr);
1931 int8x8_t masked =
1932 vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a),
1933 vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b))));
1934 vst1_s8((int8_t *) mem_addr, masked);
1935}
1936
1937// Conditionally store 8-bit integer elements from a into memory using mask
1938// (elements are not stored when the highest bit is not set in the corresponding
1939// element) and a non-temporal memory hint.
1940// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_maskmovq
1941#define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
1942
1943// Compare packed signed 16-bit integers in a and b, and store packed maximum
1944// values in dst.
1945//
1946// FOR j := 0 to 3
1947// i := j*16
1948// dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
1949// ENDFOR
1950//
1951// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
1952FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
1953{
1954 return vreinterpret_m64_s16(
1955 vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
1956}
1957
1958// Computes the maximums of the four single-precision, floating-point values of
1959// a and b.
1960// https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
1961FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
1962{
1963#if SSE2NEON_PRECISE_MINMAX
1964 float32x4_t _a = vreinterpretq_f32_m128(a);
1965 float32x4_t _b = vreinterpretq_f32_m128(b);
1966 return vbslq_f32(vcltq_f32(_b, _a), _a, _b);
1967#else
1968 return vreinterpretq_m128_f32(
1969 vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1970#endif
1971}
1972
1973// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
1974// values in dst.
1975//
1976// FOR j := 0 to 7
1977// i := j*8
1978// dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
1979// ENDFOR
1980//
1981// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
1982FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
1983{
1984 return vreinterpret_m64_u8(
1985 vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
1986}
1987
1988// Computes the maximum of the two lower scalar single-precision floating point
1989// values of a and b.
1990// https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
1991FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
1992{
1993 float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
1994 return vreinterpretq_m128_f32(
1995 vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
1996}
1997
1998// Compare packed signed 16-bit integers in a and b, and store packed minimum
1999// values in dst.
2000//
2001// FOR j := 0 to 3
2002// i := j*16
2003// dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
2004// ENDFOR
2005//
2006// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
2007FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
2008{
2009 return vreinterpret_m64_s16(
2010 vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
2011}
2012
2013// Computes the minima of the four single-precision, floating-point values of a
2014// and b.
2015// https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
2016FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
2017{
2018#if SSE2NEON_PRECISE_MINMAX
2019 float32x4_t _a = vreinterpretq_f32_m128(a);
2020 float32x4_t _b = vreinterpretq_f32_m128(b);
2021 return vbslq_f32(vcltq_f32(_a, _b), _a, _b);
2022#else
2023 return vreinterpretq_m128_f32(
2024 vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2025#endif
2026}
2027
2028// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
2029// values in dst.
2030//
2031// FOR j := 0 to 7
2032// i := j*8
2033// dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
2034// ENDFOR
2035//
2036// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
2037FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
2038{
2039 return vreinterpret_m64_u8(
2040 vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
2041}
2042
2043// Computes the minimum of the two lower scalar single-precision floating point
2044// values of a and b.
2045// https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
2046FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
2047{
2048 float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
2049 return vreinterpretq_m128_f32(
2050 vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
2051}
2052
2053// Sets the low word to the single-precision, floating-point value of b
2054// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100)
2055FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
2056{
2057 return vreinterpretq_m128_f32(
2058 vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
2059 vreinterpretq_f32_m128(a), 0));
2060}
2061
2062// Moves the upper two values of B into the lower two values of A.
2063//
2064// r3 := a3
2065// r2 := a2
2066// r1 := b3
2067// r0 := b2
2068FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B)
2069{
2070 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A));
2071 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B));
2072 return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
2073}
2074
2075// Moves the lower two values of B into the upper two values of A.
2076//
2077// r3 := b1
2078// r2 := b0
2079// r1 := a1
2080// r0 := a0
2081FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
2082{
2083 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
2084 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
2085 return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
2086}
2087
2088// Create mask from the most significant bit of each 8-bit element in a, and
2089// store the result in dst.
2090// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pi8
2091FORCE_INLINE int _mm_movemask_pi8(__m64 a)
2092{
2093 uint8x8_t input = vreinterpret_u8_m64(a);
2094#if defined(__aarch64__)
2095 static const int8x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
2096 uint8x8_t tmp = vshr_n_u8(input, 7);
2097 return vaddv_u8(vshl_u8(tmp, shift));
2098#else
2099 // Refer the implementation of `_mm_movemask_epi8`
2100 uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7));
2101 uint32x2_t paired16 =
2102 vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
2103 uint8x8_t paired32 =
2104 vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14));
2105 return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4);
2106#endif
2107}
2108
2109// NEON does not provide this method
2110// Creates a 4-bit mask from the most significant bits of the four
2111// single-precision, floating-point values.
2112// https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
2113FORCE_INLINE int _mm_movemask_ps(__m128 a)
2114{
2115 uint32x4_t input = vreinterpretq_u32_m128(a);
2116#if defined(__aarch64__)
2117 static const int32x4_t shift = {0, 1, 2, 3};
2118 uint32x4_t tmp = vshrq_n_u32(input, 31);
2119 return vaddvq_u32(vshlq_u32(tmp, shift));
2120#else
2121 // Uses the exact same method as _mm_movemask_epi8, see that for details.
2122 // Shift out everything but the sign bits with a 32-bit unsigned shift
2123 // right.
2124 uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
2125 // Merge the two pairs together with a 64-bit unsigned shift right + add.
2126 uint8x16_t paired =
2127 vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
2128 // Extract the result.
2129 return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
2130#endif
2131}
2132
2133// Multiplies the four single-precision, floating-point values of a and b.
2134//
2135// r0 := a0 * b0
2136// r1 := a1 * b1
2137// r2 := a2 * b2
2138// r3 := a3 * b3
2139//
2140// https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
2141FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
2142{
2143 return vreinterpretq_m128_f32(
2144 vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2145}
2146
2147// Multiply the lower single-precision (32-bit) floating-point element in a and
2148// b, store the result in the lower element of dst, and copy the upper 3 packed
2149// elements from a to the upper elements of dst.
2150//
2151// dst[31:0] := a[31:0] * b[31:0]
2152// dst[127:32] := a[127:32]
2153//
2154// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss
2155FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
2156{
2157 return _mm_move_ss(a, _mm_mul_ps(a, b));
2158}
2159
2160// Multiply the packed unsigned 16-bit integers in a and b, producing
2161// intermediate 32-bit integers, and store the high 16 bits of the intermediate
2162// integers in dst.
2163// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16
2164FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
2165{
2166 return vreinterpret_m64_u16(vshrn_n_u32(
2167 vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
2168}
2169
2170// Computes the bitwise OR of the four single-precision, floating-point values
2171// of a and b.
2172// https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
2173FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
2174{
2175 return vreinterpretq_m128_s32(
2176 vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
2177}
2178
2179// Average packed unsigned 8-bit integers in a and b, and store the results in
2180// dst.
2181//
2182// FOR j := 0 to 7
2183// i := j*8
2184// dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
2185// ENDFOR
2186//
2187// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb
2188#define _m_pavgb(a, b) _mm_avg_pu8(a, b)
2189
2190// Average packed unsigned 16-bit integers in a and b, and store the results in
2191// dst.
2192//
2193// FOR j := 0 to 3
2194// i := j*16
2195// dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
2196// ENDFOR
2197//
2198// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw
2199#define _m_pavgw(a, b) _mm_avg_pu16(a, b)
2200
2201// Extract a 16-bit integer from a, selected with imm8, and store the result in
2202// the lower element of dst.
2203// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pextrw
2204#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
2205
2206// Copy a to dst, and insert the 16-bit integer i into dst at the location
2207// specified by imm8.
2208// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_pinsrw
2209#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
2210
2211// Compare packed signed 16-bit integers in a and b, and store packed maximum
2212// values in dst.
2213// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxsw
2214#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
2215
2216// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
2217// values in dst.
2218// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxub
2219#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
2220
2221// Compare packed signed 16-bit integers in a and b, and store packed minimum
2222// values in dst.
2223// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminsw
2224#define _m_pminsw(a, b) _mm_min_pi16(a, b)
2225
2226// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
2227// values in dst.
2228// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminub
2229#define _m_pminub(a, b) _mm_min_pu8(a, b)
2230
2231// Create mask from the most significant bit of each 8-bit element in a, and
2232// store the result in dst.
2233// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmovmskb
2234#define _m_pmovmskb(a) _mm_movemask_pi8(a)
2235
2236// Multiply the packed unsigned 16-bit integers in a and b, producing
2237// intermediate 32-bit integers, and store the high 16 bits of the intermediate
2238// integers in dst.
2239// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw
2240#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
2241
2242// Loads one cache line of data from address p to a location closer to the
2243// processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx
2244FORCE_INLINE void _mm_prefetch(const void *p, int i)
2245{
2246 (void) i;
2247 __builtin_prefetch(p);
2248}
2249
2250// Compute the absolute differences of packed unsigned 8-bit integers in a and
2251// b, then horizontally sum each consecutive 8 differences to produce four
2252// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
2253// 16 bits of dst.
2254// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_psadbw
2255#define _m_psadbw(a, b) _mm_sad_pu8(a, b)
2256
2257// Shuffle 16-bit integers in a using the control in imm8, and store the results
2258// in dst.
2259// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pshufw
2260#define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)
2261
2262// Compute the approximate reciprocal of packed single-precision (32-bit)
2263// floating-point elements in a, and store the results in dst. The maximum
2264// relative error for this approximation is less than 1.5*2^-12.
2265// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps
2266FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
2267{
2268 float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
2269 recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
2270#if SSE2NEON_PRECISE_DIV
2271 // Additional Netwon-Raphson iteration for accuracy
2272 recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
2273#endif
2274 return vreinterpretq_m128_f32(recip);
2275}
2276
2277// Compute the approximate reciprocal of the lower single-precision (32-bit)
2278// floating-point element in a, store the result in the lower element of dst,
2279// and copy the upper 3 packed elements from a to the upper elements of dst. The
2280// maximum relative error for this approximation is less than 1.5*2^-12.
2281//
2282// dst[31:0] := (1.0 / a[31:0])
2283// dst[127:32] := a[127:32]
2284//
2285// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss
2286FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
2287{
2288 return _mm_move_ss(a, _mm_rcp_ps(a));
2289}
2290
2291// Computes the approximations of the reciprocal square roots of the four
2292// single-precision floating point values of in.
2293// The current precision is 1% error.
2294// https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
2295FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
2296{
2297 float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2298#if SSE2NEON_PRECISE_SQRT
2299 // Additional Netwon-Raphson iteration for accuracy
2300 out = vmulq_f32(
2301 out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
2302 out = vmulq_f32(
2303 out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
2304#endif
2305 return vreinterpretq_m128_f32(out);
2306}
2307
2308// Compute the approximate reciprocal square root of the lower single-precision
2309// (32-bit) floating-point element in a, store the result in the lower element
2310// of dst, and copy the upper 3 packed elements from a to the upper elements of
2311// dst.
2312// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss
2313FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
2314{
2315 return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
2316}
2317
2318// Compute the absolute differences of packed unsigned 8-bit integers in a and
2319// b, then horizontally sum each consecutive 8 differences to produce four
2320// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
2321// 16 bits of dst.
2322// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8
2323FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
2324{
2325 uint64x1_t t = vpaddl_u32(vpaddl_u16(
2326 vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)))));
2327 return vreinterpret_m64_u16(
2328 vset_lane_u16(vget_lane_u64(t, 0), vdup_n_u16(0), 0));
2329}
2330
2331// Sets the four single-precision, floating-point values to the four inputs.
2332// https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
2333FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
2334{
2335 float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
2336 return vreinterpretq_m128_f32(vld1q_f32(data));
2337}
2338
2339// Sets the four single-precision, floating-point values to w.
2340// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
2341FORCE_INLINE __m128 _mm_set_ps1(float _w)
2342{
2343 return vreinterpretq_m128_f32(vdupq_n_f32(_w));
2344}
2345
2346// Macro: Set the rounding mode bits of the MXCSR control and status register to
2347// the value in unsigned 32-bit integer a. The rounding mode may contain any of
2348// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
2349// _MM_ROUND_TOWARD_ZERO
2350// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_ROUNDING_MODE
2351FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
2352{
2353 union {
2354 fpcr_bitfield field;
2355#if defined(__aarch64__)
2356 uint64_t value;
2357#else
2358 uint32_t value;
2359#endif
2360 } r;
2361
2362#if defined(__aarch64__)
2363 asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
2364#else
2365 asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
2366#endif
2367
2368 switch (rounding) {
2369 case _MM_ROUND_TOWARD_ZERO:
2370 r.field.bit22 = 1;
2371 r.field.bit23 = 1;
2372 break;
2373 case _MM_ROUND_DOWN:
2374 r.field.bit22 = 0;
2375 r.field.bit23 = 1;
2376 break;
2377 case _MM_ROUND_UP:
2378 r.field.bit22 = 1;
2379 r.field.bit23 = 0;
2380 break;
2381 default: //_MM_ROUND_NEAREST
2382 r.field.bit22 = 0;
2383 r.field.bit23 = 0;
2384 }
2385
2386#if defined(__aarch64__)
2387 asm volatile("msr FPCR, %0" ::"r"(r)); /* write */
2388#else
2389 asm volatile("vmsr FPSCR, %0" ::"r"(r)); /* write */
2390#endif
2391}
2392
2393// Copy single-precision (32-bit) floating-point element a to the lower element
2394// of dst, and zero the upper 3 elements.
2395// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss
2396FORCE_INLINE __m128 _mm_set_ss(float a)
2397{
2398 float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0};
2399 return vreinterpretq_m128_f32(vld1q_f32(data));
2400}
2401
2402// Sets the four single-precision, floating-point values to w.
2403//
2404// r0 := r1 := r2 := r3 := w
2405//
2406// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
2407FORCE_INLINE __m128 _mm_set1_ps(float _w)
2408{
2409 return vreinterpretq_m128_f32(vdupq_n_f32(_w));
2410}
2411
2412FORCE_INLINE void _mm_setcsr(unsigned int a)
2413{
2414 _MM_SET_ROUNDING_MODE(a);
2415}
2416
2417// Sets the four single-precision, floating-point values to the four inputs in
2418// reverse order.
2419// https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
2420FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
2421{
2422 float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
2423 return vreinterpretq_m128_f32(vld1q_f32(data));
2424}
2425
2426// Clears the four single-precision, floating-point values.
2427// https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
2428FORCE_INLINE __m128 _mm_setzero_ps(void)
2429{
2430 return vreinterpretq_m128_f32(vdupq_n_f32(0));
2431}
2432
2433// Shuffle 16-bit integers in a using the control in imm8, and store the results
2434// in dst.
2435// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pi16
2436#if __has_builtin(__builtin_shufflevector)
2437#define _mm_shuffle_pi16(a, imm) \
2438 __extension__({ \
2439 vreinterpret_m64_s16(__builtin_shufflevector( \
2440 vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \
2441 ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3))); \
2442 })
2443#else
2444#define _mm_shuffle_pi16(a, imm) \
2445 __extension__({ \
2446 int16x4_t ret; \
2447 ret = \
2448 vmov_n_s16(vget_lane_s16(vreinterpret_s16_m64(a), (imm) & (0x3))); \
2449 ret = vset_lane_s16( \
2450 vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 2) & 0x3), ret, \
2451 1); \
2452 ret = vset_lane_s16( \
2453 vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 4) & 0x3), ret, \
2454 2); \
2455 ret = vset_lane_s16( \
2456 vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 6) & 0x3), ret, \
2457 3); \
2458 vreinterpret_m64_s16(ret); \
2459 })
2460#endif
2461
2462// Guarantees that every preceding store is globally visible before any
2463// subsequent store.
2464// https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
2465FORCE_INLINE void _mm_sfence(void)
2466{
2467 __sync_synchronize();
2468}
2469
2470// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
2471// int imm)
2472#if __has_builtin(__builtin_shufflevector)
2473#define _mm_shuffle_ps(a, b, imm) \
2474 __extension__({ \
2475 float32x4_t _input1 = vreinterpretq_f32_m128(a); \
2476 float32x4_t _input2 = vreinterpretq_f32_m128(b); \
2477 float32x4_t _shuf = __builtin_shufflevector( \
2478 _input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
2479 (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
2480 vreinterpretq_m128_f32(_shuf); \
2481 })
2482#else // generic
2483#define _mm_shuffle_ps(a, b, imm) \
2484 __extension__({ \
2485 __m128 ret; \
2486 switch (imm) { \
2487 case _MM_SHUFFLE(1, 0, 3, 2): \
2488 ret = _mm_shuffle_ps_1032((a), (b)); \
2489 break; \
2490 case _MM_SHUFFLE(2, 3, 0, 1): \
2491 ret = _mm_shuffle_ps_2301((a), (b)); \
2492 break; \
2493 case _MM_SHUFFLE(0, 3, 2, 1): \
2494 ret = _mm_shuffle_ps_0321((a), (b)); \
2495 break; \
2496 case _MM_SHUFFLE(2, 1, 0, 3): \
2497 ret = _mm_shuffle_ps_2103((a), (b)); \
2498 break; \
2499 case _MM_SHUFFLE(1, 0, 1, 0): \
2500 ret = _mm_movelh_ps((a), (b)); \
2501 break; \
2502 case _MM_SHUFFLE(1, 0, 0, 1): \
2503 ret = _mm_shuffle_ps_1001((a), (b)); \
2504 break; \
2505 case _MM_SHUFFLE(0, 1, 0, 1): \
2506 ret = _mm_shuffle_ps_0101((a), (b)); \
2507 break; \
2508 case _MM_SHUFFLE(3, 2, 1, 0): \
2509 ret = _mm_shuffle_ps_3210((a), (b)); \
2510 break; \
2511 case _MM_SHUFFLE(0, 0, 1, 1): \
2512 ret = _mm_shuffle_ps_0011((a), (b)); \
2513 break; \
2514 case _MM_SHUFFLE(0, 0, 2, 2): \
2515 ret = _mm_shuffle_ps_0022((a), (b)); \
2516 break; \
2517 case _MM_SHUFFLE(2, 2, 0, 0): \
2518 ret = _mm_shuffle_ps_2200((a), (b)); \
2519 break; \
2520 case _MM_SHUFFLE(3, 2, 0, 2): \
2521 ret = _mm_shuffle_ps_3202((a), (b)); \
2522 break; \
2523 case _MM_SHUFFLE(3, 2, 3, 2): \
2524 ret = _mm_movehl_ps((b), (a)); \
2525 break; \
2526 case _MM_SHUFFLE(1, 1, 3, 3): \
2527 ret = _mm_shuffle_ps_1133((a), (b)); \
2528 break; \
2529 case _MM_SHUFFLE(2, 0, 1, 0): \
2530 ret = _mm_shuffle_ps_2010((a), (b)); \
2531 break; \
2532 case _MM_SHUFFLE(2, 0, 0, 1): \
2533 ret = _mm_shuffle_ps_2001((a), (b)); \
2534 break; \
2535 case _MM_SHUFFLE(2, 0, 3, 2): \
2536 ret = _mm_shuffle_ps_2032((a), (b)); \
2537 break; \
2538 default: \
2539 ret = _mm_shuffle_ps_default((a), (b), (imm)); \
2540 break; \
2541 } \
2542 ret; \
2543 })
2544#endif
2545
2546// Computes the approximations of square roots of the four single-precision,
2547// floating-point values of a. First computes reciprocal square roots and then
2548// reciprocals of the four values.
2549//
2550// r0 := sqrt(a0)
2551// r1 := sqrt(a1)
2552// r2 := sqrt(a2)
2553// r3 := sqrt(a3)
2554//
2555// https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
2556FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
2557{
2558#if SSE2NEON_PRECISE_SQRT
2559 float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2560
2561 // Test for vrsqrteq_f32(0) -> positive infinity case.
2562 // Change to zero, so that s * 1/sqrt(s) result is zero too.
2563 const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
2564 const uint32x4_t div_by_zero =
2565 vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
2566 recip = vreinterpretq_f32_u32(
2567 vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
2568
2569 // Additional Netwon-Raphson iteration for accuracy
2570 recip = vmulq_f32(
2571 vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
2572 recip);
2573 recip = vmulq_f32(
2574 vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
2575 recip);
2576
2577 // sqrt(s) = s * 1/sqrt(s)
2578 return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));
2579#elif defined(__aarch64__)
2580 return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
2581#else
2582 float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2583 float32x4_t sq = vrecpeq_f32(recipsq);
2584 return vreinterpretq_m128_f32(sq);
2585#endif
2586}
2587
2588// Computes the approximation of the square root of the scalar single-precision
2589// floating point value of in.
2590// https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
2591FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
2592{
2593 float32_t value =
2594 vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
2595 return vreinterpretq_m128_f32(
2596 vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
2597}
2598
2599// Stores four single-precision, floating-point values.
2600// https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
2601FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
2602{
2603 vst1q_f32(p, vreinterpretq_f32_m128(a));
2604}
2605
2606// Store the lower single-precision (32-bit) floating-point element from a into
2607// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
2608// boundary or a general-protection exception may be generated.
2609//
2610// MEM[mem_addr+31:mem_addr] := a[31:0]
2611// MEM[mem_addr+63:mem_addr+32] := a[31:0]
2612// MEM[mem_addr+95:mem_addr+64] := a[31:0]
2613// MEM[mem_addr+127:mem_addr+96] := a[31:0]
2614//
2615// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps1
2616FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
2617{
2618 float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
2619 vst1q_f32(p, vdupq_n_f32(a0));
2620}
2621
2622// Stores the lower single - precision, floating - point value.
2623// https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
2624FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
2625{
2626 vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
2627}
2628
2629// Store the lower single-precision (32-bit) floating-point element from a into
2630// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
2631// boundary or a general-protection exception may be generated.
2632//
2633// MEM[mem_addr+31:mem_addr] := a[31:0]
2634// MEM[mem_addr+63:mem_addr+32] := a[31:0]
2635// MEM[mem_addr+95:mem_addr+64] := a[31:0]
2636// MEM[mem_addr+127:mem_addr+96] := a[31:0]
2637//
2638// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_ps
2639#define _mm_store1_ps _mm_store_ps1
2640
2641// Stores the upper two single-precision, floating-point values of a to the
2642// address p.
2643//
2644// *p0 := a2
2645// *p1 := a3
2646//
2647// https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx
2648FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
2649{
2650 *p = vreinterpret_m64_f32(vget_high_f32(a));
2651}
2652
2653// Stores the lower two single-precision floating point values of a to the
2654// address p.
2655//
2656// *p0 := a0
2657// *p1 := a1
2658//
2659// https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx
2660FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
2661{
2662 *p = vreinterpret_m64_f32(vget_low_f32(a));
2663}
2664
2665// Store 4 single-precision (32-bit) floating-point elements from a into memory
2666// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
2667// general-protection exception may be generated.
2668//
2669// MEM[mem_addr+31:mem_addr] := a[127:96]
2670// MEM[mem_addr+63:mem_addr+32] := a[95:64]
2671// MEM[mem_addr+95:mem_addr+64] := a[63:32]
2672// MEM[mem_addr+127:mem_addr+96] := a[31:0]
2673//
2674// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_ps
2675FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
2676{
2677 float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
2678 float32x4_t rev = vextq_f32(tmp, tmp, 2);
2679 vst1q_f32(p, rev);
2680}
2681
2682// Stores four single-precision, floating-point values.
2683// https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
2684FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
2685{
2686 vst1q_f32(p, vreinterpretq_f32_m128(a));
2687}
2688
2689// Stores 16-bits of integer data a at the address p.
2690// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si16
2691FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a)
2692{
2693 vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0);
2694}
2695
2696// Stores 64-bits of integer data a at the address p.
2697// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si64
2698FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
2699{
2700 vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0);
2701}
2702
2703// Store 64-bits of integer data from a into memory using a non-temporal memory
2704// hint.
2705// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pi
2706FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
2707{
2708 vst1_s64((int64_t *) p, vreinterpret_s64_m64(a));
2709}
2710
2711// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
2712// point elements) from a into memory using a non-temporal memory hint.
2713// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps
2714FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
2715{
2716#if __has_builtin(__builtin_nontemporal_store)
2717 __builtin_nontemporal_store(a, (float32x4_t *) p);
2718#else
2719 vst1q_f32(p, vreinterpretq_f32_m128(a));
2720#endif
2721}
2722
2723// Subtracts the four single-precision, floating-point values of a and b.
2724//
2725// r0 := a0 - b0
2726// r1 := a1 - b1
2727// r2 := a2 - b2
2728// r3 := a3 - b3
2729//
2730// https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
2731FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
2732{
2733 return vreinterpretq_m128_f32(
2734 vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2735}
2736
2737// Subtract the lower single-precision (32-bit) floating-point element in b from
2738// the lower single-precision (32-bit) floating-point element in a, store the
2739// result in the lower element of dst, and copy the upper 3 packed elements from
2740// a to the upper elements of dst.
2741//
2742// dst[31:0] := a[31:0] - b[31:0]
2743// dst[127:32] := a[127:32]
2744//
2745// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss
2746FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
2747{
2748 return _mm_move_ss(a, _mm_sub_ps(a, b));
2749}
2750
2751// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
2752// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
2753// transposed matrix in these vectors (row0 now contains column 0, etc.).
2754// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS
2755#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
2756 do { \
2757 float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \
2758 float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \
2759 row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \
2760 vget_low_f32(ROW23.val[0])); \
2761 row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \
2762 vget_low_f32(ROW23.val[1])); \
2763 row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \
2764 vget_high_f32(ROW23.val[0])); \
2765 row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \
2766 vget_high_f32(ROW23.val[1])); \
2767 } while (0)
2768
2769// according to the documentation, these intrinsics behave the same as the
2770// non-'u' versions. We'll just alias them here.
2771#define _mm_ucomieq_ss _mm_comieq_ss
2772#define _mm_ucomige_ss _mm_comige_ss
2773#define _mm_ucomigt_ss _mm_comigt_ss
2774#define _mm_ucomile_ss _mm_comile_ss
2775#define _mm_ucomilt_ss _mm_comilt_ss
2776#define _mm_ucomineq_ss _mm_comineq_ss
2777
2778// Return vector of type __m128i with undefined elements.
2779// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_undefined_si128
2780FORCE_INLINE __m128i _mm_undefined_si128(void)
2781{
2782#if defined(__GNUC__) || defined(__clang__)
2783#pragma GCC diagnostic push
2784#pragma GCC diagnostic ignored "-Wuninitialized"
2785#endif
2786 __m128i a;
2787 return a;
2788#if defined(__GNUC__) || defined(__clang__)
2789#pragma GCC diagnostic pop
2790#endif
2791}
2792
2793// Return vector of type __m128 with undefined elements.
2794// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps
2795FORCE_INLINE __m128 _mm_undefined_ps(void)
2796{
2797#if defined(__GNUC__) || defined(__clang__)
2798#pragma GCC diagnostic push
2799#pragma GCC diagnostic ignored "-Wuninitialized"
2800#endif
2801 __m128 a;
2802 return a;
2803#if defined(__GNUC__) || defined(__clang__)
2804#pragma GCC diagnostic pop
2805#endif
2806}
2807
2808// Selects and interleaves the upper two single-precision, floating-point values
2809// from a and b.
2810//
2811// r0 := a2
2812// r1 := b2
2813// r2 := a3
2814// r3 := b3
2815//
2816// https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
2817FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
2818{
2819#if defined(__aarch64__)
2820 return vreinterpretq_m128_f32(
2821 vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2822#else
2823 float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
2824 float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
2825 float32x2x2_t result = vzip_f32(a1, b1);
2826 return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
2827#endif
2828}
2829
2830// Selects and interleaves the lower two single-precision, floating-point values
2831// from a and b.
2832//
2833// r0 := a0
2834// r1 := b0
2835// r2 := a1
2836// r3 := b1
2837//
2838// https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
2839FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
2840{
2841#if defined(__aarch64__)
2842 return vreinterpretq_m128_f32(
2843 vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2844#else
2845 float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
2846 float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
2847 float32x2x2_t result = vzip_f32(a1, b1);
2848 return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
2849#endif
2850}
2851
2852// Computes bitwise EXOR (exclusive-or) of the four single-precision,
2853// floating-point values of a and b.
2854// https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
2855FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
2856{
2857 return vreinterpretq_m128_s32(
2858 veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
2859}
2860
2861/* SSE2 */
2862
2863// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or
2864// unsigned 16-bit integers in b.
2865// https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
2866FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
2867{
2868 return vreinterpretq_m128i_s16(
2869 vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2870}
2871
2872// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or
2873// unsigned 32-bit integers in b.
2874//
2875// r0 := a0 + b0
2876// r1 := a1 + b1
2877// r2 := a2 + b2
2878// r3 := a3 + b3
2879//
2880// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
2881FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
2882{
2883 return vreinterpretq_m128i_s32(
2884 vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
2885}
2886
2887// Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or
2888// unsigned 32-bit integers in b.
2889// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
2890FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
2891{
2892 return vreinterpretq_m128i_s64(
2893 vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
2894}
2895
2896// Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or
2897// unsigned 8-bit integers in b.
2898// https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
2899FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
2900{
2901 return vreinterpretq_m128i_s8(
2902 vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2903}
2904
2905// Add packed double-precision (64-bit) floating-point elements in a and b, and
2906// store the results in dst.
2907// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd
2908FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
2909{
2910#if defined(__aarch64__)
2911 return vreinterpretq_m128d_f64(
2912 vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
2913#else
2914 double *da = (double *) &a;
2915 double *db = (double *) &b;
2916 double c[2];
2917 c[0] = da[0] + db[0];
2918 c[1] = da[1] + db[1];
2919 return vld1q_f32((float32_t *) c);
2920#endif
2921}
2922
2923// Add the lower double-precision (64-bit) floating-point element in a and b,
2924// store the result in the lower element of dst, and copy the upper element from
2925// a to the upper element of dst.
2926//
2927// dst[63:0] := a[63:0] + b[63:0]
2928// dst[127:64] := a[127:64]
2929//
2930// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sd
2931FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
2932{
2933#if defined(__aarch64__)
2934 return _mm_move_sd(a, _mm_add_pd(a, b));
2935#else
2936 double *da = (double *) &a;
2937 double *db = (double *) &b;
2938 double c[2];
2939 c[0] = da[0] + db[0];
2940 c[1] = da[1];
2941 return vld1q_f32((float32_t *) c);
2942#endif
2943}
2944
2945// Add 64-bit integers a and b, and store the result in dst.
2946//
2947// dst[63:0] := a[63:0] + b[63:0]
2948//
2949// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64
2950FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
2951{
2952 return vreinterpret_m64_s64(
2953 vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
2954}
2955
2956// Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b
2957// and saturates.
2958//
2959// r0 := SignedSaturate(a0 + b0)
2960// r1 := SignedSaturate(a1 + b1)
2961// ...
2962// r7 := SignedSaturate(a7 + b7)
2963//
2964// https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
2965FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
2966{
2967 return vreinterpretq_m128i_s16(
2968 vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2969}
2970
2971// Add packed signed 8-bit integers in a and b using saturation, and store the
2972// results in dst.
2973//
2974// FOR j := 0 to 15
2975// i := j*8
2976// dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
2977// ENDFOR
2978//
2979// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8
2980FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
2981{
2982 return vreinterpretq_m128i_s8(
2983 vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2984}
2985
2986// Add packed unsigned 16-bit integers in a and b using saturation, and store
2987// the results in dst.
2988// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epu16
2989FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
2990{
2991 return vreinterpretq_m128i_u16(
2992 vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
2993}
2994
2995// Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in
2996// b and saturates..
2997// https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
2998FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
2999{
3000 return vreinterpretq_m128i_u8(
3001 vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
3002}
3003
3004// Compute the bitwise AND of packed double-precision (64-bit) floating-point
3005// elements in a and b, and store the results in dst.
3006//
3007// FOR j := 0 to 1
3008// i := j*64
3009// dst[i+63:i] := a[i+63:i] AND b[i+63:i]
3010// ENDFOR
3011//
3012// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd
3013FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
3014{
3015 return vreinterpretq_m128d_s64(
3016 vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
3017}
3018
3019// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in
3020// b.
3021//
3022// r := a & b
3023//
3024// https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
3025FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
3026{
3027 return vreinterpretq_m128i_s32(
3028 vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3029}
3030
3031// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
3032// elements in a and then AND with b, and store the results in dst.
3033//
3034// FOR j := 0 to 1
3035// i := j*64
3036// dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
3037// ENDFOR
3038//
3039// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd
3040FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
3041{
3042 // *NOTE* argument swap
3043 return vreinterpretq_m128d_s64(
3044 vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
3045}
3046
3047// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the
3048// 128-bit value in a.
3049//
3050// r := (~a) & b
3051//
3052// https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
3053FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
3054{
3055 return vreinterpretq_m128i_s32(
3056 vbicq_s32(vreinterpretq_s32_m128i(b),
3057 vreinterpretq_s32_m128i(a))); // *NOTE* argument swap
3058}
3059
3060// Computes the average of the 8 unsigned 16-bit integers in a and the 8
3061// unsigned 16-bit integers in b and rounds.
3062//
3063// r0 := (a0 + b0) / 2
3064// r1 := (a1 + b1) / 2
3065// ...
3066// r7 := (a7 + b7) / 2
3067//
3068// https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx
3069FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
3070{
3071 return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
3072 vreinterpretq_u16_m128i(b));
3073}
3074
3075// Computes the average of the 16 unsigned 8-bit integers in a and the 16
3076// unsigned 8-bit integers in b and rounds.
3077//
3078// r0 := (a0 + b0) / 2
3079// r1 := (a1 + b1) / 2
3080// ...
3081// r15 := (a15 + b15) / 2
3082//
3083// https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx
3084FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
3085{
3086 return vreinterpretq_m128i_u8(
3087 vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
3088}
3089
3090// Shift a left by imm8 bytes while shifting in zeros, and store the results in
3091// dst.
3092// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bslli_si128
3093#define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)
3094
3095// Shift a right by imm8 bytes while shifting in zeros, and store the results in
3096// dst.
3097// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bsrli_si128
3098#define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
3099
3100// Cast vector of type __m128d to type __m128. This intrinsic is only used for
3101// compilation and does not generate any instructions, thus it has zero latency.
3102// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ps
3103FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
3104{
3105 return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a));
3106}
3107
3108// Cast vector of type __m128d to type __m128i. This intrinsic is only used for
3109// compilation and does not generate any instructions, thus it has zero latency.
3110// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128
3111FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
3112{
3113 return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
3114}
3115
3116// Cast vector of type __m128 to type __m128d. This intrinsic is only used for
3117// compilation and does not generate any instructions, thus it has zero latency.
3118// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd
3119FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
3120{
3121 return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
3122}
3123
3124// Applies a type cast to reinterpret four 32-bit floating point values passed
3125// in as a 128-bit parameter as packed 32-bit integers.
3126// https://msdn.microsoft.com/en-us/library/bb514099.aspx
3127FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
3128{
3129 return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
3130}
3131
3132// Cast vector of type __m128i to type __m128d. This intrinsic is only used for
3133// compilation and does not generate any instructions, thus it has zero latency.
3134// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_pd
3135FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
3136{
3137#if defined(__aarch64__)
3138 return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
3139#else
3140 return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a));
3141#endif
3142}
3143
3144// Applies a type cast to reinterpret four 32-bit integers passed in as a
3145// 128-bit parameter as packed 32-bit floating point values.
3146// https://msdn.microsoft.com/en-us/library/bb514029.aspx
3147FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
3148{
3149 return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
3150}
3151
3152// Cache line containing p is flushed and invalidated from all caches in the
3153// coherency domain. :
3154// https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx
3155FORCE_INLINE void _mm_clflush(void const *p)
3156{
3157 (void) p;
3158 // no corollary for Neon?
3159}
3160
3161// Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or
3162// unsigned 16-bit integers in b for equality.
3163// https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
3164FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
3165{
3166 return vreinterpretq_m128i_u16(
3167 vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3168}
3169
3170// Compare packed 32-bit integers in a and b for equality, and store the results
3171// in dst
3172FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
3173{
3174 return vreinterpretq_m128i_u32(
3175 vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3176}
3177
3178// Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or
3179// unsigned 8-bit integers in b for equality.
3180// https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
3181FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
3182{
3183 return vreinterpretq_m128i_u8(
3184 vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3185}
3186
3187// Compare packed double-precision (64-bit) floating-point elements in a and b
3188// for equality, and store the results in dst.
3189// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_pd
3190FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
3191{
3192#if defined(__aarch64__)
3193 return vreinterpretq_m128d_u64(
3194 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3195#else
3196 // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
3197 uint32x4_t cmp =
3198 vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
3199 uint32x4_t swapped = vrev64q_u32(cmp);
3200 return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped));
3201#endif
3202}
3203
3204// Compare the lower double-precision (64-bit) floating-point elements in a and
3205// b for equality, store the result in the lower element of dst, and copy the
3206// upper element from a to the upper element of dst.
3207// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_sd
3208FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
3209{
3210 return _mm_move_sd(a, _mm_cmpeq_pd(a, b));
3211}
3212
3213// Compare packed double-precision (64-bit) floating-point elements in a and b
3214// for greater-than-or-equal, and store the results in dst.
3215// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_pd
3216FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
3217{
3218#if defined(__aarch64__)
3219 return vreinterpretq_m128d_u64(
3220 vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3221#else
3222 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3223 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3224 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3225 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3226 uint64_t d[2];
3227 d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3228 d[1] = (*(double *) &a1) >= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3229
3230 return vreinterpretq_m128d_u64(vld1q_u64(d));
3231#endif
3232}
3233
3234// Compare the lower double-precision (64-bit) floating-point elements in a and
3235// b for greater-than-or-equal, store the result in the lower element of dst,
3236// and copy the upper element from a to the upper element of dst.
3237// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_sd
3238FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
3239{
3240#if defined(__aarch64__)
3241 return _mm_move_sd(a, _mm_cmpge_pd(a, b));
3242#else
3243 // expand "_mm_cmpge_pd()" to reduce unnecessary operations
3244 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3245 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3246 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3247 uint64_t d[2];
3248 d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3249 d[1] = a1;
3250
3251 return vreinterpretq_m128d_u64(vld1q_u64(d));
3252#endif
3253}
3254
3255// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
3256// in b for greater than.
3257//
3258// r0 := (a0 > b0) ? 0xffff : 0x0
3259// r1 := (a1 > b1) ? 0xffff : 0x0
3260// ...
3261// r7 := (a7 > b7) ? 0xffff : 0x0
3262//
3263// https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
3264FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
3265{
3266 return vreinterpretq_m128i_u16(
3267 vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3268}
3269
3270// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
3271// in b for greater than.
3272// https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
3273FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
3274{
3275 return vreinterpretq_m128i_u32(
3276 vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3277}
3278
3279// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
3280// in b for greater than.
3281//
3282// r0 := (a0 > b0) ? 0xff : 0x0
3283// r1 := (a1 > b1) ? 0xff : 0x0
3284// ...
3285// r15 := (a15 > b15) ? 0xff : 0x0
3286//
3287// https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
3288FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
3289{
3290 return vreinterpretq_m128i_u8(
3291 vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3292}
3293
3294// Compare packed double-precision (64-bit) floating-point elements in a and b
3295// for greater-than, and store the results in dst.
3296// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_pd
3297FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
3298{
3299#if defined(__aarch64__)
3300 return vreinterpretq_m128d_u64(
3301 vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3302#else
3303 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3304 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3305 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3306 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3307 uint64_t d[2];
3308 d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3309 d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3310
3311 return vreinterpretq_m128d_u64(vld1q_u64(d));
3312#endif
3313}
3314
3315// Compare the lower double-precision (64-bit) floating-point elements in a and
3316// b for greater-than, store the result in the lower element of dst, and copy
3317// the upper element from a to the upper element of dst.
3318// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_sd
3319FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
3320{
3321#if defined(__aarch64__)
3322 return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
3323#else
3324 // expand "_mm_cmpge_pd()" to reduce unnecessary operations
3325 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3326 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3327 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3328 uint64_t d[2];
3329 d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3330 d[1] = a1;
3331
3332 return vreinterpretq_m128d_u64(vld1q_u64(d));
3333#endif
3334}
3335
3336// Compare packed double-precision (64-bit) floating-point elements in a and b
3337// for less-than-or-equal, and store the results in dst.
3338// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_pd
3339FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
3340{
3341#if defined(__aarch64__)
3342 return vreinterpretq_m128d_u64(
3343 vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3344#else
3345 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3346 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3347 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3348 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3349 uint64_t d[2];
3350 d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3351 d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3352
3353 return vreinterpretq_m128d_u64(vld1q_u64(d));
3354#endif
3355}
3356
3357// Compare the lower double-precision (64-bit) floating-point elements in a and
3358// b for less-than-or-equal, store the result in the lower element of dst, and
3359// copy the upper element from a to the upper element of dst.
3360// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_sd
3361FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
3362{
3363#if defined(__aarch64__)
3364 return _mm_move_sd(a, _mm_cmple_pd(a, b));
3365#else
3366 // expand "_mm_cmpge_pd()" to reduce unnecessary operations
3367 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3368 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3369 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3370 uint64_t d[2];
3371 d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3372 d[1] = a1;
3373
3374 return vreinterpretq_m128d_u64(vld1q_u64(d));
3375#endif
3376}
3377
3378// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
3379// in b for less than.
3380//
3381// r0 := (a0 < b0) ? 0xffff : 0x0
3382// r1 := (a1 < b1) ? 0xffff : 0x0
3383// ...
3384// r7 := (a7 < b7) ? 0xffff : 0x0
3385//
3386// https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx
3387FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
3388{
3389 return vreinterpretq_m128i_u16(
3390 vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3391}
3392
3393
3394// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
3395// in b for less than.
3396// https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
3397FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
3398{
3399 return vreinterpretq_m128i_u32(
3400 vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3401}
3402
3403// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
3404// in b for lesser than.
3405// https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
3406FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
3407{
3408 return vreinterpretq_m128i_u8(
3409 vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3410}
3411
3412// Compare packed double-precision (64-bit) floating-point elements in a and b
3413// for less-than, and store the results in dst.
3414// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_pd
3415FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
3416{
3417#if defined(__aarch64__)
3418 return vreinterpretq_m128d_u64(
3419 vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3420#else
3421 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3422 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3423 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3424 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3425 uint64_t d[2];
3426 d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3427 d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3428
3429 return vreinterpretq_m128d_u64(vld1q_u64(d));
3430#endif
3431}
3432
3433// Compare the lower double-precision (64-bit) floating-point elements in a and
3434// b for less-than, store the result in the lower element of dst, and copy the
3435// upper element from a to the upper element of dst.
3436// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_sd
3437FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
3438{
3439#if defined(__aarch64__)
3440 return _mm_move_sd(a, _mm_cmplt_pd(a, b));
3441#else
3442 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3443 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3444 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3445 uint64_t d[2];
3446 d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3447 d[1] = a1;
3448
3449 return vreinterpretq_m128d_u64(vld1q_u64(d));
3450#endif
3451}
3452
3453// Compare packed double-precision (64-bit) floating-point elements in a and b
3454// for not-equal, and store the results in dst.
3455// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_pd
3456FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
3457{
3458#if defined(__aarch64__)
3459 return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64(
3460 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));
3461#else
3462 // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
3463 uint32x4_t cmp =
3464 vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
3465 uint32x4_t swapped = vrev64q_u32(cmp);
3466 return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped)));
3467#endif
3468}
3469
3470// Compare the lower double-precision (64-bit) floating-point elements in a and
3471// b for not-equal, store the result in the lower element of dst, and copy the
3472// upper element from a to the upper element of dst.
3473// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_sd
3474FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
3475{
3476 return _mm_move_sd(a, _mm_cmpneq_pd(a, b));
3477}
3478
3479// Compare packed double-precision (64-bit) floating-point elements in a and b
3480// for not-greater-than-or-equal, and store the results in dst.
3481// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_pd
3482#define _mm_cmpnge_pd(a, b) _mm_cmplt_pd(a, b)
3483
3484// Compare the lower double-precision (64-bit) floating-point elements in a and
3485// b for not-greater-than-or-equal, store the result in the lower element of
3486// dst, and copy the upper element from a to the upper element of dst.
3487// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_sd
3488#define _mm_cmpnge_sd(a, b) _mm_cmplt_sd(a, b)
3489
3490// Compare packed double-precision (64-bit) floating-point elements in a and b
3491// for not-greater-than, and store the results in dst.
3492// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_cmpngt_pd
3493#define _mm_cmpngt_pd(a, b) _mm_cmple_pd(a, b)
3494
3495// Compare the lower double-precision (64-bit) floating-point elements in a and
3496// b for not-greater-than, store the result in the lower element of dst, and
3497// copy the upper element from a to the upper element of dst.
3498// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpngt_sd
3499#define _mm_cmpngt_sd(a, b) _mm_cmple_sd(a, b)
3500
3501// Compare packed double-precision (64-bit) floating-point elements in a and b
3502// for not-less-than-or-equal, and store the results in dst.
3503// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_pd
3504#define _mm_cmpnle_pd(a, b) _mm_cmpgt_pd(a, b)
3505
3506// Compare the lower double-precision (64-bit) floating-point elements in a and
3507// b for not-less-than-or-equal, store the result in the lower element of dst,
3508// and copy the upper element from a to the upper element of dst.
3509// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_sd
3510#define _mm_cmpnle_sd(a, b) _mm_cmpgt_sd(a, b)
3511
3512// Compare packed double-precision (64-bit) floating-point elements in a and b
3513// for not-less-than, and store the results in dst.
3514// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_pd
3515#define _mm_cmpnlt_pd(a, b) _mm_cmpge_pd(a, b)
3516
3517// Compare the lower double-precision (64-bit) floating-point elements in a and
3518// b for not-less-than, store the result in the lower element of dst, and copy
3519// the upper element from a to the upper element of dst.
3520// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_sd
3521#define _mm_cmpnlt_sd(a, b) _mm_cmpge_sd(a, b)
3522
3523// Compare packed double-precision (64-bit) floating-point elements in a and b
3524// to see if neither is NaN, and store the results in dst.
3525// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_pd
3526FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
3527{
3528#if defined(__aarch64__)
3529 // Excluding NaNs, any two floating point numbers can be compared.
3530 uint64x2_t not_nan_a =
3531 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
3532 uint64x2_t not_nan_b =
3533 vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
3534 return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b));
3535#else
3536 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3537 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3538 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3539 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3540 uint64_t d[2];
3541 d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3542 (*(double *) &b0) == (*(double *) &b0))
3543 ? ~UINT64_C(0)
3544 : UINT64_C(0);
3545 d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
3546 (*(double *) &b1) == (*(double *) &b1))
3547 ? ~UINT64_C(0)
3548 : UINT64_C(0);
3549
3550 return vreinterpretq_m128d_u64(vld1q_u64(d));
3551#endif
3552}
3553
3554// Compare the lower double-precision (64-bit) floating-point elements in a and
3555// b to see if neither is NaN, store the result in the lower element of dst, and
3556// copy the upper element from a to the upper element of dst.
3557// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_sd
3558FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
3559{
3560#if defined(__aarch64__)
3561 return _mm_move_sd(a, _mm_cmpord_pd(a, b));
3562#else
3563 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3564 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3565 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3566 uint64_t d[2];
3567 d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3568 (*(double *) &b0) == (*(double *) &b0))
3569 ? ~UINT64_C(0)
3570 : UINT64_C(0);
3571 d[1] = a1;
3572
3573 return vreinterpretq_m128d_u64(vld1q_u64(d));
3574#endif
3575}
3576
3577// Compare packed double-precision (64-bit) floating-point elements in a and b
3578// to see if either is NaN, and store the results in dst.
3579// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_pd
3580FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
3581{
3582#if defined(__aarch64__)
3583 // Two NaNs are not equal in comparison operation.
3584 uint64x2_t not_nan_a =
3585 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
3586 uint64x2_t not_nan_b =
3587 vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
3588 return vreinterpretq_m128d_s32(
3589 vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
3590#else
3591 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3592 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3593 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3594 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3595 uint64_t d[2];
3596 d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3597 (*(double *) &b0) == (*(double *) &b0))
3598 ? UINT64_C(0)
3599 : ~UINT64_C(0);
3600 d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
3601 (*(double *) &b1) == (*(double *) &b1))
3602 ? UINT64_C(0)
3603 : ~UINT64_C(0);
3604
3605 return vreinterpretq_m128d_u64(vld1q_u64(d));
3606#endif
3607}
3608
3609// Compare the lower double-precision (64-bit) floating-point elements in a and
3610// b to see if either is NaN, store the result in the lower element of dst, and
3611// copy the upper element from a to the upper element of dst.
3612// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_sd
3613FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
3614{
3615#if defined(__aarch64__)
3616 return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
3617#else
3618 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3619 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3620 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3621 uint64_t d[2];
3622 d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3623 (*(double *) &b0) == (*(double *) &b0))
3624 ? UINT64_C(0)
3625 : ~UINT64_C(0);
3626 d[1] = a1;
3627
3628 return vreinterpretq_m128d_u64(vld1q_u64(d));
3629#endif
3630}
3631
3632// Compare the lower double-precision (64-bit) floating-point element in a and b
3633// for greater-than-or-equal, and return the boolean result (0 or 1).
3634// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comige_sd
3635FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
3636{
3637#if defined(__aarch64__)
3638 return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
3639#else
3640 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3641 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3642
3643 return (*(double *) &a0 >= *(double *) &b0);
3644#endif
3645}
3646
3647// Compare the lower double-precision (64-bit) floating-point element in a and b
3648// for greater-than, and return the boolean result (0 or 1).
3649// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comigt_sd
3650FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
3651{
3652#if defined(__aarch64__)
3653 return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
3654#else
3655 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3656 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3657
3658 return (*(double *) &a0 > *(double *) &b0);
3659#endif
3660}
3661
3662// Compare the lower double-precision (64-bit) floating-point element in a and b
3663// for less-than-or-equal, and return the boolean result (0 or 1).
3664// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comile_sd
3665FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
3666{
3667#if defined(__aarch64__)
3668 return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
3669#else
3670 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3671 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3672
3673 return (*(double *) &a0 <= *(double *) &b0);
3674#endif
3675}
3676
3677// Compare the lower double-precision (64-bit) floating-point element in a and b
3678// for less-than, and return the boolean result (0 or 1).
3679// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comilt_sd
3680FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
3681{
3682#if defined(__aarch64__)
3683 return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
3684#else
3685 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3686 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3687
3688 return (*(double *) &a0 < *(double *) &b0);
3689#endif
3690}
3691
3692// Compare the lower double-precision (64-bit) floating-point element in a and b
3693// for equality, and return the boolean result (0 or 1).
3694// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_sd
3695FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
3696{
3697#if defined(__aarch64__)
3698 return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1;
3699#else
3700 uint32x4_t a_not_nan =
3701 vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(a));
3702 uint32x4_t b_not_nan =
3703 vceqq_u32(vreinterpretq_u32_m128d(b), vreinterpretq_u32_m128d(b));
3704 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
3705 uint32x4_t a_eq_b =
3706 vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
3707 uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan),
3708 vreinterpretq_u64_u32(a_eq_b));
3709 return !!vgetq_lane_u64(and_results, 0);
3710#endif
3711}
3712
3713// Compare the lower double-precision (64-bit) floating-point element in a and b
3714// for not-equal, and return the boolean result (0 or 1).
3715// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comineq_sd
3716FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
3717{
3718#if defined(__aarch64__)
3719 return !vgetq_lane_u64(vceqq_f64(a, b), 0);
3720#else
3721 // FIXME we should handle NaN condition here
3722 uint32x4_t a_eq_b =
3723 vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
3724 return !vgetq_lane_u64(vreinterpretq_u64_u32(a_eq_b), 0);
3725#endif
3726}
3727
3728// Convert packed signed 32-bit integers in a to packed double-precision
3729// (64-bit) floating-point elements, and store the results in dst.
3730//
3731// FOR j := 0 to 1
3732// i := j*32
3733// m := j*64
3734// dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
3735// ENDFOR
3736//
3737// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_pd
3738FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
3739{
3740#if defined(__aarch64__)
3741 return vreinterpretq_m128d_f64(
3742 vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))));
3743#else
3744 double a0 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
3745 double a1 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1);
3746 return _mm_set_pd(a1, a0);
3747#endif
3748}
3749
3750// Converts the four signed 32-bit integer values of a to single-precision,
3751// floating-point values
3752// https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
3753FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
3754{
3755 return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
3756}
3757
3758// Convert packed double-precision (64-bit) floating-point elements in a to
3759// packed 32-bit integers, and store the results in dst.
3760//
3761// FOR j := 0 to 1
3762// i := 32*j
3763// k := 64*j
3764// dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
3765// ENDFOR
3766//
3767// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_epi32
3768FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
3769{
3770 __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
3771 double d0 = ((double *) &rnd)[0];
3772 double d1 = ((double *) &rnd)[1];
3773 return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);
3774}
3775
3776// Convert packed double-precision (64-bit) floating-point elements in a to
3777// packed 32-bit integers, and store the results in dst.
3778//
3779// FOR j := 0 to 1
3780// i := 32*j
3781// k := 64*j
3782// dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
3783// ENDFOR
3784//
3785// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_pi32
3786FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
3787{
3788 __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
3789 double d0 = ((double *) &rnd)[0];
3790 double d1 = ((double *) &rnd)[1];
3791 int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
3792 return vreinterpret_m64_s32(vld1_s32(data));
3793}
3794
3795// Convert packed double-precision (64-bit) floating-point elements in a to
3796// packed single-precision (32-bit) floating-point elements, and store the
3797// results in dst.
3798//
3799// FOR j := 0 to 1
3800// i := 32*j
3801// k := 64*j
3802// dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k])
3803// ENDFOR
3804// dst[127:64] := 0
3805//
3806// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps
3807FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
3808{
3809#if defined(__aarch64__)
3810 float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
3811 return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
3812#else
3813 float a0 = (float) ((double *) &a)[0];
3814 float a1 = (float) ((double *) &a)[1];
3815 return _mm_set_ps(0, 0, a1, a0);
3816#endif
3817}
3818
3819// Convert packed signed 32-bit integers in a to packed double-precision
3820// (64-bit) floating-point elements, and store the results in dst.
3821//
3822// FOR j := 0 to 1
3823// i := j*32
3824// m := j*64
3825// dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
3826// ENDFOR
3827//
3828// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_pd
3829FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
3830{
3831#if defined(__aarch64__)
3832 return vreinterpretq_m128d_f64(
3833 vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a))));
3834#else
3835 double a0 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 0);
3836 double a1 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 1);
3837 return _mm_set_pd(a1, a0);
3838#endif
3839}
3840
3841// Converts the four single-precision, floating-point values of a to signed
3842// 32-bit integer values.
3843//
3844// r0 := (int) a0
3845// r1 := (int) a1
3846// r2 := (int) a2
3847// r3 := (int) a3
3848//
3849// https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
3850// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
3851// does not support! It is supported on ARMv8-A however.
3852FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
3853{
3854#if defined(__aarch64__)
3855 switch (_MM_GET_ROUNDING_MODE()) {
3856 case _MM_ROUND_NEAREST:
3857 return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
3858 case _MM_ROUND_DOWN:
3859 return vreinterpretq_m128i_s32(vcvtmq_s32_f32(a));
3860 case _MM_ROUND_UP:
3861 return vreinterpretq_m128i_s32(vcvtpq_s32_f32(a));
3862 default: // _MM_ROUND_TOWARD_ZERO
3863 return vreinterpretq_m128i_s32(vcvtq_s32_f32(a));
3864 }
3865#else
3866 float *f = (float *) &a;
3867 switch (_MM_GET_ROUNDING_MODE()) {
3868 case _MM_ROUND_NEAREST: {
3869 uint32x4_t signmask = vdupq_n_u32(0x80000000);
3870 float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
3871 vdupq_n_f32(0.5f)); /* +/- 0.5 */
3872 int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
3873 vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
3874 int32x4_t r_trunc = vcvtq_s32_f32(
3875 vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
3876 int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
3877 vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
3878 int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
3879 vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
3880 float32x4_t delta = vsubq_f32(
3881 vreinterpretq_f32_m128(a),
3882 vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
3883 uint32x4_t is_delta_half =
3884 vceqq_f32(delta, half); /* delta == +/- 0.5 */
3885 return vreinterpretq_m128i_s32(
3886 vbslq_s32(is_delta_half, r_even, r_normal));
3887 }
3888 case _MM_ROUND_DOWN:
3889 return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]),
3890 floorf(f[0]));
3891 case _MM_ROUND_UP:
3892 return _mm_set_epi32(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]),
3893 ceilf(f[0]));
3894 default: // _MM_ROUND_TOWARD_ZERO
3895 return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1],
3896 (int32_t) f[0]);
3897 }
3898#endif
3899}
3900
3901// Convert packed single-precision (32-bit) floating-point elements in a to
3902// packed double-precision (64-bit) floating-point elements, and store the
3903// results in dst.
3904//
3905// FOR j := 0 to 1
3906// i := 64*j
3907// k := 32*j
3908// dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
3909// ENDFOR
3910//
3911// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd
3912FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
3913{
3914#if defined(__aarch64__)
3915 return vreinterpretq_m128d_f64(
3916 vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
3917#else
3918 double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
3919 double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
3920 return _mm_set_pd(a1, a0);
3921#endif
3922}
3923
3924// Copy the lower double-precision (64-bit) floating-point element of a to dst.
3925//
3926// dst[63:0] := a[63:0]
3927//
3928// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64
3929FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
3930{
3931#if defined(__aarch64__)
3932 return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
3933#else
3934 return ((double *) &a)[0];
3935#endif
3936}
3937
3938// Convert the lower double-precision (64-bit) floating-point element in a to a
3939// 32-bit integer, and store the result in dst.
3940//
3941// dst[31:0] := Convert_FP64_To_Int32(a[63:0])
3942//
3943// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si32
3944FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
3945{
3946#if defined(__aarch64__)
3947 return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
3948#else
3949 __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
3950 double ret = ((double *) &rnd)[0];
3951 return (int32_t) ret;
3952#endif
3953}
3954
3955// Convert the lower double-precision (64-bit) floating-point element in a to a
3956// 64-bit integer, and store the result in dst.
3957//
3958// dst[63:0] := Convert_FP64_To_Int64(a[63:0])
3959//
3960// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64
3961FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
3962{
3963#if defined(__aarch64__)
3964 return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
3965#else
3966 __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
3967 double ret = ((double *) &rnd)[0];
3968 return (int64_t) ret;
3969#endif
3970}
3971
3972// Convert the lower double-precision (64-bit) floating-point element in a to a
3973// 64-bit integer, and store the result in dst.
3974//
3975// dst[63:0] := Convert_FP64_To_Int64(a[63:0])
3976//
3977// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64x
3978#define _mm_cvtsd_si64x _mm_cvtsd_si64
3979
3980// Convert the lower double-precision (64-bit) floating-point element in b to a
3981// single-precision (32-bit) floating-point element, store the result in the
3982// lower element of dst, and copy the upper 3 packed elements from a to the
3983// upper elements of dst.
3984// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_ss
3985FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
3986{
3987#if defined(__aarch64__)
3988 return vreinterpretq_m128_f32(vsetq_lane_f32(
3989 vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
3990 vreinterpretq_f32_m128(a), 0));
3991#else
3992 return vreinterpretq_m128_f32(vsetq_lane_f32((float) ((double *) &b)[0],
3993 vreinterpretq_f32_m128(a), 0));
3994#endif
3995}
3996
3997// Copy the lower 32-bit integer in a to dst.
3998//
3999// dst[31:0] := a[31:0]
4000//
4001// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32
4002FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
4003{
4004 return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
4005}
4006
4007// Copy the lower 64-bit integer in a to dst.
4008//
4009// dst[63:0] := a[63:0]
4010//
4011// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64
4012FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
4013{
4014 return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
4015}
4016
4017// Copy the lower 64-bit integer in a to dst.
4018// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
4019#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4020
4021// Convert the signed 32-bit integer b to a double-precision (64-bit)
4022// floating-point element, store the result in the lower element of dst, and
4023// copy the upper element from a to the upper element of dst.
4024// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_sd
4025FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
4026{
4027#if defined(__aarch64__)
4028 return vreinterpretq_m128d_f64(
4029 vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
4030#else
4031 double bf = (double) b;
4032 return vreinterpretq_m128d_s64(
4033 vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
4034#endif
4035}
4036
4037// Copy the lower 64-bit integer in a to dst.
4038//
4039// dst[63:0] := a[63:0]
4040//
4041// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
4042#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4043
4044// Moves 32-bit integer a to the least significant 32 bits of an __m128 object,
4045// zero extending the upper bits.
4046//
4047// r0 := a
4048// r1 := 0x0
4049// r2 := 0x0
4050// r3 := 0x0
4051//
4052// https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
4053FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
4054{
4055 return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
4056}
4057
4058// Convert the signed 64-bit integer b to a double-precision (64-bit)
4059// floating-point element, store the result in the lower element of dst, and
4060// copy the upper element from a to the upper element of dst.
4061// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_sd
4062FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
4063{
4064#if defined(__aarch64__)
4065 return vreinterpretq_m128d_f64(
4066 vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
4067#else
4068 double bf = (double) b;
4069 return vreinterpretq_m128d_s64(
4070 vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
4071#endif
4072}
4073
4074// Moves 64-bit integer a to the least significant 64 bits of an __m128 object,
4075// zero extending the upper bits.
4076//
4077// r0 := a
4078// r1 := 0x0
4079FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
4080{
4081 return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
4082}
4083
4084// Copy 64-bit integer a to the lower element of dst, and zero the upper
4085// element.
4086// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_si128
4087#define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a)
4088
4089// Convert the signed 64-bit integer b to a double-precision (64-bit)
4090// floating-point element, store the result in the lower element of dst, and
4091// copy the upper element from a to the upper element of dst.
4092// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_sd
4093#define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b)
4094
4095// Convert the lower single-precision (32-bit) floating-point element in b to a
4096// double-precision (64-bit) floating-point element, store the result in the
4097// lower element of dst, and copy the upper element from a to the upper element
4098// of dst.
4099//
4100// dst[63:0] := Convert_FP32_To_FP64(b[31:0])
4101// dst[127:64] := a[127:64]
4102//
4103// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sd
4104FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
4105{
4106 double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
4107#if defined(__aarch64__)
4108 return vreinterpretq_m128d_f64(
4109 vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
4110#else
4111 return vreinterpretq_m128d_s64(
4112 vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0));
4113#endif
4114}
4115
4116// Convert packed double-precision (64-bit) floating-point elements in a to
4117// packed 32-bit integers with truncation, and store the results in dst.
4118// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_epi32
4119FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
4120{
4121 double a0 = ((double *) &a)[0];
4122 double a1 = ((double *) &a)[1];
4123 return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0);
4124}
4125
4126// Convert packed double-precision (64-bit) floating-point elements in a to
4127// packed 32-bit integers with truncation, and store the results in dst.
4128// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_pi32
4129FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
4130{
4131 double a0 = ((double *) &a)[0];
4132 double a1 = ((double *) &a)[1];
4133 int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
4134 return vreinterpret_m64_s32(vld1_s32(data));
4135}
4136
4137// Converts the four single-precision, floating-point values of a to signed
4138// 32-bit integer values using truncate.
4139// https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
4140FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
4141{
4142 return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
4143}
4144
4145// Convert the lower double-precision (64-bit) floating-point element in a to a
4146// 32-bit integer with truncation, and store the result in dst.
4147//
4148// dst[63:0] := Convert_FP64_To_Int32_Truncate(a[63:0])
4149//
4150// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si32
4151FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
4152{
4153 double ret = *((double *) &a);
4154 return (int32_t) ret;
4155}
4156
4157// Convert the lower double-precision (64-bit) floating-point element in a to a
4158// 64-bit integer with truncation, and store the result in dst.
4159//
4160// dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
4161//
4162// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64
4163FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
4164{
4165#if defined(__aarch64__)
4166 return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
4167#else
4168 double ret = *((double *) &a);
4169 return (int64_t) ret;
4170#endif
4171}
4172
4173// Convert the lower double-precision (64-bit) floating-point element in a to a
4174// 64-bit integer with truncation, and store the result in dst.
4175//
4176// dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
4177//
4178// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64x
4179#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
4180
4181// Divide packed double-precision (64-bit) floating-point elements in a by
4182// packed elements in b, and store the results in dst.
4183//
4184// FOR j := 0 to 1
4185// i := 64*j
4186// dst[i+63:i] := a[i+63:i] / b[i+63:i]
4187// ENDFOR
4188//
4189// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_pd
4190FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
4191{
4192#if defined(__aarch64__)
4193 return vreinterpretq_m128d_f64(
4194 vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4195#else
4196 double *da = (double *) &a;
4197 double *db = (double *) &b;
4198 double c[2];
4199 c[0] = da[0] / db[0];
4200 c[1] = da[1] / db[1];
4201 return vld1q_f32((float32_t *) c);
4202#endif
4203}
4204
4205// Divide the lower double-precision (64-bit) floating-point element in a by the
4206// lower double-precision (64-bit) floating-point element in b, store the result
4207// in the lower element of dst, and copy the upper element from a to the upper
4208// element of dst.
4209// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_sd
4210FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
4211{
4212#if defined(__aarch64__)
4213 float64x2_t tmp =
4214 vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
4215 return vreinterpretq_m128d_f64(
4216 vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1));
4217#else
4218 return _mm_move_sd(a, _mm_div_pd(a, b));
4219#endif
4220}
4221
4222// Extracts the selected signed or unsigned 16-bit integer from a and zero
4223// extends.
4224// https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
4225// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
4226#define _mm_extract_epi16(a, imm) \
4227 vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
4228
4229// Inserts the least significant 16 bits of b into the selected 16-bit integer
4230// of a.
4231// https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
4232// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
4233// __constrange(0,8) int imm)
4234#define _mm_insert_epi16(a, b, imm) \
4235 __extension__({ \
4236 vreinterpretq_m128i_s16( \
4237 vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
4238 })
4239
4240// Loads two double-precision from 16-byte aligned memory, floating-point
4241// values.
4242//
4243// dst[127:0] := MEM[mem_addr+127:mem_addr]
4244//
4245// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd
4246FORCE_INLINE __m128d _mm_load_pd(const double *p)
4247{
4248#if defined(__aarch64__)
4249 return vreinterpretq_m128d_f64(vld1q_f64(p));
4250#else
4251 const float *fp = (const float *) p;
4252 float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
4253 return vreinterpretq_m128d_f32(vld1q_f32(data));
4254#endif
4255}
4256
4257// Load a double-precision (64-bit) floating-point element from memory into both
4258// elements of dst.
4259//
4260// dst[63:0] := MEM[mem_addr+63:mem_addr]
4261// dst[127:64] := MEM[mem_addr+63:mem_addr]
4262//
4263// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1
4264#define _mm_load_pd1 _mm_load1_pd
4265
4266// Load a double-precision (64-bit) floating-point element from memory into the
4267// lower of dst, and zero the upper element. mem_addr does not need to be
4268// aligned on any particular boundary.
4269//
4270// dst[63:0] := MEM[mem_addr+63:mem_addr]
4271// dst[127:64] := 0
4272//
4273// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd
4274FORCE_INLINE __m128d _mm_load_sd(const double *p)
4275{
4276#if defined(__aarch64__)
4277 return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
4278#else
4279 const float *fp = (const float *) p;
4280 float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
4281 return vreinterpretq_m128d_f32(vld1q_f32(data));
4282#endif
4283}
4284
4285// Loads 128-bit value. :
4286// https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
4287FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
4288{
4289 return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
4290}
4291
4292// Load a double-precision (64-bit) floating-point element from memory into both
4293// elements of dst.
4294//
4295// dst[63:0] := MEM[mem_addr+63:mem_addr]
4296// dst[127:64] := MEM[mem_addr+63:mem_addr]
4297//
4298// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd
4299FORCE_INLINE __m128d _mm_load1_pd(const double *p)
4300{
4301#if defined(__aarch64__)
4302 return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
4303#else
4304 return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
4305#endif
4306}
4307
4308// Load a double-precision (64-bit) floating-point element from memory into the
4309// upper element of dst, and copy the lower element from a to dst. mem_addr does
4310// not need to be aligned on any particular boundary.
4311//
4312// dst[63:0] := a[63:0]
4313// dst[127:64] := MEM[mem_addr+63:mem_addr]
4314//
4315// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd
4316FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
4317{
4318#if defined(__aarch64__)
4319 return vreinterpretq_m128d_f64(
4320 vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
4321#else
4322 return vreinterpretq_m128d_f32(vcombine_f32(
4323 vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
4324#endif
4325}
4326
4327// Load 64-bit integer from memory into the first element of dst.
4328// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_epi64
4329FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
4330{
4331 /* Load the lower 64 bits of the value pointed to by p into the
4332 * lower 64 bits of the result, zeroing the upper 64 bits of the result.
4333 */
4334 return vreinterpretq_m128i_s32(
4335 vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
4336}
4337
4338// Load a double-precision (64-bit) floating-point element from memory into the
4339// lower element of dst, and copy the upper element from a to dst. mem_addr does
4340// not need to be aligned on any particular boundary.
4341//
4342// dst[63:0] := MEM[mem_addr+63:mem_addr]
4343// dst[127:64] := a[127:64]
4344//
4345// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd
4346FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
4347{
4348#if defined(__aarch64__)
4349 return vreinterpretq_m128d_f64(
4350 vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
4351#else
4352 return vreinterpretq_m128d_f32(
4353 vcombine_f32(vld1_f32((const float *) p),
4354 vget_high_f32(vreinterpretq_f32_m128d(a))));
4355#endif
4356}
4357
4358// Load 2 double-precision (64-bit) floating-point elements from memory into dst
4359// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
4360// general-protection exception may be generated.
4361//
4362// dst[63:0] := MEM[mem_addr+127:mem_addr+64]
4363// dst[127:64] := MEM[mem_addr+63:mem_addr]
4364//
4365// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd
4366FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
4367{
4368#if defined(__aarch64__)
4369 float64x2_t v = vld1q_f64(p);
4370 return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
4371#else
4372 int64x2_t v = vld1q_s64((const int64_t *) p);
4373 return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
4374#endif
4375}
4376
4377// Loads two double-precision from unaligned memory, floating-point values.
4378// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd
4379FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
4380{
4381 return _mm_load_pd(p);
4382}
4383
4384// Loads 128-bit value. :
4385// https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
4386FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
4387{
4388 return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
4389}
4390
4391// Load unaligned 32-bit integer from memory into the first element of dst.
4392//
4393// dst[31:0] := MEM[mem_addr+31:mem_addr]
4394// dst[MAX:32] := 0
4395//
4396// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32
4397FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
4398{
4399 return vreinterpretq_m128i_s32(
4400 vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
4401}
4402
4403// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
4404// integers from b.
4405//
4406// r0 := (a0 * b0) + (a1 * b1)
4407// r1 := (a2 * b2) + (a3 * b3)
4408// r2 := (a4 * b4) + (a5 * b5)
4409// r3 := (a6 * b6) + (a7 * b7)
4410// https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx
4411FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
4412{
4413 int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
4414 vget_low_s16(vreinterpretq_s16_m128i(b)));
4415 int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
4416 vget_high_s16(vreinterpretq_s16_m128i(b)));
4417
4418 int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
4419 int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
4420
4421 return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
4422}
4423
4424// Conditionally store 8-bit integer elements from a into memory using mask
4425// (elements are not stored when the highest bit is not set in the corresponding
4426// element) and a non-temporal memory hint. mem_addr does not need to be aligned
4427// on any particular boundary.
4428// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmoveu_si128
4429FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
4430{
4431 int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7);
4432 __m128 b = _mm_load_ps((const float *) mem_addr);
4433 int8x16_t masked =
4434 vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a),
4435 vreinterpretq_s8_m128(b));
4436 vst1q_s8((int8_t *) mem_addr, masked);
4437}
4438
4439// Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8
4440// signed 16-bit integers from b.
4441// https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
4442FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
4443{
4444 return vreinterpretq_m128i_s16(
4445 vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4446}
4447
4448// Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the
4449// 16 unsigned 8-bit integers from b.
4450// https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx
4451FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
4452{
4453 return vreinterpretq_m128i_u8(
4454 vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
4455}
4456
4457// Compare packed double-precision (64-bit) floating-point elements in a and b,
4458// and store packed maximum values in dst.
4459// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pd
4460FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
4461{
4462#if defined(__aarch64__)
4463 return vreinterpretq_m128d_f64(
4464 vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4465#else
4466 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
4467 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
4468 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
4469 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
4470 uint64_t d[2];
4471 d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0;
4472 d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1;
4473
4474 return vreinterpretq_m128d_u64(vld1q_u64(d));
4475#endif
4476}
4477
4478// Compare the lower double-precision (64-bit) floating-point elements in a and
4479// b, store the maximum value in the lower element of dst, and copy the upper
4480// element from a to the upper element of dst.
4481// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_sd
4482FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
4483{
4484#if defined(__aarch64__)
4485 return _mm_move_sd(a, _mm_max_pd(a, b));
4486#else
4487 double *da = (double *) &a;
4488 double *db = (double *) &b;
4489 double c[2] = {fmax(da[0], db[0]), da[1]};
4490 return vld1q_f32((float32_t *) c);
4491#endif
4492}
4493
4494// Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8
4495// signed 16-bit integers from b.
4496// https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
4497FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
4498{
4499 return vreinterpretq_m128i_s16(
4500 vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4501}
4502
4503// Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the
4504// 16 unsigned 8-bit integers from b.
4505// https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
4506FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
4507{
4508 return vreinterpretq_m128i_u8(
4509 vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
4510}
4511
4512// Compare packed double-precision (64-bit) floating-point elements in a and b,
4513// and store packed minimum values in dst.
4514// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pd
4515FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
4516{
4517#if defined(__aarch64__)
4518 return vreinterpretq_m128d_f64(
4519 vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4520#else
4521 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
4522 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
4523 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
4524 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
4525 uint64_t d[2];
4526 d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0;
4527 d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1;
4528 return vreinterpretq_m128d_u64(vld1q_u64(d));
4529#endif
4530}
4531
4532// Compare the lower double-precision (64-bit) floating-point elements in a and
4533// b, store the minimum value in the lower element of dst, and copy the upper
4534// element from a to the upper element of dst.
4535// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_sd
4536FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
4537{
4538#if defined(__aarch64__)
4539 return _mm_move_sd(a, _mm_min_pd(a, b));
4540#else
4541 double *da = (double *) &a;
4542 double *db = (double *) &b;
4543 double c[2] = {fmin(da[0], db[0]), da[1]};
4544 return vld1q_f32((float32_t *) c);
4545#endif
4546}
4547
4548// Copy the lower 64-bit integer in a to the lower element of dst, and zero the
4549// upper element.
4550//
4551// dst[63:0] := a[63:0]
4552// dst[127:64] := 0
4553//
4554// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64
4555FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
4556{
4557 return vreinterpretq_m128i_s64(
4558 vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
4559}
4560
4561// Move the lower double-precision (64-bit) floating-point element from b to the
4562// lower element of dst, and copy the upper element from a to the upper element
4563// of dst.
4564//
4565// dst[63:0] := b[63:0]
4566// dst[127:64] := a[127:64]
4567//
4568// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sd
4569FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
4570{
4571 return vreinterpretq_m128d_f32(
4572 vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)),
4573 vget_high_f32(vreinterpretq_f32_m128d(a))));
4574}
4575
4576// NEON does not provide a version of this function.
4577// Creates a 16-bit mask from the most significant bits of the 16 signed or
4578// unsigned 8-bit integers in a and zero extends the upper bits.
4579// https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
4580FORCE_INLINE int _mm_movemask_epi8(__m128i a)
4581{
4582 // Use increasingly wide shifts+adds to collect the sign bits
4583 // together.
4584 // Since the widening shifts would be rather confusing to follow in little
4585 // endian, everything will be illustrated in big endian order instead. This
4586 // has a different result - the bits would actually be reversed on a big
4587 // endian machine.
4588
4589 // Starting input (only half the elements are shown):
4590 // 89 ff 1d c0 00 10 99 33
4591 uint8x16_t input = vreinterpretq_u8_m128i(a);
4592
4593 // Shift out everything but the sign bits with an unsigned shift right.
4594 //
4595 // Bytes of the vector::
4596 // 89 ff 1d c0 00 10 99 33
4597 // \ \ \ \ \ \ \ \ high_bits = (uint16x4_t)(input >> 7)
4598 // | | | | | | | |
4599 // 01 01 00 01 00 00 01 00
4600 //
4601 // Bits of first important lane(s):
4602 // 10001001 (89)
4603 // \______
4604 // |
4605 // 00000001 (01)
4606 uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
4607
4608 // Merge the even lanes together with a 16-bit unsigned shift right + add.
4609 // 'xx' represents garbage data which will be ignored in the final result.
4610 // In the important bytes, the add functions like a binary OR.
4611 //
4612 // 01 01 00 01 00 00 01 00
4613 // \_ | \_ | \_ | \_ | paired16 = (uint32x4_t)(input + (input >> 7))
4614 // \| \| \| \|
4615 // xx 03 xx 01 xx 00 xx 02
4616 //
4617 // 00000001 00000001 (01 01)
4618 // \_______ |
4619 // \|
4620 // xxxxxxxx xxxxxx11 (xx 03)
4621 uint32x4_t paired16 =
4622 vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
4623
4624 // Repeat with a wider 32-bit shift + add.
4625 // xx 03 xx 01 xx 00 xx 02
4626 // \____ | \____ | paired32 = (uint64x1_t)(paired16 + (paired16 >>
4627 // 14))
4628 // \| \|
4629 // xx xx xx 0d xx xx xx 02
4630 //
4631 // 00000011 00000001 (03 01)
4632 // \\_____ ||
4633 // '----.\||
4634 // xxxxxxxx xxxx1101 (xx 0d)
4635 uint64x2_t paired32 =
4636 vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
4637
4638 // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
4639 // lanes. xx xx xx 0d xx xx xx 02
4640 // \_________ | paired64 = (uint8x8_t)(paired32 + (paired32 >>
4641 // 28))
4642 // \|
4643 // xx xx xx xx xx xx xx d2
4644 //
4645 // 00001101 00000010 (0d 02)
4646 // \ \___ | |
4647 // '---. \| |
4648 // xxxxxxxx 11010010 (xx d2)
4649 uint8x16_t paired64 =
4650 vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
4651
4652 // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
4653 // xx xx xx xx xx xx xx d2
4654 // || return paired64[0]
4655 // d2
4656 // Note: Little endian would return the correct value 4b (01001011) instead.
4657 return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
4658}
4659
4660// Set each bit of mask dst based on the most significant bit of the
4661// corresponding packed double-precision (64-bit) floating-point element in a.
4662// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pd
4663FORCE_INLINE int _mm_movemask_pd(__m128d a)
4664{
4665 uint64x2_t input = vreinterpretq_u64_m128d(a);
4666 uint64x2_t high_bits = vshrq_n_u64(input, 63);
4667 return vgetq_lane_u64(high_bits, 0) | (vgetq_lane_u64(high_bits, 1) << 1);
4668}
4669
4670// Copy the lower 64-bit integer in a to dst.
4671//
4672// dst[63:0] := a[63:0]
4673//
4674// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64
4675FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
4676{
4677 return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
4678}
4679
4680// Copy the 64-bit integer a to the lower element of dst, and zero the upper
4681// element.
4682//
4683// dst[63:0] := a[63:0]
4684// dst[127:64] := 0
4685//
4686// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64
4687FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
4688{
4689 return vreinterpretq_m128i_s64(
4690 vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
4691}
4692
4693// Multiply the low unsigned 32-bit integers from each packed 64-bit element in
4694// a and b, and store the unsigned 64-bit results in dst.
4695//
4696// r0 := (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF)
4697// r1 := (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF)
4698FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
4699{
4700 // vmull_u32 upcasts instead of masking, so we downcast.
4701 uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
4702 uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
4703 return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
4704}
4705
4706// Multiply packed double-precision (64-bit) floating-point elements in a and b,
4707// and store the results in dst.
4708// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pd
4709FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
4710{
4711#if defined(__aarch64__)
4712 return vreinterpretq_m128d_f64(
4713 vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4714#else
4715 double *da = (double *) &a;
4716 double *db = (double *) &b;
4717 double c[2];
4718 c[0] = da[0] * db[0];
4719 c[1] = da[1] * db[1];
4720 return vld1q_f32((float32_t *) c);
4721#endif
4722}
4723
4724// Multiply the lower double-precision (64-bit) floating-point element in a and
4725// b, store the result in the lower element of dst, and copy the upper element
4726// from a to the upper element of dst.
4727// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_sd
4728FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
4729{
4730 return _mm_move_sd(a, _mm_mul_pd(a, b));
4731}
4732
4733// Multiply the low unsigned 32-bit integers from a and b, and store the
4734// unsigned 64-bit result in dst.
4735//
4736// dst[63:0] := a[31:0] * b[31:0]
4737//
4738// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32
4739FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
4740{
4741 return vreinterpret_m64_u64(vget_low_u64(
4742 vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
4743}
4744
4745// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
4746// integers from b.
4747//
4748// r0 := (a0 * b0)[31:16]
4749// r1 := (a1 * b1)[31:16]
4750// ...
4751// r7 := (a7 * b7)[31:16]
4752//
4753// https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
4754FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
4755{
4756 /* FIXME: issue with large values because of result saturation */
4757 // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
4758 // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
4759 // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
4760 int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
4761 int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
4762 int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
4763 int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
4764 int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
4765 int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
4766 uint16x8x2_t r =
4767 vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
4768 return vreinterpretq_m128i_u16(r.val[1]);
4769}
4770
4771// Multiply the packed unsigned 16-bit integers in a and b, producing
4772// intermediate 32-bit integers, and store the high 16 bits of the intermediate
4773// integers in dst.
4774// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_epu16
4775FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
4776{
4777 uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
4778 uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
4779 uint32x4_t ab3210 = vmull_u16(a3210, b3210);
4780#if defined(__aarch64__)
4781 uint32x4_t ab7654 =
4782 vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
4783 uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
4784 vreinterpretq_u16_u32(ab7654));
4785 return vreinterpretq_m128i_u16(r);
4786#else
4787 uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a));
4788 uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b));
4789 uint32x4_t ab7654 = vmull_u16(a7654, b7654);
4790 uint16x8x2_t r =
4791 vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
4792 return vreinterpretq_m128i_u16(r.val[1]);
4793#endif
4794}
4795
4796// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or
4797// unsigned 16-bit integers from b.
4798//
4799// r0 := (a0 * b0)[15:0]
4800// r1 := (a1 * b1)[15:0]
4801// ...
4802// r7 := (a7 * b7)[15:0]
4803//
4804// https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
4805FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
4806{
4807 return vreinterpretq_m128i_s16(
4808 vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4809}
4810
4811// Compute the bitwise OR of packed double-precision (64-bit) floating-point
4812// elements in a and b, and store the results in dst.
4813// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_or_pd
4814FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
4815{
4816 return vreinterpretq_m128d_s64(
4817 vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
4818}
4819
4820// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
4821//
4822// r := a | b
4823//
4824// https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
4825FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
4826{
4827 return vreinterpretq_m128i_s32(
4828 vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
4829}
4830
4831// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and
4832// saturates.
4833// https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
4834FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
4835{
4836 return vreinterpretq_m128i_s8(
4837 vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
4838 vqmovn_s16(vreinterpretq_s16_m128i(b))));
4839}
4840
4841// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers
4842// and saturates.
4843//
4844// r0 := SignedSaturate(a0)
4845// r1 := SignedSaturate(a1)
4846// r2 := SignedSaturate(a2)
4847// r3 := SignedSaturate(a3)
4848// r4 := SignedSaturate(b0)
4849// r5 := SignedSaturate(b1)
4850// r6 := SignedSaturate(b2)
4851// r7 := SignedSaturate(b3)
4852//
4853// https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
4854FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
4855{
4856 return vreinterpretq_m128i_s16(
4857 vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
4858 vqmovn_s32(vreinterpretq_s32_m128i(b))));
4859}
4860
4861// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned
4862// integers and saturates.
4863//
4864// r0 := UnsignedSaturate(a0)
4865// r1 := UnsignedSaturate(a1)
4866// ...
4867// r7 := UnsignedSaturate(a7)
4868// r8 := UnsignedSaturate(b0)
4869// r9 := UnsignedSaturate(b1)
4870// ...
4871// r15 := UnsignedSaturate(b7)
4872//
4873// https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
4874FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
4875{
4876 return vreinterpretq_m128i_u8(
4877 vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
4878 vqmovun_s16(vreinterpretq_s16_m128i(b))));
4879}
4880
4881// Pause the processor. This is typically used in spin-wait loops and depending
4882// on the x86 processor typical values are in the 40-100 cycle range. The
4883// 'yield' instruction isn't a good fit beacuse it's effectively a nop on most
4884// Arm cores. Experience with several databases has shown has shown an 'isb' is
4885// a reasonable approximation.
4886FORCE_INLINE void _mm_pause()
4887{
4888 __asm__ __volatile__("isb\n");
4889}
4890
4891// Compute the absolute differences of packed unsigned 8-bit integers in a and
4892// b, then horizontally sum each consecutive 8 differences to produce two
4893// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
4894// 16 bits of 64-bit elements in dst.
4895// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8
4896FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
4897{
4898 uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
4899 return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t)));
4900}
4901
4902// Sets the 8 signed 16-bit integer values.
4903// https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx
4904FORCE_INLINE __m128i _mm_set_epi16(short i7,
4905 short i6,
4906 short i5,
4907 short i4,
4908 short i3,
4909 short i2,
4910 short i1,
4911 short i0)
4912{
4913 int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
4914 return vreinterpretq_m128i_s16(vld1q_s16(data));
4915}
4916
4917// Sets the 4 signed 32-bit integer values.
4918// https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
4919FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
4920{
4921 int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
4922 return vreinterpretq_m128i_s32(vld1q_s32(data));
4923}
4924
4925// Returns the __m128i structure with its two 64-bit integer values
4926// initialized to the values of the two 64-bit integers passed in.
4927// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
4928FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
4929{
4930 return _mm_set_epi64x((int64_t) i1, (int64_t) i2);
4931}
4932
4933// Returns the __m128i structure with its two 64-bit integer values
4934// initialized to the values of the two 64-bit integers passed in.
4935// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
4936FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
4937{
4938 return vreinterpretq_m128i_s64(
4939 vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
4940}
4941
4942// Sets the 16 signed 8-bit integer values.
4943// https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
4944FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
4945 signed char b14,
4946 signed char b13,
4947 signed char b12,
4948 signed char b11,
4949 signed char b10,
4950 signed char b9,
4951 signed char b8,
4952 signed char b7,
4953 signed char b6,
4954 signed char b5,
4955 signed char b4,
4956 signed char b3,
4957 signed char b2,
4958 signed char b1,
4959 signed char b0)
4960{
4961 int8_t ALIGN_STRUCT(16)
4962 data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
4963 (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
4964 (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
4965 (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
4966 return (__m128i) vld1q_s8(data);
4967}
4968
4969// Set packed double-precision (64-bit) floating-point elements in dst with the
4970// supplied values.
4971// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd
4972FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
4973{
4974 double ALIGN_STRUCT(16) data[2] = {e0, e1};
4975#if defined(__aarch64__)
4976 return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
4977#else
4978 return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
4979#endif
4980}
4981
4982// Broadcast double-precision (64-bit) floating-point value a to all elements of
4983// dst.
4984// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd1
4985#define _mm_set_pd1 _mm_set1_pd
4986
4987// Copy double-precision (64-bit) floating-point element a to the lower element
4988// of dst, and zero the upper element.
4989// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sd
4990FORCE_INLINE __m128d _mm_set_sd(double a)
4991{
4992 return _mm_set_pd(0, a);
4993}
4994
4995// Sets the 8 signed 16-bit integer values to w.
4996//
4997// r0 := w
4998// r1 := w
4999// ...
5000// r7 := w
5001//
5002// https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx
5003FORCE_INLINE __m128i _mm_set1_epi16(short w)
5004{
5005 return vreinterpretq_m128i_s16(vdupq_n_s16(w));
5006}
5007
5008// Sets the 4 signed 32-bit integer values to i.
5009//
5010// r0 := i
5011// r1 := i
5012// r2 := i
5013// r3 := I
5014//
5015// https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
5016FORCE_INLINE __m128i _mm_set1_epi32(int _i)
5017{
5018 return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
5019}
5020
5021// Sets the 2 signed 64-bit integer values to i.
5022// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100)
5023FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
5024{
5025 return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));
5026}
5027
5028// Sets the 2 signed 64-bit integer values to i.
5029// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x
5030FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
5031{
5032 return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
5033}
5034
5035// Sets the 16 signed 8-bit integer values to b.
5036//
5037// r0 := b
5038// r1 := b
5039// ...
5040// r15 := b
5041//
5042// https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx
5043FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
5044{
5045 return vreinterpretq_m128i_s8(vdupq_n_s8(w));
5046}
5047
5048// Broadcast double-precision (64-bit) floating-point value a to all elements of
5049// dst.
5050// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_pd
5051FORCE_INLINE __m128d _mm_set1_pd(double d)
5052{
5053#if defined(__aarch64__)
5054 return vreinterpretq_m128d_f64(vdupq_n_f64(d));
5055#else
5056 return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d));
5057#endif
5058}
5059
5060// Sets the 8 signed 16-bit integer values in reverse order.
5061//
5062// Return Value
5063// r0 := w0
5064// r1 := w1
5065// ...
5066// r7 := w7
5067FORCE_INLINE __m128i _mm_setr_epi16(short w0,
5068 short w1,
5069 short w2,
5070 short w3,
5071 short w4,
5072 short w5,
5073 short w6,
5074 short w7)
5075{
5076 int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
5077 return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
5078}
5079
5080// Sets the 4 signed 32-bit integer values in reverse order
5081// https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx
5082FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
5083{
5084 int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
5085 return vreinterpretq_m128i_s32(vld1q_s32(data));
5086}
5087
5088// Set packed 64-bit integers in dst with the supplied values in reverse order.
5089// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64
5090FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
5091{
5092 return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
5093}
5094
5095// Sets the 16 signed 8-bit integer values in reverse order.
5096// https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx
5097FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
5098 signed char b1,
5099 signed char b2,
5100 signed char b3,
5101 signed char b4,
5102 signed char b5,
5103 signed char b6,
5104 signed char b7,
5105 signed char b8,
5106 signed char b9,
5107 signed char b10,
5108 signed char b11,
5109 signed char b12,
5110 signed char b13,
5111 signed char b14,
5112 signed char b15)
5113{
5114 int8_t ALIGN_STRUCT(16)
5115 data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
5116 (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
5117 (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
5118 (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
5119 return (__m128i) vld1q_s8(data);
5120}
5121
5122// Set packed double-precision (64-bit) floating-point elements in dst with the
5123// supplied values in reverse order.
5124// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_pd
5125FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
5126{
5127 return _mm_set_pd(e0, e1);
5128}
5129
5130// Return vector of type __m128d with all elements set to zero.
5131// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_pd
5132FORCE_INLINE __m128d _mm_setzero_pd(void)
5133{
5134#if defined(__aarch64__)
5135 return vreinterpretq_m128d_f64(vdupq_n_f64(0));
5136#else
5137 return vreinterpretq_m128d_f32(vdupq_n_f32(0));
5138#endif
5139}
5140
5141// Sets the 128-bit value to zero
5142// https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
5143FORCE_INLINE __m128i _mm_setzero_si128(void)
5144{
5145 return vreinterpretq_m128i_s32(vdupq_n_s32(0));
5146}
5147
5148// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.
5149// https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
5150// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
5151// __constrange(0,255) int imm)
5152#if __has_builtin(__builtin_shufflevector)
5153#define _mm_shuffle_epi32(a, imm) \
5154 __extension__({ \
5155 int32x4_t _input = vreinterpretq_s32_m128i(a); \
5156 int32x4_t _shuf = __builtin_shufflevector( \
5157 _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
5158 ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \
5159 vreinterpretq_m128i_s32(_shuf); \
5160 })
5161#else // generic
5162#define _mm_shuffle_epi32(a, imm) \
5163 __extension__({ \
5164 __m128i ret; \
5165 switch (imm) { \
5166 case _MM_SHUFFLE(1, 0, 3, 2): \
5167 ret = _mm_shuffle_epi_1032((a)); \
5168 break; \
5169 case _MM_SHUFFLE(2, 3, 0, 1): \
5170 ret = _mm_shuffle_epi_2301((a)); \
5171 break; \
5172 case _MM_SHUFFLE(0, 3, 2, 1): \
5173 ret = _mm_shuffle_epi_0321((a)); \
5174 break; \
5175 case _MM_SHUFFLE(2, 1, 0, 3): \
5176 ret = _mm_shuffle_epi_2103((a)); \
5177 break; \
5178 case _MM_SHUFFLE(1, 0, 1, 0): \
5179 ret = _mm_shuffle_epi_1010((a)); \
5180 break; \
5181 case _MM_SHUFFLE(1, 0, 0, 1): \
5182 ret = _mm_shuffle_epi_1001((a)); \
5183 break; \
5184 case _MM_SHUFFLE(0, 1, 0, 1): \
5185 ret = _mm_shuffle_epi_0101((a)); \
5186 break; \
5187 case _MM_SHUFFLE(2, 2, 1, 1): \
5188 ret = _mm_shuffle_epi_2211((a)); \
5189 break; \
5190 case _MM_SHUFFLE(0, 1, 2, 2): \
5191 ret = _mm_shuffle_epi_0122((a)); \
5192 break; \
5193 case _MM_SHUFFLE(3, 3, 3, 2): \
5194 ret = _mm_shuffle_epi_3332((a)); \
5195 break; \
5196 case _MM_SHUFFLE(0, 0, 0, 0): \
5197 ret = _mm_shuffle_epi32_splat((a), 0); \
5198 break; \
5199 case _MM_SHUFFLE(1, 1, 1, 1): \
5200 ret = _mm_shuffle_epi32_splat((a), 1); \
5201 break; \
5202 case _MM_SHUFFLE(2, 2, 2, 2): \
5203 ret = _mm_shuffle_epi32_splat((a), 2); \
5204 break; \
5205 case _MM_SHUFFLE(3, 3, 3, 3): \
5206 ret = _mm_shuffle_epi32_splat((a), 3); \
5207 break; \
5208 default: \
5209 ret = _mm_shuffle_epi32_default((a), (imm)); \
5210 break; \
5211 } \
5212 ret; \
5213 })
5214#endif
5215
5216// Shuffle double-precision (64-bit) floating-point elements using the control
5217// in imm8, and store the results in dst.
5218//
5219// dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
5220// dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
5221//
5222// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pd
5223#if __has_builtin(__builtin_shufflevector)
5224#define _mm_shuffle_pd(a, b, imm8) \
5225 vreinterpretq_m128d_s64(__builtin_shufflevector( \
5226 vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), imm8 & 0x1, \
5227 ((imm8 & 0x2) >> 1) + 2))
5228#else
5229#define _mm_shuffle_pd(a, b, imm8) \
5230 _mm_castsi128_pd(_mm_set_epi64x( \
5231 vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
5232 vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
5233#endif
5234
5235// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
5236// __constrange(0,255) int imm)
5237#if __has_builtin(__builtin_shufflevector)
5238#define _mm_shufflehi_epi16(a, imm) \
5239 __extension__({ \
5240 int16x8_t _input = vreinterpretq_s16_m128i(a); \
5241 int16x8_t _shuf = __builtin_shufflevector( \
5242 _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4, \
5243 (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
5244 (((imm) >> 6) & 0x3) + 4); \
5245 vreinterpretq_m128i_s16(_shuf); \
5246 })
5247#else // generic
5248#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
5249#endif
5250
5251// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
5252// __constrange(0,255) int imm)
5253#if __has_builtin(__builtin_shufflevector)
5254#define _mm_shufflelo_epi16(a, imm) \
5255 __extension__({ \
5256 int16x8_t _input = vreinterpretq_s16_m128i(a); \
5257 int16x8_t _shuf = __builtin_shufflevector( \
5258 _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \
5259 (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
5260 vreinterpretq_m128i_s16(_shuf); \
5261 })
5262#else // generic
5263#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
5264#endif
5265
5266// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
5267// shifting in zeros.
5268//
5269// r0 := a0 << count
5270// r1 := a1 << count
5271// ...
5272// r7 := a7 << count
5273//
5274// https://msdn.microsoft.com/en-us/library/c79w388h(v%3dvs.90).aspx
5275FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
5276{
5277 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5278 if (unlikely(c > 15))
5279 return _mm_setzero_si128();
5280
5281 int16x8_t vc = vdupq_n_s16((int16_t) c);
5282 return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
5283}
5284
5285// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
5286// shifting in zeros.
5287//
5288// r0 := a0 << count
5289// r1 := a1 << count
5290// r2 := a2 << count
5291// r3 := a3 << count
5292//
5293// https://msdn.microsoft.com/en-us/library/6fe5a6s9(v%3dvs.90).aspx
5294FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
5295{
5296 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5297 if (unlikely(c > 31))
5298 return _mm_setzero_si128();
5299
5300 int32x4_t vc = vdupq_n_s32((int32_t) c);
5301 return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
5302}
5303
5304// Shifts the 2 signed or unsigned 64-bit integers in a left by count bits while
5305// shifting in zeros.
5306//
5307// r0 := a0 << count
5308// r1 := a1 << count
5309//
5310// https://msdn.microsoft.com/en-us/library/6ta9dffd(v%3dvs.90).aspx
5311FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
5312{
5313 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5314 if (unlikely(c > 63))
5315 return _mm_setzero_si128();
5316
5317 int64x2_t vc = vdupq_n_s64((int64_t) c);
5318 return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
5319}
5320
5321// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
5322// shifting in zeros.
5323//
5324// r0 := a0 << count
5325// r1 := a1 << count
5326// ...
5327// r7 := a7 << count
5328//
5329// https://msdn.microsoft.com/en-us/library/es73bcsy(v=vs.90).aspx
5330#define _mm_slli_epi16(a, imm) \
5331 __extension__({ \
5332 __m128i ret; \
5333 if (unlikely((imm)) <= 0) { \
5334 ret = a; \
5335 } \
5336 if (unlikely((imm) > 15)) { \
5337 ret = _mm_setzero_si128(); \
5338 } else { \
5339 ret = vreinterpretq_m128i_s16( \
5340 vshlq_n_s16(vreinterpretq_s16_m128i(a), (imm))); \
5341 } \
5342 ret; \
5343 })
5344
5345// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
5346// shifting in zeros. :
5347// https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx
5348// FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, __constrange(0,255) int imm)
5349FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
5350{
5351 if (unlikely(imm <= 0)) /* TODO: add constant range macro: [0, 255] */
5352 return a;
5353 if (unlikely(imm > 31))
5354 return _mm_setzero_si128();
5355 return vreinterpretq_m128i_s32(
5356 vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
5357}
5358
5359// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
5360// store the results in dst.
5361FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
5362{
5363 if (unlikely(imm <= 0)) /* TODO: add constant range macro: [0, 255] */
5364 return a;
5365 if (unlikely(imm > 63))
5366 return _mm_setzero_si128();
5367 return vreinterpretq_m128i_s64(
5368 vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
5369}
5370
5371// Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm
5372// must be an immediate.
5373//
5374// r := a << (imm * 8)
5375//
5376// https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx
5377// FORCE_INLINE __m128i _mm_slli_si128(__m128i a, __constrange(0,255) int imm)
5378#define _mm_slli_si128(a, imm) \
5379 __extension__({ \
5380 __m128i ret; \
5381 if (unlikely((imm) <= 0)) { \
5382 ret = a; \
5383 } \
5384 if (unlikely((imm) > 15)) { \
5385 ret = _mm_setzero_si128(); \
5386 } else { \
5387 ret = vreinterpretq_m128i_s8(vextq_s8( \
5388 vdupq_n_s8(0), vreinterpretq_s8_m128i(a), 16 - (imm))); \
5389 } \
5390 ret; \
5391 })
5392
5393// Compute the square root of packed double-precision (64-bit) floating-point
5394// elements in a, and store the results in dst.
5395// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_pd
5396FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
5397{
5398#if defined(__aarch64__)
5399 return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
5400#else
5401 double a0 = sqrt(((double *) &a)[0]);
5402 double a1 = sqrt(((double *) &a)[1]);
5403 return _mm_set_pd(a1, a0);
5404#endif
5405}
5406
5407// Compute the square root of the lower double-precision (64-bit) floating-point
5408// element in b, store the result in the lower element of dst, and copy the
5409// upper element from a to the upper element of dst.
5410// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_sd
5411FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
5412{
5413#if defined(__aarch64__)
5414 return _mm_move_sd(a, _mm_sqrt_pd(b));
5415#else
5416 return _mm_set_pd(((double *) &a)[1], sqrt(((double *) &b)[0]));
5417#endif
5418}
5419
5420// Shifts the 8 signed 16-bit integers in a right by count bits while shifting
5421// in the sign bit.
5422//
5423// r0 := a0 >> count
5424// r1 := a1 >> count
5425// ...
5426// r7 := a7 >> count
5427//
5428// https://msdn.microsoft.com/en-us/library/3c9997dk(v%3dvs.90).aspx
5429FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
5430{
5431 int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
5432 if (unlikely(c > 15))
5433 return _mm_cmplt_epi16(a, _mm_setzero_si128());
5434 return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c)));
5435}
5436
5437// Shifts the 4 signed 32-bit integers in a right by count bits while shifting
5438// in the sign bit.
5439//
5440// r0 := a0 >> count
5441// r1 := a1 >> count
5442// r2 := a2 >> count
5443// r3 := a3 >> count
5444//
5445// https://msdn.microsoft.com/en-us/library/ce40009e(v%3dvs.100).aspx
5446FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
5447{
5448 int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
5449 if (unlikely(c > 31))
5450 return _mm_cmplt_epi32(a, _mm_setzero_si128());
5451 return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c)));
5452}
5453
5454// Shift packed 16-bit integers in a right by imm while shifting in sign
5455// bits, and store the results in dst.
5456// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16
5457FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
5458{
5459 const int count = (imm & ~15) ? 15 : imm;
5460 return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
5461}
5462
5463// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
5464// and store the results in dst.
5465//
5466// FOR j := 0 to 3
5467// i := j*32
5468// IF imm8[7:0] > 31
5469// dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
5470// ELSE
5471// dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0])
5472// FI
5473// ENDFOR
5474//
5475// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32
5476// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
5477#define _mm_srai_epi32(a, imm) \
5478 __extension__({ \
5479 __m128i ret; \
5480 if (unlikely((imm) == 0)) { \
5481 ret = a; \
5482 } else if (likely(0 < (imm) && (imm) < 32)) { \
5483 ret = vreinterpretq_m128i_s32( \
5484 vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-imm))); \
5485 } else { \
5486 ret = vreinterpretq_m128i_s32( \
5487 vshrq_n_s32(vreinterpretq_s32_m128i(a), 31)); \
5488 } \
5489 ret; \
5490 })
5491
5492// Shifts the 8 signed or unsigned 16-bit integers in a right by count bits
5493// while shifting in zeros.
5494//
5495// r0 := srl(a0, count)
5496// r1 := srl(a1, count)
5497// ...
5498// r7 := srl(a7, count)
5499//
5500// https://msdn.microsoft.com/en-us/library/wd5ax830(v%3dvs.90).aspx
5501FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
5502{
5503 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5504 if (unlikely(c > 15))
5505 return _mm_setzero_si128();
5506
5507 int16x8_t vc = vdupq_n_s16(-(int16_t) c);
5508 return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
5509}
5510
5511// Shifts the 4 signed or unsigned 32-bit integers in a right by count bits
5512// while shifting in zeros.
5513//
5514// r0 := srl(a0, count)
5515// r1 := srl(a1, count)
5516// r2 := srl(a2, count)
5517// r3 := srl(a3, count)
5518//
5519// https://msdn.microsoft.com/en-us/library/a9cbttf4(v%3dvs.90).aspx
5520FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
5521{
5522 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5523 if (unlikely(c > 31))
5524 return _mm_setzero_si128();
5525
5526 int32x4_t vc = vdupq_n_s32(-(int32_t) c);
5527 return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
5528}
5529
5530// Shifts the 2 signed or unsigned 64-bit integers in a right by count bits
5531// while shifting in zeros.
5532//
5533// r0 := srl(a0, count)
5534// r1 := srl(a1, count)
5535//
5536// https://msdn.microsoft.com/en-us/library/yf6cf9k8(v%3dvs.90).aspx
5537FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
5538{
5539 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5540 if (unlikely(c > 63))
5541 return _mm_setzero_si128();
5542
5543 int64x2_t vc = vdupq_n_s64(-(int64_t) c);
5544 return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
5545}
5546
5547// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
5548// store the results in dst.
5549//
5550// FOR j := 0 to 7
5551// i := j*16
5552// IF imm8[7:0] > 15
5553// dst[i+15:i] := 0
5554// ELSE
5555// dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0])
5556// FI
5557// ENDFOR
5558//
5559// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16
5560#define _mm_srli_epi16(a, imm) \
5561 __extension__({ \
5562 __m128i ret; \
5563 if (unlikely(imm) == 0) { \
5564 ret = a; \
5565 } else if (likely(0 < (imm) && (imm) < 16)) { \
5566 ret = vreinterpretq_m128i_u16( \
5567 vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-imm))); \
5568 } else { \
5569 ret = _mm_setzero_si128(); \
5570 } \
5571 ret; \
5572 })
5573
5574// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
5575// store the results in dst.
5576//
5577// FOR j := 0 to 3
5578// i := j*32
5579// IF imm8[7:0] > 31
5580// dst[i+31:i] := 0
5581// ELSE
5582// dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0])
5583// FI
5584// ENDFOR
5585//
5586// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32
5587// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
5588#define _mm_srli_epi32(a, imm) \
5589 __extension__({ \
5590 __m128i ret; \
5591 if (unlikely((imm) == 0)) { \
5592 ret = a; \
5593 } else if (likely(0 < (imm) && (imm) < 32)) { \
5594 ret = vreinterpretq_m128i_u32( \
5595 vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-imm))); \
5596 } else { \
5597 ret = _mm_setzero_si128(); \
5598 } \
5599 ret; \
5600 })
5601
5602// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
5603// store the results in dst.
5604//
5605// FOR j := 0 to 1
5606// i := j*64
5607// IF imm8[7:0] > 63
5608// dst[i+63:i] := 0
5609// ELSE
5610// dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0])
5611// FI
5612// ENDFOR
5613//
5614// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64
5615#define _mm_srli_epi64(a, imm) \
5616 __extension__({ \
5617 __m128i ret; \
5618 if (unlikely((imm) == 0)) { \
5619 ret = a; \
5620 } else if (likely(0 < (imm) && (imm) < 64)) { \
5621 ret = vreinterpretq_m128i_u64( \
5622 vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-imm))); \
5623 } else { \
5624 ret = _mm_setzero_si128(); \
5625 } \
5626 ret; \
5627 })
5628
5629// Shifts the 128 - bit value in a right by imm bytes while shifting in
5630// zeros.imm must be an immediate.
5631//
5632// r := srl(a, imm*8)
5633//
5634// https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx
5635// FORCE_INLINE _mm_srli_si128(__m128i a, __constrange(0,255) int imm)
5636#define _mm_srli_si128(a, imm) \
5637 __extension__({ \
5638 __m128i ret; \
5639 if (unlikely((imm) <= 0)) { \
5640 ret = a; \
5641 } \
5642 if (unlikely((imm) > 15)) { \
5643 ret = _mm_setzero_si128(); \
5644 } else { \
5645 ret = vreinterpretq_m128i_s8( \
5646 vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), (imm))); \
5647 } \
5648 ret; \
5649 })
5650
5651// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
5652// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
5653// or a general-protection exception may be generated.
5654// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd
5655FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
5656{
5657#if defined(__aarch64__)
5658 vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
5659#else
5660 vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
5661#endif
5662}
5663
5664// Store the lower double-precision (64-bit) floating-point element from a into
5665// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
5666// boundary or a general-protection exception may be generated.
5667// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd1
5668FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
5669{
5670#if defined(__aarch64__)
5671 float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
5672 vst1q_f64((float64_t *) mem_addr,
5673 vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
5674#else
5675 float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a));
5676 vst1q_f32((float32_t *) mem_addr,
5677 vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low)));
5678#endif
5679}
5680
5681// Store the lower double-precision (64-bit) floating-point element from a into
5682// memory. mem_addr does not need to be aligned on any particular boundary.
5683// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_store_sd
5684FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
5685{
5686#if defined(__aarch64__)
5687 vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
5688#else
5689 vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a)));
5690#endif
5691}
5692
5693// Stores four 32-bit integer values as (as a __m128i value) at the address p.
5694// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
5695FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
5696{
5697 vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
5698}
5699
5700// Store the lower double-precision (64-bit) floating-point element from a into
5701// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
5702// boundary or a general-protection exception may be generated.
5703// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=9,526,5601&text=_mm_store1_pd
5704#define _mm_store1_pd _mm_store_pd1
5705
5706// Store the upper double-precision (64-bit) floating-point element from a into
5707// memory.
5708//
5709// MEM[mem_addr+63:mem_addr] := a[127:64]
5710//
5711// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeh_pd
5712FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
5713{
5714#if defined(__aarch64__)
5715 vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
5716#else
5717 vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a)));
5718#endif
5719}
5720
5721// Reads the lower 64 bits of b and stores them into the lower 64 bits of a.
5722// https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
5723FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
5724{
5725 uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a));
5726 uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b));
5727 *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi));
5728}
5729
5730// Store the lower double-precision (64-bit) floating-point element from a into
5731// memory.
5732//
5733// MEM[mem_addr+63:mem_addr] := a[63:0]
5734//
5735// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storel_pd
5736FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
5737{
5738#if defined(__aarch64__)
5739 vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
5740#else
5741 vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a)));
5742#endif
5743}
5744
5745// Store 2 double-precision (64-bit) floating-point elements from a into memory
5746// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
5747// general-protection exception may be generated.
5748//
5749// MEM[mem_addr+63:mem_addr] := a[127:64]
5750// MEM[mem_addr+127:mem_addr+64] := a[63:0]
5751//
5752// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_pd
5753FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
5754{
5755 float32x4_t f = vreinterpretq_f32_m128d(a);
5756 _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2)));
5757}
5758
5759// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
5760// elements) from a into memory. mem_addr does not need to be aligned on any
5761// particular boundary.
5762// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd
5763FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
5764{
5765 _mm_store_pd(mem_addr, a);
5766}
5767
5768// Stores 128-bits of integer data a at the address p.
5769// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si128
5770FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
5771{
5772 vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
5773}
5774
5775// Stores 32-bits of integer data a at the address p.
5776// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si32
5777FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
5778{
5779 vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0);
5780}
5781
5782// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
5783// elements) from a into memory using a non-temporal memory hint. mem_addr must
5784// be aligned on a 16-byte boundary or a general-protection exception may be
5785// generated.
5786// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pd
5787FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
5788{
5789#if __has_builtin(__builtin_nontemporal_store)
5790 __builtin_nontemporal_store(a, (float32x4_t *) p);
5791#elif defined(__aarch64__)
5792 vst1q_f64(p, vreinterpretq_f64_m128d(a));
5793#else
5794 vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a));
5795#endif
5796}
5797
5798// Stores the data in a to the address p without polluting the caches. If the
5799// cache line containing address p is already in the cache, the cache will be
5800// updated.
5801// https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
5802FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
5803{
5804#if __has_builtin(__builtin_nontemporal_store)
5805 __builtin_nontemporal_store(a, p);
5806#else
5807 vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
5808#endif
5809}
5810
5811// Store 32-bit integer a into memory using a non-temporal hint to minimize
5812// cache pollution. If the cache line containing address mem_addr is already in
5813// the cache, the cache will be updated.
5814// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_si32
5815FORCE_INLINE void _mm_stream_si32(int *p, int a)
5816{
5817 vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);
5818}
5819
5820// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
5821// store the results in dst.
5822// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi16
5823FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
5824{
5825 return vreinterpretq_m128i_s16(
5826 vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
5827}
5828
5829// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or
5830// unsigned 32-bit integers of a.
5831//
5832// r0 := a0 - b0
5833// r1 := a1 - b1
5834// r2 := a2 - b2
5835// r3 := a3 - b3
5836//
5837// https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
5838FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
5839{
5840 return vreinterpretq_m128i_s32(
5841 vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
5842}
5843
5844// Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a,
5845// and store the results in dst.
5846// r0 := a0 - b0
5847// r1 := a1 - b1
5848FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
5849{
5850 return vreinterpretq_m128i_s64(
5851 vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
5852}
5853
5854// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and
5855// store the results in dst.
5856// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi8
5857FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
5858{
5859 return vreinterpretq_m128i_s8(
5860 vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
5861}
5862
5863// Subtract packed double-precision (64-bit) floating-point elements in b from
5864// packed double-precision (64-bit) floating-point elements in a, and store the
5865// results in dst.
5866//
5867// FOR j := 0 to 1
5868// i := j*64
5869// dst[i+63:i] := a[i+63:i] - b[i+63:i]
5870// ENDFOR
5871//
5872// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_pd
5873FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
5874{
5875#if defined(__aarch64__)
5876 return vreinterpretq_m128d_f64(
5877 vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
5878#else
5879 double *da = (double *) &a;
5880 double *db = (double *) &b;
5881 double c[2];
5882 c[0] = da[0] - db[0];
5883 c[1] = da[1] - db[1];
5884 return vld1q_f32((float32_t *) c);
5885#endif
5886}
5887
5888// Subtract the lower double-precision (64-bit) floating-point element in b from
5889// the lower double-precision (64-bit) floating-point element in a, store the
5890// result in the lower element of dst, and copy the upper element from a to the
5891// upper element of dst.
5892// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sd
5893FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
5894{
5895 return _mm_move_sd(a, _mm_sub_pd(a, b));
5896}
5897
5898// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
5899//
5900// dst[63:0] := a[63:0] - b[63:0]
5901//
5902// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64
5903FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
5904{
5905 return vreinterpret_m64_s64(
5906 vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
5907}
5908
5909// Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers
5910// of a and saturates.
5911//
5912// r0 := SignedSaturate(a0 - b0)
5913// r1 := SignedSaturate(a1 - b1)
5914// ...
5915// r7 := SignedSaturate(a7 - b7)
5916//
5917// https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90)
5918FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
5919{
5920 return vreinterpretq_m128i_s16(
5921 vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
5922}
5923
5924// Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers
5925// of a and saturates.
5926//
5927// r0 := SignedSaturate(a0 - b0)
5928// r1 := SignedSaturate(a1 - b1)
5929// ...
5930// r15 := SignedSaturate(a15 - b15)
5931//
5932// https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90)
5933FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
5934{
5935 return vreinterpretq_m128i_s8(
5936 vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
5937}
5938
5939// Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit
5940// integers of a and saturates..
5941// https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx
5942FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
5943{
5944 return vreinterpretq_m128i_u16(
5945 vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
5946}
5947
5948// Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit
5949// integers of a and saturates.
5950//
5951// r0 := UnsignedSaturate(a0 - b0)
5952// r1 := UnsignedSaturate(a1 - b1)
5953// ...
5954// r15 := UnsignedSaturate(a15 - b15)
5955//
5956// https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)
5957FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
5958{
5959 return vreinterpretq_m128i_u8(
5960 vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
5961}
5962
5963#define _mm_ucomieq_sd _mm_comieq_sd
5964#define _mm_ucomige_sd _mm_comige_sd
5965#define _mm_ucomigt_sd _mm_comigt_sd
5966#define _mm_ucomile_sd _mm_comile_sd
5967#define _mm_ucomilt_sd _mm_comilt_sd
5968#define _mm_ucomineq_sd _mm_comineq_sd
5969
5970// Return vector of type __m128d with undefined elements.
5971// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_pd
5972FORCE_INLINE __m128d _mm_undefined_pd(void)
5973{
5974#if defined(__GNUC__) || defined(__clang__)
5975#pragma GCC diagnostic push
5976#pragma GCC diagnostic ignored "-Wuninitialized"
5977#endif
5978 __m128d a;
5979 return a;
5980#if defined(__GNUC__) || defined(__clang__)
5981#pragma GCC diagnostic pop
5982#endif
5983}
5984
5985// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the
5986// upper 4 signed or unsigned 16-bit integers in b.
5987//
5988// r0 := a4
5989// r1 := b4
5990// r2 := a5
5991// r3 := b5
5992// r4 := a6
5993// r5 := b6
5994// r6 := a7
5995// r7 := b7
5996//
5997// https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
5998FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
5999{
6000#if defined(__aarch64__)
6001 return vreinterpretq_m128i_s16(
6002 vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
6003#else
6004 int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
6005 int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
6006 int16x4x2_t result = vzip_s16(a1, b1);
6007 return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
6008#endif
6009}
6010
6011// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the
6012// upper 2 signed or unsigned 32-bit integers in b.
6013// https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
6014FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
6015{
6016#if defined(__aarch64__)
6017 return vreinterpretq_m128i_s32(
6018 vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
6019#else
6020 int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
6021 int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
6022 int32x2x2_t result = vzip_s32(a1, b1);
6023 return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
6024#endif
6025}
6026
6027// Interleaves the upper signed or unsigned 64-bit integer in a with the
6028// upper signed or unsigned 64-bit integer in b.
6029//
6030// r0 := a1
6031// r1 := b1
6032FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
6033{
6034 int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
6035 int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
6036 return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
6037}
6038
6039// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper
6040// 8 signed or unsigned 8-bit integers in b.
6041//
6042// r0 := a8
6043// r1 := b8
6044// r2 := a9
6045// r3 := b9
6046// ...
6047// r14 := a15
6048// r15 := b15
6049//
6050// https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
6051FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
6052{
6053#if defined(__aarch64__)
6054 return vreinterpretq_m128i_s8(
6055 vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
6056#else
6057 int8x8_t a1 =
6058 vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
6059 int8x8_t b1 =
6060 vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
6061 int8x8x2_t result = vzip_s8(a1, b1);
6062 return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
6063#endif
6064}
6065
6066// Unpack and interleave double-precision (64-bit) floating-point elements from
6067// the high half of a and b, and store the results in dst.
6068//
6069// DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
6070// dst[63:0] := src1[127:64]
6071// dst[127:64] := src2[127:64]
6072// RETURN dst[127:0]
6073// }
6074// dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
6075//
6076// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_pd
6077FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
6078{
6079#if defined(__aarch64__)
6080 return vreinterpretq_m128d_f64(
6081 vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6082#else
6083 return vreinterpretq_m128d_s64(
6084 vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)),
6085 vget_high_s64(vreinterpretq_s64_m128d(b))));
6086#endif
6087}
6088
6089// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the
6090// lower 4 signed or unsigned 16-bit integers in b.
6091//
6092// r0 := a0
6093// r1 := b0
6094// r2 := a1
6095// r3 := b1
6096// r4 := a2
6097// r5 := b2
6098// r6 := a3
6099// r7 := b3
6100//
6101// https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
6102FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
6103{
6104#if defined(__aarch64__)
6105 return vreinterpretq_m128i_s16(
6106 vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
6107#else
6108 int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
6109 int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
6110 int16x4x2_t result = vzip_s16(a1, b1);
6111 return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
6112#endif
6113}
6114
6115// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the
6116// lower 2 signed or unsigned 32 - bit integers in b.
6117//
6118// r0 := a0
6119// r1 := b0
6120// r2 := a1
6121// r3 := b1
6122//
6123// https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
6124FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
6125{
6126#if defined(__aarch64__)
6127 return vreinterpretq_m128i_s32(
6128 vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
6129#else
6130 int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
6131 int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
6132 int32x2x2_t result = vzip_s32(a1, b1);
6133 return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
6134#endif
6135}
6136
6137FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
6138{
6139 int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
6140 int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
6141 return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
6142}
6143
6144// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower
6145// 8 signed or unsigned 8-bit integers in b.
6146//
6147// r0 := a0
6148// r1 := b0
6149// r2 := a1
6150// r3 := b1
6151// ...
6152// r14 := a7
6153// r15 := b7
6154//
6155// https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
6156FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
6157{
6158#if defined(__aarch64__)
6159 return vreinterpretq_m128i_s8(
6160 vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
6161#else
6162 int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
6163 int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
6164 int8x8x2_t result = vzip_s8(a1, b1);
6165 return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
6166#endif
6167}
6168
6169// Unpack and interleave double-precision (64-bit) floating-point elements from
6170// the low half of a and b, and store the results in dst.
6171//
6172// DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
6173// dst[63:0] := src1[63:0]
6174// dst[127:64] := src2[63:0]
6175// RETURN dst[127:0]
6176// }
6177// dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
6178//
6179// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_pd
6180FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
6181{
6182#if defined(__aarch64__)
6183 return vreinterpretq_m128d_f64(
6184 vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6185#else
6186 return vreinterpretq_m128d_s64(
6187 vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)),
6188 vget_low_s64(vreinterpretq_s64_m128d(b))));
6189#endif
6190}
6191
6192// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
6193// elements in a and b, and store the results in dst.
6194//
6195// FOR j := 0 to 1
6196// i := j*64
6197// dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
6198// ENDFOR
6199//
6200// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd
6201FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
6202{
6203 return vreinterpretq_m128d_s64(
6204 veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
6205}
6206
6207// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in
6208// b. https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
6209FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
6210{
6211 return vreinterpretq_m128i_s32(
6212 veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
6213}
6214
6215/* SSE3 */
6216
6217// Alternatively add and subtract packed double-precision (64-bit)
6218// floating-point elements in a to/from packed elements in b, and store the
6219// results in dst.
6220//
6221// FOR j := 0 to 1
6222// i := j*64
6223// IF ((j & 1) == 0)
6224// dst[i+63:i] := a[i+63:i] - b[i+63:i]
6225// ELSE
6226// dst[i+63:i] := a[i+63:i] + b[i+63:i]
6227// FI
6228// ENDFOR
6229//
6230// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_addsub_pd
6231FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
6232{
6233 __m128d mask = _mm_set_pd(1.0f, -1.0f);
6234#if defined(__aarch64__)
6235 return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),
6236 vreinterpretq_f64_m128d(b),
6237 vreinterpretq_f64_m128d(mask)));
6238#else
6239 return _mm_add_pd(_mm_mul_pd(b, mask), a);
6240#endif
6241}
6242
6243// Alternatively add and subtract packed single-precision (32-bit)
6244// floating-point elements in a to/from packed elements in b, and store the
6245// results in dst.
6246// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps
6247FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
6248{
6249 __m128 mask = {-1.0f, 1.0f, -1.0f, 1.0f};
6250#if defined(__aarch64__) || defined(__ARM_FEATURE_FMA) /* VFPv4+ */
6251 return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a),
6252 vreinterpretq_f32_m128(mask),
6253 vreinterpretq_f32_m128(b)));
6254#else
6255 return _mm_add_ps(_mm_mul_ps(b, mask), a);
6256#endif
6257}
6258
6259// Horizontally add adjacent pairs of double-precision (64-bit) floating-point
6260// elements in a and b, and pack the results in dst.
6261// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pd
6262FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
6263{
6264#if defined(__aarch64__)
6265 return vreinterpretq_m128d_f64(
6266 vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6267#else
6268 double *da = (double *) &a;
6269 double *db = (double *) &b;
6270 double c[] = {da[0] + da[1], db[0] + db[1]};
6271 return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
6272#endif
6273}
6274
6275// Computes pairwise add of each argument as single-precision, floating-point
6276// values a and b.
6277// https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
6278FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
6279{
6280#if defined(__aarch64__)
6281 return vreinterpretq_m128_f32(
6282 vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
6283#else
6284 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
6285 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
6286 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
6287 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
6288 return vreinterpretq_m128_f32(
6289 vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
6290#endif
6291}
6292
6293// Horizontally subtract adjacent pairs of double-precision (64-bit)
6294// floating-point elements in a and b, and pack the results in dst.
6295// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pd
6296FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b)
6297{
6298#if defined(__aarch64__)
6299 return vreinterpretq_m128d_f64(vsubq_f64(
6300 vuzp1q_f64(vreinterpretq_f64_m128d(_a), vreinterpretq_f64_m128d(_b)),
6301 vuzp2q_f64(vreinterpretq_f64_m128d(_a), vreinterpretq_f64_m128d(_b))));
6302#else
6303 double *da = (double *) &_a;
6304 double *db = (double *) &_b;
6305 double c[] = {da[0] - da[1], db[0] - db[1]};
6306 return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
6307#endif
6308}
6309
6310// Horizontally substract adjacent pairs of single-precision (32-bit)
6311// floating-point elements in a and b, and pack the results in dst.
6312// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps
6313FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
6314{
6315#if defined(__aarch64__)
6316 return vreinterpretq_m128_f32(vsubq_f32(
6317 vuzp1q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)),
6318 vuzp2q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b))));
6319#else
6320 float32x4x2_t c =
6321 vuzpq_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b));
6322 return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
6323#endif
6324}
6325
6326// Load 128-bits of integer data from unaligned memory into dst. This intrinsic
6327// may perform better than _mm_loadu_si128 when the data crosses a cache line
6328// boundary.
6329//
6330// dst[127:0] := MEM[mem_addr+127:mem_addr]
6331//
6332// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128
6333#define _mm_lddqu_si128 _mm_loadu_si128
6334
6335// Load a double-precision (64-bit) floating-point element from memory into both
6336// elements of dst.
6337//
6338// dst[63:0] := MEM[mem_addr+63:mem_addr]
6339// dst[127:64] := MEM[mem_addr+63:mem_addr]
6340//
6341// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd
6342#define _mm_loaddup_pd _mm_load1_pd
6343
6344// Duplicate the low double-precision (64-bit) floating-point element from a,
6345// and store the results in dst.
6346// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movedup_pd
6347FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
6348{
6349#if (__aarch64__)
6350 return vreinterpretq_m128d_f64(
6351 vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
6352#else
6353 return vreinterpretq_m128d_u64(
6354 vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)));
6355#endif
6356}
6357
6358// Duplicate odd-indexed single-precision (32-bit) floating-point elements
6359// from a, and store the results in dst.
6360// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps
6361FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
6362{
6363#if __has_builtin(__builtin_shufflevector)
6364 return vreinterpretq_m128_f32(__builtin_shufflevector(
6365 vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
6366#else
6367 float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
6368 float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
6369 float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
6370 return vreinterpretq_m128_f32(vld1q_f32(data));
6371#endif
6372}
6373
6374// Duplicate even-indexed single-precision (32-bit) floating-point elements
6375// from a, and store the results in dst.
6376// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps
6377FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
6378{
6379#if __has_builtin(__builtin_shufflevector)
6380 return vreinterpretq_m128_f32(__builtin_shufflevector(
6381 vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
6382#else
6383 float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
6384 float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
6385 float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
6386 return vreinterpretq_m128_f32(vld1q_f32(data));
6387#endif
6388}
6389
6390/* SSSE3 */
6391
6392// Compute the absolute value of packed signed 16-bit integers in a, and store
6393// the unsigned results in dst.
6394//
6395// FOR j := 0 to 7
6396// i := j*16
6397// dst[i+15:i] := ABS(a[i+15:i])
6398// ENDFOR
6399//
6400// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16
6401FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
6402{
6403 return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
6404}
6405
6406// Compute the absolute value of packed signed 32-bit integers in a, and store
6407// the unsigned results in dst.
6408//
6409// FOR j := 0 to 3
6410// i := j*32
6411// dst[i+31:i] := ABS(a[i+31:i])
6412// ENDFOR
6413//
6414// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32
6415FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
6416{
6417 return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
6418}
6419
6420// Compute the absolute value of packed signed 8-bit integers in a, and store
6421// the unsigned results in dst.
6422//
6423// FOR j := 0 to 15
6424// i := j*8
6425// dst[i+7:i] := ABS(a[i+7:i])
6426// ENDFOR
6427//
6428// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8
6429FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
6430{
6431 return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
6432}
6433
6434// Compute the absolute value of packed signed 16-bit integers in a, and store
6435// the unsigned results in dst.
6436//
6437// FOR j := 0 to 3
6438// i := j*16
6439// dst[i+15:i] := ABS(a[i+15:i])
6440// ENDFOR
6441//
6442// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16
6443FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
6444{
6445 return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
6446}
6447
6448// Compute the absolute value of packed signed 32-bit integers in a, and store
6449// the unsigned results in dst.
6450//
6451// FOR j := 0 to 1
6452// i := j*32
6453// dst[i+31:i] := ABS(a[i+31:i])
6454// ENDFOR
6455//
6456// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32
6457FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
6458{
6459 return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
6460}
6461
6462// Compute the absolute value of packed signed 8-bit integers in a, and store
6463// the unsigned results in dst.
6464//
6465// FOR j := 0 to 7
6466// i := j*8
6467// dst[i+7:i] := ABS(a[i+7:i])
6468// ENDFOR
6469//
6470// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8
6471FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
6472{
6473 return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
6474}
6475
6476// Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift
6477// the result right by imm8 bytes, and store the low 16 bytes in dst.
6478//
6479// tmp[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8)
6480// dst[127:0] := tmp[127:0]
6481//
6482// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_epi8
6483#define _mm_alignr_epi8(a, b, imm) \
6484 __extension__({ \
6485 __m128i ret; \
6486 if (unlikely((imm) >= 32)) { \
6487 ret = _mm_setzero_si128(); \
6488 } else { \
6489 uint8x16_t tmp_low, tmp_high; \
6490 if (imm >= 16) { \
6491 const int idx = imm - 16; \
6492 tmp_low = vreinterpretq_u8_m128i(a); \
6493 tmp_high = vdupq_n_u8(0); \
6494 ret = \
6495 vreinterpretq_m128i_u8(vextq_u8(tmp_low, tmp_high, idx)); \
6496 } else { \
6497 const int idx = imm; \
6498 tmp_low = vreinterpretq_u8_m128i(b); \
6499 tmp_high = vreinterpretq_u8_m128i(a); \
6500 ret = \
6501 vreinterpretq_m128i_u8(vextq_u8(tmp_low, tmp_high, idx)); \
6502 } \
6503 } \
6504 ret; \
6505 })
6506
6507// Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
6508// the result right by imm8 bytes, and store the low 8 bytes in dst.
6509//
6510// tmp[127:0] := ((a[63:0] << 64)[127:0] OR b[63:0]) >> (imm8*8)
6511// dst[63:0] := tmp[63:0]
6512//
6513// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_pi8
6514#define _mm_alignr_pi8(a, b, imm) \
6515 __extension__({ \
6516 __m64 ret; \
6517 if (unlikely((imm) >= 16)) { \
6518 ret = vreinterpret_m64_s8(vdup_n_s8(0)); \
6519 } else { \
6520 uint8x8_t tmp_low, tmp_high; \
6521 if (imm >= 8) { \
6522 const int idx = imm - 8; \
6523 tmp_low = vreinterpret_u8_m64(a); \
6524 tmp_high = vdup_n_u8(0); \
6525 ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
6526 } else { \
6527 const int idx = imm; \
6528 tmp_low = vreinterpret_u8_m64(b); \
6529 tmp_high = vreinterpret_u8_m64(a); \
6530 ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
6531 } \
6532 } \
6533 ret; \
6534 })
6535
6536// Computes pairwise add of each argument as a 16-bit signed or unsigned integer
6537// values a and b.
6538FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
6539{
6540 int16x8_t a = vreinterpretq_s16_m128i(_a);
6541 int16x8_t b = vreinterpretq_s16_m128i(_b);
6542#if defined(__aarch64__)
6543 return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
6544#else
6545 return vreinterpretq_m128i_s16(
6546 vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
6547 vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
6548#endif
6549}
6550
6551// Computes pairwise add of each argument as a 32-bit signed or unsigned integer
6552// values a and b.
6553FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
6554{
6555 int32x4_t a = vreinterpretq_s32_m128i(_a);
6556 int32x4_t b = vreinterpretq_s32_m128i(_b);
6557 return vreinterpretq_m128i_s32(
6558 vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
6559 vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
6560}
6561
6562// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
6563// signed 16-bit results in dst.
6564// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16
6565FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
6566{
6567 return vreinterpret_m64_s16(
6568 vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
6569}
6570
6571// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
6572// signed 32-bit results in dst.
6573// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32
6574FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
6575{
6576 return vreinterpret_m64_s32(
6577 vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
6578}
6579
6580// Computes saturated pairwise sub of each argument as a 16-bit signed
6581// integer values a and b.
6582FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
6583{
6584#if defined(__aarch64__)
6585 int16x8_t a = vreinterpretq_s16_m128i(_a);
6586 int16x8_t b = vreinterpretq_s16_m128i(_b);
6587 return vreinterpretq_s64_s16(
6588 vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6589#else
6590 int32x4_t a = vreinterpretq_s32_m128i(_a);
6591 int32x4_t b = vreinterpretq_s32_m128i(_b);
6592 // Interleave using vshrn/vmovn
6593 // [a0|a2|a4|a6|b0|b2|b4|b6]
6594 // [a1|a3|a5|a7|b1|b3|b5|b7]
6595 int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
6596 int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
6597 // Saturated add
6598 return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
6599#endif
6600}
6601
6602// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
6603// saturation, and pack the signed 16-bit results in dst.
6604// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadds_pi16
6605FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
6606{
6607 int16x4_t a = vreinterpret_s16_m64(_a);
6608 int16x4_t b = vreinterpret_s16_m64(_b);
6609#if defined(__aarch64__)
6610 return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
6611#else
6612 int16x4x2_t res = vuzp_s16(a, b);
6613 return vreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1]));
6614#endif
6615}
6616
6617// Computes pairwise difference of each argument as a 16-bit signed or unsigned
6618// integer values a and b.
6619FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
6620{
6621 int32x4_t a = vreinterpretq_s32_m128i(_a);
6622 int32x4_t b = vreinterpretq_s32_m128i(_b);
6623 // Interleave using vshrn/vmovn
6624 // [a0|a2|a4|a6|b0|b2|b4|b6]
6625 // [a1|a3|a5|a7|b1|b3|b5|b7]
6626 int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
6627 int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
6628 // Subtract
6629 return vreinterpretq_m128i_s16(vsubq_s16(ab0246, ab1357));
6630}
6631
6632// Computes pairwise difference of each argument as a 32-bit signed or unsigned
6633// integer values a and b.
6634FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
6635{
6636 int64x2_t a = vreinterpretq_s64_m128i(_a);
6637 int64x2_t b = vreinterpretq_s64_m128i(_b);
6638 // Interleave using vshrn/vmovn
6639 // [a0|a2|b0|b2]
6640 // [a1|a2|b1|b3]
6641 int32x4_t ab02 = vcombine_s32(vmovn_s64(a), vmovn_s64(b));
6642 int32x4_t ab13 = vcombine_s32(vshrn_n_s64(a, 32), vshrn_n_s64(b, 32));
6643 // Subtract
6644 return vreinterpretq_m128i_s32(vsubq_s32(ab02, ab13));
6645}
6646
6647// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
6648// the signed 16-bit results in dst.
6649// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pi16
6650FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
6651{
6652 int32x4_t ab =
6653 vcombine_s32(vreinterpret_s32_m64(_a), vreinterpret_s32_m64(_b));
6654
6655 int16x4_t ab_low_bits = vmovn_s32(ab);
6656 int16x4_t ab_high_bits = vshrn_n_s32(ab, 16);
6657
6658 return vreinterpret_m64_s16(vsub_s16(ab_low_bits, ab_high_bits));
6659}
6660
6661// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
6662// the signed 32-bit results in dst.
6663// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_hsub_pi32
6664FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
6665{
6666#if defined(__aarch64__)
6667 int32x2_t a = vreinterpret_s32_m64(_a);
6668 int32x2_t b = vreinterpret_s32_m64(_b);
6669 return vreinterpret_m64_s32(vsub_s32(vtrn1_s32(a, b), vtrn2_s32(a, b)));
6670#else
6671 int32x2x2_t trn_ab =
6672 vtrn_s32(vreinterpret_s32_m64(_a), vreinterpret_s32_m64(_b));
6673 return vreinterpret_m64_s32(vsub_s32(trn_ab.val[0], trn_ab.val[1]));
6674#endif
6675}
6676
6677// Computes saturated pairwise difference of each argument as a 16-bit signed
6678// integer values a and b.
6679// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16
6680FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
6681{
6682#if defined(__aarch64__)
6683 int16x8_t a = vreinterpretq_s16_m128i(_a);
6684 int16x8_t b = vreinterpretq_s16_m128i(_b);
6685 return vreinterpretq_s64_s16(
6686 vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6687#else
6688 int32x4_t a = vreinterpretq_s32_m128i(_a);
6689 int32x4_t b = vreinterpretq_s32_m128i(_b);
6690 // Interleave using vshrn/vmovn
6691 // [a0|a2|a4|a6|b0|b2|b4|b6]
6692 // [a1|a3|a5|a7|b1|b3|b5|b7]
6693 int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
6694 int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
6695 // Saturated subtract
6696 return vreinterpretq_m128i_s16(vqsubq_s16(ab0246, ab1357));
6697#endif
6698}
6699
6700// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
6701// using saturation, and pack the signed 16-bit results in dst.
6702// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_pi16
6703FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
6704{
6705 int16x4_t a = vreinterpret_s16_m64(_a);
6706 int16x4_t b = vreinterpret_s16_m64(_b);
6707#if defined(__aarch64__)
6708 return vreinterpret_s64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
6709#else
6710 int16x4x2_t res = vuzp_s16(a, b);
6711 return vreinterpret_s64_s16(vqsub_s16(res.val[0], res.val[1]));
6712#endif
6713}
6714
6715// Vertically multiply each unsigned 8-bit integer from a with the corresponding
6716// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
6717// Horizontally add adjacent pairs of intermediate signed 16-bit integers,
6718// and pack the saturated results in dst.
6719//
6720// FOR j := 0 to 7
6721// i := j*16
6722// dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] +
6723// a[i+7:i]*b[i+7:i] )
6724// ENDFOR
6725FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
6726{
6727#if defined(__aarch64__)
6728 uint8x16_t a = vreinterpretq_u8_m128i(_a);
6729 int8x16_t b = vreinterpretq_s8_m128i(_b);
6730 int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
6731 vmovl_s8(vget_low_s8(b)));
6732 int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
6733 vmovl_s8(vget_high_s8(b)));
6734 return vreinterpretq_m128i_s16(
6735 vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
6736#else
6737 // This would be much simpler if x86 would choose to zero extend OR sign
6738 // extend, not both. This could probably be optimized better.
6739 uint16x8_t a = vreinterpretq_u16_m128i(_a);
6740 int16x8_t b = vreinterpretq_s16_m128i(_b);
6741
6742 // Zero extend a
6743 int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
6744 int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
6745
6746 // Sign extend by shifting left then shifting right.
6747 int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
6748 int16x8_t b_odd = vshrq_n_s16(b, 8);
6749
6750 // multiply
6751 int16x8_t prod1 = vmulq_s16(a_even, b_even);
6752 int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
6753
6754 // saturated add
6755 return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
6756#endif
6757}
6758
6759// Vertically multiply each unsigned 8-bit integer from a with the corresponding
6760// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
6761// Horizontally add adjacent pairs of intermediate signed 16-bit integers, and
6762// pack the saturated results in dst.
6763// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maddubs_pi16
6764FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
6765{
6766 uint16x4_t a = vreinterpret_u16_m64(_a);
6767 int16x4_t b = vreinterpret_s16_m64(_b);
6768
6769 // Zero extend a
6770 int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8));
6771 int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff)));
6772
6773 // Sign extend by shifting left then shifting right.
6774 int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8);
6775 int16x4_t b_odd = vshr_n_s16(b, 8);
6776
6777 // multiply
6778 int16x4_t prod1 = vmul_s16(a_even, b_even);
6779 int16x4_t prod2 = vmul_s16(a_odd, b_odd);
6780
6781 // saturated add
6782 return vreinterpret_m64_s16(vqadd_s16(prod1, prod2));
6783}
6784
6785// Multiply packed signed 16-bit integers in a and b, producing intermediate
6786// signed 32-bit integers. Shift right by 15 bits while rounding up, and store
6787// the packed 16-bit integers in dst.
6788//
6789// r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15)
6790// r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15)
6791// r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15)
6792// ...
6793// r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15)
6794FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
6795{
6796 // Has issues due to saturation
6797 // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
6798
6799 // Multiply
6800 int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
6801 vget_low_s16(vreinterpretq_s16_m128i(b)));
6802 int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
6803 vget_high_s16(vreinterpretq_s16_m128i(b)));
6804
6805 // Rounding narrowing shift right
6806 // narrow = (int16_t)((mul + 16384) >> 15);
6807 int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
6808 int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
6809
6810 // Join together
6811 return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
6812}
6813
6814// Multiply packed signed 16-bit integers in a and b, producing intermediate
6815// signed 32-bit integers. Truncate each intermediate integer to the 18 most
6816// significant bits, round by adding 1, and store bits [16:1] to dst.
6817// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhrs_pi16
6818FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)
6819{
6820 int32x4_t mul_extend =
6821 vmull_s16((vreinterpret_s16_m64(a)), (vreinterpret_s16_m64(b)));
6822
6823 // Rounding narrowing shift right
6824 return vreinterpret_m64_s16(vrshrn_n_s32(mul_extend, 15));
6825}
6826
6827// Shuffle packed 8-bit integers in a according to shuffle control mask in the
6828// corresponding 8-bit element of b, and store the results in dst.
6829// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8
6830FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
6831{
6832 int8x16_t tbl = vreinterpretq_s8_m128i(a); // input a
6833 uint8x16_t idx = vreinterpretq_u8_m128i(b); // input b
6834 uint8x16_t idx_masked =
6835 vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits
6836#if defined(__aarch64__)
6837 return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
6838#elif defined(__GNUC__)
6839 int8x16_t ret;
6840 // %e and %f represent the even and odd D registers
6841 // respectively.
6842 __asm__ __volatile__(
6843 "vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
6844 "vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
6845 : [ret] "=&w"(ret)
6846 : [tbl] "w"(tbl), [idx] "w"(idx_masked));
6847 return vreinterpretq_m128i_s8(ret);
6848#else
6849 // use this line if testing on aarch64
6850 int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
6851 return vreinterpretq_m128i_s8(
6852 vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
6853 vtbl2_s8(a_split, vget_high_u8(idx_masked))));
6854#endif
6855}
6856
6857// Shuffle packed 8-bit integers in a according to shuffle control mask in the
6858// corresponding 8-bit element of b, and store the results in dst.
6859//
6860// FOR j := 0 to 7
6861// i := j*8
6862// IF b[i+7] == 1
6863// dst[i+7:i] := 0
6864// ELSE
6865// index[2:0] := b[i+2:i]
6866// dst[i+7:i] := a[index*8+7:index*8]
6867// FI
6868// ENDFOR
6869//
6870// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pi8
6871FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b)
6872{
6873 const int8x8_t controlMask =
6874 vand_s8(vreinterpret_s8_m64(b), vdup_n_s8(1 << 7 | 0x07));
6875 int8x8_t res = vtbl1_s8(vreinterpret_s8_m64(a), controlMask);
6876 return vreinterpret_m64_s8(res);
6877}
6878
6879// Negate packed 16-bit integers in a when the corresponding signed
6880// 16-bit integer in b is negative, and store the results in dst.
6881// Element in dst are zeroed out when the corresponding element
6882// in b is zero.
6883//
6884// for i in 0..7
6885// if b[i] < 0
6886// r[i] := -a[i]
6887// else if b[i] == 0
6888// r[i] := 0
6889// else
6890// r[i] := a[i]
6891// fi
6892// done
6893FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
6894{
6895 int16x8_t a = vreinterpretq_s16_m128i(_a);
6896 int16x8_t b = vreinterpretq_s16_m128i(_b);
6897
6898 // signed shift right: faster than vclt
6899 // (b < 0) ? 0xFFFF : 0
6900 uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
6901 // (b == 0) ? 0xFFFF : 0
6902#if defined(__aarch64__)
6903 int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
6904#else
6905 int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
6906#endif
6907
6908 // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
6909 // 'a') based on ltMask
6910 int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
6911 // res = masked & (~zeroMask)
6912 int16x8_t res = vbicq_s16(masked, zeroMask);
6913 return vreinterpretq_m128i_s16(res);
6914}
6915
6916// Negate packed 32-bit integers in a when the corresponding signed
6917// 32-bit integer in b is negative, and store the results in dst.
6918// Element in dst are zeroed out when the corresponding element
6919// in b is zero.
6920//
6921// for i in 0..3
6922// if b[i] < 0
6923// r[i] := -a[i]
6924// else if b[i] == 0
6925// r[i] := 0
6926// else
6927// r[i] := a[i]
6928// fi
6929// done
6930FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
6931{
6932 int32x4_t a = vreinterpretq_s32_m128i(_a);
6933 int32x4_t b = vreinterpretq_s32_m128i(_b);
6934
6935 // signed shift right: faster than vclt
6936 // (b < 0) ? 0xFFFFFFFF : 0
6937 uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
6938
6939 // (b == 0) ? 0xFFFFFFFF : 0
6940#if defined(__aarch64__)
6941 int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
6942#else
6943 int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
6944#endif
6945
6946 // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
6947 // 'a') based on ltMask
6948 int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
6949 // res = masked & (~zeroMask)
6950 int32x4_t res = vbicq_s32(masked, zeroMask);
6951 return vreinterpretq_m128i_s32(res);
6952}
6953
6954// Negate packed 8-bit integers in a when the corresponding signed
6955// 8-bit integer in b is negative, and store the results in dst.
6956// Element in dst are zeroed out when the corresponding element
6957// in b is zero.
6958//
6959// for i in 0..15
6960// if b[i] < 0
6961// r[i] := -a[i]
6962// else if b[i] == 0
6963// r[i] := 0
6964// else
6965// r[i] := a[i]
6966// fi
6967// done
6968FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
6969{
6970 int8x16_t a = vreinterpretq_s8_m128i(_a);
6971 int8x16_t b = vreinterpretq_s8_m128i(_b);
6972
6973 // signed shift right: faster than vclt
6974 // (b < 0) ? 0xFF : 0
6975 uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
6976
6977 // (b == 0) ? 0xFF : 0
6978#if defined(__aarch64__)
6979 int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
6980#else
6981 int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
6982#endif
6983
6984 // bitwise select either a or nagative 'a' (vnegq_s8(a) return nagative 'a')
6985 // based on ltMask
6986 int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
6987 // res = masked & (~zeroMask)
6988 int8x16_t res = vbicq_s8(masked, zeroMask);
6989
6990 return vreinterpretq_m128i_s8(res);
6991}
6992
6993// Negate packed 16-bit integers in a when the corresponding signed 16-bit
6994// integer in b is negative, and store the results in dst. Element in dst are
6995// zeroed out when the corresponding element in b is zero.
6996//
6997// FOR j := 0 to 3
6998// i := j*16
6999// IF b[i+15:i] < 0
7000// dst[i+15:i] := -(a[i+15:i])
7001// ELSE IF b[i+15:i] == 0
7002// dst[i+15:i] := 0
7003// ELSE
7004// dst[i+15:i] := a[i+15:i]
7005// FI
7006// ENDFOR
7007//
7008// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16
7009FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
7010{
7011 int16x4_t a = vreinterpret_s16_m64(_a);
7012 int16x4_t b = vreinterpret_s16_m64(_b);
7013
7014 // signed shift right: faster than vclt
7015 // (b < 0) ? 0xFFFF : 0
7016 uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
7017
7018 // (b == 0) ? 0xFFFF : 0
7019#if defined(__aarch64__)
7020 int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
7021#else
7022 int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
7023#endif
7024
7025 // bitwise select either a or nagative 'a' (vneg_s16(a) return nagative 'a')
7026 // based on ltMask
7027 int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
7028 // res = masked & (~zeroMask)
7029 int16x4_t res = vbic_s16(masked, zeroMask);
7030
7031 return vreinterpret_m64_s16(res);
7032}
7033
7034// Negate packed 32-bit integers in a when the corresponding signed 32-bit
7035// integer in b is negative, and store the results in dst. Element in dst are
7036// zeroed out when the corresponding element in b is zero.
7037//
7038// FOR j := 0 to 1
7039// i := j*32
7040// IF b[i+31:i] < 0
7041// dst[i+31:i] := -(a[i+31:i])
7042// ELSE IF b[i+31:i] == 0
7043// dst[i+31:i] := 0
7044// ELSE
7045// dst[i+31:i] := a[i+31:i]
7046// FI
7047// ENDFOR
7048//
7049// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32
7050FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
7051{
7052 int32x2_t a = vreinterpret_s32_m64(_a);
7053 int32x2_t b = vreinterpret_s32_m64(_b);
7054
7055 // signed shift right: faster than vclt
7056 // (b < 0) ? 0xFFFFFFFF : 0
7057 uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
7058
7059 // (b == 0) ? 0xFFFFFFFF : 0
7060#if defined(__aarch64__)
7061 int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
7062#else
7063 int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
7064#endif
7065
7066 // bitwise select either a or nagative 'a' (vneg_s32(a) return nagative 'a')
7067 // based on ltMask
7068 int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
7069 // res = masked & (~zeroMask)
7070 int32x2_t res = vbic_s32(masked, zeroMask);
7071
7072 return vreinterpret_m64_s32(res);
7073}
7074
7075// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
7076// in b is negative, and store the results in dst. Element in dst are zeroed out
7077// when the corresponding element in b is zero.
7078//
7079// FOR j := 0 to 7
7080// i := j*8
7081// IF b[i+7:i] < 0
7082// dst[i+7:i] := -(a[i+7:i])
7083// ELSE IF b[i+7:i] == 0
7084// dst[i+7:i] := 0
7085// ELSE
7086// dst[i+7:i] := a[i+7:i]
7087// FI
7088// ENDFOR
7089//
7090// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8
7091FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
7092{
7093 int8x8_t a = vreinterpret_s8_m64(_a);
7094 int8x8_t b = vreinterpret_s8_m64(_b);
7095
7096 // signed shift right: faster than vclt
7097 // (b < 0) ? 0xFF : 0
7098 uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
7099
7100 // (b == 0) ? 0xFF : 0
7101#if defined(__aarch64__)
7102 int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
7103#else
7104 int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
7105#endif
7106
7107 // bitwise select either a or nagative 'a' (vneg_s8(a) return nagative 'a')
7108 // based on ltMask
7109 int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
7110 // res = masked & (~zeroMask)
7111 int8x8_t res = vbic_s8(masked, zeroMask);
7112
7113 return vreinterpret_m64_s8(res);
7114}
7115
7116/* SSE4.1 */
7117
7118// Blend packed 16-bit integers from a and b using control mask imm8, and store
7119// the results in dst.
7120//
7121// FOR j := 0 to 7
7122// i := j*16
7123// IF imm8[j]
7124// dst[i+15:i] := b[i+15:i]
7125// ELSE
7126// dst[i+15:i] := a[i+15:i]
7127// FI
7128// ENDFOR
7129// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
7130// __constrange(0,255) int imm)
7131#define _mm_blend_epi16(a, b, imm) \
7132 __extension__({ \
7133 const uint16_t _mask[8] = {((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0, \
7134 ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0, \
7135 ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0, \
7136 ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0, \
7137 ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0, \
7138 ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0, \
7139 ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0, \
7140 ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0}; \
7141 uint16x8_t _mask_vec = vld1q_u16(_mask); \
7142 uint16x8_t _a = vreinterpretq_u16_m128i(a); \
7143 uint16x8_t _b = vreinterpretq_u16_m128i(b); \
7144 vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a)); \
7145 })
7146
7147// Blend packed double-precision (64-bit) floating-point elements from a and b
7148// using control mask imm8, and store the results in dst.
7149// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_pd
7150#define _mm_blend_pd(a, b, imm) \
7151 __extension__({ \
7152 const uint64_t _mask[2] = { \
7153 ((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0), \
7154 ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)}; \
7155 uint64x2_t _mask_vec = vld1q_u64(_mask); \
7156 uint64x2_t _a = vreinterpretq_u64_m128d(a); \
7157 uint64x2_t _b = vreinterpretq_u64_m128d(b); \
7158 vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, _b, _a)); \
7159 })
7160
7161// Blend packed single-precision (32-bit) floating-point elements from a and b
7162// using mask, and store the results in dst.
7163// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_ps
7164FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
7165{
7166 const uint32_t ALIGN_STRUCT(16)
7167 data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
7168 ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
7169 ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
7170 ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
7171 uint32x4_t mask = vld1q_u32(data);
7172 float32x4_t a = vreinterpretq_f32_m128(_a);
7173 float32x4_t b = vreinterpretq_f32_m128(_b);
7174 return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
7175}
7176
7177// Blend packed 8-bit integers from a and b using mask, and store the results in
7178// dst.
7179//
7180// FOR j := 0 to 15
7181// i := j*8
7182// IF mask[i+7]
7183// dst[i+7:i] := b[i+7:i]
7184// ELSE
7185// dst[i+7:i] := a[i+7:i]
7186// FI
7187// ENDFOR
7188FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
7189{
7190 // Use a signed shift right to create a mask with the sign bit
7191 uint8x16_t mask =
7192 vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
7193 uint8x16_t a = vreinterpretq_u8_m128i(_a);
7194 uint8x16_t b = vreinterpretq_u8_m128i(_b);
7195 return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
7196}
7197
7198// Blend packed double-precision (64-bit) floating-point elements from a and b
7199// using mask, and store the results in dst.
7200// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd
7201FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
7202{
7203 uint64x2_t mask =
7204 vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
7205#if defined(__aarch64__)
7206 float64x2_t a = vreinterpretq_f64_m128d(_a);
7207 float64x2_t b = vreinterpretq_f64_m128d(_b);
7208 return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
7209#else
7210 uint64x2_t a = vreinterpretq_u64_m128d(_a);
7211 uint64x2_t b = vreinterpretq_u64_m128d(_b);
7212 return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a));
7213#endif
7214}
7215
7216// Blend packed single-precision (32-bit) floating-point elements from a and b
7217// using mask, and store the results in dst.
7218// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps
7219FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
7220{
7221 // Use a signed shift right to create a mask with the sign bit
7222 uint32x4_t mask =
7223 vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31));
7224 float32x4_t a = vreinterpretq_f32_m128(_a);
7225 float32x4_t b = vreinterpretq_f32_m128(_b);
7226 return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
7227}
7228
7229// Round the packed double-precision (64-bit) floating-point elements in a up
7230// to an integer value, and store the results as packed double-precision
7231// floating-point elements in dst.
7232// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_pd
7233FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
7234{
7235#if defined(__aarch64__)
7236 return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
7237#else
7238 double *f = (double *) &a;
7239 return _mm_set_pd(ceil(f[1]), ceil(f[0]));
7240#endif
7241}
7242
7243// Round the packed single-precision (32-bit) floating-point elements in a up to
7244// an integer value, and store the results as packed single-precision
7245// floating-point elements in dst.
7246// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps
7247FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
7248{
7249#if defined(__aarch64__)
7250 return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
7251#else
7252 float *f = (float *) &a;
7253 return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0]));
7254#endif
7255}
7256
7257// Round the lower double-precision (64-bit) floating-point element in b up to
7258// an integer value, store the result as a double-precision floating-point
7259// element in the lower element of dst, and copy the upper element from a to the
7260// upper element of dst.
7261// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_sd
7262FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b)
7263{
7264 return _mm_move_sd(a, _mm_ceil_pd(b));
7265}
7266
7267// Round the lower single-precision (32-bit) floating-point element in b up to
7268// an integer value, store the result as a single-precision floating-point
7269// element in the lower element of dst, and copy the upper 3 packed elements
7270// from a to the upper elements of dst.
7271//
7272// dst[31:0] := CEIL(b[31:0])
7273// dst[127:32] := a[127:32]
7274//
7275// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ss
7276FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
7277{
7278 return _mm_move_ss(a, _mm_ceil_ps(b));
7279}
7280
7281// Compare packed 64-bit integers in a and b for equality, and store the results
7282// in dst
7283FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
7284{
7285#if defined(__aarch64__)
7286 return vreinterpretq_m128i_u64(
7287 vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
7288#else
7289 // ARMv7 lacks vceqq_u64
7290 // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
7291 uint32x4_t cmp =
7292 vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
7293 uint32x4_t swapped = vrev64q_u32(cmp);
7294 return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
7295#endif
7296}
7297
7298// Converts the four signed 16-bit integers in the lower 64 bits to four signed
7299// 32-bit integers.
7300FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
7301{
7302 return vreinterpretq_m128i_s32(
7303 vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
7304}
7305
7306// Converts the two signed 16-bit integers in the lower 32 bits two signed
7307// 32-bit integers.
7308FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
7309{
7310 int16x8_t s16x8 = vreinterpretq_s16_m128i(a); /* xxxx xxxx xxxx 0B0A */
7311 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
7312 int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
7313 return vreinterpretq_m128i_s64(s64x2);
7314}
7315
7316// Converts the two signed 32-bit integers in the lower 64 bits to two signed
7317// 64-bit integers.
7318FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
7319{
7320 return vreinterpretq_m128i_s64(
7321 vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
7322}
7323
7324// Converts the four unsigned 8-bit integers in the lower 16 bits to four
7325// unsigned 32-bit integers.
7326FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
7327{
7328 int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
7329 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
7330 return vreinterpretq_m128i_s16(s16x8);
7331}
7332
7333// Converts the four unsigned 8-bit integers in the lower 32 bits to four
7334// unsigned 32-bit integers.
7335FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
7336{
7337 int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
7338 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
7339 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
7340 return vreinterpretq_m128i_s32(s32x4);
7341}
7342
7343// Converts the two signed 8-bit integers in the lower 32 bits to four
7344// signed 64-bit integers.
7345FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
7346{
7347 int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx xxBA */
7348 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0x0x 0B0A */
7349 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
7350 int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
7351 return vreinterpretq_m128i_s64(s64x2);
7352}
7353
7354// Converts the four unsigned 16-bit integers in the lower 64 bits to four
7355// unsigned 32-bit integers.
7356FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
7357{
7358 return vreinterpretq_m128i_u32(
7359 vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
7360}
7361
7362// Converts the two unsigned 16-bit integers in the lower 32 bits to two
7363// unsigned 64-bit integers.
7364FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
7365{
7366 uint16x8_t u16x8 = vreinterpretq_u16_m128i(a); /* xxxx xxxx xxxx 0B0A */
7367 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
7368 uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
7369 return vreinterpretq_m128i_u64(u64x2);
7370}
7371
7372// Converts the two unsigned 32-bit integers in the lower 64 bits to two
7373// unsigned 64-bit integers.
7374FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
7375{
7376 return vreinterpretq_m128i_u64(
7377 vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
7378}
7379
7380// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers,
7381// and store the results in dst.
7382// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi16
7383FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
7384{
7385 uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx HGFE DCBA */
7386 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0H0G 0F0E 0D0C 0B0A */
7387 return vreinterpretq_m128i_u16(u16x8);
7388}
7389
7390// Converts the four unsigned 8-bit integers in the lower 32 bits to four
7391// unsigned 32-bit integers.
7392// https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx
7393FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
7394{
7395 uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */
7396 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
7397 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
7398 return vreinterpretq_m128i_u32(u32x4);
7399}
7400
7401// Converts the two unsigned 8-bit integers in the lower 16 bits to two
7402// unsigned 64-bit integers.
7403FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
7404{
7405 uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx xxBA */
7406 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0x0x 0B0A */
7407 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
7408 uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
7409 return vreinterpretq_m128i_u64(u64x2);
7410}
7411
7412// Conditionally multiply the packed single-precision (32-bit) floating-point
7413// elements in a and b using the high 4 bits in imm8, sum the four products,
7414// and conditionally store the sum in dst using the low 4 bits of imm.
7415// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps
7416FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
7417{
7418#if defined(__aarch64__)
7419 /* shortcuts */
7420 if (imm == 0xFF) {
7421 return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b)));
7422 }
7423 if (imm == 0x7F) {
7424 float32x4_t m = _mm_mul_ps(a, b);
7425 m[3] = 0;
7426 return _mm_set1_ps(vaddvq_f32(m));
7427 }
7428#endif
7429
7430 float s = 0, c = 0;
7431 float32x4_t f32a = vreinterpretq_f32_m128(a);
7432 float32x4_t f32b = vreinterpretq_f32_m128(b);
7433
7434 /* To improve the accuracy of floating-point summation, Kahan algorithm
7435 * is used for each operation.
7436 */
7437 if (imm & (1 << 4))
7438 _sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]);
7439 if (imm & (1 << 5))
7440 _sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]);
7441 if (imm & (1 << 6))
7442 _sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]);
7443 if (imm & (1 << 7))
7444 _sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]);
7445 s += c;
7446
7447 float32x4_t res = {
7448 (imm & 0x1) ? s : 0,
7449 (imm & 0x2) ? s : 0,
7450 (imm & 0x4) ? s : 0,
7451 (imm & 0x8) ? s : 0,
7452 };
7453 return vreinterpretq_m128_f32(res);
7454}
7455
7456// Extracts the selected signed or unsigned 32-bit integer from a and zero
7457// extends.
7458// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
7459#define _mm_extract_epi32(a, imm) \
7460 vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
7461
7462// Extracts the selected signed or unsigned 64-bit integer from a and zero
7463// extends.
7464// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
7465#define _mm_extract_epi64(a, imm) \
7466 vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
7467
7468// Extracts the selected signed or unsigned 8-bit integer from a and zero
7469// extends.
7470// FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm)
7471// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi8
7472#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
7473
7474// Extracts the selected single-precision (32-bit) floating-point from a.
7475// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
7476#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
7477
7478// Round the packed double-precision (64-bit) floating-point elements in a down
7479// to an integer value, and store the results as packed double-precision
7480// floating-point elements in dst.
7481// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_pd
7482FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
7483{
7484#if defined(__aarch64__)
7485 return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
7486#else
7487 double *f = (double *) &a;
7488 return _mm_set_pd(floor(f[1]), floor(f[0]));
7489#endif
7490}
7491
7492// Round the packed single-precision (32-bit) floating-point elements in a down
7493// to an integer value, and store the results as packed single-precision
7494// floating-point elements in dst.
7495// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps
7496FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
7497{
7498#if defined(__aarch64__)
7499 return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
7500#else
7501 float *f = (float *) &a;
7502 return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0]));
7503#endif
7504}
7505
7506// Round the lower double-precision (64-bit) floating-point element in b down to
7507// an integer value, store the result as a double-precision floating-point
7508// element in the lower element of dst, and copy the upper element from a to the
7509// upper element of dst.
7510// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_sd
7511FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b)
7512{
7513 return _mm_move_sd(a, _mm_floor_pd(b));
7514}
7515
7516// Round the lower single-precision (32-bit) floating-point element in b down to
7517// an integer value, store the result as a single-precision floating-point
7518// element in the lower element of dst, and copy the upper 3 packed elements
7519// from a to the upper elements of dst.
7520//
7521// dst[31:0] := FLOOR(b[31:0])
7522// dst[127:32] := a[127:32]
7523//
7524// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss
7525FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
7526{
7527 return _mm_move_ss(a, _mm_floor_ps(b));
7528}
7529
7530// Inserts the least significant 32 bits of b into the selected 32-bit integer
7531// of a.
7532// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
7533// __constrange(0,4) int imm)
7534#define _mm_insert_epi32(a, b, imm) \
7535 __extension__({ \
7536 vreinterpretq_m128i_s32( \
7537 vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
7538 })
7539
7540// Inserts the least significant 64 bits of b into the selected 64-bit integer
7541// of a.
7542// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
7543// __constrange(0,2) int imm)
7544#define _mm_insert_epi64(a, b, imm) \
7545 __extension__({ \
7546 vreinterpretq_m128i_s64( \
7547 vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
7548 })
7549
7550// Inserts the least significant 8 bits of b into the selected 8-bit integer
7551// of a.
7552// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
7553// __constrange(0,16) int imm)
7554#define _mm_insert_epi8(a, b, imm) \
7555 __extension__({ \
7556 vreinterpretq_m128i_s8( \
7557 vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
7558 })
7559
7560// Copy a to tmp, then insert a single-precision (32-bit) floating-point
7561// element from b into tmp using the control in imm8. Store tmp to dst using
7562// the mask in imm8 (elements are zeroed out when the corresponding bit is set).
7563// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=insert_ps
7564#define _mm_insert_ps(a, b, imm8) \
7565 __extension__({ \
7566 float32x4_t tmp1 = vsetq_lane_f32(vgetq_lane_f32(b, (imm >> 6) & 0x3), \
7567 vreinterpretq_f32_m128(a), 0); \
7568 float32x4_t tmp2 = \
7569 vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), vreinterpretq_f32_m128(a), \
7570 ((imm >> 4) & 0x3)); \
7571 const uint32_t data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0, \
7572 ((imm8) & (1 << 1)) ? UINT32_MAX : 0, \
7573 ((imm8) & (1 << 2)) ? UINT32_MAX : 0, \
7574 ((imm8) & (1 << 3)) ? UINT32_MAX : 0}; \
7575 uint32x4_t mask = vld1q_u32(data); \
7576 float32x4_t all_zeros = vdupq_n_f32(0); \
7577 \
7578 vreinterpretq_m128_f32( \
7579 vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2))); \
7580 })
7581
7582// epi versions of min/max
7583// Computes the pariwise maximums of the four signed 32-bit integer values of a
7584// and b.
7585//
7586// A 128-bit parameter that can be defined with the following equations:
7587// r0 := (a0 > b0) ? a0 : b0
7588// r1 := (a1 > b1) ? a1 : b1
7589// r2 := (a2 > b2) ? a2 : b2
7590// r3 := (a3 > b3) ? a3 : b3
7591//
7592// https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
7593FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
7594{
7595 return vreinterpretq_m128i_s32(
7596 vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
7597}
7598
7599// Compare packed signed 8-bit integers in a and b, and store packed maximum
7600// values in dst.
7601// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8
7602FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
7603{
7604 return vreinterpretq_m128i_s8(
7605 vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
7606}
7607
7608// Compare packed unsigned 16-bit integers in a and b, and store packed maximum
7609// values in dst.
7610// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16
7611FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
7612{
7613 return vreinterpretq_m128i_u16(
7614 vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
7615}
7616
7617// Compare packed unsigned 32-bit integers in a and b, and store packed maximum
7618// values in dst.
7619// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
7620FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
7621{
7622 return vreinterpretq_m128i_u32(
7623 vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
7624}
7625
7626// Computes the pariwise minima of the four signed 32-bit integer values of a
7627// and b.
7628//
7629// A 128-bit parameter that can be defined with the following equations:
7630// r0 := (a0 < b0) ? a0 : b0
7631// r1 := (a1 < b1) ? a1 : b1
7632// r2 := (a2 < b2) ? a2 : b2
7633// r3 := (a3 < b3) ? a3 : b3
7634//
7635// https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
7636FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
7637{
7638 return vreinterpretq_m128i_s32(
7639 vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
7640}
7641
7642// Compare packed signed 8-bit integers in a and b, and store packed minimum
7643// values in dst.
7644// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8
7645FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
7646{
7647 return vreinterpretq_m128i_s8(
7648 vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
7649}
7650
7651// Compare packed unsigned 16-bit integers in a and b, and store packed minimum
7652// values in dst.
7653// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16
7654FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
7655{
7656 return vreinterpretq_m128i_u16(
7657 vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
7658}
7659
7660// Compare packed unsigned 32-bit integers in a and b, and store packed minimum
7661// values in dst.
7662// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
7663FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
7664{
7665 return vreinterpretq_m128i_u32(
7666 vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
7667}
7668
7669// Horizontally compute the minimum amongst the packed unsigned 16-bit integers
7670// in a, store the minimum and index in dst, and zero the remaining bits in dst.
7671//
7672// index[2:0] := 0
7673// min[15:0] := a[15:0]
7674// FOR j := 0 to 7
7675// i := j*16
7676// IF a[i+15:i] < min[15:0]
7677// index[2:0] := j
7678// min[15:0] := a[i+15:i]
7679// FI
7680// ENDFOR
7681// dst[15:0] := min[15:0]
7682// dst[18:16] := index[2:0]
7683// dst[127:19] := 0
7684//
7685// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16
7686FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
7687{
7688 __m128i dst;
7689 uint16_t min, idx = 0;
7690 // Find the minimum value
7691#if defined(__aarch64__)
7692 min = vminvq_u16(vreinterpretq_u16_m128i(a));
7693#else
7694 __m64 tmp;
7695 tmp = vreinterpret_m64_u16(
7696 vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
7697 vget_high_u16(vreinterpretq_u16_m128i(a))));
7698 tmp = vreinterpret_m64_u16(
7699 vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
7700 tmp = vreinterpret_m64_u16(
7701 vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
7702 min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
7703#endif
7704 // Get the index of the minimum value
7705 int i;
7706 for (i = 0; i < 8; i++) {
7707 if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
7708 idx = (uint16_t) i;
7709 break;
7710 }
7711 a = _mm_srli_si128(a, 2);
7712 }
7713 // Generate result
7714 dst = _mm_setzero_si128();
7715 dst = vreinterpretq_m128i_u16(
7716 vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
7717 dst = vreinterpretq_m128i_u16(
7718 vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
7719 return dst;
7720}
7721
7722// Multiply the low signed 32-bit integers from each packed 64-bit element in
7723// a and b, and store the signed 64-bit results in dst.
7724//
7725// r0 := (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0
7726// r1 := (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2
7727FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
7728{
7729 // vmull_s32 upcasts instead of masking, so we downcast.
7730 int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
7731 int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
7732 return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
7733}
7734
7735// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or
7736// unsigned 32-bit integers from b.
7737// https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
7738FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
7739{
7740 return vreinterpretq_m128i_s32(
7741 vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
7742}
7743
7744// Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit
7745// integers and saturates.
7746//
7747// r0 := UnsignedSaturate(a0)
7748// r1 := UnsignedSaturate(a1)
7749// r2 := UnsignedSaturate(a2)
7750// r3 := UnsignedSaturate(a3)
7751// r4 := UnsignedSaturate(b0)
7752// r5 := UnsignedSaturate(b1)
7753// r6 := UnsignedSaturate(b2)
7754// r7 := UnsignedSaturate(b3)
7755FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
7756{
7757 return vreinterpretq_m128i_u16(
7758 vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
7759 vqmovun_s32(vreinterpretq_s32_m128i(b))));
7760}
7761
7762// Round the packed double-precision (64-bit) floating-point elements in a using
7763// the rounding parameter, and store the results as packed double-precision
7764// floating-point elements in dst.
7765// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_pd
7766FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
7767{
7768#if defined(__aarch64__)
7769 switch (rounding) {
7770 case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
7771 return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a)));
7772 case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
7773 return _mm_floor_pd(a);
7774 case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
7775 return _mm_ceil_pd(a);
7776 case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
7777 return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a)));
7778 default: //_MM_FROUND_CUR_DIRECTION
7779 return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)));
7780 }
7781#else
7782 double *v_double = (double *) &a;
7783
7784 if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
7785 (rounding == _MM_FROUND_CUR_DIRECTION &&
7786 _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
7787 double res[2], tmp;
7788 for (int i = 0; i < 2; i++) {
7789 tmp = (v_double[i] < 0) ? -v_double[i] : v_double[i];
7790 double roundDown = floor(tmp); // Round down value
7791 double roundUp = ceil(tmp); // Round up value
7792 double diffDown = tmp - roundDown;
7793 double diffUp = roundUp - tmp;
7794 if (diffDown < diffUp) {
7795 /* If it's closer to the round down value, then use it */
7796 res[i] = roundDown;
7797 } else if (diffDown > diffUp) {
7798 /* If it's closer to the round up value, then use it */
7799 res[i] = roundUp;
7800 } else {
7801 /* If it's equidistant between round up and round down value,
7802 * pick the one which is an even number */
7803 double half = roundDown / 2;
7804 if (half != floor(half)) {
7805 /* If the round down value is odd, return the round up value
7806 */
7807 res[i] = roundUp;
7808 } else {
7809 /* If the round up value is odd, return the round down value
7810 */
7811 res[i] = roundDown;
7812 }
7813 }
7814 res[i] = (v_double[i] < 0) ? -res[i] : res[i];
7815 }
7816 return _mm_set_pd(res[1], res[0]);
7817 } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
7818 (rounding == _MM_FROUND_CUR_DIRECTION &&
7819 _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
7820 return _mm_floor_pd(a);
7821 } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
7822 (rounding == _MM_FROUND_CUR_DIRECTION &&
7823 _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
7824 return _mm_ceil_pd(a);
7825 }
7826 return _mm_set_pd(v_double[1] > 0 ? floor(v_double[1]) : ceil(v_double[1]),
7827 v_double[0] > 0 ? floor(v_double[0]) : ceil(v_double[0]));
7828#endif
7829}
7830
7831// Round the packed single-precision (32-bit) floating-point elements in a using
7832// the rounding parameter, and store the results as packed single-precision
7833// floating-point elements in dst.
7834// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
7835FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
7836{
7837#if defined(__aarch64__)
7838 switch (rounding) {
7839 case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
7840 return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
7841 case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
7842 return _mm_floor_ps(a);
7843 case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
7844 return _mm_ceil_ps(a);
7845 case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
7846 return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
7847 default: //_MM_FROUND_CUR_DIRECTION
7848 return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
7849 }
7850#else
7851 float *v_float = (float *) &a;
7852
7853 if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
7854 (rounding == _MM_FROUND_CUR_DIRECTION &&
7855 _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
7856 uint32x4_t signmask = vdupq_n_u32(0x80000000);
7857 float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
7858 vdupq_n_f32(0.5f)); /* +/- 0.5 */
7859 int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
7860 vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
7861 int32x4_t r_trunc = vcvtq_s32_f32(
7862 vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
7863 int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
7864 vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
7865 int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
7866 vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
7867 float32x4_t delta = vsubq_f32(
7868 vreinterpretq_f32_m128(a),
7869 vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
7870 uint32x4_t is_delta_half =
7871 vceqq_f32(delta, half); /* delta == +/- 0.5 */
7872 return vreinterpretq_m128_f32(
7873 vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal)));
7874 } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
7875 (rounding == _MM_FROUND_CUR_DIRECTION &&
7876 _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
7877 return _mm_floor_ps(a);
7878 } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
7879 (rounding == _MM_FROUND_CUR_DIRECTION &&
7880 _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
7881 return _mm_ceil_ps(a);
7882 }
7883 return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]),
7884 v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]),
7885 v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]),
7886 v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0]));
7887#endif
7888}
7889
7890// Round the lower double-precision (64-bit) floating-point element in b using
7891// the rounding parameter, store the result as a double-precision floating-point
7892// element in the lower element of dst, and copy the upper element from a to the
7893// upper element of dst.
7894// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_sd
7895FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
7896{
7897 return _mm_move_sd(a, _mm_round_pd(b, rounding));
7898}
7899
7900// Round the lower single-precision (32-bit) floating-point element in b using
7901// the rounding parameter, store the result as a single-precision floating-point
7902// element in the lower element of dst, and copy the upper 3 packed elements
7903// from a to the upper elements of dst. Rounding is done according to the
7904// rounding[3:0] parameter, which can be one of:
7905// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and
7906// suppress exceptions
7907// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and
7908// suppress exceptions
7909// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress
7910// exceptions
7911// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress
7912// exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see
7913// _MM_SET_ROUNDING_MODE
7914// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ss
7915FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
7916{
7917 return _mm_move_ss(a, _mm_round_ps(b, rounding));
7918}
7919
7920// Load 128-bits of integer data from memory into dst using a non-temporal
7921// memory hint. mem_addr must be aligned on a 16-byte boundary or a
7922// general-protection exception may be generated.
7923//
7924// dst[127:0] := MEM[mem_addr+127:mem_addr]
7925//
7926// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128
7927FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
7928{
7929#if __has_builtin(__builtin_nontemporal_store)
7930 return __builtin_nontemporal_load(p);
7931#else
7932 return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
7933#endif
7934}
7935
7936// Compute the bitwise NOT of a and then AND with a 128-bit vector containing
7937// all 1's, and return 1 if the result is zero, otherwise return 0.
7938// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones
7939FORCE_INLINE int _mm_test_all_ones(__m128i a)
7940{
7941 return (uint64_t)(vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
7942 ~(uint64_t) 0;
7943}
7944
7945// Compute the bitwise AND of 128 bits (representing integer data) in a and
7946// mask, and return 1 if the result is zero, otherwise return 0.
7947// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros
7948FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
7949{
7950 int64x2_t a_and_mask =
7951 vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
7952 return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1));
7953}
7954
7955// Compute the bitwise AND of 128 bits (representing integer data) in a and
7956// mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute
7957// the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is
7958// zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
7959// otherwise return 0.
7960// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_test_mix_ones_zero
7961FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
7962{
7963 uint64x2_t zf =
7964 vandq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
7965 uint64x2_t cf =
7966 vbicq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
7967 uint64x2_t result = vandq_u64(zf, cf);
7968 return !(vgetq_lane_u64(result, 0) | vgetq_lane_u64(result, 1));
7969}
7970
7971// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
7972// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
7973// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
7974// otherwise set CF to 0. Return the CF value.
7975// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128
7976FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
7977{
7978 int64x2_t s64 =
7979 vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_m128i(a))),
7980 vreinterpretq_s64_m128i(b));
7981 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
7982}
7983
7984// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
7985// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
7986// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
7987// otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
7988// otherwise return 0.
7989// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testnzc_si128
7990#define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b)
7991
7992// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
7993// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
7994// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
7995// otherwise set CF to 0. Return the ZF value.
7996// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128
7997FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
7998{
7999 int64x2_t s64 =
8000 vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
8001 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
8002}
8003
8004/* SSE4.2 */
8005
8006// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
8007// in b for greater than.
8008FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
8009{
8010#if defined(__aarch64__)
8011 return vreinterpretq_m128i_u64(
8012 vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
8013#else
8014 return vreinterpretq_m128i_s64(vshrq_n_s64(
8015 vqsubq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a)),
8016 63));
8017#endif
8018}
8019
8020// Starting with the initial value in crc, accumulates a CRC32 value for
8021// unsigned 16-bit integer v.
8022// https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100)
8023FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
8024{
8025#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8026 __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
8027 : [c] "+r"(crc)
8028 : [v] "r"(v));
8029#else
8030 crc = _mm_crc32_u8(crc, v & 0xff);
8031 crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
8032#endif
8033 return crc;
8034}
8035
8036// Starting with the initial value in crc, accumulates a CRC32 value for
8037// unsigned 32-bit integer v.
8038// https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100)
8039FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
8040{
8041#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8042 __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
8043 : [c] "+r"(crc)
8044 : [v] "r"(v));
8045#else
8046 crc = _mm_crc32_u16(crc, v & 0xffff);
8047 crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
8048#endif
8049 return crc;
8050}
8051
8052// Starting with the initial value in crc, accumulates a CRC32 value for
8053// unsigned 64-bit integer v.
8054// https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100)
8055FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
8056{
8057#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8058 __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
8059 : [c] "+r"(crc)
8060 : [v] "r"(v));
8061#else
8062 crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff);
8063 crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff);
8064#endif
8065 return crc;
8066}
8067
8068// Starting with the initial value in crc, accumulates a CRC32 value for
8069// unsigned 8-bit integer v.
8070// https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100)
8071FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
8072{
8073#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8074 __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
8075 : [c] "+r"(crc)
8076 : [v] "r"(v));
8077#else
8078 crc ^= v;
8079 for (int bit = 0; bit < 8; bit++) {
8080 if (crc & 1)
8081 crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
8082 else
8083 crc = (crc >> 1);
8084 }
8085#endif
8086 return crc;
8087}
8088
8089/* AES */
8090
8091#if !defined(__ARM_FEATURE_CRYPTO)
8092/* clang-format off */
8093#define SSE2NEON_AES_DATA(w) \
8094 { \
8095 w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
8096 w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
8097 w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
8098 w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
8099 w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
8100 w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
8101 w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
8102 w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
8103 w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
8104 w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
8105 w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
8106 w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
8107 w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
8108 w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
8109 w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
8110 w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
8111 w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
8112 w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
8113 w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
8114 w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
8115 w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
8116 w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
8117 w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
8118 w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
8119 w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
8120 w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
8121 w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
8122 w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
8123 w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
8124 w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
8125 w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
8126 w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
8127 w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
8128 w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
8129 w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
8130 w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
8131 w(0xb0), w(0x54), w(0xbb), w(0x16) \
8132 }
8133/* clang-format on */
8134
8135/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
8136#define SSE2NEON_AES_H0(x) (x)
8137static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_DATA(SSE2NEON_AES_H0);
8138#undef SSE2NEON_AES_H0
8139
8140// In the absence of crypto extensions, implement aesenc using regular neon
8141// intrinsics instead. See:
8142// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
8143// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
8144// https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52
8145// for more information Reproduced with permission of the author.
8146FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
8147{
8148#if defined(__aarch64__)
8149 static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
8150 0xe, 0x3, 0x8, 0xd, 0x2, 0x7,
8151 0xc, 0x1, 0x6, 0xb};
8152 static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
8153 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
8154
8155 uint8x16_t v;
8156 uint8x16_t w = vreinterpretq_u8_m128i(EncBlock);
8157
8158 // shift rows
8159 w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
8160
8161 // sub bytes
8162 v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(SSE2NEON_sbox), w);
8163 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40);
8164 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80);
8165 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0);
8166
8167 // mix columns
8168 w = (v << 1) ^ (uint8x16_t)(((int8x16_t) v >> 7) & 0x1b);
8169 w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
8170 w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
8171
8172 // add round key
8173 return vreinterpretq_m128i_u8(w) ^ RoundKey;
8174
8175#else /* ARMv7-A NEON implementation */
8176#define SSE2NEON_AES_B2W(b0, b1, b2, b3) \
8177 (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | \
8178 (b0))
8179#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
8180#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
8181#define SSE2NEON_AES_U0(p) \
8182 SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
8183#define SSE2NEON_AES_U1(p) \
8184 SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
8185#define SSE2NEON_AES_U2(p) \
8186 SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
8187#define SSE2NEON_AES_U3(p) \
8188 SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
8189 static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
8190 SSE2NEON_AES_DATA(SSE2NEON_AES_U0),
8191 SSE2NEON_AES_DATA(SSE2NEON_AES_U1),
8192 SSE2NEON_AES_DATA(SSE2NEON_AES_U2),
8193 SSE2NEON_AES_DATA(SSE2NEON_AES_U3),
8194 };
8195#undef SSE2NEON_AES_B2W
8196#undef SSE2NEON_AES_F2
8197#undef SSE2NEON_AES_F3
8198#undef SSE2NEON_AES_U0
8199#undef SSE2NEON_AES_U1
8200#undef SSE2NEON_AES_U2
8201#undef SSE2NEON_AES_U3
8202
8203 uint32_t x0 = _mm_cvtsi128_si32(EncBlock);
8204 uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0x55));
8205 uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xAA));
8206 uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xFF));
8207
8208 __m128i out = _mm_set_epi32(
8209 (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
8210 aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
8211 (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
8212 aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
8213 (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
8214 aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
8215 (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
8216 aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
8217
8218 return _mm_xor_si128(out, RoundKey);
8219#endif
8220}
8221
8222// Perform the last round of an AES encryption flow on data (state) in a using
8223// the round key in RoundKey, and store the result in dst.
8224// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
8225FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
8226{
8227 /* FIXME: optimized for NEON */
8228 uint8_t v[4][4] = {
8229 {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 0)],
8230 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 5)],
8231 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 10)],
8232 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 15)]},
8233 {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 4)],
8234 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 9)],
8235 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 14)],
8236 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 3)]},
8237 {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 8)],
8238 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 13)],
8239 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 2)],
8240 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 7)]},
8241 {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 12)],
8242 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 1)],
8243 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 6)],
8244 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 11)]},
8245 };
8246 for (int i = 0; i < 16; i++)
8247 vreinterpretq_nth_u8_m128i(a, i) =
8248 v[i / 4][i % 4] ^ vreinterpretq_nth_u8_m128i(RoundKey, i);
8249 return a;
8250}
8251
8252// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
8253// This instruction generates a round key for AES encryption. See
8254// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
8255// for details.
8256//
8257// https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx
8258FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon)
8259{
8260 uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55));
8261 uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF));
8262 for (int i = 0; i < 4; ++i) {
8263 ((uint8_t *) &X1)[i] = SSE2NEON_sbox[((uint8_t *) &X1)[i]];
8264 ((uint8_t *) &X3)[i] = SSE2NEON_sbox[((uint8_t *) &X3)[i]];
8265 }
8266 return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
8267 ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
8268}
8269#undef SSE2NEON_AES_DATA
8270
8271#else /* __ARM_FEATURE_CRYPTO */
8272// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
8273// AESMC and then manually applying the real key as an xor operation. This
8274// unfortunately means an additional xor op; the compiler should be able to
8275// optimize this away for repeated calls however. See
8276// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
8277// for more details.
8278FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
8279{
8280 return vreinterpretq_m128i_u8(
8281 vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
8282 vreinterpretq_u8_m128i(b));
8283}
8284
8285// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
8286FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
8287{
8288 return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
8289 vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
8290 RoundKey);
8291}
8292
8293FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
8294{
8295 // AESE does ShiftRows and SubBytes on A
8296 uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
8297
8298 uint8x16_t dest = {
8299 // Undo ShiftRows step from AESE and extract X1 and X3
8300 u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1)
8301 u8[0x1], u8[0xE], u8[0xB], u8[0x4], // ROT(SubBytes(X1))
8302 u8[0xC], u8[0x9], u8[0x6], u8[0x3], // SubBytes(X3)
8303 u8[0x9], u8[0x6], u8[0x3], u8[0xC], // ROT(SubBytes(X3))
8304 };
8305 uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
8306 return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
8307}
8308#endif
8309
8310/* Others */
8311
8312// Perform a carry-less multiplication of two 64-bit integers, selected from a
8313// and b according to imm8, and store the results in dst.
8314// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clmulepi64_si128
8315FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
8316{
8317 uint64x2_t a = vreinterpretq_u64_m128i(_a);
8318 uint64x2_t b = vreinterpretq_u64_m128i(_b);
8319 switch (imm & 0x11) {
8320 case 0x00:
8321 return vreinterpretq_m128i_u64(
8322 _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
8323 case 0x01:
8324 return vreinterpretq_m128i_u64(
8325 _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
8326 case 0x10:
8327 return vreinterpretq_m128i_u64(
8328 _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
8329 case 0x11:
8330 return vreinterpretq_m128i_u64(
8331 _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
8332 default:
8333 abort();
8334 }
8335}
8336
8337// Count the number of bits set to 1 in unsigned 32-bit integer a, and
8338// return that count in dst.
8339// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32
8340FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
8341{
8342#if defined(__aarch64__)
8343#if __has_builtin(__builtin_popcount)
8344 return __builtin_popcount(a);
8345#else
8346 return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
8347#endif
8348#else
8349 uint32_t count = 0;
8350 uint8x8_t input_val, count8x8_val;
8351 uint16x4_t count16x4_val;
8352 uint32x2_t count32x2_val;
8353
8354 input_val = vld1_u8((uint8_t *) &a);
8355 count8x8_val = vcnt_u8(input_val);
8356 count16x4_val = vpaddl_u8(count8x8_val);
8357 count32x2_val = vpaddl_u16(count16x4_val);
8358
8359 vst1_u32(&count, count32x2_val);
8360 return count;
8361#endif
8362}
8363
8364// Count the number of bits set to 1 in unsigned 64-bit integer a, and
8365// return that count in dst.
8366// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64
8367FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
8368{
8369#if defined(__aarch64__)
8370#if __has_builtin(__builtin_popcountll)
8371 return __builtin_popcountll(a);
8372#else
8373 return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
8374#endif
8375#else
8376 uint64_t count = 0;
8377 uint8x8_t input_val, count8x8_val;
8378 uint16x4_t count16x4_val;
8379 uint32x2_t count32x2_val;
8380 uint64x1_t count64x1_val;
8381
8382 input_val = vld1_u8((uint8_t *) &a);
8383 count8x8_val = vcnt_u8(input_val);
8384 count16x4_val = vpaddl_u8(count8x8_val);
8385 count32x2_val = vpaddl_u16(count16x4_val);
8386 count64x1_val = vpaddl_u32(count32x2_val);
8387 vst1_u64(&count, count64x1_val);
8388 return count;
8389#endif
8390}
8391
8392#if defined(__GNUC__) || defined(__clang__)
8393#pragma pop_macro("ALIGN_STRUCT")
8394#pragma pop_macro("FORCE_INLINE")
8395#endif
8396
8397#if defined(__GNUC__) && !defined(__clang__)
8398#pragma GCC pop_options
8399#endif
8400
8401#endif
Definition sse2neon.h:486