4 // This header file provides a simple API translation layer
5 // between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
7 // This header file does not yet translate all of the SSE intrinsics.
9 // Contributors to this work are:
10 // John W. Ratcliff <jratcliffscarab@gmail.com>
11 // Brandon Rowlett <browlett@nvidia.com>
12 // Ken Fast <kfast@gdeb.com>
13 // Eric van Beurden <evanbeurden@nvidia.com>
14 // Alexander Potylitsin <apotylitsin@nvidia.com>
15 // Hasindu Gamaarachchi <hasindu2008@gmail.com>
16 // Jim Huang <jserv@biilabs.io>
17 // Mark Cheng <marktwtn@biilabs.io>
18 // Malcolm James MacLeod <malcolm@gulden.com>
19 // Devin Hussey (easyaspi314) <husseydevin@gmail.com>
20 // Sebastian Pop <spop@amazon.com>
21 // Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
22 // Danila Kutenin <danilak@google.com>
23 // François Turban (JishinMaster) <francois.turban@gmail.com>
24 // Pei-Hsuan Hung <afcidk@gmail.com>
25 // Yang-Hao Yuan <yanghau@biilabs.io>
26 // Syoyo Fujita <syoyo@lighttransport.com>
27 // Brecht Van Lommel <brecht@blender.org>
30 * sse2neon is freely redistributable under the MIT License.
32 * Permission is hereby granted, free of charge, to any person obtaining a copy
33 * of this software and associated documentation files (the "Software"), to deal
34 * in the Software without restriction, including without limitation the rights
35 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
36 * copies of the Software, and to permit persons to whom the Software is
37 * furnished to do so, subject to the following conditions:
39 * The above copyright notice and this permission notice shall be included in
40 * all copies or substantial portions of the Software.
42 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
43 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
44 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
45 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
46 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
47 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
51 /* Tunable configurations */
53 /* Enable precise implementation of math operations
54 * This would slow down the computation a bit, but gives consistent result with
55 * x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result)
57 /* _mm_min|max_ps|ss|pd|sd */
58 #ifndef SSE2NEON_PRECISE_MINMAX
59 #define SSE2NEON_PRECISE_MINMAX (0)
61 /* _mm_rcp_ps and _mm_div_ps */
62 #ifndef SSE2NEON_PRECISE_DIV
63 #define SSE2NEON_PRECISE_DIV (0)
65 /* _mm_sqrt_ps and _mm_rsqrt_ps */
66 #ifndef SSE2NEON_PRECISE_SQRT
67 #define SSE2NEON_PRECISE_SQRT (0)
70 #ifndef SSE2NEON_PRECISE_DP
71 #define SSE2NEON_PRECISE_DP (0)
74 /* compiler specific definitions */
75 #if defined(__GNUC__) || defined(__clang__)
76 #pragma push_macro("FORCE_INLINE")
77 #pragma push_macro("ALIGN_STRUCT")
78 #define FORCE_INLINE static inline __attribute__((always_inline))
79 #define ALIGN_STRUCT(x) __attribute__((aligned(x)))
80 #define _sse2neon_likely(x) __builtin_expect(!!(x), 1)
81 #define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0)
82 #else /* non-GNU / non-clang compilers */
83 #warning "Macro name collisions may happen with unsupported compiler."
85 #define FORCE_INLINE static inline
88 #define ALIGN_STRUCT(x) __declspec(align(x))
90 #define _sse2neon_likely(x) (x)
91 #define _sse2neon_unlikely(x) (x)
97 /* Architecture-specific build options */
98 /* FIXME: #pragma GCC push_options is only available on GCC */
100 #if defined(__arm__) && __ARM_ARCH == 7
101 /* According to ARM C Language Extensions Architecture specification,
102 * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
103 * architecture supported.
105 #if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
106 #error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
108 #if !defined(__clang__)
109 #pragma GCC push_options
110 #pragma GCC target("fpu=neon")
112 #elif defined(__aarch64__)
113 #if !defined(__clang__)
114 #pragma GCC push_options
115 #pragma GCC target("+simd")
118 #error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
122 #include <arm_neon.h>
124 /* Rounding functions require either Aarch64 instructions or libm failback */
125 #if !defined(__aarch64__)
129 /* "__has_builtin" can be used to query support for built-in functions
130 * provided by gcc/clang and other compilers that support it.
132 #ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
133 /* Compatibility with gcc <= 9 */
134 #if defined(__GNUC__) && (__GNUC__ <= 9)
135 #define __has_builtin(x) HAS##x
136 #define HAS__builtin_popcount 1
137 #define HAS__builtin_popcountll 1
139 #define __has_builtin(x) 0
144 * MACRO for shuffle parameter for _mm_shuffle_ps().
145 * Argument fp3 is a digit[0123] that represents the fp from argument "b"
146 * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
147 * for fp2 in result. fp1 is a digit[0123] that represents the fp from
148 * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
149 * fp0 is the same for fp0 of result.
151 #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
152 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
154 /* Rounding mode macros. */
155 #define _MM_FROUND_TO_NEAREST_INT 0x00
156 #define _MM_FROUND_TO_NEG_INF 0x01
157 #define _MM_FROUND_TO_POS_INF 0x02
158 #define _MM_FROUND_TO_ZERO 0x03
159 #define _MM_FROUND_CUR_DIRECTION 0x04
160 #define _MM_FROUND_NO_EXC 0x08
161 #define _MM_FROUND_RAISE_EXC 0x00
162 #define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
163 #define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
164 #define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
165 #define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
166 #define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
167 #define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
168 #define _MM_ROUND_NEAREST 0x0000
169 #define _MM_ROUND_DOWN 0x2000
170 #define _MM_ROUND_UP 0x4000
171 #define _MM_ROUND_TOWARD_ZERO 0x6000
172 /* Flush zero mode macros. */
173 #define _MM_FLUSH_ZERO_MASK 0x8000
174 #define _MM_FLUSH_ZERO_ON 0x8000
175 #define _MM_FLUSH_ZERO_OFF 0x0000
176 /* Denormals are zeros mode macros. */
177 #define _MM_DENORMALS_ZERO_MASK 0x0040
178 #define _MM_DENORMALS_ZERO_ON 0x0040
179 #define _MM_DENORMALS_ZERO_OFF 0x0000
181 /* indicate immediate constant argument in a given range */
182 #define __constrange(a, b) const
184 /* A few intrinsics accept traditional data types like ints or floats, but
185 * most operate on data types that are specific to SSE.
186 * If a vector type ends in d, it contains doubles, and if it does not have
187 * a suffix, it contains floats. An integer vector type can contain any type
188 * of integer, from chars to shorts to unsigned long longs.
190 typedef int64x1_t __m64;
191 typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
192 // On ARM 32-bit architecture, the float64x2_t is not supported.
193 // The data type __m128d should be represented in a different way for related
194 // intrinsic conversion.
195 #if defined(__aarch64__)
196 typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
198 typedef float32x4_t __m128d;
200 typedef int64x2_t __m128i; /* 128-bit vector containing integers */
202 // __int64 is defined in the Intrinsics Guide which maps to different datatype
203 // in different data model
204 #if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
205 #if (defined(__x86_64__) || defined(__i386__))
206 #define __int64 long long
208 #define __int64 int64_t
212 /* type-safe casting between types */
214 #define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
215 #define vreinterpretq_m128_f32(x) (x)
216 #define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
218 #define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
219 #define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
220 #define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
221 #define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
223 #define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
224 #define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
225 #define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
226 #define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
228 #define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
229 #define vreinterpretq_f32_m128(x) (x)
230 #define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
232 #define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
233 #define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
234 #define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
235 #define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
237 #define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
238 #define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
239 #define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
240 #define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
242 #define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
243 #define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
244 #define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
245 #define vreinterpretq_m128i_s64(x) (x)
247 #define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
248 #define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
249 #define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
250 #define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
252 #define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
253 #define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
255 #define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
256 #define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
257 #define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
258 #define vreinterpretq_s64_m128i(x) (x)
260 #define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
261 #define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
262 #define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
263 #define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
265 #define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
266 #define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
267 #define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
268 #define vreinterpret_m64_s64(x) (x)
270 #define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
271 #define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
272 #define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
273 #define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
275 #define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
276 #define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
277 #define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
279 #define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
280 #define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
281 #define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
282 #define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
284 #define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
285 #define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
286 #define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
287 #define vreinterpret_s64_m64(x) (x)
289 #define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
291 #if defined(__aarch64__)
292 #define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
293 #define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
295 #define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
297 #define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
298 #define vreinterpretq_m128d_f64(x) (x)
300 #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
302 #define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x)
303 #define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
305 #define vreinterpretq_f64_m128d(x) (x)
306 #define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
308 #define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
309 #define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
311 #define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x)
312 #define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
314 #define vreinterpretq_m128d_f32(x) (x)
316 #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
318 #define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x)
319 #define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
321 #define vreinterpretq_f32_m128d(x) (x)
324 // A struct is defined in this header file called 'SIMDVec' which can be used
325 // by applications which attempt to access the contents of an __m128 struct
326 // directly. It is important to note that accessing the __m128 struct directly
327 // is bad coding practice by Microsoft: @see:
328 // https://docs.microsoft.com/en-us/cpp/cpp/m128
330 // However, some legacy source code may try to access the contents of an __m128
331 // struct directly so the developer can use the SIMDVec as an alias for it. Any
332 // casting must be done manually by the developer, as you cannot cast or
333 // otherwise alias the base NEON data type for intrinsic operations.
335 // union intended to allow direct access to an __m128 variable using the names
336 // that the MSVC compiler provides. This union should really only be used when
337 // trying to access the members of the vector as integer values. GCC/clang
338 // allow native access to the float members through a simple array access
339 // operator (in C since 4.6, in C++ since 4.8).
341 // Ideally direct accesses to SIMD vectors should not be used since it can cause
342 // a performance hit. If it really is needed however, the original __m128
343 // variable can be aliased with a pointer to this union and used to access
344 // individual components. The use of this union should be hidden behind a macro
345 // that is used throughout the codebase to access the members instead of always
346 // declaring this type of variable.
347 typedef union ALIGN_STRUCT(16) SIMDVec {
348 float m128_f32[4]; // as floats - DON'T USE. Added for convenience.
349 int8_t m128_i8[16]; // as signed 8-bit integers.
350 int16_t m128_i16[8]; // as signed 16-bit integers.
351 int32_t m128_i32[4]; // as signed 32-bit integers.
352 int64_t m128_i64[2]; // as signed 64-bit integers.
353 uint8_t m128_u8[16]; // as unsigned 8-bit integers.
354 uint16_t m128_u16[8]; // as unsigned 16-bit integers.
355 uint32_t m128_u32[4]; // as unsigned 32-bit integers.
356 uint64_t m128_u64[2]; // as unsigned 64-bit integers.
359 // casting using SIMDVec
360 #define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
361 #define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
362 #define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
365 #define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode
366 #define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode
367 #define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode
368 #define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode
370 // Function declaration
372 FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE();
373 FORCE_INLINE __m128 _mm_move_ss(__m128, __m128);
374 FORCE_INLINE __m128 _mm_or_ps(__m128, __m128);
375 FORCE_INLINE __m128 _mm_set_ps1(float);
376 FORCE_INLINE __m128 _mm_setzero_ps(void);
378 FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i);
379 FORCE_INLINE __m128i _mm_castps_si128(__m128);
380 FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i);
381 FORCE_INLINE __m128i _mm_cvtps_epi32(__m128);
382 FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d);
383 FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i);
384 FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int);
385 FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t);
386 FORCE_INLINE __m128d _mm_set_pd(double, double);
387 FORCE_INLINE __m128i _mm_set1_epi32(int);
388 FORCE_INLINE __m128i _mm_setzero_si128();
390 FORCE_INLINE __m128d _mm_ceil_pd(__m128d);
391 FORCE_INLINE __m128 _mm_ceil_ps(__m128);
392 FORCE_INLINE __m128d _mm_floor_pd(__m128d);
393 FORCE_INLINE __m128 _mm_floor_ps(__m128);
394 FORCE_INLINE __m128d _mm_round_pd(__m128d, int);
395 FORCE_INLINE __m128 _mm_round_ps(__m128, int);
397 FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
399 /* Backwards compatibility for compilers with lack of specific type support */
401 // Older gcc does not define vld1q_u8_x4 type
402 #if defined(__GNUC__) && !defined(__clang__) && \
403 ((__GNUC__ <= 10 && defined(__arm__)) || \
404 (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \
405 (__GNUC__ <= 9 && defined(__aarch64__)))
406 FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
409 ret.val[0] = vld1q_u8(p + 0);
410 ret.val[1] = vld1q_u8(p + 16);
411 ret.val[2] = vld1q_u8(p + 32);
412 ret.val[3] = vld1q_u8(p + 48);
417 FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
419 return vld1q_u8_x4(p);
423 /* Function Naming Conventions
424 * The naming convention of SSE intrinsics is straightforward. A generic SSE
425 * intrinsic function is given as follows:
426 * _mm_<name>_<data_type>
428 * The parts of this format are given as follows:
429 * 1. <name> describes the operation performed by the intrinsic
430 * 2. <data_type> identifies the data type of the function's primary arguments
432 * This last part, <data_type>, is a little complicated. It identifies the
433 * content of the input values, and can be set to any of the following values:
434 * + ps - vectors contain floats (ps stands for packed single-precision)
435 * + pd - vectors cantain doubles (pd stands for packed double-precision)
436 * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
438 * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
440 * + si128 - unspecified 128-bit vector or 256-bit vector
441 * + m128/m128i/m128d - identifies input vector types when they are different
442 * than the type of the returned vector
444 * For example, _mm_setzero_ps. The _mm implies that the function returns
445 * a 128-bit vector. The _ps at the end implies that the argument vectors
448 * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
449 * // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
450 * __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
451 * // Set packed 8-bit integers
452 * // 128 bits, 16 chars, per 8 bits
453 * __m128i v_perm = _mm_setr_epi8(1, 0, 2, 3, 8, 9, 10, 11,
454 * 4, 5, 12, 13, 6, 7, 14, 15);
455 * // Shuffle packed 8-bit integers
456 * __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
458 * Data (Number, Binary, Byte Index):
459 +------+------+-------------+------+------+-------------+
460 | 1 | 2 | 3 | 4 | Number
461 +------+------+------+------+------+------+------+------+
462 | 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary
463 +------+------+------+------+------+------+------+------+
464 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | Index
465 +------+------+------+------+------+------+------+------+
467 +------+------+------+------+------+------+------+------+
468 | 5 | 6 | 7 | 8 | Number
469 +------+------+------+------+------+------+------+------+
470 | 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary
471 +------+------+------+------+------+------+------+------+
472 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | Index
473 +------+------+------+------+------+------+------+------+
474 * Index (Byte Index):
475 +------+------+------+------+------+------+------+------+
476 | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 |
477 +------+------+------+------+------+------+------+------+
479 +------+------+------+------+------+------+------+------+
480 | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 |
481 +------+------+------+------+------+------+------+------+
483 +------+------+------+------+------+------+------+------+
484 | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 | Index
485 +------+------+------+------+------+------+------+------+
486 | 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary
487 +------+------+------+------+------+------+------+------+
488 | 256 | 2 | 5 | 6 | Number
489 +------+------+------+------+------+------+------+------+
491 +------+------+------+------+------+------+------+------+
492 | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 | Index
493 +------+------+------+------+------+------+------+------+
494 | 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary
495 +------+------+------+------+------+------+------+------+
496 | 3 | 7 | 4 | 8 | Number
497 +------+------+------+------+------+------+-------------+
500 /* Constants for use with _mm_prefetch. */
502 _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */
503 _MM_HINT_T0 = 1, /* load data to L1 and L2 cache */
504 _MM_HINT_T1 = 2, /* load data to L2 cache only */
505 _MM_HINT_T2 = 3, /* load data to L2 cache only, mark it as NTA */
506 _MM_HINT_ENTA = 4, /* exclusive version of _MM_HINT_NTA */
507 _MM_HINT_ET0 = 5, /* exclusive version of _MM_HINT_T0 */
508 _MM_HINT_ET1 = 6, /* exclusive version of _MM_HINT_T1 */
509 _MM_HINT_ET2 = 7 /* exclusive version of _MM_HINT_T2 */
512 // The bit field mapping to the FPCR(floating-point control register)
520 #if defined(__aarch64__)
525 // Takes the upper 64 bits of a and places it in the low end of the result
526 // Takes the lower 64 bits of b and places it into the high end of the result.
527 FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
529 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
530 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
531 return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
534 // takes the lower two 32-bit values from a and swaps them and places in high
535 // end of result takes the higher two 32 bit values from b and swaps them and
536 // places in low end of result.
537 FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
539 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
540 float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
541 return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
544 FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
546 float32x2_t a21 = vget_high_f32(
547 vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
548 float32x2_t b03 = vget_low_f32(
549 vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
550 return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
553 FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
555 float32x2_t a03 = vget_low_f32(
556 vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
557 float32x2_t b21 = vget_high_f32(
558 vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
559 return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
562 FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
564 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
565 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
566 return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
569 FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
571 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
572 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
573 return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
576 FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
578 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
579 float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
580 return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
583 // keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
585 FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
587 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
588 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
589 return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
592 FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
594 float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
595 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
596 return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
599 FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
602 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
603 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
604 return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
607 FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
609 float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
611 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
612 return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
615 FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
617 float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
619 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
620 float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
621 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
622 return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
625 FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
628 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
629 float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
630 return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
633 FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
635 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
636 float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
637 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
638 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
639 return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
642 FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
644 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
645 float32_t b2 = vgetq_lane_f32(b, 2);
646 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
647 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
648 return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
651 FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
653 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
654 float32_t b2 = vgetq_lane_f32(b, 2);
655 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
656 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
657 return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
660 // Kahan summation for accurate summation of floating-point numbers.
661 // http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html
662 FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y)
670 #if defined(__ARM_FEATURE_CRYPTO)
672 FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
674 poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
675 poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
676 return vreinterpretq_u64_p128(vmull_p64(a, b));
678 #else // ARMv7 polyfill
679 // ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
681 // vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
682 // 64-bit->128-bit polynomial multiply.
684 // It needs some work and is somewhat slow, but it is still faster than all
685 // known scalar methods.
687 // Algorithm adapted to C from
688 // https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
689 // from "Fast Software Polynomial Multiplication on ARM Processors Using the
690 // NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
691 // (https://hal.inria.fr/hal-01506572)
692 static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
694 poly8x8_t a = vreinterpret_p8_u64(_a);
695 poly8x8_t b = vreinterpret_p8_u64(_b);
698 uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
699 vcreate_u8(0x00000000ffffffff));
700 uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
701 vcreate_u8(0x0000000000000000));
703 // Do the multiplies, rotating with vext to get all combinations
704 uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0
706 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1
708 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0
710 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2
712 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0
714 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3
716 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0
718 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4
720 // Add cross products
721 uint8x16_t l = veorq_u8(e, f); // L = E + F
722 uint8x16_t m = veorq_u8(g, h); // M = G + H
723 uint8x16_t n = veorq_u8(i, j); // N = I + J
725 // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
727 #if defined(__aarch64__)
728 uint8x16_t lm_p0 = vreinterpretq_u8_u64(
729 vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
730 uint8x16_t lm_p1 = vreinterpretq_u8_u64(
731 vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
732 uint8x16_t nk_p0 = vreinterpretq_u8_u64(
733 vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
734 uint8x16_t nk_p1 = vreinterpretq_u8_u64(
735 vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
737 uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
738 uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
739 uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
740 uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
742 // t0 = (L) (P0 + P1) << 8
743 // t1 = (M) (P2 + P3) << 16
744 uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
745 uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
746 uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
748 // t2 = (N) (P4 + P5) << 24
749 // t3 = (K) (P6 + P7) << 32
750 uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
751 uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
752 uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
755 #if defined(__aarch64__)
756 uint8x16_t t0 = vreinterpretq_u8_u64(
757 vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
758 uint8x16_t t1 = vreinterpretq_u8_u64(
759 vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
760 uint8x16_t t2 = vreinterpretq_u8_u64(
761 vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
762 uint8x16_t t3 = vreinterpretq_u8_u64(
763 vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
765 uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
766 uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
767 uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
768 uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
770 // Shift the cross products
771 uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8
772 uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16
773 uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24
774 uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32
776 // Accumulate the products
777 uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
778 uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
779 uint8x16_t mix = veorq_u8(d, cross1);
780 uint8x16_t r = veorq_u8(mix, cross2);
781 return vreinterpretq_u64_u8(r);
783 #endif // ARMv7 polyfill
786 // __m128i _mm_shuffle_epi32_default(__m128i a,
787 // __constrange(0, 255) int imm) {
789 // ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3];
790 // ret[2] = a[(imm >> 4) & 0x03]; ret[3] = a[(imm >> 6) & 0x03];
793 #define _mm_shuffle_epi32_default(a, imm) \
797 vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3))); \
798 ret = vsetq_lane_s32( \
799 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
801 ret = vsetq_lane_s32( \
802 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
804 ret = vsetq_lane_s32( \
805 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
807 vreinterpretq_m128i_s32(ret); \
810 // Takes the upper 64 bits of a and places it in the low end of the result
811 // Takes the lower 64 bits of a and places it into the high end of the result.
812 FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
814 int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
815 int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
816 return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
819 // takes the lower two 32-bit values from a and swaps them and places in low end
820 // of result takes the higher two 32 bit values from a and swaps them and places
821 // in high end of result.
822 FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
824 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
825 int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
826 return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
829 // rotates the least significant 32 bits into the most significant 32 bits, and
830 // shifts the rest down
831 FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
833 return vreinterpretq_m128i_s32(
834 vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
837 // rotates the most significant 32 bits into the least significant 32 bits, and
838 // shifts the rest up
839 FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
841 return vreinterpretq_m128i_s32(
842 vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
845 // gets the lower 64 bits of a, and places it in the upper 64 bits
846 // gets the lower 64 bits of a and places it in the lower 64 bits
847 FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
849 int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
850 return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
853 // gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
854 // lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
855 FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
857 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
858 int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
859 return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
862 // gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
863 // upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
864 // places it in the lower 64 bits
865 FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
867 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
868 return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
871 FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
873 int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
874 int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
875 return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
878 FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
880 int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
881 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
882 return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
885 FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
887 int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
888 int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
889 return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
892 // FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
894 #if defined(__aarch64__)
895 #define _mm_shuffle_epi32_splat(a, imm) \
897 vreinterpretq_m128i_s32( \
898 vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
901 #define _mm_shuffle_epi32_splat(a, imm) \
903 vreinterpretq_m128i_s32( \
904 vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
908 // NEON does not support a general purpose permute intrinsic
909 // Selects four specific single-precision, floating-point values from a and b,
910 // based on the mask i.
913 // __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
914 // __constrange(0, 255) int imm) {
916 // ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3];
917 // ret[2] = b[(imm >> 4) & 0x03]; ret[3] = b[(imm >> 6) & 0x03];
921 // https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
922 #define _mm_shuffle_ps_default(a, b, imm) \
926 vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))); \
927 ret = vsetq_lane_f32( \
928 vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
930 ret = vsetq_lane_f32( \
931 vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
933 ret = vsetq_lane_f32( \
934 vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
936 vreinterpretq_m128_f32(ret); \
939 // Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified
941 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100)
942 // FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a,
943 // __constrange(0,255) int
945 #define _mm_shufflelo_epi16_function(a, imm) \
947 int16x8_t ret = vreinterpretq_s16_m128i(a); \
948 int16x4_t lowBits = vget_low_s16(ret); \
949 ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \
950 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
952 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
954 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
956 vreinterpretq_m128i_s16(ret); \
959 // Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified
961 // https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
962 // FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a,
963 // __constrange(0,255) int
965 #define _mm_shufflehi_epi16_function(a, imm) \
967 int16x8_t ret = vreinterpretq_s16_m128i(a); \
968 int16x4_t highBits = vget_high_s16(ret); \
969 ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \
970 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
972 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
974 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
976 vreinterpretq_m128i_s16(ret); \
981 //_mm_empty is a no-op on arm
982 FORCE_INLINE void _mm_empty(void) {}
986 // Adds the four single-precision, floating-point values of a and b.
993 // https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
994 FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
996 return vreinterpretq_m128_f32(
997 vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1000 // adds the scalar single-precision floating point values of a and b.
1001 // https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
1002 FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
1004 float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
1005 float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
1006 // the upper values in the result must be the remnants of <a>.
1007 return vreinterpretq_m128_f32(vaddq_f32(a, value));
1010 // Computes the bitwise AND of the four single-precision, floating-point values
1018 // https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
1019 FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
1021 return vreinterpretq_m128_s32(
1022 vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
1025 // Computes the bitwise AND-NOT of the four single-precision, floating-point
1026 // values of a and b.
1033 // https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
1034 FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
1036 return vreinterpretq_m128_s32(
1037 vbicq_s32(vreinterpretq_s32_m128(b),
1038 vreinterpretq_s32_m128(a))); // *NOTE* argument swap
1041 // Average packed unsigned 16-bit integers in a and b, and store the results in
1046 // dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
1049 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16
1050 FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
1052 return vreinterpret_m64_u16(
1053 vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
1056 // Average packed unsigned 8-bit integers in a and b, and store the results in
1061 // dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
1064 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8
1065 FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
1067 return vreinterpret_m64_u8(
1068 vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
1071 // Compares for equality.
1072 // https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
1073 FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
1075 return vreinterpretq_m128_u32(
1076 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1079 // Compares for equality.
1080 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100)
1081 FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
1083 return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
1086 // Compares for greater than or equal.
1087 // https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
1088 FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
1090 return vreinterpretq_m128_u32(
1091 vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1094 // Compares for greater than or equal.
1095 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100)
1096 FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
1098 return _mm_move_ss(a, _mm_cmpge_ps(a, b));
1101 // Compares for greater than.
1103 // r0 := (a0 > b0) ? 0xffffffff : 0x0
1104 // r1 := (a1 > b1) ? 0xffffffff : 0x0
1105 // r2 := (a2 > b2) ? 0xffffffff : 0x0
1106 // r3 := (a3 > b3) ? 0xffffffff : 0x0
1108 // https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
1109 FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
1111 return vreinterpretq_m128_u32(
1112 vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1115 // Compares for greater than.
1116 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100)
1117 FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
1119 return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
1122 // Compares for less than or equal.
1124 // r0 := (a0 <= b0) ? 0xffffffff : 0x0
1125 // r1 := (a1 <= b1) ? 0xffffffff : 0x0
1126 // r2 := (a2 <= b2) ? 0xffffffff : 0x0
1127 // r3 := (a3 <= b3) ? 0xffffffff : 0x0
1129 // https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
1130 FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
1132 return vreinterpretq_m128_u32(
1133 vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1136 // Compares for less than or equal.
1137 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100)
1138 FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
1140 return _mm_move_ss(a, _mm_cmple_ps(a, b));
1143 // Compares for less than
1144 // https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
1145 FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
1147 return vreinterpretq_m128_u32(
1148 vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1151 // Compares for less than
1152 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100)
1153 FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
1155 return _mm_move_ss(a, _mm_cmplt_ps(a, b));
1158 // Compares for inequality.
1159 // https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
1160 FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
1162 return vreinterpretq_m128_u32(vmvnq_u32(
1163 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1166 // Compares for inequality.
1167 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100)
1168 FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
1170 return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
1173 // Compares for not greater than or equal.
1174 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100)
1175 FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
1177 return vreinterpretq_m128_u32(vmvnq_u32(
1178 vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1181 // Compares for not greater than or equal.
1182 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100)
1183 FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
1185 return _mm_move_ss(a, _mm_cmpnge_ps(a, b));
1188 // Compares for not greater than.
1189 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100)
1190 FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
1192 return vreinterpretq_m128_u32(vmvnq_u32(
1193 vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1196 // Compares for not greater than.
1197 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
1198 FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
1200 return _mm_move_ss(a, _mm_cmpngt_ps(a, b));
1203 // Compares for not less than or equal.
1204 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100)
1205 FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
1207 return vreinterpretq_m128_u32(vmvnq_u32(
1208 vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1211 // Compares for not less than or equal.
1212 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
1213 FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
1215 return _mm_move_ss(a, _mm_cmpnle_ps(a, b));
1218 // Compares for not less than.
1219 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100)
1220 FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
1222 return vreinterpretq_m128_u32(vmvnq_u32(
1223 vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1226 // Compares for not less than.
1227 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100)
1228 FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
1230 return _mm_move_ss(a, _mm_cmpnlt_ps(a, b));
1233 // Compares the four 32-bit floats in a and b to check if any values are NaN.
1234 // Ordered compare between each value returns true for "orderable" and false for
1235 // "not orderable" (NaN).
1236 // https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see
1238 // http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
1239 // http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
1240 FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
1242 // Note: NEON does not have ordered compare builtin
1243 // Need to compare a eq a and b eq b to check for NaN
1244 // Do AND of results to get final
1246 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1248 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1249 return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
1252 // Compares for ordered.
1253 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100)
1254 FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
1256 return _mm_move_ss(a, _mm_cmpord_ps(a, b));
1259 // Compares for unordered.
1260 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100)
1261 FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
1264 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1266 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1267 return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
1270 // Compares for unordered.
1271 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100)
1272 FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
1274 return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
1277 // Compares the lower single-precision floating point scalar values of a and b
1278 // using an equality operation. :
1279 // https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
1280 FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
1283 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1284 return vgetq_lane_u32(a_eq_b, 0) & 0x1;
1287 // Compares the lower single-precision floating point scalar values of a and b
1288 // using a greater than or equal operation. :
1289 // https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
1290 FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
1293 vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1294 return vgetq_lane_u32(a_ge_b, 0) & 0x1;
1297 // Compares the lower single-precision floating point scalar values of a and b
1298 // using a greater than operation. :
1299 // https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
1300 FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
1303 vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1304 return vgetq_lane_u32(a_gt_b, 0) & 0x1;
1307 // Compares the lower single-precision floating point scalar values of a and b
1308 // using a less than or equal operation. :
1309 // https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
1310 FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
1313 vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1314 return vgetq_lane_u32(a_le_b, 0) & 0x1;
1317 // Compares the lower single-precision floating point scalar values of a and b
1318 // using a less than operation. :
1319 // https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important
1320 // note!! The documentation on MSDN is incorrect! If either of the values is a
1321 // NAN the docs say you will get a one, but in fact, it will return a zero!!
1322 FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
1325 vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1326 return vgetq_lane_u32(a_lt_b, 0) & 0x1;
1329 // Compares the lower single-precision floating point scalar values of a and b
1330 // using an inequality operation. :
1331 // https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
1332 FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
1334 return !_mm_comieq_ss(a, b);
1337 // Convert packed signed 32-bit integers in b to packed single-precision
1338 // (32-bit) floating-point elements, store the results in the lower 2 elements
1339 // of dst, and copy the upper 2 packed elements from a to the upper elements of
1342 // dst[31:0] := Convert_Int32_To_FP32(b[31:0])
1343 // dst[63:32] := Convert_Int32_To_FP32(b[63:32])
1344 // dst[95:64] := a[95:64]
1345 // dst[127:96] := a[127:96]
1347 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps
1348 FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
1350 return vreinterpretq_m128_f32(
1351 vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
1352 vget_high_f32(vreinterpretq_f32_m128(a))));
1355 // Convert packed single-precision (32-bit) floating-point elements in a to
1356 // packed 32-bit integers, and store the results in dst.
1360 // dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
1363 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ps2pi
1364 FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
1366 #if defined(__aarch64__)
1367 return vreinterpret_m64_s32(
1368 vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a)))));
1370 return vreinterpret_m64_s32(vcvt_s32_f32(vget_low_f32(
1371 vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)))));
1375 // Convert the signed 32-bit integer b to a single-precision (32-bit)
1376 // floating-point element, store the result in the lower element of dst, and
1377 // copy the upper 3 packed elements from a to the upper elements of dst.
1379 // dst[31:0] := Convert_Int32_To_FP32(b[31:0])
1380 // dst[127:32] := a[127:32]
1382 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss
1383 FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
1385 return vreinterpretq_m128_f32(
1386 vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
1389 // Convert the lower single-precision (32-bit) floating-point element in a to a
1390 // 32-bit integer, and store the result in dst.
1391 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si
1392 FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
1394 #if defined(__aarch64__)
1395 return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))),
1398 float32_t data = vgetq_lane_f32(
1399 vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
1400 return (int32_t) data;
1404 // Convert packed 16-bit integers in a to packed single-precision (32-bit)
1405 // floating-point elements, and store the results in dst.
1410 // dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
1413 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps
1414 FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
1416 return vreinterpretq_m128_f32(
1417 vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
1420 // Convert packed 32-bit integers in b to packed single-precision (32-bit)
1421 // floating-point elements, store the results in the lower 2 elements of dst,
1422 // and copy the upper 2 packed elements from a to the upper elements of dst.
1424 // dst[31:0] := Convert_Int32_To_FP32(b[31:0])
1425 // dst[63:32] := Convert_Int32_To_FP32(b[63:32])
1426 // dst[95:64] := a[95:64]
1427 // dst[127:96] := a[127:96]
1429 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps
1430 FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
1432 return vreinterpretq_m128_f32(
1433 vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
1434 vget_high_f32(vreinterpretq_f32_m128(a))));
1437 // Convert packed signed 32-bit integers in a to packed single-precision
1438 // (32-bit) floating-point elements, store the results in the lower 2 elements
1439 // of dst, then covert the packed signed 32-bit integers in b to
1440 // single-precision (32-bit) floating-point element, and store the results in
1441 // the upper 2 elements of dst.
1443 // dst[31:0] := Convert_Int32_To_FP32(a[31:0])
1444 // dst[63:32] := Convert_Int32_To_FP32(a[63:32])
1445 // dst[95:64] := Convert_Int32_To_FP32(b[31:0])
1446 // dst[127:96] := Convert_Int32_To_FP32(b[63:32])
1448 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps
1449 FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
1451 return vreinterpretq_m128_f32(vcvtq_f32_s32(
1452 vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
1455 // Convert the lower packed 8-bit integers in a to packed single-precision
1456 // (32-bit) floating-point elements, and store the results in dst.
1461 // dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
1464 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps
1465 FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
1467 return vreinterpretq_m128_f32(vcvtq_f32_s32(
1468 vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
1471 // Convert packed single-precision (32-bit) floating-point elements in a to
1472 // packed 16-bit integers, and store the results in dst. Note: this intrinsic
1473 // will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
1479 // IF a[k+31:k] >= FP32(0x7FFF) && a[k+31:k] <= FP32(0x7FFFFFFF)
1480 // dst[i+15:i] := 0x7FFF
1482 // dst[i+15:i] := Convert_FP32_To_Int16(a[k+31:k])
1486 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi16
1487 FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
1489 const __m128 i16Min = _mm_set_ps1((float) INT16_MIN);
1490 const __m128 i16Max = _mm_set_ps1((float) INT16_MAX);
1491 const __m128 i32Max = _mm_set_ps1((float) INT32_MAX);
1492 const __m128i maxMask = _mm_castps_si128(
1493 _mm_and_ps(_mm_cmpge_ps(a, i16Max), _mm_cmple_ps(a, i32Max)));
1494 const __m128i betweenMask = _mm_castps_si128(
1495 _mm_and_ps(_mm_cmpgt_ps(a, i16Min), _mm_cmplt_ps(a, i16Max)));
1496 const __m128i minMask = _mm_cmpeq_epi32(_mm_or_si128(maxMask, betweenMask),
1497 _mm_setzero_si128());
1498 __m128i max = _mm_and_si128(maxMask, _mm_set1_epi32(INT16_MAX));
1499 __m128i min = _mm_and_si128(minMask, _mm_set1_epi32(INT16_MIN));
1500 __m128i cvt = _mm_and_si128(betweenMask, _mm_cvtps_epi32(a));
1501 __m128i res32 = _mm_or_si128(_mm_or_si128(max, min), cvt);
1502 return vreinterpret_m64_s16(vmovn_s32(vreinterpretq_s32_m128i(res32)));
1505 // Convert packed single-precision (32-bit) floating-point elements in a to
1506 // packed 32-bit integers, and store the results in dst.
1510 // dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
1513 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi32
1514 #define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
1516 // Convert packed single-precision (32-bit) floating-point elements in a to
1517 // packed 8-bit integers, and store the results in lower 4 elements of dst.
1518 // Note: this intrinsic will generate 0x7F, rather than 0x80, for input values
1519 // between 0x7F and 0x7FFFFFFF.
1524 // IF a[k+31:k] >= FP32(0x7F) && a[k+31:k] <= FP32(0x7FFFFFFF)
1525 // dst[i+7:i] := 0x7F
1527 // dst[i+7:i] := Convert_FP32_To_Int8(a[k+31:k])
1531 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi8
1532 FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a)
1534 const __m128 i8Min = _mm_set_ps1((float) INT8_MIN);
1535 const __m128 i8Max = _mm_set_ps1((float) INT8_MAX);
1536 const __m128 i32Max = _mm_set_ps1((float) INT32_MAX);
1537 const __m128i maxMask = _mm_castps_si128(
1538 _mm_and_ps(_mm_cmpge_ps(a, i8Max), _mm_cmple_ps(a, i32Max)));
1539 const __m128i betweenMask = _mm_castps_si128(
1540 _mm_and_ps(_mm_cmpgt_ps(a, i8Min), _mm_cmplt_ps(a, i8Max)));
1541 const __m128i minMask = _mm_cmpeq_epi32(_mm_or_si128(maxMask, betweenMask),
1542 _mm_setzero_si128());
1543 __m128i max = _mm_and_si128(maxMask, _mm_set1_epi32(INT8_MAX));
1544 __m128i min = _mm_and_si128(minMask, _mm_set1_epi32(INT8_MIN));
1545 __m128i cvt = _mm_and_si128(betweenMask, _mm_cvtps_epi32(a));
1546 __m128i res32 = _mm_or_si128(_mm_or_si128(max, min), cvt);
1547 int16x4_t res16 = vmovn_s32(vreinterpretq_s32_m128i(res32));
1548 int8x8_t res8 = vmovn_s16(vcombine_s16(res16, res16));
1549 static const uint32_t bitMask[2] = {0xFFFFFFFF, 0};
1550 int8x8_t mask = vreinterpret_s8_u32(vld1_u32(bitMask));
1552 return vreinterpret_m64_s8(vorr_s8(vand_s8(mask, res8), vdup_n_s8(0)));
1555 // Convert packed unsigned 16-bit integers in a to packed single-precision
1556 // (32-bit) floating-point elements, and store the results in dst.
1561 // dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i])
1564 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps
1565 FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
1567 return vreinterpretq_m128_f32(
1568 vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
1571 // Convert the lower packed unsigned 8-bit integers in a to packed
1572 // single-precision (32-bit) floating-point elements, and store the results in
1578 // dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i])
1581 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps
1582 FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
1584 return vreinterpretq_m128_f32(vcvtq_f32_u32(
1585 vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
1588 // Convert the signed 32-bit integer b to a single-precision (32-bit)
1589 // floating-point element, store the result in the lower element of dst, and
1590 // copy the upper 3 packed elements from a to the upper elements of dst.
1592 // dst[31:0] := Convert_Int32_To_FP32(b[31:0])
1593 // dst[127:32] := a[127:32]
1595 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss
1596 #define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
1598 // Convert the signed 64-bit integer b to a single-precision (32-bit)
1599 // floating-point element, store the result in the lower element of dst, and
1600 // copy the upper 3 packed elements from a to the upper elements of dst.
1602 // dst[31:0] := Convert_Int64_To_FP32(b[63:0])
1603 // dst[127:32] := a[127:32]
1605 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_ss
1606 FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
1608 return vreinterpretq_m128_f32(
1609 vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
1612 // Copy the lower single-precision (32-bit) floating-point element of a to dst.
1614 // dst[31:0] := a[31:0]
1616 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32
1617 FORCE_INLINE float _mm_cvtss_f32(__m128 a)
1619 return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1622 // Convert the lower single-precision (32-bit) floating-point element in a to a
1623 // 32-bit integer, and store the result in dst.
1625 // dst[31:0] := Convert_FP32_To_Int32(a[31:0])
1627 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32
1628 #define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
1630 // Convert the lower single-precision (32-bit) floating-point element in a to a
1631 // 64-bit integer, and store the result in dst.
1633 // dst[63:0] := Convert_FP32_To_Int64(a[31:0])
1635 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si64
1636 FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
1638 #if defined(__aarch64__)
1639 return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0);
1641 float32_t data = vgetq_lane_f32(
1642 vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
1643 return (int64_t) data;
1647 // Convert packed single-precision (32-bit) floating-point elements in a to
1648 // packed 32-bit integers with truncation, and store the results in dst.
1652 // dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
1655 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ps2pi
1656 FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
1658 return vreinterpret_m64_s32(
1659 vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
1662 // Convert the lower single-precision (32-bit) floating-point element in a to a
1663 // 32-bit integer with truncation, and store the result in dst.
1665 // dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
1667 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si
1668 FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
1670 return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
1673 // Convert packed single-precision (32-bit) floating-point elements in a to
1674 // packed 32-bit integers with truncation, and store the results in dst.
1678 // dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
1681 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_pi32
1682 #define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
1684 // Convert the lower single-precision (32-bit) floating-point element in a to a
1685 // 32-bit integer with truncation, and store the result in dst.
1687 // dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
1689 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32
1690 #define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
1692 // Convert the lower single-precision (32-bit) floating-point element in a to a
1693 // 64-bit integer with truncation, and store the result in dst.
1695 // dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
1697 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si64
1698 FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
1700 return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1703 // Divides the four single-precision, floating-point values of a and b.
1710 // https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
1711 FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
1713 #if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV
1714 return vreinterpretq_m128_f32(
1715 vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1717 float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
1718 recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
1719 #if SSE2NEON_PRECISE_DIV
1720 // Additional Netwon-Raphson iteration for accuracy
1721 recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
1723 return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
1727 // Divides the scalar single-precision floating point value of a by b.
1728 // https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
1729 FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
1732 vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
1733 return vreinterpretq_m128_f32(
1734 vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
1737 // Extract a 16-bit integer from a, selected with imm8, and store the result in
1738 // the lower element of dst.
1739 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_pi16
1740 #define _mm_extract_pi16(a, imm) \
1741 (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
1743 #ifndef __MM_MALLOC_H
1744 // Free aligned memory that was allocated with _mm_malloc.
1745 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_free
1746 FORCE_INLINE void _mm_free(void *addr)
1752 // Macro: Get the flush zero bits from the MXCSR control and status register.
1753 // The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or
1754 // _MM_FLUSH_ZERO_OFF
1755 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_FLUSH_ZERO_MODE
1756 FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode()
1759 fpcr_bitfield field;
1760 #if defined(__aarch64__)
1767 #if defined(__aarch64__)
1768 asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
1770 asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
1773 return r.field.bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF;
1776 // Macro: Get the rounding mode bits from the MXCSR control and status register.
1777 // The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
1778 // _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
1779 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_ROUNDING_MODE
1780 FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE()
1783 fpcr_bitfield field;
1784 #if defined(__aarch64__)
1791 #if defined(__aarch64__)
1792 asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
1794 asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
1797 if (r.field.bit22) {
1798 return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP;
1800 return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST;
1804 // Copy a to dst, and insert the 16-bit integer i into dst at the location
1805 // specified by imm8.
1806 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_pi16
1807 #define _mm_insert_pi16(a, b, imm) \
1809 vreinterpret_m64_s16( \
1810 vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \
1813 // Loads four single-precision, floating-point values.
1814 // https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
1815 FORCE_INLINE __m128 _mm_load_ps(const float *p)
1817 return vreinterpretq_m128_f32(vld1q_f32(p));
1820 // Load a single-precision (32-bit) floating-point element from memory into all
1823 // dst[31:0] := MEM[mem_addr+31:mem_addr]
1824 // dst[63:32] := MEM[mem_addr+31:mem_addr]
1825 // dst[95:64] := MEM[mem_addr+31:mem_addr]
1826 // dst[127:96] := MEM[mem_addr+31:mem_addr]
1828 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1
1829 #define _mm_load_ps1 _mm_load1_ps
1831 // Loads an single - precision, floating - point value into the low word and
1832 // clears the upper three words.
1833 // https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
1834 FORCE_INLINE __m128 _mm_load_ss(const float *p)
1836 return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
1839 // Loads a single single-precision, floating-point value, copying it into all
1841 // https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
1842 FORCE_INLINE __m128 _mm_load1_ps(const float *p)
1844 return vreinterpretq_m128_f32(vld1q_dup_f32(p));
1847 // Sets the upper two single-precision, floating-point values with 64
1848 // bits of data loaded from the address p; the lower two values are passed
1856 // https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx
1857 FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
1859 return vreinterpretq_m128_f32(
1860 vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
1863 // Sets the lower two single-precision, floating-point values with 64
1864 // bits of data loaded from the address p; the upper two values are passed
1873 // https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx
1874 FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
1876 return vreinterpretq_m128_f32(
1877 vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
1880 // Load 4 single-precision (32-bit) floating-point elements from memory into dst
1881 // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
1882 // general-protection exception may be generated.
1884 // dst[31:0] := MEM[mem_addr+127:mem_addr+96]
1885 // dst[63:32] := MEM[mem_addr+95:mem_addr+64]
1886 // dst[95:64] := MEM[mem_addr+63:mem_addr+32]
1887 // dst[127:96] := MEM[mem_addr+31:mem_addr]
1889 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps
1890 FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
1892 float32x4_t v = vrev64q_f32(vld1q_f32(p));
1893 return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
1896 // Loads four single-precision, floating-point values.
1897 // https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
1898 FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
1900 // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
1901 // equivalent for neon
1902 return vreinterpretq_m128_f32(vld1q_f32(p));
1905 // Load unaligned 16-bit integer from memory into the first element of dst.
1907 // dst[15:0] := MEM[mem_addr+15:mem_addr]
1910 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16
1911 FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
1913 return vreinterpretq_m128i_s16(
1914 vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
1917 // Load unaligned 64-bit integer from memory into the first element of dst.
1919 // dst[63:0] := MEM[mem_addr+63:mem_addr]
1922 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64
1923 FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
1925 return vreinterpretq_m128i_s64(
1926 vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
1929 #ifndef __MM_MALLOC_H
1930 // Allocate aligned blocks of memory.
1931 // https://software.intel.com/en-us/
1932 // cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks
1933 FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
1937 return malloc(size);
1938 if (align == 2 || (sizeof(void *) == 8 && align == 4))
1939 align = sizeof(void *);
1940 if (!posix_memalign(&ptr, align, size))
1946 // Conditionally store 8-bit integer elements from a into memory using mask
1947 // (elements are not stored when the highest bit is not set in the corresponding
1948 // element) and a non-temporal memory hint.
1949 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmove_si64
1950 FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
1952 int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7);
1953 __m128 b = _mm_load_ps((const float *) mem_addr);
1955 vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a),
1956 vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b))));
1957 vst1_s8((int8_t *) mem_addr, masked);
1960 // Conditionally store 8-bit integer elements from a into memory using mask
1961 // (elements are not stored when the highest bit is not set in the corresponding
1962 // element) and a non-temporal memory hint.
1963 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_maskmovq
1964 #define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
1966 // Compare packed signed 16-bit integers in a and b, and store packed maximum
1971 // dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
1974 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
1975 FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
1977 return vreinterpret_m64_s16(
1978 vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
1981 // Computes the maximums of the four single-precision, floating-point values of
1983 // https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
1984 FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
1986 #if SSE2NEON_PRECISE_MINMAX
1987 float32x4_t _a = vreinterpretq_f32_m128(a);
1988 float32x4_t _b = vreinterpretq_f32_m128(b);
1989 return vreinterpretq_m128_f32(vbslq_f32(vcgtq_f32(_a, _b), _a, _b));
1991 return vreinterpretq_m128_f32(
1992 vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1996 // Compare packed unsigned 8-bit integers in a and b, and store packed maximum
2001 // dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
2004 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
2005 FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
2007 return vreinterpret_m64_u8(
2008 vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
2011 // Computes the maximum of the two lower scalar single-precision floating point
2012 // values of a and b.
2013 // https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
2014 FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
2016 float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
2017 return vreinterpretq_m128_f32(
2018 vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
2021 // Compare packed signed 16-bit integers in a and b, and store packed minimum
2026 // dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
2029 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
2030 FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
2032 return vreinterpret_m64_s16(
2033 vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
2036 // Computes the minima of the four single-precision, floating-point values of a
2038 // https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
2039 FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
2041 #if SSE2NEON_PRECISE_MINMAX
2042 float32x4_t _a = vreinterpretq_f32_m128(a);
2043 float32x4_t _b = vreinterpretq_f32_m128(b);
2044 return vreinterpretq_m128_f32(vbslq_f32(vcltq_f32(_a, _b), _a, _b));
2046 return vreinterpretq_m128_f32(
2047 vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2051 // Compare packed unsigned 8-bit integers in a and b, and store packed minimum
2056 // dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
2059 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
2060 FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
2062 return vreinterpret_m64_u8(
2063 vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
2066 // Computes the minimum of the two lower scalar single-precision floating point
2067 // values of a and b.
2068 // https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
2069 FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
2071 float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
2072 return vreinterpretq_m128_f32(
2073 vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
2076 // Sets the low word to the single-precision, floating-point value of b
2077 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100)
2078 FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
2080 return vreinterpretq_m128_f32(
2081 vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
2082 vreinterpretq_f32_m128(a), 0));
2085 // Moves the upper two values of B into the lower two values of A.
2091 FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B)
2093 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A));
2094 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B));
2095 return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
2098 // Moves the lower two values of B into the upper two values of A.
2104 FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
2106 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
2107 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
2108 return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
2111 // Create mask from the most significant bit of each 8-bit element in a, and
2112 // store the result in dst.
2113 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pi8
2114 FORCE_INLINE int _mm_movemask_pi8(__m64 a)
2116 uint8x8_t input = vreinterpret_u8_m64(a);
2117 #if defined(__aarch64__)
2118 static const int8x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
2119 uint8x8_t tmp = vshr_n_u8(input, 7);
2120 return vaddv_u8(vshl_u8(tmp, shift));
2122 // Refer the implementation of `_mm_movemask_epi8`
2123 uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7));
2124 uint32x2_t paired16 =
2125 vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
2126 uint8x8_t paired32 =
2127 vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14));
2128 return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4);
2132 // NEON does not provide this method
2133 // Creates a 4-bit mask from the most significant bits of the four
2134 // single-precision, floating-point values.
2135 // https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
2136 FORCE_INLINE int _mm_movemask_ps(__m128 a)
2138 uint32x4_t input = vreinterpretq_u32_m128(a);
2139 #if defined(__aarch64__)
2140 static const int32x4_t shift = {0, 1, 2, 3};
2141 uint32x4_t tmp = vshrq_n_u32(input, 31);
2142 return vaddvq_u32(vshlq_u32(tmp, shift));
2144 // Uses the exact same method as _mm_movemask_epi8, see that for details.
2145 // Shift out everything but the sign bits with a 32-bit unsigned shift
2147 uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
2148 // Merge the two pairs together with a 64-bit unsigned shift right + add.
2150 vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
2151 // Extract the result.
2152 return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
2156 // Multiplies the four single-precision, floating-point values of a and b.
2163 // https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
2164 FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
2166 return vreinterpretq_m128_f32(
2167 vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2170 // Multiply the lower single-precision (32-bit) floating-point element in a and
2171 // b, store the result in the lower element of dst, and copy the upper 3 packed
2172 // elements from a to the upper elements of dst.
2174 // dst[31:0] := a[31:0] * b[31:0]
2175 // dst[127:32] := a[127:32]
2177 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss
2178 FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
2180 return _mm_move_ss(a, _mm_mul_ps(a, b));
2183 // Multiply the packed unsigned 16-bit integers in a and b, producing
2184 // intermediate 32-bit integers, and store the high 16 bits of the intermediate
2186 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16
2187 FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
2189 return vreinterpret_m64_u16(vshrn_n_u32(
2190 vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
2193 // Computes the bitwise OR of the four single-precision, floating-point values
2195 // https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
2196 FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
2198 return vreinterpretq_m128_s32(
2199 vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
2202 // Average packed unsigned 8-bit integers in a and b, and store the results in
2207 // dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
2210 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb
2211 #define _m_pavgb(a, b) _mm_avg_pu8(a, b)
2213 // Average packed unsigned 16-bit integers in a and b, and store the results in
2218 // dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
2221 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw
2222 #define _m_pavgw(a, b) _mm_avg_pu16(a, b)
2224 // Extract a 16-bit integer from a, selected with imm8, and store the result in
2225 // the lower element of dst.
2226 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pextrw
2227 #define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
2229 // Copy a to dst, and insert the 16-bit integer i into dst at the location
2230 // specified by imm8.
2231 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_pinsrw
2232 #define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
2234 // Compare packed signed 16-bit integers in a and b, and store packed maximum
2236 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxsw
2237 #define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
2239 // Compare packed unsigned 8-bit integers in a and b, and store packed maximum
2241 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxub
2242 #define _m_pmaxub(a, b) _mm_max_pu8(a, b)
2244 // Compare packed signed 16-bit integers in a and b, and store packed minimum
2246 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminsw
2247 #define _m_pminsw(a, b) _mm_min_pi16(a, b)
2249 // Compare packed unsigned 8-bit integers in a and b, and store packed minimum
2251 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminub
2252 #define _m_pminub(a, b) _mm_min_pu8(a, b)
2254 // Create mask from the most significant bit of each 8-bit element in a, and
2255 // store the result in dst.
2256 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmovmskb
2257 #define _m_pmovmskb(a) _mm_movemask_pi8(a)
2259 // Multiply the packed unsigned 16-bit integers in a and b, producing
2260 // intermediate 32-bit integers, and store the high 16 bits of the intermediate
2262 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw
2263 #define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
2265 // Loads one cache line of data from address p to a location closer to the
2266 // processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx
2267 FORCE_INLINE void _mm_prefetch(const void *p, int i)
2270 __builtin_prefetch(p);
2273 // Compute the absolute differences of packed unsigned 8-bit integers in a and
2274 // b, then horizontally sum each consecutive 8 differences to produce four
2275 // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
2277 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_psadbw
2278 #define _m_psadbw(a, b) _mm_sad_pu8(a, b)
2280 // Shuffle 16-bit integers in a using the control in imm8, and store the results
2282 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pshufw
2283 #define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)
2285 // Compute the approximate reciprocal of packed single-precision (32-bit)
2286 // floating-point elements in a, and store the results in dst. The maximum
2287 // relative error for this approximation is less than 1.5*2^-12.
2288 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps
2289 FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
2291 float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
2292 recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
2293 #if SSE2NEON_PRECISE_DIV
2294 // Additional Netwon-Raphson iteration for accuracy
2295 recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
2297 return vreinterpretq_m128_f32(recip);
2300 // Compute the approximate reciprocal of the lower single-precision (32-bit)
2301 // floating-point element in a, store the result in the lower element of dst,
2302 // and copy the upper 3 packed elements from a to the upper elements of dst. The
2303 // maximum relative error for this approximation is less than 1.5*2^-12.
2305 // dst[31:0] := (1.0 / a[31:0])
2306 // dst[127:32] := a[127:32]
2308 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss
2309 FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
2311 return _mm_move_ss(a, _mm_rcp_ps(a));
2314 // Computes the approximations of the reciprocal square roots of the four
2315 // single-precision floating point values of in.
2316 // The current precision is 1% error.
2317 // https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
2318 FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
2320 float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2321 #if SSE2NEON_PRECISE_SQRT
2322 // Additional Netwon-Raphson iteration for accuracy
2324 out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
2326 out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
2328 return vreinterpretq_m128_f32(out);
2331 // Compute the approximate reciprocal square root of the lower single-precision
2332 // (32-bit) floating-point element in a, store the result in the lower element
2333 // of dst, and copy the upper 3 packed elements from a to the upper elements of
2335 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss
2336 FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
2338 return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
2341 // Compute the absolute differences of packed unsigned 8-bit integers in a and
2342 // b, then horizontally sum each consecutive 8 differences to produce four
2343 // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
2345 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8
2346 FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
2348 uint64x1_t t = vpaddl_u32(vpaddl_u16(
2349 vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)))));
2350 return vreinterpret_m64_u16(
2351 vset_lane_u16(vget_lane_u64(t, 0), vdup_n_u16(0), 0));
2354 // Macro: Set the flush zero bits of the MXCSR control and status register to
2355 // the value in unsigned 32-bit integer a. The flush zero may contain any of the
2356 // following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF
2357 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_FLUSH_ZERO_MODE
2358 FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
2360 // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
2361 // regardless of the value of the FZ bit.
2363 fpcr_bitfield field;
2364 #if defined(__aarch64__)
2371 #if defined(__aarch64__)
2372 asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
2374 asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
2377 r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON;
2379 #if defined(__aarch64__)
2380 asm volatile("msr FPCR, %0" ::"r"(r)); /* write */
2382 asm volatile("vmsr FPSCR, %0" ::"r"(r)); /* write */
2386 // Sets the four single-precision, floating-point values to the four inputs.
2387 // https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
2388 FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
2390 float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
2391 return vreinterpretq_m128_f32(vld1q_f32(data));
2394 // Sets the four single-precision, floating-point values to w.
2395 // https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
2396 FORCE_INLINE __m128 _mm_set_ps1(float _w)
2398 return vreinterpretq_m128_f32(vdupq_n_f32(_w));
2401 // Macro: Set the rounding mode bits of the MXCSR control and status register to
2402 // the value in unsigned 32-bit integer a. The rounding mode may contain any of
2403 // the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
2404 // _MM_ROUND_TOWARD_ZERO
2405 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_ROUNDING_MODE
2406 FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
2409 fpcr_bitfield field;
2410 #if defined(__aarch64__)
2417 #if defined(__aarch64__)
2418 asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
2420 asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
2424 case _MM_ROUND_TOWARD_ZERO:
2428 case _MM_ROUND_DOWN:
2436 default: //_MM_ROUND_NEAREST
2441 #if defined(__aarch64__)
2442 asm volatile("msr FPCR, %0" ::"r"(r)); /* write */
2444 asm volatile("vmsr FPSCR, %0" ::"r"(r)); /* write */
2448 // Copy single-precision (32-bit) floating-point element a to the lower element
2449 // of dst, and zero the upper 3 elements.
2450 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss
2451 FORCE_INLINE __m128 _mm_set_ss(float a)
2453 float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0};
2454 return vreinterpretq_m128_f32(vld1q_f32(data));
2457 // Sets the four single-precision, floating-point values to w.
2459 // r0 := r1 := r2 := r3 := w
2461 // https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
2462 FORCE_INLINE __m128 _mm_set1_ps(float _w)
2464 return vreinterpretq_m128_f32(vdupq_n_f32(_w));
2467 // FIXME: _mm_setcsr() implementation supports changing the rounding mode only.
2468 FORCE_INLINE void _mm_setcsr(unsigned int a)
2470 _MM_SET_ROUNDING_MODE(a);
2473 // FIXME: _mm_getcsr() implementation supports reading the rounding mode only.
2474 FORCE_INLINE unsigned int _mm_getcsr()
2476 return _MM_GET_ROUNDING_MODE();
2479 // Sets the four single-precision, floating-point values to the four inputs in
2481 // https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
2482 FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
2484 float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
2485 return vreinterpretq_m128_f32(vld1q_f32(data));
2488 // Clears the four single-precision, floating-point values.
2489 // https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
2490 FORCE_INLINE __m128 _mm_setzero_ps(void)
2492 return vreinterpretq_m128_f32(vdupq_n_f32(0));
2495 // Shuffle 16-bit integers in a using the control in imm8, and store the results
2497 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pi16
2498 #if __has_builtin(__builtin_shufflevector)
2499 #define _mm_shuffle_pi16(a, imm) \
2501 vreinterpret_m64_s16(__builtin_shufflevector( \
2502 vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \
2503 ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3))); \
2506 #define _mm_shuffle_pi16(a, imm) \
2510 vmov_n_s16(vget_lane_s16(vreinterpret_s16_m64(a), (imm) & (0x3))); \
2511 ret = vset_lane_s16( \
2512 vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 2) & 0x3), ret, \
2514 ret = vset_lane_s16( \
2515 vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 4) & 0x3), ret, \
2517 ret = vset_lane_s16( \
2518 vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 6) & 0x3), ret, \
2520 vreinterpret_m64_s16(ret); \
2524 // Guarantees that every preceding store is globally visible before any
2525 // subsequent store.
2526 // https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
2527 FORCE_INLINE void _mm_sfence(void)
2529 __sync_synchronize();
2532 // FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
2534 #if __has_builtin(__builtin_shufflevector)
2535 #define _mm_shuffle_ps(a, b, imm) \
2537 float32x4_t _input1 = vreinterpretq_f32_m128(a); \
2538 float32x4_t _input2 = vreinterpretq_f32_m128(b); \
2539 float32x4_t _shuf = __builtin_shufflevector( \
2540 _input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
2541 (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
2542 vreinterpretq_m128_f32(_shuf); \
2545 #define _mm_shuffle_ps(a, b, imm) \
2549 case _MM_SHUFFLE(1, 0, 3, 2): \
2550 ret = _mm_shuffle_ps_1032((a), (b)); \
2552 case _MM_SHUFFLE(2, 3, 0, 1): \
2553 ret = _mm_shuffle_ps_2301((a), (b)); \
2555 case _MM_SHUFFLE(0, 3, 2, 1): \
2556 ret = _mm_shuffle_ps_0321((a), (b)); \
2558 case _MM_SHUFFLE(2, 1, 0, 3): \
2559 ret = _mm_shuffle_ps_2103((a), (b)); \
2561 case _MM_SHUFFLE(1, 0, 1, 0): \
2562 ret = _mm_movelh_ps((a), (b)); \
2564 case _MM_SHUFFLE(1, 0, 0, 1): \
2565 ret = _mm_shuffle_ps_1001((a), (b)); \
2567 case _MM_SHUFFLE(0, 1, 0, 1): \
2568 ret = _mm_shuffle_ps_0101((a), (b)); \
2570 case _MM_SHUFFLE(3, 2, 1, 0): \
2571 ret = _mm_shuffle_ps_3210((a), (b)); \
2573 case _MM_SHUFFLE(0, 0, 1, 1): \
2574 ret = _mm_shuffle_ps_0011((a), (b)); \
2576 case _MM_SHUFFLE(0, 0, 2, 2): \
2577 ret = _mm_shuffle_ps_0022((a), (b)); \
2579 case _MM_SHUFFLE(2, 2, 0, 0): \
2580 ret = _mm_shuffle_ps_2200((a), (b)); \
2582 case _MM_SHUFFLE(3, 2, 0, 2): \
2583 ret = _mm_shuffle_ps_3202((a), (b)); \
2585 case _MM_SHUFFLE(3, 2, 3, 2): \
2586 ret = _mm_movehl_ps((b), (a)); \
2588 case _MM_SHUFFLE(1, 1, 3, 3): \
2589 ret = _mm_shuffle_ps_1133((a), (b)); \
2591 case _MM_SHUFFLE(2, 0, 1, 0): \
2592 ret = _mm_shuffle_ps_2010((a), (b)); \
2594 case _MM_SHUFFLE(2, 0, 0, 1): \
2595 ret = _mm_shuffle_ps_2001((a), (b)); \
2597 case _MM_SHUFFLE(2, 0, 3, 2): \
2598 ret = _mm_shuffle_ps_2032((a), (b)); \
2601 ret = _mm_shuffle_ps_default((a), (b), (imm)); \
2608 // Computes the approximations of square roots of the four single-precision,
2609 // floating-point values of a. First computes reciprocal square roots and then
2610 // reciprocals of the four values.
2617 // https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
2618 FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
2620 #if SSE2NEON_PRECISE_SQRT
2621 float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2623 // Test for vrsqrteq_f32(0) -> positive infinity case.
2624 // Change to zero, so that s * 1/sqrt(s) result is zero too.
2625 const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
2626 const uint32x4_t div_by_zero =
2627 vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
2628 recip = vreinterpretq_f32_u32(
2629 vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
2631 // Additional Netwon-Raphson iteration for accuracy
2633 vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
2636 vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
2639 // sqrt(s) = s * 1/sqrt(s)
2640 return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));
2641 #elif defined(__aarch64__)
2642 return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
2644 float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2645 float32x4_t sq = vrecpeq_f32(recipsq);
2646 return vreinterpretq_m128_f32(sq);
2650 // Computes the approximation of the square root of the scalar single-precision
2651 // floating point value of in.
2652 // https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
2653 FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
2656 vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
2657 return vreinterpretq_m128_f32(
2658 vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
2661 // Stores four single-precision, floating-point values.
2662 // https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
2663 FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
2665 vst1q_f32(p, vreinterpretq_f32_m128(a));
2668 // Store the lower single-precision (32-bit) floating-point element from a into
2669 // 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
2670 // boundary or a general-protection exception may be generated.
2672 // MEM[mem_addr+31:mem_addr] := a[31:0]
2673 // MEM[mem_addr+63:mem_addr+32] := a[31:0]
2674 // MEM[mem_addr+95:mem_addr+64] := a[31:0]
2675 // MEM[mem_addr+127:mem_addr+96] := a[31:0]
2677 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps1
2678 FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
2680 float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
2681 vst1q_f32(p, vdupq_n_f32(a0));
2684 // Stores the lower single - precision, floating - point value.
2685 // https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
2686 FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
2688 vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
2691 // Store the lower single-precision (32-bit) floating-point element from a into
2692 // 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
2693 // boundary or a general-protection exception may be generated.
2695 // MEM[mem_addr+31:mem_addr] := a[31:0]
2696 // MEM[mem_addr+63:mem_addr+32] := a[31:0]
2697 // MEM[mem_addr+95:mem_addr+64] := a[31:0]
2698 // MEM[mem_addr+127:mem_addr+96] := a[31:0]
2700 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_ps
2701 #define _mm_store1_ps _mm_store_ps1
2703 // Stores the upper two single-precision, floating-point values of a to the
2709 // https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx
2710 FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
2712 *p = vreinterpret_m64_f32(vget_high_f32(a));
2715 // Stores the lower two single-precision floating point values of a to the
2721 // https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx
2722 FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
2724 *p = vreinterpret_m64_f32(vget_low_f32(a));
2727 // Store 4 single-precision (32-bit) floating-point elements from a into memory
2728 // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
2729 // general-protection exception may be generated.
2731 // MEM[mem_addr+31:mem_addr] := a[127:96]
2732 // MEM[mem_addr+63:mem_addr+32] := a[95:64]
2733 // MEM[mem_addr+95:mem_addr+64] := a[63:32]
2734 // MEM[mem_addr+127:mem_addr+96] := a[31:0]
2736 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_ps
2737 FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
2739 float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
2740 float32x4_t rev = vextq_f32(tmp, tmp, 2);
2744 // Stores four single-precision, floating-point values.
2745 // https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
2746 FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
2748 vst1q_f32(p, vreinterpretq_f32_m128(a));
2751 // Stores 16-bits of integer data a at the address p.
2752 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si16
2753 FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a)
2755 vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0);
2758 // Stores 64-bits of integer data a at the address p.
2759 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si64
2760 FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
2762 vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0);
2765 // Store 64-bits of integer data from a into memory using a non-temporal memory
2767 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pi
2768 FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
2770 vst1_s64((int64_t *) p, vreinterpret_s64_m64(a));
2773 // Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
2774 // point elements) from a into memory using a non-temporal memory hint.
2775 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps
2776 FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
2778 #if __has_builtin(__builtin_nontemporal_store)
2779 __builtin_nontemporal_store(a, (float32x4_t *) p);
2781 vst1q_f32(p, vreinterpretq_f32_m128(a));
2785 // Subtracts the four single-precision, floating-point values of a and b.
2792 // https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
2793 FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
2795 return vreinterpretq_m128_f32(
2796 vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2799 // Subtract the lower single-precision (32-bit) floating-point element in b from
2800 // the lower single-precision (32-bit) floating-point element in a, store the
2801 // result in the lower element of dst, and copy the upper 3 packed elements from
2802 // a to the upper elements of dst.
2804 // dst[31:0] := a[31:0] - b[31:0]
2805 // dst[127:32] := a[127:32]
2807 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss
2808 FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
2810 return _mm_move_ss(a, _mm_sub_ps(a, b));
2813 // Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
2814 // (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
2815 // transposed matrix in these vectors (row0 now contains column 0, etc.).
2816 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS
2817 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
2819 float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \
2820 float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \
2821 row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \
2822 vget_low_f32(ROW23.val[0])); \
2823 row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \
2824 vget_low_f32(ROW23.val[1])); \
2825 row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \
2826 vget_high_f32(ROW23.val[0])); \
2827 row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \
2828 vget_high_f32(ROW23.val[1])); \
2831 // according to the documentation, these intrinsics behave the same as the
2832 // non-'u' versions. We'll just alias them here.
2833 #define _mm_ucomieq_ss _mm_comieq_ss
2834 #define _mm_ucomige_ss _mm_comige_ss
2835 #define _mm_ucomigt_ss _mm_comigt_ss
2836 #define _mm_ucomile_ss _mm_comile_ss
2837 #define _mm_ucomilt_ss _mm_comilt_ss
2838 #define _mm_ucomineq_ss _mm_comineq_ss
2840 // Return vector of type __m128i with undefined elements.
2841 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_undefined_si128
2842 FORCE_INLINE __m128i _mm_undefined_si128(void)
2844 #if defined(__GNUC__) || defined(__clang__)
2845 #pragma GCC diagnostic push
2846 #pragma GCC diagnostic ignored "-Wuninitialized"
2850 #if defined(__GNUC__) || defined(__clang__)
2851 #pragma GCC diagnostic pop
2855 // Return vector of type __m128 with undefined elements.
2856 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps
2857 FORCE_INLINE __m128 _mm_undefined_ps(void)
2859 #if defined(__GNUC__) || defined(__clang__)
2860 #pragma GCC diagnostic push
2861 #pragma GCC diagnostic ignored "-Wuninitialized"
2865 #if defined(__GNUC__) || defined(__clang__)
2866 #pragma GCC diagnostic pop
2870 // Selects and interleaves the upper two single-precision, floating-point values
2878 // https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
2879 FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
2881 #if defined(__aarch64__)
2882 return vreinterpretq_m128_f32(
2883 vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2885 float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
2886 float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
2887 float32x2x2_t result = vzip_f32(a1, b1);
2888 return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
2892 // Selects and interleaves the lower two single-precision, floating-point values
2900 // https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
2901 FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
2903 #if defined(__aarch64__)
2904 return vreinterpretq_m128_f32(
2905 vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2907 float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
2908 float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
2909 float32x2x2_t result = vzip_f32(a1, b1);
2910 return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
2914 // Computes bitwise EXOR (exclusive-or) of the four single-precision,
2915 // floating-point values of a and b.
2916 // https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
2917 FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
2919 return vreinterpretq_m128_s32(
2920 veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
2925 // Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or
2926 // unsigned 16-bit integers in b.
2927 // https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
2928 FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
2930 return vreinterpretq_m128i_s16(
2931 vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2934 // Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or
2935 // unsigned 32-bit integers in b.
2942 // https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
2943 FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
2945 return vreinterpretq_m128i_s32(
2946 vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
2949 // Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or
2950 // unsigned 32-bit integers in b.
2951 // https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
2952 FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
2954 return vreinterpretq_m128i_s64(
2955 vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
2958 // Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or
2959 // unsigned 8-bit integers in b.
2960 // https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
2961 FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
2963 return vreinterpretq_m128i_s8(
2964 vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2967 // Add packed double-precision (64-bit) floating-point elements in a and b, and
2968 // store the results in dst.
2969 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd
2970 FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
2972 #if defined(__aarch64__)
2973 return vreinterpretq_m128d_f64(
2974 vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
2976 double *da = (double *) &a;
2977 double *db = (double *) &b;
2979 c[0] = da[0] + db[0];
2980 c[1] = da[1] + db[1];
2981 return vld1q_f32((float32_t *) c);
2985 // Add the lower double-precision (64-bit) floating-point element in a and b,
2986 // store the result in the lower element of dst, and copy the upper element from
2987 // a to the upper element of dst.
2989 // dst[63:0] := a[63:0] + b[63:0]
2990 // dst[127:64] := a[127:64]
2992 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sd
2993 FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
2995 #if defined(__aarch64__)
2996 return _mm_move_sd(a, _mm_add_pd(a, b));
2998 double *da = (double *) &a;
2999 double *db = (double *) &b;
3001 c[0] = da[0] + db[0];
3003 return vld1q_f32((float32_t *) c);
3007 // Add 64-bit integers a and b, and store the result in dst.
3009 // dst[63:0] := a[63:0] + b[63:0]
3011 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64
3012 FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
3014 return vreinterpret_m64_s64(
3015 vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
3018 // Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b
3021 // r0 := SignedSaturate(a0 + b0)
3022 // r1 := SignedSaturate(a1 + b1)
3024 // r7 := SignedSaturate(a7 + b7)
3026 // https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
3027 FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
3029 return vreinterpretq_m128i_s16(
3030 vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3033 // Add packed signed 8-bit integers in a and b using saturation, and store the
3038 // dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
3041 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8
3042 FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
3044 return vreinterpretq_m128i_s8(
3045 vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3048 // Add packed unsigned 16-bit integers in a and b using saturation, and store
3049 // the results in dst.
3050 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epu16
3051 FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
3053 return vreinterpretq_m128i_u16(
3054 vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
3057 // Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in
3058 // b and saturates..
3059 // https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
3060 FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
3062 return vreinterpretq_m128i_u8(
3063 vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
3066 // Compute the bitwise AND of packed double-precision (64-bit) floating-point
3067 // elements in a and b, and store the results in dst.
3071 // dst[i+63:i] := a[i+63:i] AND b[i+63:i]
3074 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd
3075 FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
3077 return vreinterpretq_m128d_s64(
3078 vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
3081 // Computes the bitwise AND of the 128-bit value in a and the 128-bit value in
3086 // https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
3087 FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
3089 return vreinterpretq_m128i_s32(
3090 vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3093 // Compute the bitwise NOT of packed double-precision (64-bit) floating-point
3094 // elements in a and then AND with b, and store the results in dst.
3098 // dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
3101 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd
3102 FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
3104 // *NOTE* argument swap
3105 return vreinterpretq_m128d_s64(
3106 vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
3109 // Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the
3110 // 128-bit value in a.
3114 // https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
3115 FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
3117 return vreinterpretq_m128i_s32(
3118 vbicq_s32(vreinterpretq_s32_m128i(b),
3119 vreinterpretq_s32_m128i(a))); // *NOTE* argument swap
3122 // Computes the average of the 8 unsigned 16-bit integers in a and the 8
3123 // unsigned 16-bit integers in b and rounds.
3125 // r0 := (a0 + b0) / 2
3126 // r1 := (a1 + b1) / 2
3128 // r7 := (a7 + b7) / 2
3130 // https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx
3131 FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
3133 return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
3134 vreinterpretq_u16_m128i(b));
3137 // Computes the average of the 16 unsigned 8-bit integers in a and the 16
3138 // unsigned 8-bit integers in b and rounds.
3140 // r0 := (a0 + b0) / 2
3141 // r1 := (a1 + b1) / 2
3143 // r15 := (a15 + b15) / 2
3145 // https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx
3146 FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
3148 return vreinterpretq_m128i_u8(
3149 vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
3152 // Shift a left by imm8 bytes while shifting in zeros, and store the results in
3154 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bslli_si128
3155 #define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)
3157 // Shift a right by imm8 bytes while shifting in zeros, and store the results in
3159 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bsrli_si128
3160 #define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
3162 // Cast vector of type __m128d to type __m128. This intrinsic is only used for
3163 // compilation and does not generate any instructions, thus it has zero latency.
3164 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ps
3165 FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
3167 return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a));
3170 // Cast vector of type __m128d to type __m128i. This intrinsic is only used for
3171 // compilation and does not generate any instructions, thus it has zero latency.
3172 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128
3173 FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
3175 return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
3178 // Cast vector of type __m128 to type __m128d. This intrinsic is only used for
3179 // compilation and does not generate any instructions, thus it has zero latency.
3180 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd
3181 FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
3183 return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
3186 // Applies a type cast to reinterpret four 32-bit floating point values passed
3187 // in as a 128-bit parameter as packed 32-bit integers.
3188 // https://msdn.microsoft.com/en-us/library/bb514099.aspx
3189 FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
3191 return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
3194 // Cast vector of type __m128i to type __m128d. This intrinsic is only used for
3195 // compilation and does not generate any instructions, thus it has zero latency.
3196 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_pd
3197 FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
3199 #if defined(__aarch64__)
3200 return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
3202 return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a));
3206 // Applies a type cast to reinterpret four 32-bit integers passed in as a
3207 // 128-bit parameter as packed 32-bit floating point values.
3208 // https://msdn.microsoft.com/en-us/library/bb514029.aspx
3209 FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
3211 return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
3214 // Cache line containing p is flushed and invalidated from all caches in the
3215 // coherency domain. :
3216 // https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx
3217 FORCE_INLINE void _mm_clflush(void const *p)
3220 // no corollary for Neon?
3223 // Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or
3224 // unsigned 16-bit integers in b for equality.
3225 // https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
3226 FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
3228 return vreinterpretq_m128i_u16(
3229 vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3232 // Compare packed 32-bit integers in a and b for equality, and store the results
3234 FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
3236 return vreinterpretq_m128i_u32(
3237 vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3240 // Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or
3241 // unsigned 8-bit integers in b for equality.
3242 // https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
3243 FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
3245 return vreinterpretq_m128i_u8(
3246 vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3249 // Compare packed double-precision (64-bit) floating-point elements in a and b
3250 // for equality, and store the results in dst.
3251 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_pd
3252 FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
3254 #if defined(__aarch64__)
3255 return vreinterpretq_m128d_u64(
3256 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3258 // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
3260 vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
3261 uint32x4_t swapped = vrev64q_u32(cmp);
3262 return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped));
3266 // Compare the lower double-precision (64-bit) floating-point elements in a and
3267 // b for equality, store the result in the lower element of dst, and copy the
3268 // upper element from a to the upper element of dst.
3269 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_sd
3270 FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
3272 return _mm_move_sd(a, _mm_cmpeq_pd(a, b));
3275 // Compare packed double-precision (64-bit) floating-point elements in a and b
3276 // for greater-than-or-equal, and store the results in dst.
3277 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_pd
3278 FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
3280 #if defined(__aarch64__)
3281 return vreinterpretq_m128d_u64(
3282 vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3284 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3285 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3286 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3287 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3289 d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3290 d[1] = (*(double *) &a1) >= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3292 return vreinterpretq_m128d_u64(vld1q_u64(d));
3296 // Compare the lower double-precision (64-bit) floating-point elements in a and
3297 // b for greater-than-or-equal, store the result in the lower element of dst,
3298 // and copy the upper element from a to the upper element of dst.
3299 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_sd
3300 FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
3302 #if defined(__aarch64__)
3303 return _mm_move_sd(a, _mm_cmpge_pd(a, b));
3305 // expand "_mm_cmpge_pd()" to reduce unnecessary operations
3306 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3307 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3308 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3310 d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3313 return vreinterpretq_m128d_u64(vld1q_u64(d));
3317 // Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
3318 // in b for greater than.
3320 // r0 := (a0 > b0) ? 0xffff : 0x0
3321 // r1 := (a1 > b1) ? 0xffff : 0x0
3323 // r7 := (a7 > b7) ? 0xffff : 0x0
3325 // https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
3326 FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
3328 return vreinterpretq_m128i_u16(
3329 vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3332 // Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
3333 // in b for greater than.
3334 // https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
3335 FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
3337 return vreinterpretq_m128i_u32(
3338 vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3341 // Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
3342 // in b for greater than.
3344 // r0 := (a0 > b0) ? 0xff : 0x0
3345 // r1 := (a1 > b1) ? 0xff : 0x0
3347 // r15 := (a15 > b15) ? 0xff : 0x0
3349 // https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
3350 FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
3352 return vreinterpretq_m128i_u8(
3353 vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3356 // Compare packed double-precision (64-bit) floating-point elements in a and b
3357 // for greater-than, and store the results in dst.
3358 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_pd
3359 FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
3361 #if defined(__aarch64__)
3362 return vreinterpretq_m128d_u64(
3363 vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3365 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3366 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3367 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3368 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3370 d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3371 d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3373 return vreinterpretq_m128d_u64(vld1q_u64(d));
3377 // Compare the lower double-precision (64-bit) floating-point elements in a and
3378 // b for greater-than, store the result in the lower element of dst, and copy
3379 // the upper element from a to the upper element of dst.
3380 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_sd
3381 FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
3383 #if defined(__aarch64__)
3384 return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
3386 // expand "_mm_cmpge_pd()" to reduce unnecessary operations
3387 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3388 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3389 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3391 d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3394 return vreinterpretq_m128d_u64(vld1q_u64(d));
3398 // Compare packed double-precision (64-bit) floating-point elements in a and b
3399 // for less-than-or-equal, and store the results in dst.
3400 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_pd
3401 FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
3403 #if defined(__aarch64__)
3404 return vreinterpretq_m128d_u64(
3405 vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3407 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3408 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3409 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3410 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3412 d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3413 d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3415 return vreinterpretq_m128d_u64(vld1q_u64(d));
3419 // Compare the lower double-precision (64-bit) floating-point elements in a and
3420 // b for less-than-or-equal, store the result in the lower element of dst, and
3421 // copy the upper element from a to the upper element of dst.
3422 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_sd
3423 FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
3425 #if defined(__aarch64__)
3426 return _mm_move_sd(a, _mm_cmple_pd(a, b));
3428 // expand "_mm_cmpge_pd()" to reduce unnecessary operations
3429 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3430 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3431 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3433 d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3436 return vreinterpretq_m128d_u64(vld1q_u64(d));
3440 // Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
3441 // in b for less than.
3443 // r0 := (a0 < b0) ? 0xffff : 0x0
3444 // r1 := (a1 < b1) ? 0xffff : 0x0
3446 // r7 := (a7 < b7) ? 0xffff : 0x0
3448 // https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx
3449 FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
3451 return vreinterpretq_m128i_u16(
3452 vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3456 // Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
3457 // in b for less than.
3458 // https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
3459 FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
3461 return vreinterpretq_m128i_u32(
3462 vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3465 // Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
3466 // in b for lesser than.
3467 // https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
3468 FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
3470 return vreinterpretq_m128i_u8(
3471 vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3474 // Compare packed double-precision (64-bit) floating-point elements in a and b
3475 // for less-than, and store the results in dst.
3476 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_pd
3477 FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
3479 #if defined(__aarch64__)
3480 return vreinterpretq_m128d_u64(
3481 vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3483 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3484 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3485 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3486 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3488 d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3489 d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3491 return vreinterpretq_m128d_u64(vld1q_u64(d));
3495 // Compare the lower double-precision (64-bit) floating-point elements in a and
3496 // b for less-than, store the result in the lower element of dst, and copy the
3497 // upper element from a to the upper element of dst.
3498 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_sd
3499 FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
3501 #if defined(__aarch64__)
3502 return _mm_move_sd(a, _mm_cmplt_pd(a, b));
3504 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3505 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3506 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3508 d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3511 return vreinterpretq_m128d_u64(vld1q_u64(d));
3515 // Compare packed double-precision (64-bit) floating-point elements in a and b
3516 // for not-equal, and store the results in dst.
3517 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_pd
3518 FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
3520 #if defined(__aarch64__)
3521 return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64(
3522 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));
3524 // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
3526 vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
3527 uint32x4_t swapped = vrev64q_u32(cmp);
3528 return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped)));
3532 // Compare the lower double-precision (64-bit) floating-point elements in a and
3533 // b for not-equal, store the result in the lower element of dst, and copy the
3534 // upper element from a to the upper element of dst.
3535 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_sd
3536 FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
3538 return _mm_move_sd(a, _mm_cmpneq_pd(a, b));
3541 // Compare packed double-precision (64-bit) floating-point elements in a and b
3542 // for not-greater-than-or-equal, and store the results in dst.
3543 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_pd
3544 FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
3546 #if defined(__aarch64__)
3547 return vreinterpretq_m128d_u64(veorq_u64(
3548 vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3549 vdupq_n_u64(UINT64_MAX)));
3551 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3552 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3553 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3554 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3557 !((*(double *) &a0) >= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3559 !((*(double *) &a1) >= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3561 return vreinterpretq_m128d_u64(vld1q_u64(d));
3565 // Compare the lower double-precision (64-bit) floating-point elements in a and
3566 // b for not-greater-than-or-equal, store the result in the lower element of
3567 // dst, and copy the upper element from a to the upper element of dst.
3568 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_sd
3569 FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
3571 return _mm_move_sd(a, _mm_cmpnge_pd(a, b));
3574 // Compare packed double-precision (64-bit) floating-point elements in a and b
3575 // for not-greater-than, and store the results in dst.
3576 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_cmpngt_pd
3577 FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
3579 #if defined(__aarch64__)
3580 return vreinterpretq_m128d_u64(veorq_u64(
3581 vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3582 vdupq_n_u64(UINT64_MAX)));
3584 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3585 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3586 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3587 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3590 !((*(double *) &a0) > (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3592 !((*(double *) &a1) > (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3594 return vreinterpretq_m128d_u64(vld1q_u64(d));
3598 // Compare the lower double-precision (64-bit) floating-point elements in a and
3599 // b for not-greater-than, store the result in the lower element of dst, and
3600 // copy the upper element from a to the upper element of dst.
3601 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpngt_sd
3602 FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
3604 return _mm_move_sd(a, _mm_cmpngt_pd(a, b));
3607 // Compare packed double-precision (64-bit) floating-point elements in a and b
3608 // for not-less-than-or-equal, and store the results in dst.
3609 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_pd
3610 FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
3612 #if defined(__aarch64__)
3613 return vreinterpretq_m128d_u64(veorq_u64(
3614 vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3615 vdupq_n_u64(UINT64_MAX)));
3617 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3618 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3619 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3620 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3623 !((*(double *) &a0) <= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3625 !((*(double *) &a1) <= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3627 return vreinterpretq_m128d_u64(vld1q_u64(d));
3631 // Compare the lower double-precision (64-bit) floating-point elements in a and
3632 // b for not-less-than-or-equal, store the result in the lower element of dst,
3633 // and copy the upper element from a to the upper element of dst.
3634 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_sd
3635 FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
3637 return _mm_move_sd(a, _mm_cmpnle_pd(a, b));
3640 // Compare packed double-precision (64-bit) floating-point elements in a and b
3641 // for not-less-than, and store the results in dst.
3642 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_pd
3643 FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
3645 #if defined(__aarch64__)
3646 return vreinterpretq_m128d_u64(veorq_u64(
3647 vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3648 vdupq_n_u64(UINT64_MAX)));
3650 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3651 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3652 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3653 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3656 !((*(double *) &a0) < (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3658 !((*(double *) &a1) < (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3660 return vreinterpretq_m128d_u64(vld1q_u64(d));
3664 // Compare the lower double-precision (64-bit) floating-point elements in a and
3665 // b for not-less-than, store the result in the lower element of dst, and copy
3666 // the upper element from a to the upper element of dst.
3667 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_sd
3668 FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
3670 return _mm_move_sd(a, _mm_cmpnlt_pd(a, b));
3673 // Compare packed double-precision (64-bit) floating-point elements in a and b
3674 // to see if neither is NaN, and store the results in dst.
3675 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_pd
3676 FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
3678 #if defined(__aarch64__)
3679 // Excluding NaNs, any two floating point numbers can be compared.
3680 uint64x2_t not_nan_a =
3681 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
3682 uint64x2_t not_nan_b =
3683 vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
3684 return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b));
3686 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3687 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3688 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3689 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3691 d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3692 (*(double *) &b0) == (*(double *) &b0))
3695 d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
3696 (*(double *) &b1) == (*(double *) &b1))
3700 return vreinterpretq_m128d_u64(vld1q_u64(d));
3704 // Compare the lower double-precision (64-bit) floating-point elements in a and
3705 // b to see if neither is NaN, store the result in the lower element of dst, and
3706 // copy the upper element from a to the upper element of dst.
3707 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_sd
3708 FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
3710 #if defined(__aarch64__)
3711 return _mm_move_sd(a, _mm_cmpord_pd(a, b));
3713 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3714 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3715 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3717 d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3718 (*(double *) &b0) == (*(double *) &b0))
3723 return vreinterpretq_m128d_u64(vld1q_u64(d));
3727 // Compare packed double-precision (64-bit) floating-point elements in a and b
3728 // to see if either is NaN, and store the results in dst.
3729 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_pd
3730 FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
3732 #if defined(__aarch64__)
3733 // Two NaNs are not equal in comparison operation.
3734 uint64x2_t not_nan_a =
3735 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
3736 uint64x2_t not_nan_b =
3737 vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
3738 return vreinterpretq_m128d_s32(
3739 vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
3741 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3742 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3743 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3744 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3746 d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3747 (*(double *) &b0) == (*(double *) &b0))
3750 d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
3751 (*(double *) &b1) == (*(double *) &b1))
3755 return vreinterpretq_m128d_u64(vld1q_u64(d));
3759 // Compare the lower double-precision (64-bit) floating-point elements in a and
3760 // b to see if either is NaN, store the result in the lower element of dst, and
3761 // copy the upper element from a to the upper element of dst.
3762 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_sd
3763 FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
3765 #if defined(__aarch64__)
3766 return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
3768 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3769 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3770 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3772 d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3773 (*(double *) &b0) == (*(double *) &b0))
3778 return vreinterpretq_m128d_u64(vld1q_u64(d));
3782 // Compare the lower double-precision (64-bit) floating-point element in a and b
3783 // for greater-than-or-equal, and return the boolean result (0 or 1).
3784 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comige_sd
3785 FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
3787 #if defined(__aarch64__)
3788 return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
3790 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3791 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3793 return (*(double *) &a0 >= *(double *) &b0);
3797 // Compare the lower double-precision (64-bit) floating-point element in a and b
3798 // for greater-than, and return the boolean result (0 or 1).
3799 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comigt_sd
3800 FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
3802 #if defined(__aarch64__)
3803 return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
3805 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3806 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3808 return (*(double *) &a0 > *(double *) &b0);
3812 // Compare the lower double-precision (64-bit) floating-point element in a and b
3813 // for less-than-or-equal, and return the boolean result (0 or 1).
3814 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comile_sd
3815 FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
3817 #if defined(__aarch64__)
3818 return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
3820 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3821 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3823 return (*(double *) &a0 <= *(double *) &b0);
3827 // Compare the lower double-precision (64-bit) floating-point element in a and b
3828 // for less-than, and return the boolean result (0 or 1).
3829 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comilt_sd
3830 FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
3832 #if defined(__aarch64__)
3833 return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
3835 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3836 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3838 return (*(double *) &a0 < *(double *) &b0);
3842 // Compare the lower double-precision (64-bit) floating-point element in a and b
3843 // for equality, and return the boolean result (0 or 1).
3844 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_sd
3845 FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
3847 #if defined(__aarch64__)
3848 return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1;
3850 uint32x4_t a_not_nan =
3851 vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(a));
3852 uint32x4_t b_not_nan =
3853 vceqq_u32(vreinterpretq_u32_m128d(b), vreinterpretq_u32_m128d(b));
3854 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
3856 vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
3857 uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan),
3858 vreinterpretq_u64_u32(a_eq_b));
3859 return vgetq_lane_u64(and_results, 0) & 0x1;
3863 // Compare the lower double-precision (64-bit) floating-point element in a and b
3864 // for not-equal, and return the boolean result (0 or 1).
3865 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comineq_sd
3866 FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
3868 return !_mm_comieq_sd(a, b);
3871 // Convert packed signed 32-bit integers in a to packed double-precision
3872 // (64-bit) floating-point elements, and store the results in dst.
3877 // dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
3880 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_pd
3881 FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
3883 #if defined(__aarch64__)
3884 return vreinterpretq_m128d_f64(
3885 vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))));
3887 double a0 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
3888 double a1 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1);
3889 return _mm_set_pd(a1, a0);
3893 // Converts the four signed 32-bit integer values of a to single-precision,
3894 // floating-point values
3895 // https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
3896 FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
3898 return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
3901 // Convert packed double-precision (64-bit) floating-point elements in a to
3902 // packed 32-bit integers, and store the results in dst.
3907 // dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
3910 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_epi32
3911 FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
3913 __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
3914 double d0 = ((double *) &rnd)[0];
3915 double d1 = ((double *) &rnd)[1];
3916 return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);
3919 // Convert packed double-precision (64-bit) floating-point elements in a to
3920 // packed 32-bit integers, and store the results in dst.
3925 // dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
3928 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_pi32
3929 FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
3931 __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
3932 double d0 = ((double *) &rnd)[0];
3933 double d1 = ((double *) &rnd)[1];
3934 int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
3935 return vreinterpret_m64_s32(vld1_s32(data));
3938 // Convert packed double-precision (64-bit) floating-point elements in a to
3939 // packed single-precision (32-bit) floating-point elements, and store the
3945 // dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k])
3949 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps
3950 FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
3952 #if defined(__aarch64__)
3953 float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
3954 return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
3956 float a0 = (float) ((double *) &a)[0];
3957 float a1 = (float) ((double *) &a)[1];
3958 return _mm_set_ps(0, 0, a1, a0);
3962 // Convert packed signed 32-bit integers in a to packed double-precision
3963 // (64-bit) floating-point elements, and store the results in dst.
3968 // dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
3971 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_pd
3972 FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
3974 #if defined(__aarch64__)
3975 return vreinterpretq_m128d_f64(
3976 vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a))));
3978 double a0 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 0);
3979 double a1 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 1);
3980 return _mm_set_pd(a1, a0);
3984 // Converts the four single-precision, floating-point values of a to signed
3985 // 32-bit integer values.
3992 // https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
3993 // *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
3994 // does not support! It is supported on ARMv8-A however.
3995 FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
3997 #if defined(__aarch64__)
3998 switch (_MM_GET_ROUNDING_MODE()) {
3999 case _MM_ROUND_NEAREST:
4000 return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
4001 case _MM_ROUND_DOWN:
4002 return vreinterpretq_m128i_s32(vcvtmq_s32_f32(a));
4004 return vreinterpretq_m128i_s32(vcvtpq_s32_f32(a));
4005 default: // _MM_ROUND_TOWARD_ZERO
4006 return vreinterpretq_m128i_s32(vcvtq_s32_f32(a));
4009 float *f = (float *) &a;
4010 switch (_MM_GET_ROUNDING_MODE()) {
4011 case _MM_ROUND_NEAREST: {
4012 uint32x4_t signmask = vdupq_n_u32(0x80000000);
4013 float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
4014 vdupq_n_f32(0.5f)); /* +/- 0.5 */
4015 int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
4016 vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
4017 int32x4_t r_trunc = vcvtq_s32_f32(
4018 vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
4019 int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
4020 vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
4021 int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
4022 vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
4023 float32x4_t delta = vsubq_f32(
4024 vreinterpretq_f32_m128(a),
4025 vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
4026 uint32x4_t is_delta_half =
4027 vceqq_f32(delta, half); /* delta == +/- 0.5 */
4028 return vreinterpretq_m128i_s32(
4029 vbslq_s32(is_delta_half, r_even, r_normal));
4031 case _MM_ROUND_DOWN:
4032 return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]),
4035 return _mm_set_epi32(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]),
4037 default: // _MM_ROUND_TOWARD_ZERO
4038 return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1],
4044 // Convert packed single-precision (32-bit) floating-point elements in a to
4045 // packed double-precision (64-bit) floating-point elements, and store the
4051 // dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
4054 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd
4055 FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
4057 #if defined(__aarch64__)
4058 return vreinterpretq_m128d_f64(
4059 vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
4061 double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
4062 double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
4063 return _mm_set_pd(a1, a0);
4067 // Copy the lower double-precision (64-bit) floating-point element of a to dst.
4069 // dst[63:0] := a[63:0]
4071 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64
4072 FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
4074 #if defined(__aarch64__)
4075 return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
4077 return ((double *) &a)[0];
4081 // Convert the lower double-precision (64-bit) floating-point element in a to a
4082 // 32-bit integer, and store the result in dst.
4084 // dst[31:0] := Convert_FP64_To_Int32(a[63:0])
4086 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si32
4087 FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
4089 #if defined(__aarch64__)
4090 return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
4092 __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
4093 double ret = ((double *) &rnd)[0];
4094 return (int32_t) ret;
4098 // Convert the lower double-precision (64-bit) floating-point element in a to a
4099 // 64-bit integer, and store the result in dst.
4101 // dst[63:0] := Convert_FP64_To_Int64(a[63:0])
4103 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64
4104 FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
4106 #if defined(__aarch64__)
4107 return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
4109 __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
4110 double ret = ((double *) &rnd)[0];
4111 return (int64_t) ret;
4115 // Convert the lower double-precision (64-bit) floating-point element in a to a
4116 // 64-bit integer, and store the result in dst.
4118 // dst[63:0] := Convert_FP64_To_Int64(a[63:0])
4120 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64x
4121 #define _mm_cvtsd_si64x _mm_cvtsd_si64
4123 // Convert the lower double-precision (64-bit) floating-point element in b to a
4124 // single-precision (32-bit) floating-point element, store the result in the
4125 // lower element of dst, and copy the upper 3 packed elements from a to the
4126 // upper elements of dst.
4127 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_ss
4128 FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
4130 #if defined(__aarch64__)
4131 return vreinterpretq_m128_f32(vsetq_lane_f32(
4132 vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
4133 vreinterpretq_f32_m128(a), 0));
4135 return vreinterpretq_m128_f32(vsetq_lane_f32((float) ((double *) &b)[0],
4136 vreinterpretq_f32_m128(a), 0));
4140 // Copy the lower 32-bit integer in a to dst.
4142 // dst[31:0] := a[31:0]
4144 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32
4145 FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
4147 return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
4150 // Copy the lower 64-bit integer in a to dst.
4152 // dst[63:0] := a[63:0]
4154 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64
4155 FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
4157 return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
4160 // Copy the lower 64-bit integer in a to dst.
4161 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
4162 #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4164 // Convert the signed 32-bit integer b to a double-precision (64-bit)
4165 // floating-point element, store the result in the lower element of dst, and
4166 // copy the upper element from a to the upper element of dst.
4167 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_sd
4168 FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
4170 #if defined(__aarch64__)
4171 return vreinterpretq_m128d_f64(
4172 vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
4174 double bf = (double) b;
4175 return vreinterpretq_m128d_s64(
4176 vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
4180 // Copy the lower 64-bit integer in a to dst.
4182 // dst[63:0] := a[63:0]
4184 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
4185 #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4187 // Moves 32-bit integer a to the least significant 32 bits of an __m128 object,
4188 // zero extending the upper bits.
4195 // https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
4196 FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
4198 return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
4201 // Convert the signed 64-bit integer b to a double-precision (64-bit)
4202 // floating-point element, store the result in the lower element of dst, and
4203 // copy the upper element from a to the upper element of dst.
4204 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_sd
4205 FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
4207 #if defined(__aarch64__)
4208 return vreinterpretq_m128d_f64(
4209 vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
4211 double bf = (double) b;
4212 return vreinterpretq_m128d_s64(
4213 vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
4217 // Moves 64-bit integer a to the least significant 64 bits of an __m128 object,
4218 // zero extending the upper bits.
4222 FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
4224 return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
4227 // Copy 64-bit integer a to the lower element of dst, and zero the upper
4229 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_si128
4230 #define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a)
4232 // Convert the signed 64-bit integer b to a double-precision (64-bit)
4233 // floating-point element, store the result in the lower element of dst, and
4234 // copy the upper element from a to the upper element of dst.
4235 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_sd
4236 #define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b)
4238 // Convert the lower single-precision (32-bit) floating-point element in b to a
4239 // double-precision (64-bit) floating-point element, store the result in the
4240 // lower element of dst, and copy the upper element from a to the upper element
4243 // dst[63:0] := Convert_FP32_To_FP64(b[31:0])
4244 // dst[127:64] := a[127:64]
4246 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sd
4247 FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
4249 double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
4250 #if defined(__aarch64__)
4251 return vreinterpretq_m128d_f64(
4252 vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
4254 return vreinterpretq_m128d_s64(
4255 vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0));
4259 // Convert packed double-precision (64-bit) floating-point elements in a to
4260 // packed 32-bit integers with truncation, and store the results in dst.
4261 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_epi32
4262 FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
4264 double a0 = ((double *) &a)[0];
4265 double a1 = ((double *) &a)[1];
4266 return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0);
4269 // Convert packed double-precision (64-bit) floating-point elements in a to
4270 // packed 32-bit integers with truncation, and store the results in dst.
4271 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_pi32
4272 FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
4274 double a0 = ((double *) &a)[0];
4275 double a1 = ((double *) &a)[1];
4276 int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
4277 return vreinterpret_m64_s32(vld1_s32(data));
4280 // Converts the four single-precision, floating-point values of a to signed
4281 // 32-bit integer values using truncate.
4282 // https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
4283 FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
4285 return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
4288 // Convert the lower double-precision (64-bit) floating-point element in a to a
4289 // 32-bit integer with truncation, and store the result in dst.
4291 // dst[63:0] := Convert_FP64_To_Int32_Truncate(a[63:0])
4293 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si32
4294 FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
4296 double ret = *((double *) &a);
4297 return (int32_t) ret;
4300 // Convert the lower double-precision (64-bit) floating-point element in a to a
4301 // 64-bit integer with truncation, and store the result in dst.
4303 // dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
4305 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64
4306 FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
4308 #if defined(__aarch64__)
4309 return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
4311 double ret = *((double *) &a);
4312 return (int64_t) ret;
4316 // Convert the lower double-precision (64-bit) floating-point element in a to a
4317 // 64-bit integer with truncation, and store the result in dst.
4319 // dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
4321 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64x
4322 #define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
4324 // Divide packed double-precision (64-bit) floating-point elements in a by
4325 // packed elements in b, and store the results in dst.
4329 // dst[i+63:i] := a[i+63:i] / b[i+63:i]
4332 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_pd
4333 FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
4335 #if defined(__aarch64__)
4336 return vreinterpretq_m128d_f64(
4337 vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4339 double *da = (double *) &a;
4340 double *db = (double *) &b;
4342 c[0] = da[0] / db[0];
4343 c[1] = da[1] / db[1];
4344 return vld1q_f32((float32_t *) c);
4348 // Divide the lower double-precision (64-bit) floating-point element in a by the
4349 // lower double-precision (64-bit) floating-point element in b, store the result
4350 // in the lower element of dst, and copy the upper element from a to the upper
4352 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_sd
4353 FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
4355 #if defined(__aarch64__)
4357 vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
4358 return vreinterpretq_m128d_f64(
4359 vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1));
4361 return _mm_move_sd(a, _mm_div_pd(a, b));
4365 // Extracts the selected signed or unsigned 16-bit integer from a and zero
4367 // https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
4368 // FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
4369 #define _mm_extract_epi16(a, imm) \
4370 vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
4372 // Inserts the least significant 16 bits of b into the selected 16-bit integer
4374 // https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
4375 // FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
4376 // __constrange(0,8) int imm)
4377 #define _mm_insert_epi16(a, b, imm) \
4379 vreinterpretq_m128i_s16( \
4380 vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
4383 // Loads two double-precision from 16-byte aligned memory, floating-point
4386 // dst[127:0] := MEM[mem_addr+127:mem_addr]
4388 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd
4389 FORCE_INLINE __m128d _mm_load_pd(const double *p)
4391 #if defined(__aarch64__)
4392 return vreinterpretq_m128d_f64(vld1q_f64(p));
4394 const float *fp = (const float *) p;
4395 float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
4396 return vreinterpretq_m128d_f32(vld1q_f32(data));
4400 // Load a double-precision (64-bit) floating-point element from memory into both
4403 // dst[63:0] := MEM[mem_addr+63:mem_addr]
4404 // dst[127:64] := MEM[mem_addr+63:mem_addr]
4406 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1
4407 #define _mm_load_pd1 _mm_load1_pd
4409 // Load a double-precision (64-bit) floating-point element from memory into the
4410 // lower of dst, and zero the upper element. mem_addr does not need to be
4411 // aligned on any particular boundary.
4413 // dst[63:0] := MEM[mem_addr+63:mem_addr]
4416 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd
4417 FORCE_INLINE __m128d _mm_load_sd(const double *p)
4419 #if defined(__aarch64__)
4420 return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
4422 const float *fp = (const float *) p;
4423 float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
4424 return vreinterpretq_m128d_f32(vld1q_f32(data));
4428 // Loads 128-bit value. :
4429 // https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
4430 FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
4432 return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
4435 // Load a double-precision (64-bit) floating-point element from memory into both
4438 // dst[63:0] := MEM[mem_addr+63:mem_addr]
4439 // dst[127:64] := MEM[mem_addr+63:mem_addr]
4441 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd
4442 FORCE_INLINE __m128d _mm_load1_pd(const double *p)
4444 #if defined(__aarch64__)
4445 return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
4447 return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
4451 // Load a double-precision (64-bit) floating-point element from memory into the
4452 // upper element of dst, and copy the lower element from a to dst. mem_addr does
4453 // not need to be aligned on any particular boundary.
4455 // dst[63:0] := a[63:0]
4456 // dst[127:64] := MEM[mem_addr+63:mem_addr]
4458 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd
4459 FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
4461 #if defined(__aarch64__)
4462 return vreinterpretq_m128d_f64(
4463 vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
4465 return vreinterpretq_m128d_f32(vcombine_f32(
4466 vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
4470 // Load 64-bit integer from memory into the first element of dst.
4471 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_epi64
4472 FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
4474 /* Load the lower 64 bits of the value pointed to by p into the
4475 * lower 64 bits of the result, zeroing the upper 64 bits of the result.
4477 return vreinterpretq_m128i_s32(
4478 vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
4481 // Load a double-precision (64-bit) floating-point element from memory into the
4482 // lower element of dst, and copy the upper element from a to dst. mem_addr does
4483 // not need to be aligned on any particular boundary.
4485 // dst[63:0] := MEM[mem_addr+63:mem_addr]
4486 // dst[127:64] := a[127:64]
4488 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd
4489 FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
4491 #if defined(__aarch64__)
4492 return vreinterpretq_m128d_f64(
4493 vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
4495 return vreinterpretq_m128d_f32(
4496 vcombine_f32(vld1_f32((const float *) p),
4497 vget_high_f32(vreinterpretq_f32_m128d(a))));
4501 // Load 2 double-precision (64-bit) floating-point elements from memory into dst
4502 // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
4503 // general-protection exception may be generated.
4505 // dst[63:0] := MEM[mem_addr+127:mem_addr+64]
4506 // dst[127:64] := MEM[mem_addr+63:mem_addr]
4508 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd
4509 FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
4511 #if defined(__aarch64__)
4512 float64x2_t v = vld1q_f64(p);
4513 return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
4515 int64x2_t v = vld1q_s64((const int64_t *) p);
4516 return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
4520 // Loads two double-precision from unaligned memory, floating-point values.
4521 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd
4522 FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
4524 return _mm_load_pd(p);
4527 // Loads 128-bit value. :
4528 // https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
4529 FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
4531 return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
4534 // Load unaligned 32-bit integer from memory into the first element of dst.
4536 // dst[31:0] := MEM[mem_addr+31:mem_addr]
4539 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32
4540 FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
4542 return vreinterpretq_m128i_s32(
4543 vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
4546 // Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
4549 // r0 := (a0 * b0) + (a1 * b1)
4550 // r1 := (a2 * b2) + (a3 * b3)
4551 // r2 := (a4 * b4) + (a5 * b5)
4552 // r3 := (a6 * b6) + (a7 * b7)
4553 // https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx
4554 FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
4556 int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
4557 vget_low_s16(vreinterpretq_s16_m128i(b)));
4558 int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
4559 vget_high_s16(vreinterpretq_s16_m128i(b)));
4561 int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
4562 int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
4564 return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
4567 // Conditionally store 8-bit integer elements from a into memory using mask
4568 // (elements are not stored when the highest bit is not set in the corresponding
4569 // element) and a non-temporal memory hint. mem_addr does not need to be aligned
4570 // on any particular boundary.
4571 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmoveu_si128
4572 FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
4574 int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7);
4575 __m128 b = _mm_load_ps((const float *) mem_addr);
4577 vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a),
4578 vreinterpretq_s8_m128(b));
4579 vst1q_s8((int8_t *) mem_addr, masked);
4582 // Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8
4583 // signed 16-bit integers from b.
4584 // https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
4585 FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
4587 return vreinterpretq_m128i_s16(
4588 vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4591 // Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the
4592 // 16 unsigned 8-bit integers from b.
4593 // https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx
4594 FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
4596 return vreinterpretq_m128i_u8(
4597 vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
4600 // Compare packed double-precision (64-bit) floating-point elements in a and b,
4601 // and store packed maximum values in dst.
4602 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pd
4603 FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
4605 #if defined(__aarch64__)
4606 #if SSE2NEON_PRECISE_MINMAX
4607 float64x2_t _a = vreinterpretq_f64_m128d(a);
4608 float64x2_t _b = vreinterpretq_f64_m128d(b);
4609 return vreinterpretq_m128d_f64(vbslq_f64(vcgtq_f64(_a, _b), _a, _b));
4611 return vreinterpretq_m128d_f64(
4612 vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4615 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
4616 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
4617 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
4618 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
4620 d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0;
4621 d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1;
4623 return vreinterpretq_m128d_u64(vld1q_u64(d));
4627 // Compare the lower double-precision (64-bit) floating-point elements in a and
4628 // b, store the maximum value in the lower element of dst, and copy the upper
4629 // element from a to the upper element of dst.
4630 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_sd
4631 FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
4633 #if defined(__aarch64__)
4634 return _mm_move_sd(a, _mm_max_pd(a, b));
4636 double *da = (double *) &a;
4637 double *db = (double *) &b;
4638 double c[2] = {da[0] > db[0] ? da[0] : db[0], da[1]};
4639 return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
4643 // Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8
4644 // signed 16-bit integers from b.
4645 // https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
4646 FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
4648 return vreinterpretq_m128i_s16(
4649 vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4652 // Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the
4653 // 16 unsigned 8-bit integers from b.
4654 // https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
4655 FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
4657 return vreinterpretq_m128i_u8(
4658 vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
4661 // Compare packed double-precision (64-bit) floating-point elements in a and b,
4662 // and store packed minimum values in dst.
4663 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pd
4664 FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
4666 #if defined(__aarch64__)
4667 #if SSE2NEON_PRECISE_MINMAX
4668 float64x2_t _a = vreinterpretq_f64_m128d(a);
4669 float64x2_t _b = vreinterpretq_f64_m128d(b);
4670 return vreinterpretq_m128d_f64(vbslq_f64(vcltq_f64(_a, _b), _a, _b));
4672 return vreinterpretq_m128d_f64(
4673 vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4676 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
4677 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
4678 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
4679 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
4681 d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0;
4682 d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1;
4683 return vreinterpretq_m128d_u64(vld1q_u64(d));
4687 // Compare the lower double-precision (64-bit) floating-point elements in a and
4688 // b, store the minimum value in the lower element of dst, and copy the upper
4689 // element from a to the upper element of dst.
4690 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_sd
4691 FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
4693 #if defined(__aarch64__)
4694 return _mm_move_sd(a, _mm_min_pd(a, b));
4696 double *da = (double *) &a;
4697 double *db = (double *) &b;
4698 double c[2] = {da[0] < db[0] ? da[0] : db[0], da[1]};
4699 return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
4703 // Copy the lower 64-bit integer in a to the lower element of dst, and zero the
4706 // dst[63:0] := a[63:0]
4709 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64
4710 FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
4712 return vreinterpretq_m128i_s64(
4713 vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
4716 // Move the lower double-precision (64-bit) floating-point element from b to the
4717 // lower element of dst, and copy the upper element from a to the upper element
4720 // dst[63:0] := b[63:0]
4721 // dst[127:64] := a[127:64]
4723 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sd
4724 FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
4726 return vreinterpretq_m128d_f32(
4727 vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)),
4728 vget_high_f32(vreinterpretq_f32_m128d(a))));
4731 // NEON does not provide a version of this function.
4732 // Creates a 16-bit mask from the most significant bits of the 16 signed or
4733 // unsigned 8-bit integers in a and zero extends the upper bits.
4734 // https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
4735 FORCE_INLINE int _mm_movemask_epi8(__m128i a)
4737 // Use increasingly wide shifts+adds to collect the sign bits
4739 // Since the widening shifts would be rather confusing to follow in little
4740 // endian, everything will be illustrated in big endian order instead. This
4741 // has a different result - the bits would actually be reversed on a big
4744 // Starting input (only half the elements are shown):
4745 // 89 ff 1d c0 00 10 99 33
4746 uint8x16_t input = vreinterpretq_u8_m128i(a);
4748 // Shift out everything but the sign bits with an unsigned shift right.
4750 // Bytes of the vector::
4751 // 89 ff 1d c0 00 10 99 33
4752 // \ \ \ \ \ \ \ \ high_bits = (uint16x4_t)(input >> 7)
4754 // 01 01 00 01 00 00 01 00
4756 // Bits of first important lane(s):
4761 uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
4763 // Merge the even lanes together with a 16-bit unsigned shift right + add.
4764 // 'xx' represents garbage data which will be ignored in the final result.
4765 // In the important bytes, the add functions like a binary OR.
4767 // 01 01 00 01 00 00 01 00
4768 // \_ | \_ | \_ | \_ | paired16 = (uint32x4_t)(input + (input >> 7))
4770 // xx 03 xx 01 xx 00 xx 02
4772 // 00000001 00000001 (01 01)
4775 // xxxxxxxx xxxxxx11 (xx 03)
4776 uint32x4_t paired16 =
4777 vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
4779 // Repeat with a wider 32-bit shift + add.
4780 // xx 03 xx 01 xx 00 xx 02
4781 // \____ | \____ | paired32 = (uint64x1_t)(paired16 + (paired16 >>
4784 // xx xx xx 0d xx xx xx 02
4786 // 00000011 00000001 (03 01)
4789 // xxxxxxxx xxxx1101 (xx 0d)
4790 uint64x2_t paired32 =
4791 vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
4793 // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
4794 // lanes. xx xx xx 0d xx xx xx 02
4795 // \_________ | paired64 = (uint8x8_t)(paired32 + (paired32 >>
4798 // xx xx xx xx xx xx xx d2
4800 // 00001101 00000010 (0d 02)
4803 // xxxxxxxx 11010010 (xx d2)
4804 uint8x16_t paired64 =
4805 vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
4807 // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
4808 // xx xx xx xx xx xx xx d2
4809 // || return paired64[0]
4811 // Note: Little endian would return the correct value 4b (01001011) instead.
4812 return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
4815 // Set each bit of mask dst based on the most significant bit of the
4816 // corresponding packed double-precision (64-bit) floating-point element in a.
4817 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pd
4818 FORCE_INLINE int _mm_movemask_pd(__m128d a)
4820 uint64x2_t input = vreinterpretq_u64_m128d(a);
4821 uint64x2_t high_bits = vshrq_n_u64(input, 63);
4822 return vgetq_lane_u64(high_bits, 0) | (vgetq_lane_u64(high_bits, 1) << 1);
4825 // Copy the lower 64-bit integer in a to dst.
4827 // dst[63:0] := a[63:0]
4829 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64
4830 FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
4832 return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
4835 // Copy the 64-bit integer a to the lower element of dst, and zero the upper
4838 // dst[63:0] := a[63:0]
4841 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64
4842 FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
4844 return vreinterpretq_m128i_s64(
4845 vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
4848 // Multiply the low unsigned 32-bit integers from each packed 64-bit element in
4849 // a and b, and store the unsigned 64-bit results in dst.
4851 // r0 := (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF)
4852 // r1 := (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF)
4853 FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
4855 // vmull_u32 upcasts instead of masking, so we downcast.
4856 uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
4857 uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
4858 return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
4861 // Multiply packed double-precision (64-bit) floating-point elements in a and b,
4862 // and store the results in dst.
4863 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pd
4864 FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
4866 #if defined(__aarch64__)
4867 return vreinterpretq_m128d_f64(
4868 vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4870 double *da = (double *) &a;
4871 double *db = (double *) &b;
4873 c[0] = da[0] * db[0];
4874 c[1] = da[1] * db[1];
4875 return vld1q_f32((float32_t *) c);
4879 // Multiply the lower double-precision (64-bit) floating-point element in a and
4880 // b, store the result in the lower element of dst, and copy the upper element
4881 // from a to the upper element of dst.
4882 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_sd
4883 FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
4885 return _mm_move_sd(a, _mm_mul_pd(a, b));
4888 // Multiply the low unsigned 32-bit integers from a and b, and store the
4889 // unsigned 64-bit result in dst.
4891 // dst[63:0] := a[31:0] * b[31:0]
4893 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32
4894 FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
4896 return vreinterpret_m64_u64(vget_low_u64(
4897 vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
4900 // Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
4903 // r0 := (a0 * b0)[31:16]
4904 // r1 := (a1 * b1)[31:16]
4906 // r7 := (a7 * b7)[31:16]
4908 // https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
4909 FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
4911 /* FIXME: issue with large values because of result saturation */
4912 // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
4913 // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
4914 // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
4915 int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
4916 int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
4917 int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
4918 int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
4919 int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
4920 int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
4922 vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
4923 return vreinterpretq_m128i_u16(r.val[1]);
4926 // Multiply the packed unsigned 16-bit integers in a and b, producing
4927 // intermediate 32-bit integers, and store the high 16 bits of the intermediate
4929 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_epu16
4930 FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
4932 uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
4933 uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
4934 uint32x4_t ab3210 = vmull_u16(a3210, b3210);
4935 #if defined(__aarch64__)
4937 vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
4938 uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
4939 vreinterpretq_u16_u32(ab7654));
4940 return vreinterpretq_m128i_u16(r);
4942 uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a));
4943 uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b));
4944 uint32x4_t ab7654 = vmull_u16(a7654, b7654);
4946 vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
4947 return vreinterpretq_m128i_u16(r.val[1]);
4951 // Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or
4952 // unsigned 16-bit integers from b.
4954 // r0 := (a0 * b0)[15:0]
4955 // r1 := (a1 * b1)[15:0]
4957 // r7 := (a7 * b7)[15:0]
4959 // https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
4960 FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
4962 return vreinterpretq_m128i_s16(
4963 vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4966 // Compute the bitwise OR of packed double-precision (64-bit) floating-point
4967 // elements in a and b, and store the results in dst.
4968 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_or_pd
4969 FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
4971 return vreinterpretq_m128d_s64(
4972 vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
4975 // Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
4979 // https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
4980 FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
4982 return vreinterpretq_m128i_s32(
4983 vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
4986 // Packs the 16 signed 16-bit integers from a and b into 8-bit integers and
4988 // https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
4989 FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
4991 return vreinterpretq_m128i_s8(
4992 vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
4993 vqmovn_s16(vreinterpretq_s16_m128i(b))));
4996 // Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers
4999 // r0 := SignedSaturate(a0)
5000 // r1 := SignedSaturate(a1)
5001 // r2 := SignedSaturate(a2)
5002 // r3 := SignedSaturate(a3)
5003 // r4 := SignedSaturate(b0)
5004 // r5 := SignedSaturate(b1)
5005 // r6 := SignedSaturate(b2)
5006 // r7 := SignedSaturate(b3)
5008 // https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
5009 FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
5011 return vreinterpretq_m128i_s16(
5012 vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
5013 vqmovn_s32(vreinterpretq_s32_m128i(b))));
5016 // Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned
5017 // integers and saturates.
5019 // r0 := UnsignedSaturate(a0)
5020 // r1 := UnsignedSaturate(a1)
5022 // r7 := UnsignedSaturate(a7)
5023 // r8 := UnsignedSaturate(b0)
5024 // r9 := UnsignedSaturate(b1)
5026 // r15 := UnsignedSaturate(b7)
5028 // https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
5029 FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
5031 return vreinterpretq_m128i_u8(
5032 vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
5033 vqmovun_s16(vreinterpretq_s16_m128i(b))));
5036 // Pause the processor. This is typically used in spin-wait loops and depending
5037 // on the x86 processor typical values are in the 40-100 cycle range. The
5038 // 'yield' instruction isn't a good fit beacuse it's effectively a nop on most
5039 // Arm cores. Experience with several databases has shown has shown an 'isb' is
5040 // a reasonable approximation.
5041 FORCE_INLINE void _mm_pause()
5043 __asm__ __volatile__("isb\n");
5046 // Compute the absolute differences of packed unsigned 8-bit integers in a and
5047 // b, then horizontally sum each consecutive 8 differences to produce two
5048 // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
5049 // 16 bits of 64-bit elements in dst.
5050 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8
5051 FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
5053 uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
5054 return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t)));
5057 // Sets the 8 signed 16-bit integer values.
5058 // https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx
5059 FORCE_INLINE __m128i _mm_set_epi16(short i7,
5068 int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
5069 return vreinterpretq_m128i_s16(vld1q_s16(data));
5072 // Sets the 4 signed 32-bit integer values.
5073 // https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
5074 FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
5076 int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
5077 return vreinterpretq_m128i_s32(vld1q_s32(data));
5080 // Returns the __m128i structure with its two 64-bit integer values
5081 // initialized to the values of the two 64-bit integers passed in.
5082 // https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
5083 FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
5085 return _mm_set_epi64x((int64_t) i1, (int64_t) i2);
5088 // Returns the __m128i structure with its two 64-bit integer values
5089 // initialized to the values of the two 64-bit integers passed in.
5090 // https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
5091 FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
5093 return vreinterpretq_m128i_s64(
5094 vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
5097 // Sets the 16 signed 8-bit integer values.
5098 // https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
5099 FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
5116 int8_t ALIGN_STRUCT(16)
5117 data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
5118 (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
5119 (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
5120 (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
5121 return (__m128i) vld1q_s8(data);
5124 // Set packed double-precision (64-bit) floating-point elements in dst with the
5126 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd
5127 FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
5129 double ALIGN_STRUCT(16) data[2] = {e0, e1};
5130 #if defined(__aarch64__)
5131 return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
5133 return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
5137 // Broadcast double-precision (64-bit) floating-point value a to all elements of
5139 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd1
5140 #define _mm_set_pd1 _mm_set1_pd
5142 // Copy double-precision (64-bit) floating-point element a to the lower element
5143 // of dst, and zero the upper element.
5144 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sd
5145 FORCE_INLINE __m128d _mm_set_sd(double a)
5147 return _mm_set_pd(0, a);
5150 // Sets the 8 signed 16-bit integer values to w.
5157 // https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx
5158 FORCE_INLINE __m128i _mm_set1_epi16(short w)
5160 return vreinterpretq_m128i_s16(vdupq_n_s16(w));
5163 // Sets the 4 signed 32-bit integer values to i.
5170 // https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
5171 FORCE_INLINE __m128i _mm_set1_epi32(int _i)
5173 return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
5176 // Sets the 2 signed 64-bit integer values to i.
5177 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100)
5178 FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
5180 return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));
5183 // Sets the 2 signed 64-bit integer values to i.
5184 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x
5185 FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
5187 return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
5190 // Sets the 16 signed 8-bit integer values to b.
5197 // https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx
5198 FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
5200 return vreinterpretq_m128i_s8(vdupq_n_s8(w));
5203 // Broadcast double-precision (64-bit) floating-point value a to all elements of
5205 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_pd
5206 FORCE_INLINE __m128d _mm_set1_pd(double d)
5208 #if defined(__aarch64__)
5209 return vreinterpretq_m128d_f64(vdupq_n_f64(d));
5211 return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d));
5215 // Sets the 8 signed 16-bit integer values in reverse order.
5222 FORCE_INLINE __m128i _mm_setr_epi16(short w0,
5231 int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
5232 return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
5235 // Sets the 4 signed 32-bit integer values in reverse order
5236 // https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx
5237 FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
5239 int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
5240 return vreinterpretq_m128i_s32(vld1q_s32(data));
5243 // Set packed 64-bit integers in dst with the supplied values in reverse order.
5244 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64
5245 FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
5247 return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
5250 // Sets the 16 signed 8-bit integer values in reverse order.
5251 // https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx
5252 FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
5269 int8_t ALIGN_STRUCT(16)
5270 data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
5271 (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
5272 (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
5273 (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
5274 return (__m128i) vld1q_s8(data);
5277 // Set packed double-precision (64-bit) floating-point elements in dst with the
5278 // supplied values in reverse order.
5279 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_pd
5280 FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
5282 return _mm_set_pd(e0, e1);
5285 // Return vector of type __m128d with all elements set to zero.
5286 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_pd
5287 FORCE_INLINE __m128d _mm_setzero_pd(void)
5289 #if defined(__aarch64__)
5290 return vreinterpretq_m128d_f64(vdupq_n_f64(0));
5292 return vreinterpretq_m128d_f32(vdupq_n_f32(0));
5296 // Sets the 128-bit value to zero
5297 // https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
5298 FORCE_INLINE __m128i _mm_setzero_si128(void)
5300 return vreinterpretq_m128i_s32(vdupq_n_s32(0));
5303 // Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.
5304 // https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
5305 // FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
5306 // __constrange(0,255) int imm)
5307 #if __has_builtin(__builtin_shufflevector)
5308 #define _mm_shuffle_epi32(a, imm) \
5310 int32x4_t _input = vreinterpretq_s32_m128i(a); \
5311 int32x4_t _shuf = __builtin_shufflevector( \
5312 _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
5313 ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \
5314 vreinterpretq_m128i_s32(_shuf); \
5317 #define _mm_shuffle_epi32(a, imm) \
5321 case _MM_SHUFFLE(1, 0, 3, 2): \
5322 ret = _mm_shuffle_epi_1032((a)); \
5324 case _MM_SHUFFLE(2, 3, 0, 1): \
5325 ret = _mm_shuffle_epi_2301((a)); \
5327 case _MM_SHUFFLE(0, 3, 2, 1): \
5328 ret = _mm_shuffle_epi_0321((a)); \
5330 case _MM_SHUFFLE(2, 1, 0, 3): \
5331 ret = _mm_shuffle_epi_2103((a)); \
5333 case _MM_SHUFFLE(1, 0, 1, 0): \
5334 ret = _mm_shuffle_epi_1010((a)); \
5336 case _MM_SHUFFLE(1, 0, 0, 1): \
5337 ret = _mm_shuffle_epi_1001((a)); \
5339 case _MM_SHUFFLE(0, 1, 0, 1): \
5340 ret = _mm_shuffle_epi_0101((a)); \
5342 case _MM_SHUFFLE(2, 2, 1, 1): \
5343 ret = _mm_shuffle_epi_2211((a)); \
5345 case _MM_SHUFFLE(0, 1, 2, 2): \
5346 ret = _mm_shuffle_epi_0122((a)); \
5348 case _MM_SHUFFLE(3, 3, 3, 2): \
5349 ret = _mm_shuffle_epi_3332((a)); \
5351 case _MM_SHUFFLE(0, 0, 0, 0): \
5352 ret = _mm_shuffle_epi32_splat((a), 0); \
5354 case _MM_SHUFFLE(1, 1, 1, 1): \
5355 ret = _mm_shuffle_epi32_splat((a), 1); \
5357 case _MM_SHUFFLE(2, 2, 2, 2): \
5358 ret = _mm_shuffle_epi32_splat((a), 2); \
5360 case _MM_SHUFFLE(3, 3, 3, 3): \
5361 ret = _mm_shuffle_epi32_splat((a), 3); \
5364 ret = _mm_shuffle_epi32_default((a), (imm)); \
5371 // Shuffle double-precision (64-bit) floating-point elements using the control
5372 // in imm8, and store the results in dst.
5374 // dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
5375 // dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
5377 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pd
5378 #if __has_builtin(__builtin_shufflevector)
5379 #define _mm_shuffle_pd(a, b, imm8) \
5380 vreinterpretq_m128d_s64(__builtin_shufflevector( \
5381 vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), imm8 & 0x1, \
5382 ((imm8 & 0x2) >> 1) + 2))
5384 #define _mm_shuffle_pd(a, b, imm8) \
5385 _mm_castsi128_pd(_mm_set_epi64x( \
5386 vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
5387 vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
5390 // FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
5391 // __constrange(0,255) int imm)
5392 #if __has_builtin(__builtin_shufflevector)
5393 #define _mm_shufflehi_epi16(a, imm) \
5395 int16x8_t _input = vreinterpretq_s16_m128i(a); \
5396 int16x8_t _shuf = __builtin_shufflevector( \
5397 _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4, \
5398 (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
5399 (((imm) >> 6) & 0x3) + 4); \
5400 vreinterpretq_m128i_s16(_shuf); \
5403 #define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
5406 // FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
5407 // __constrange(0,255) int imm)
5408 #if __has_builtin(__builtin_shufflevector)
5409 #define _mm_shufflelo_epi16(a, imm) \
5411 int16x8_t _input = vreinterpretq_s16_m128i(a); \
5412 int16x8_t _shuf = __builtin_shufflevector( \
5413 _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \
5414 (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
5415 vreinterpretq_m128i_s16(_shuf); \
5418 #define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
5421 // Shift packed 16-bit integers in a left by count while shifting in zeros, and
5422 // store the results in dst.
5426 // IF count[63:0] > 15
5429 // dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0])
5433 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi16
5434 FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
5436 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5437 if (_sse2neon_unlikely(c & ~15))
5438 return _mm_setzero_si128();
5440 int16x8_t vc = vdupq_n_s16((int16_t) c);
5441 return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
5444 // Shift packed 32-bit integers in a left by count while shifting in zeros, and
5445 // store the results in dst.
5449 // IF count[63:0] > 31
5452 // dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0])
5456 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi32
5457 FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
5459 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5460 if (_sse2neon_unlikely(c & ~31))
5461 return _mm_setzero_si128();
5463 int32x4_t vc = vdupq_n_s32((int32_t) c);
5464 return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
5467 // Shift packed 64-bit integers in a left by count while shifting in zeros, and
5468 // store the results in dst.
5472 // IF count[63:0] > 63
5475 // dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0])
5479 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi64
5480 FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
5482 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5483 if (_sse2neon_unlikely(c & ~63))
5484 return _mm_setzero_si128();
5486 int64x2_t vc = vdupq_n_s64((int64_t) c);
5487 return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
5490 // Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and
5491 // store the results in dst.
5495 // IF imm8[7:0] > 15
5498 // dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0])
5502 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi16
5503 FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
5505 if (_sse2neon_unlikely(imm & ~15))
5506 return _mm_setzero_si128();
5507 return vreinterpretq_m128i_s16(
5508 vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16(imm)));
5511 // Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and
5512 // store the results in dst.
5516 // IF imm8[7:0] > 31
5519 // dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0])
5523 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi32
5524 FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
5526 if (_sse2neon_unlikely(imm & ~31))
5527 return _mm_setzero_si128();
5528 return vreinterpretq_m128i_s32(
5529 vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
5532 // Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
5533 // store the results in dst.
5537 // IF imm8[7:0] > 63
5540 // dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0])
5544 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi64
5545 FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
5547 if (_sse2neon_unlikely(imm & ~63))
5548 return _mm_setzero_si128();
5549 return vreinterpretq_m128i_s64(
5550 vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
5553 // Shift a left by imm8 bytes while shifting in zeros, and store the results in
5560 // dst[127:0] := a[127:0] << (tmp*8)
5562 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_si128
5563 FORCE_INLINE __m128i _mm_slli_si128(__m128i a, int imm)
5565 if (_sse2neon_unlikely(imm & ~15))
5566 return _mm_setzero_si128();
5567 uint8x16_t tmp[2] = {vdupq_n_u8(0), vreinterpretq_u8_m128i(a)};
5568 return vreinterpretq_m128i_u8(
5569 vld1q_u8(((uint8_t const *) tmp) + (16 - imm)));
5572 // Compute the square root of packed double-precision (64-bit) floating-point
5573 // elements in a, and store the results in dst.
5574 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_pd
5575 FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
5577 #if defined(__aarch64__)
5578 return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
5580 double a0 = sqrt(((double *) &a)[0]);
5581 double a1 = sqrt(((double *) &a)[1]);
5582 return _mm_set_pd(a1, a0);
5586 // Compute the square root of the lower double-precision (64-bit) floating-point
5587 // element in b, store the result in the lower element of dst, and copy the
5588 // upper element from a to the upper element of dst.
5589 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_sd
5590 FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
5592 #if defined(__aarch64__)
5593 return _mm_move_sd(a, _mm_sqrt_pd(b));
5595 return _mm_set_pd(((double *) &a)[1], sqrt(((double *) &b)[0]));
5599 // Shift packed 16-bit integers in a right by count while shifting in sign bits,
5600 // and store the results in dst.
5604 // IF count[63:0] > 15
5605 // dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
5607 // dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0])
5611 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sra_epi16
5612 FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
5614 int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
5615 if (_sse2neon_unlikely(c & ~15))
5616 return _mm_cmplt_epi16(a, _mm_setzero_si128());
5617 return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c)));
5620 // Shift packed 32-bit integers in a right by count while shifting in sign bits,
5621 // and store the results in dst.
5625 // IF count[63:0] > 31
5626 // dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
5628 // dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0])
5632 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sra_epi32
5633 FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
5635 int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
5636 if (_sse2neon_unlikely(c & ~31))
5637 return _mm_cmplt_epi32(a, _mm_setzero_si128());
5638 return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c)));
5641 // Shift packed 16-bit integers in a right by imm8 while shifting in sign
5642 // bits, and store the results in dst.
5646 // IF imm8[7:0] > 15
5647 // dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
5649 // dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0])
5653 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16
5654 FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
5656 const int count = (imm & ~15) ? 15 : imm;
5657 return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
5660 // Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
5661 // and store the results in dst.
5665 // IF imm8[7:0] > 31
5666 // dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
5668 // dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0])
5672 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32
5673 // FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
5674 #define _mm_srai_epi32(a, imm) \
5677 if (_sse2neon_unlikely((imm) == 0)) { \
5679 } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) { \
5680 ret = vreinterpretq_m128i_s32( \
5681 vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-imm))); \
5683 ret = vreinterpretq_m128i_s32( \
5684 vshrq_n_s32(vreinterpretq_s32_m128i(a), 31)); \
5689 // Shift packed 16-bit integers in a right by count while shifting in zeros, and
5690 // store the results in dst.
5694 // IF count[63:0] > 15
5697 // dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0])
5701 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi16
5702 FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
5704 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5705 if (_sse2neon_unlikely(c & ~15))
5706 return _mm_setzero_si128();
5708 int16x8_t vc = vdupq_n_s16(-(int16_t) c);
5709 return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
5712 // Shift packed 32-bit integers in a right by count while shifting in zeros, and
5713 // store the results in dst.
5717 // IF count[63:0] > 31
5720 // dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0])
5724 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi32
5725 FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
5727 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5728 if (_sse2neon_unlikely(c & ~31))
5729 return _mm_setzero_si128();
5731 int32x4_t vc = vdupq_n_s32(-(int32_t) c);
5732 return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
5735 // Shift packed 64-bit integers in a right by count while shifting in zeros, and
5736 // store the results in dst.
5740 // IF count[63:0] > 63
5743 // dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0])
5747 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi64
5748 FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
5750 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5751 if (_sse2neon_unlikely(c & ~63))
5752 return _mm_setzero_si128();
5754 int64x2_t vc = vdupq_n_s64(-(int64_t) c);
5755 return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
5758 // Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
5759 // store the results in dst.
5763 // IF imm8[7:0] > 15
5766 // dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0])
5770 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16
5771 #define _mm_srli_epi16(a, imm) \
5774 if (_sse2neon_unlikely((imm) & ~15)) { \
5775 ret = _mm_setzero_si128(); \
5777 ret = vreinterpretq_m128i_u16( \
5778 vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-(imm)))); \
5783 // Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
5784 // store the results in dst.
5788 // IF imm8[7:0] > 31
5791 // dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0])
5795 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32
5796 // FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
5797 #define _mm_srli_epi32(a, imm) \
5800 if (_sse2neon_unlikely((imm) & ~31)) { \
5801 ret = _mm_setzero_si128(); \
5803 ret = vreinterpretq_m128i_u32( \
5804 vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-(imm)))); \
5809 // Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
5810 // store the results in dst.
5814 // IF imm8[7:0] > 63
5817 // dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0])
5821 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64
5822 #define _mm_srli_epi64(a, imm) \
5825 if (_sse2neon_unlikely((imm) & ~63)) { \
5826 ret = _mm_setzero_si128(); \
5828 ret = vreinterpretq_m128i_u64( \
5829 vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-(imm)))); \
5834 // Shift a right by imm8 bytes while shifting in zeros, and store the results in
5841 // dst[127:0] := a[127:0] >> (tmp*8)
5843 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_si128
5844 FORCE_INLINE __m128i _mm_srli_si128(__m128i a, int imm)
5846 if (_sse2neon_unlikely(imm & ~15))
5847 return _mm_setzero_si128();
5848 uint8x16_t tmp[2] = {vreinterpretq_u8_m128i(a), vdupq_n_u8(0)};
5849 return vreinterpretq_m128i_u8(vld1q_u8(((uint8_t const *) tmp) + imm));
5852 // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
5853 // elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
5854 // or a general-protection exception may be generated.
5855 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd
5856 FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
5858 #if defined(__aarch64__)
5859 vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
5861 vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
5865 // Store the lower double-precision (64-bit) floating-point element from a into
5866 // 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
5867 // boundary or a general-protection exception may be generated.
5868 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd1
5869 FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
5871 #if defined(__aarch64__)
5872 float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
5873 vst1q_f64((float64_t *) mem_addr,
5874 vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
5876 float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a));
5877 vst1q_f32((float32_t *) mem_addr,
5878 vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low)));
5882 // Store the lower double-precision (64-bit) floating-point element from a into
5883 // memory. mem_addr does not need to be aligned on any particular boundary.
5884 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_store_sd
5885 FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
5887 #if defined(__aarch64__)
5888 vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
5890 vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a)));
5894 // Stores four 32-bit integer values as (as a __m128i value) at the address p.
5895 // https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
5896 FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
5898 vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
5901 // Store the lower double-precision (64-bit) floating-point element from a into
5902 // 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
5903 // boundary or a general-protection exception may be generated.
5904 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=9,526,5601&text=_mm_store1_pd
5905 #define _mm_store1_pd _mm_store_pd1
5907 // Store the upper double-precision (64-bit) floating-point element from a into
5910 // MEM[mem_addr+63:mem_addr] := a[127:64]
5912 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeh_pd
5913 FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
5915 #if defined(__aarch64__)
5916 vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
5918 vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a)));
5922 // Reads the lower 64 bits of b and stores them into the lower 64 bits of a.
5923 // https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
5924 FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
5926 uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a));
5927 uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b));
5928 *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi));
5931 // Store the lower double-precision (64-bit) floating-point element from a into
5934 // MEM[mem_addr+63:mem_addr] := a[63:0]
5936 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storel_pd
5937 FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
5939 #if defined(__aarch64__)
5940 vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
5942 vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a)));
5946 // Store 2 double-precision (64-bit) floating-point elements from a into memory
5947 // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
5948 // general-protection exception may be generated.
5950 // MEM[mem_addr+63:mem_addr] := a[127:64]
5951 // MEM[mem_addr+127:mem_addr+64] := a[63:0]
5953 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_pd
5954 FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
5956 float32x4_t f = vreinterpretq_f32_m128d(a);
5957 _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2)));
5960 // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
5961 // elements) from a into memory. mem_addr does not need to be aligned on any
5962 // particular boundary.
5963 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd
5964 FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
5966 _mm_store_pd(mem_addr, a);
5969 // Stores 128-bits of integer data a at the address p.
5970 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si128
5971 FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
5973 vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
5976 // Stores 32-bits of integer data a at the address p.
5977 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si32
5978 FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
5980 vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0);
5983 // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
5984 // elements) from a into memory using a non-temporal memory hint. mem_addr must
5985 // be aligned on a 16-byte boundary or a general-protection exception may be
5987 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pd
5988 FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
5990 #if __has_builtin(__builtin_nontemporal_store)
5991 __builtin_nontemporal_store(a, (float32x4_t *) p);
5992 #elif defined(__aarch64__)
5993 vst1q_f64(p, vreinterpretq_f64_m128d(a));
5995 vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a));
5999 // Stores the data in a to the address p without polluting the caches. If the
6000 // cache line containing address p is already in the cache, the cache will be
6002 // https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
6003 FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
6005 #if __has_builtin(__builtin_nontemporal_store)
6006 __builtin_nontemporal_store(a, p);
6008 vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
6012 // Store 32-bit integer a into memory using a non-temporal hint to minimize
6013 // cache pollution. If the cache line containing address mem_addr is already in
6014 // the cache, the cache will be updated.
6015 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_si32
6016 FORCE_INLINE void _mm_stream_si32(int *p, int a)
6018 vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);
6021 // Store 64-bit integer a into memory using a non-temporal hint to minimize
6022 // cache pollution. If the cache line containing address mem_addr is already in
6023 // the cache, the cache will be updated.
6024 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_si64
6025 FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a)
6027 vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a));
6030 // Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
6031 // store the results in dst.
6032 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi16
6033 FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
6035 return vreinterpretq_m128i_s16(
6036 vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
6039 // Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or
6040 // unsigned 32-bit integers of a.
6047 // https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
6048 FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
6050 return vreinterpretq_m128i_s32(
6051 vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
6054 // Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a,
6055 // and store the results in dst.
6058 FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
6060 return vreinterpretq_m128i_s64(
6061 vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
6064 // Subtract packed 8-bit integers in b from packed 8-bit integers in a, and
6065 // store the results in dst.
6066 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi8
6067 FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
6069 return vreinterpretq_m128i_s8(
6070 vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
6073 // Subtract packed double-precision (64-bit) floating-point elements in b from
6074 // packed double-precision (64-bit) floating-point elements in a, and store the
6079 // dst[i+63:i] := a[i+63:i] - b[i+63:i]
6082 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_pd
6083 FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
6085 #if defined(__aarch64__)
6086 return vreinterpretq_m128d_f64(
6087 vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6089 double *da = (double *) &a;
6090 double *db = (double *) &b;
6092 c[0] = da[0] - db[0];
6093 c[1] = da[1] - db[1];
6094 return vld1q_f32((float32_t *) c);
6098 // Subtract the lower double-precision (64-bit) floating-point element in b from
6099 // the lower double-precision (64-bit) floating-point element in a, store the
6100 // result in the lower element of dst, and copy the upper element from a to the
6101 // upper element of dst.
6102 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sd
6103 FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
6105 return _mm_move_sd(a, _mm_sub_pd(a, b));
6108 // Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
6110 // dst[63:0] := a[63:0] - b[63:0]
6112 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64
6113 FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
6115 return vreinterpret_m64_s64(
6116 vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
6119 // Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers
6120 // of a and saturates.
6122 // r0 := SignedSaturate(a0 - b0)
6123 // r1 := SignedSaturate(a1 - b1)
6125 // r7 := SignedSaturate(a7 - b7)
6127 // https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90)
6128 FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
6130 return vreinterpretq_m128i_s16(
6131 vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
6134 // Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers
6135 // of a and saturates.
6137 // r0 := SignedSaturate(a0 - b0)
6138 // r1 := SignedSaturate(a1 - b1)
6140 // r15 := SignedSaturate(a15 - b15)
6142 // https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90)
6143 FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
6145 return vreinterpretq_m128i_s8(
6146 vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
6149 // Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit
6150 // integers of a and saturates..
6151 // https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx
6152 FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
6154 return vreinterpretq_m128i_u16(
6155 vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
6158 // Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit
6159 // integers of a and saturates.
6161 // r0 := UnsignedSaturate(a0 - b0)
6162 // r1 := UnsignedSaturate(a1 - b1)
6164 // r15 := UnsignedSaturate(a15 - b15)
6166 // https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)
6167 FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
6169 return vreinterpretq_m128i_u8(
6170 vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
6173 #define _mm_ucomieq_sd _mm_comieq_sd
6174 #define _mm_ucomige_sd _mm_comige_sd
6175 #define _mm_ucomigt_sd _mm_comigt_sd
6176 #define _mm_ucomile_sd _mm_comile_sd
6177 #define _mm_ucomilt_sd _mm_comilt_sd
6178 #define _mm_ucomineq_sd _mm_comineq_sd
6180 // Return vector of type __m128d with undefined elements.
6181 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_pd
6182 FORCE_INLINE __m128d _mm_undefined_pd(void)
6184 #if defined(__GNUC__) || defined(__clang__)
6185 #pragma GCC diagnostic push
6186 #pragma GCC diagnostic ignored "-Wuninitialized"
6190 #if defined(__GNUC__) || defined(__clang__)
6191 #pragma GCC diagnostic pop
6195 // Interleaves the upper 4 signed or unsigned 16-bit integers in a with the
6196 // upper 4 signed or unsigned 16-bit integers in b.
6207 // https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
6208 FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
6210 #if defined(__aarch64__)
6211 return vreinterpretq_m128i_s16(
6212 vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
6214 int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
6215 int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
6216 int16x4x2_t result = vzip_s16(a1, b1);
6217 return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
6221 // Interleaves the upper 2 signed or unsigned 32-bit integers in a with the
6222 // upper 2 signed or unsigned 32-bit integers in b.
6223 // https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
6224 FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
6226 #if defined(__aarch64__)
6227 return vreinterpretq_m128i_s32(
6228 vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
6230 int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
6231 int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
6232 int32x2x2_t result = vzip_s32(a1, b1);
6233 return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
6237 // Interleaves the upper signed or unsigned 64-bit integer in a with the
6238 // upper signed or unsigned 64-bit integer in b.
6242 FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
6244 int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
6245 int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
6246 return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
6249 // Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper
6250 // 8 signed or unsigned 8-bit integers in b.
6260 // https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
6261 FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
6263 #if defined(__aarch64__)
6264 return vreinterpretq_m128i_s8(
6265 vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
6268 vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
6270 vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
6271 int8x8x2_t result = vzip_s8(a1, b1);
6272 return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
6276 // Unpack and interleave double-precision (64-bit) floating-point elements from
6277 // the high half of a and b, and store the results in dst.
6279 // DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
6280 // dst[63:0] := src1[127:64]
6281 // dst[127:64] := src2[127:64]
6282 // RETURN dst[127:0]
6284 // dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
6286 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_pd
6287 FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
6289 #if defined(__aarch64__)
6290 return vreinterpretq_m128d_f64(
6291 vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6293 return vreinterpretq_m128d_s64(
6294 vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)),
6295 vget_high_s64(vreinterpretq_s64_m128d(b))));
6299 // Interleaves the lower 4 signed or unsigned 16-bit integers in a with the
6300 // lower 4 signed or unsigned 16-bit integers in b.
6311 // https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
6312 FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
6314 #if defined(__aarch64__)
6315 return vreinterpretq_m128i_s16(
6316 vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
6318 int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
6319 int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
6320 int16x4x2_t result = vzip_s16(a1, b1);
6321 return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
6325 // Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the
6326 // lower 2 signed or unsigned 32 - bit integers in b.
6333 // https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
6334 FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
6336 #if defined(__aarch64__)
6337 return vreinterpretq_m128i_s32(
6338 vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
6340 int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
6341 int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
6342 int32x2x2_t result = vzip_s32(a1, b1);
6343 return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
6347 FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
6349 int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
6350 int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
6351 return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
6354 // Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower
6355 // 8 signed or unsigned 8-bit integers in b.
6365 // https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
6366 FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
6368 #if defined(__aarch64__)
6369 return vreinterpretq_m128i_s8(
6370 vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
6372 int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
6373 int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
6374 int8x8x2_t result = vzip_s8(a1, b1);
6375 return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
6379 // Unpack and interleave double-precision (64-bit) floating-point elements from
6380 // the low half of a and b, and store the results in dst.
6382 // DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
6383 // dst[63:0] := src1[63:0]
6384 // dst[127:64] := src2[63:0]
6385 // RETURN dst[127:0]
6387 // dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
6389 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_pd
6390 FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
6392 #if defined(__aarch64__)
6393 return vreinterpretq_m128d_f64(
6394 vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6396 return vreinterpretq_m128d_s64(
6397 vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)),
6398 vget_low_s64(vreinterpretq_s64_m128d(b))));
6402 // Compute the bitwise XOR of packed double-precision (64-bit) floating-point
6403 // elements in a and b, and store the results in dst.
6407 // dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
6410 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd
6411 FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
6413 return vreinterpretq_m128d_s64(
6414 veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
6417 // Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in
6418 // b. https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
6419 FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
6421 return vreinterpretq_m128i_s32(
6422 veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
6427 // Alternatively add and subtract packed double-precision (64-bit)
6428 // floating-point elements in a to/from packed elements in b, and store the
6433 // IF ((j & 1) == 0)
6434 // dst[i+63:i] := a[i+63:i] - b[i+63:i]
6436 // dst[i+63:i] := a[i+63:i] + b[i+63:i]
6440 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_addsub_pd
6441 FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
6443 static const __m128d mask = _mm_set_pd(1.0f, -1.0f);
6444 #if defined(__aarch64__)
6445 return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),
6446 vreinterpretq_f64_m128d(b),
6447 vreinterpretq_f64_m128d(mask)));
6449 return _mm_add_pd(_mm_mul_pd(b, mask), a);
6453 // Alternatively add and subtract packed single-precision (32-bit)
6454 // floating-point elements in a to/from packed elements in b, and store the
6456 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps
6457 FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
6459 static const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f);
6460 #if defined(__aarch64__) || defined(__ARM_FEATURE_FMA) /* VFPv4+ */
6461 return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a),
6462 vreinterpretq_f32_m128(mask),
6463 vreinterpretq_f32_m128(b)));
6465 return _mm_add_ps(_mm_mul_ps(b, mask), a);
6469 // Horizontally add adjacent pairs of double-precision (64-bit) floating-point
6470 // elements in a and b, and pack the results in dst.
6471 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pd
6472 FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
6474 #if defined(__aarch64__)
6475 return vreinterpretq_m128d_f64(
6476 vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6478 double *da = (double *) &a;
6479 double *db = (double *) &b;
6480 double c[] = {da[0] + da[1], db[0] + db[1]};
6481 return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
6485 // Computes pairwise add of each argument as single-precision, floating-point
6487 // https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
6488 FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
6490 #if defined(__aarch64__)
6491 return vreinterpretq_m128_f32(
6492 vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
6494 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
6495 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
6496 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
6497 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
6498 return vreinterpretq_m128_f32(
6499 vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
6503 // Horizontally subtract adjacent pairs of double-precision (64-bit)
6504 // floating-point elements in a and b, and pack the results in dst.
6505 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pd
6506 FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b)
6508 #if defined(__aarch64__)
6509 float64x2_t a = vreinterpretq_f64_m128d(_a);
6510 float64x2_t b = vreinterpretq_f64_m128d(_b);
6511 return vreinterpretq_m128d_f64(
6512 vsubq_f64(vuzp1q_f64(a, b), vuzp2q_f64(a, b)));
6514 double *da = (double *) &_a;
6515 double *db = (double *) &_b;
6516 double c[] = {da[0] - da[1], db[0] - db[1]};
6517 return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
6521 // Horizontally substract adjacent pairs of single-precision (32-bit)
6522 // floating-point elements in a and b, and pack the results in dst.
6523 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps
6524 FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
6526 float32x4_t a = vreinterpretq_f32_m128(_a);
6527 float32x4_t b = vreinterpretq_f32_m128(_b);
6528 #if defined(__aarch64__)
6529 return vreinterpretq_m128_f32(
6530 vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b)));
6532 float32x4x2_t c = vuzpq_f32(a, b);
6533 return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
6537 // Load 128-bits of integer data from unaligned memory into dst. This intrinsic
6538 // may perform better than _mm_loadu_si128 when the data crosses a cache line
6541 // dst[127:0] := MEM[mem_addr+127:mem_addr]
6543 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128
6544 #define _mm_lddqu_si128 _mm_loadu_si128
6546 // Load a double-precision (64-bit) floating-point element from memory into both
6549 // dst[63:0] := MEM[mem_addr+63:mem_addr]
6550 // dst[127:64] := MEM[mem_addr+63:mem_addr]
6552 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd
6553 #define _mm_loaddup_pd _mm_load1_pd
6555 // Duplicate the low double-precision (64-bit) floating-point element from a,
6556 // and store the results in dst.
6557 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movedup_pd
6558 FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
6560 #if defined(__aarch64__)
6561 return vreinterpretq_m128d_f64(
6562 vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
6564 return vreinterpretq_m128d_u64(
6565 vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)));
6569 // Duplicate odd-indexed single-precision (32-bit) floating-point elements
6570 // from a, and store the results in dst.
6571 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps
6572 FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
6574 #if __has_builtin(__builtin_shufflevector)
6575 return vreinterpretq_m128_f32(__builtin_shufflevector(
6576 vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
6578 float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
6579 float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
6580 float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
6581 return vreinterpretq_m128_f32(vld1q_f32(data));
6585 // Duplicate even-indexed single-precision (32-bit) floating-point elements
6586 // from a, and store the results in dst.
6587 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps
6588 FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
6590 #if __has_builtin(__builtin_shufflevector)
6591 return vreinterpretq_m128_f32(__builtin_shufflevector(
6592 vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
6594 float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
6595 float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
6596 float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
6597 return vreinterpretq_m128_f32(vld1q_f32(data));
6603 // Compute the absolute value of packed signed 16-bit integers in a, and store
6604 // the unsigned results in dst.
6608 // dst[i+15:i] := ABS(a[i+15:i])
6611 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16
6612 FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
6614 return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
6617 // Compute the absolute value of packed signed 32-bit integers in a, and store
6618 // the unsigned results in dst.
6622 // dst[i+31:i] := ABS(a[i+31:i])
6625 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32
6626 FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
6628 return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
6631 // Compute the absolute value of packed signed 8-bit integers in a, and store
6632 // the unsigned results in dst.
6636 // dst[i+7:i] := ABS(a[i+7:i])
6639 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8
6640 FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
6642 return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
6645 // Compute the absolute value of packed signed 16-bit integers in a, and store
6646 // the unsigned results in dst.
6650 // dst[i+15:i] := ABS(a[i+15:i])
6653 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16
6654 FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
6656 return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
6659 // Compute the absolute value of packed signed 32-bit integers in a, and store
6660 // the unsigned results in dst.
6664 // dst[i+31:i] := ABS(a[i+31:i])
6667 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32
6668 FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
6670 return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
6673 // Compute the absolute value of packed signed 8-bit integers in a, and store
6674 // the unsigned results in dst.
6678 // dst[i+7:i] := ABS(a[i+7:i])
6681 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8
6682 FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
6684 return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
6687 // Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift
6688 // the result right by imm8 bytes, and store the low 16 bytes in dst.
6690 // tmp[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8)
6691 // dst[127:0] := tmp[127:0]
6693 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_epi8
6694 FORCE_INLINE __m128i _mm_alignr_epi8(__m128i a, __m128i b, int imm)
6696 if (_sse2neon_unlikely(imm & ~31))
6697 return _mm_setzero_si128();
6702 tmp[0] = vreinterpretq_u8_m128i(a);
6703 tmp[1] = vdupq_n_u8(0);
6706 tmp[0] = vreinterpretq_u8_m128i(b);
6707 tmp[1] = vreinterpretq_u8_m128i(a);
6709 return vreinterpretq_m128i_u8(vld1q_u8(((uint8_t const *) tmp) + idx));
6712 // Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
6713 // the result right by imm8 bytes, and store the low 8 bytes in dst.
6715 // tmp[127:0] := ((a[63:0] << 64)[127:0] OR b[63:0]) >> (imm8*8)
6716 // dst[63:0] := tmp[63:0]
6718 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_pi8
6719 #define _mm_alignr_pi8(a, b, imm) \
6722 if (_sse2neon_unlikely((imm) >= 16)) { \
6723 ret = vreinterpret_m64_s8(vdup_n_s8(0)); \
6725 uint8x8_t tmp_low, tmp_high; \
6727 const int idx = (imm) -8; \
6728 tmp_low = vreinterpret_u8_m64(a); \
6729 tmp_high = vdup_n_u8(0); \
6730 ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
6732 const int idx = (imm); \
6733 tmp_low = vreinterpret_u8_m64(b); \
6734 tmp_high = vreinterpret_u8_m64(a); \
6735 ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
6741 // Computes pairwise add of each argument as a 16-bit signed or unsigned integer
6743 FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
6745 int16x8_t a = vreinterpretq_s16_m128i(_a);
6746 int16x8_t b = vreinterpretq_s16_m128i(_b);
6747 #if defined(__aarch64__)
6748 return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
6750 return vreinterpretq_m128i_s16(
6751 vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
6752 vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
6756 // Computes pairwise add of each argument as a 32-bit signed or unsigned integer
6758 FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
6760 int32x4_t a = vreinterpretq_s32_m128i(_a);
6761 int32x4_t b = vreinterpretq_s32_m128i(_b);
6762 return vreinterpretq_m128i_s32(
6763 vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
6764 vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
6767 // Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
6768 // signed 16-bit results in dst.
6769 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16
6770 FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
6772 return vreinterpret_m64_s16(
6773 vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
6776 // Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
6777 // signed 32-bit results in dst.
6778 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32
6779 FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
6781 return vreinterpret_m64_s32(
6782 vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
6785 // Computes saturated pairwise sub of each argument as a 16-bit signed
6786 // integer values a and b.
6787 FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
6789 #if defined(__aarch64__)
6790 int16x8_t a = vreinterpretq_s16_m128i(_a);
6791 int16x8_t b = vreinterpretq_s16_m128i(_b);
6792 return vreinterpretq_s64_s16(
6793 vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6795 int32x4_t a = vreinterpretq_s32_m128i(_a);
6796 int32x4_t b = vreinterpretq_s32_m128i(_b);
6797 // Interleave using vshrn/vmovn
6798 // [a0|a2|a4|a6|b0|b2|b4|b6]
6799 // [a1|a3|a5|a7|b1|b3|b5|b7]
6800 int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
6801 int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
6803 return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
6807 // Horizontally add adjacent pairs of signed 16-bit integers in a and b using
6808 // saturation, and pack the signed 16-bit results in dst.
6809 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadds_pi16
6810 FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
6812 int16x4_t a = vreinterpret_s16_m64(_a);
6813 int16x4_t b = vreinterpret_s16_m64(_b);
6814 #if defined(__aarch64__)
6815 return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
6817 int16x4x2_t res = vuzp_s16(a, b);
6818 return vreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1]));
6822 // Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
6823 // the signed 16-bit results in dst.
6824 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_epi16
6825 FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
6827 int16x8_t a = vreinterpretq_s16_m128i(_a);
6828 int16x8_t b = vreinterpretq_s16_m128i(_b);
6829 #if defined(__aarch64__)
6830 return vreinterpretq_m128i_s16(
6831 vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6833 int16x8x2_t c = vuzpq_s16(a, b);
6834 return vreinterpretq_m128i_s16(vsubq_s16(c.val[0], c.val[1]));
6838 // Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
6839 // the signed 32-bit results in dst.
6840 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_epi32
6841 FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
6843 int32x4_t a = vreinterpretq_s32_m128i(_a);
6844 int32x4_t b = vreinterpretq_s32_m128i(_b);
6845 #if defined(__aarch64__)
6846 return vreinterpretq_m128i_s32(
6847 vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b)));
6849 int32x4x2_t c = vuzpq_s32(a, b);
6850 return vreinterpretq_m128i_s32(vsubq_s32(c.val[0], c.val[1]));
6854 // Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
6855 // the signed 16-bit results in dst.
6856 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pi16
6857 FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
6859 int16x4_t a = vreinterpret_s16_m64(_a);
6860 int16x4_t b = vreinterpret_s16_m64(_b);
6861 #if defined(__aarch64__)
6862 return vreinterpret_m64_s16(vsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
6864 int16x4x2_t c = vuzp_s16(a, b);
6865 return vreinterpret_m64_s16(vsub_s16(c.val[0], c.val[1]));
6869 // Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
6870 // the signed 32-bit results in dst.
6871 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_hsub_pi32
6872 FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
6874 int32x2_t a = vreinterpret_s32_m64(_a);
6875 int32x2_t b = vreinterpret_s32_m64(_b);
6876 #if defined(__aarch64__)
6877 return vreinterpret_m64_s32(vsub_s32(vuzp1_s32(a, b), vuzp2_s32(a, b)));
6879 int32x2x2_t c = vuzp_s32(a, b);
6880 return vreinterpret_m64_s32(vsub_s32(c.val[0], c.val[1]));
6884 // Computes saturated pairwise difference of each argument as a 16-bit signed
6885 // integer values a and b.
6886 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16
6887 FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
6889 int16x8_t a = vreinterpretq_s16_m128i(_a);
6890 int16x8_t b = vreinterpretq_s16_m128i(_b);
6891 #if defined(__aarch64__)
6892 return vreinterpretq_m128i_s16(
6893 vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6895 int16x8x2_t c = vuzpq_s16(a, b);
6896 return vreinterpretq_m128i_s16(vqsubq_s16(c.val[0], c.val[1]));
6900 // Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
6901 // using saturation, and pack the signed 16-bit results in dst.
6902 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_pi16
6903 FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
6905 int16x4_t a = vreinterpret_s16_m64(_a);
6906 int16x4_t b = vreinterpret_s16_m64(_b);
6907 #if defined(__aarch64__)
6908 return vreinterpret_m64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
6910 int16x4x2_t c = vuzp_s16(a, b);
6911 return vreinterpret_m64_s16(vqsub_s16(c.val[0], c.val[1]));
6915 // Vertically multiply each unsigned 8-bit integer from a with the corresponding
6916 // signed 8-bit integer from b, producing intermediate signed 16-bit integers.
6917 // Horizontally add adjacent pairs of intermediate signed 16-bit integers,
6918 // and pack the saturated results in dst.
6922 // dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] +
6923 // a[i+7:i]*b[i+7:i] )
6925 FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
6927 #if defined(__aarch64__)
6928 uint8x16_t a = vreinterpretq_u8_m128i(_a);
6929 int8x16_t b = vreinterpretq_s8_m128i(_b);
6930 int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
6931 vmovl_s8(vget_low_s8(b)));
6932 int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
6933 vmovl_s8(vget_high_s8(b)));
6934 return vreinterpretq_m128i_s16(
6935 vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
6937 // This would be much simpler if x86 would choose to zero extend OR sign
6938 // extend, not both. This could probably be optimized better.
6939 uint16x8_t a = vreinterpretq_u16_m128i(_a);
6940 int16x8_t b = vreinterpretq_s16_m128i(_b);
6943 int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
6944 int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
6946 // Sign extend by shifting left then shifting right.
6947 int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
6948 int16x8_t b_odd = vshrq_n_s16(b, 8);
6951 int16x8_t prod1 = vmulq_s16(a_even, b_even);
6952 int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
6955 return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
6959 // Vertically multiply each unsigned 8-bit integer from a with the corresponding
6960 // signed 8-bit integer from b, producing intermediate signed 16-bit integers.
6961 // Horizontally add adjacent pairs of intermediate signed 16-bit integers, and
6962 // pack the saturated results in dst.
6963 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maddubs_pi16
6964 FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
6966 uint16x4_t a = vreinterpret_u16_m64(_a);
6967 int16x4_t b = vreinterpret_s16_m64(_b);
6970 int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8));
6971 int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff)));
6973 // Sign extend by shifting left then shifting right.
6974 int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8);
6975 int16x4_t b_odd = vshr_n_s16(b, 8);
6978 int16x4_t prod1 = vmul_s16(a_even, b_even);
6979 int16x4_t prod2 = vmul_s16(a_odd, b_odd);
6982 return vreinterpret_m64_s16(vqadd_s16(prod1, prod2));
6985 // Multiply packed signed 16-bit integers in a and b, producing intermediate
6986 // signed 32-bit integers. Shift right by 15 bits while rounding up, and store
6987 // the packed 16-bit integers in dst.
6989 // r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15)
6990 // r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15)
6991 // r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15)
6993 // r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15)
6994 FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
6996 // Has issues due to saturation
6997 // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
7000 int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
7001 vget_low_s16(vreinterpretq_s16_m128i(b)));
7002 int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
7003 vget_high_s16(vreinterpretq_s16_m128i(b)));
7005 // Rounding narrowing shift right
7006 // narrow = (int16_t)((mul + 16384) >> 15);
7007 int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
7008 int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
7011 return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
7014 // Multiply packed signed 16-bit integers in a and b, producing intermediate
7015 // signed 32-bit integers. Truncate each intermediate integer to the 18 most
7016 // significant bits, round by adding 1, and store bits [16:1] to dst.
7017 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhrs_pi16
7018 FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)
7020 int32x4_t mul_extend =
7021 vmull_s16((vreinterpret_s16_m64(a)), (vreinterpret_s16_m64(b)));
7023 // Rounding narrowing shift right
7024 return vreinterpret_m64_s16(vrshrn_n_s32(mul_extend, 15));
7027 // Shuffle packed 8-bit integers in a according to shuffle control mask in the
7028 // corresponding 8-bit element of b, and store the results in dst.
7029 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8
7030 FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
7032 int8x16_t tbl = vreinterpretq_s8_m128i(a); // input a
7033 uint8x16_t idx = vreinterpretq_u8_m128i(b); // input b
7034 uint8x16_t idx_masked =
7035 vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits
7036 #if defined(__aarch64__)
7037 return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
7038 #elif defined(__GNUC__)
7040 // %e and %f represent the even and odd D registers
7042 __asm__ __volatile__(
7043 "vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
7044 "vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
7046 : [tbl] "w"(tbl), [idx] "w"(idx_masked));
7047 return vreinterpretq_m128i_s8(ret);
7049 // use this line if testing on aarch64
7050 int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
7051 return vreinterpretq_m128i_s8(
7052 vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
7053 vtbl2_s8(a_split, vget_high_u8(idx_masked))));
7057 // Shuffle packed 8-bit integers in a according to shuffle control mask in the
7058 // corresponding 8-bit element of b, and store the results in dst.
7065 // index[2:0] := b[i+2:i]
7066 // dst[i+7:i] := a[index*8+7:index*8]
7070 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pi8
7071 FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b)
7073 const int8x8_t controlMask =
7074 vand_s8(vreinterpret_s8_m64(b), vdup_n_s8((int8_t)(0x1 << 7 | 0x07)));
7075 int8x8_t res = vtbl1_s8(vreinterpret_s8_m64(a), controlMask);
7076 return vreinterpret_m64_s8(res);
7079 // Negate packed 16-bit integers in a when the corresponding signed
7080 // 16-bit integer in b is negative, and store the results in dst.
7081 // Element in dst are zeroed out when the corresponding element
7087 // else if b[i] == 0
7093 FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
7095 int16x8_t a = vreinterpretq_s16_m128i(_a);
7096 int16x8_t b = vreinterpretq_s16_m128i(_b);
7098 // signed shift right: faster than vclt
7099 // (b < 0) ? 0xFFFF : 0
7100 uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
7101 // (b == 0) ? 0xFFFF : 0
7102 #if defined(__aarch64__)
7103 int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
7105 int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
7108 // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
7109 // 'a') based on ltMask
7110 int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
7111 // res = masked & (~zeroMask)
7112 int16x8_t res = vbicq_s16(masked, zeroMask);
7113 return vreinterpretq_m128i_s16(res);
7116 // Negate packed 32-bit integers in a when the corresponding signed
7117 // 32-bit integer in b is negative, and store the results in dst.
7118 // Element in dst are zeroed out when the corresponding element
7124 // else if b[i] == 0
7130 FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
7132 int32x4_t a = vreinterpretq_s32_m128i(_a);
7133 int32x4_t b = vreinterpretq_s32_m128i(_b);
7135 // signed shift right: faster than vclt
7136 // (b < 0) ? 0xFFFFFFFF : 0
7137 uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
7139 // (b == 0) ? 0xFFFFFFFF : 0
7140 #if defined(__aarch64__)
7141 int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
7143 int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
7146 // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
7147 // 'a') based on ltMask
7148 int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
7149 // res = masked & (~zeroMask)
7150 int32x4_t res = vbicq_s32(masked, zeroMask);
7151 return vreinterpretq_m128i_s32(res);
7154 // Negate packed 8-bit integers in a when the corresponding signed
7155 // 8-bit integer in b is negative, and store the results in dst.
7156 // Element in dst are zeroed out when the corresponding element
7162 // else if b[i] == 0
7168 FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
7170 int8x16_t a = vreinterpretq_s8_m128i(_a);
7171 int8x16_t b = vreinterpretq_s8_m128i(_b);
7173 // signed shift right: faster than vclt
7174 // (b < 0) ? 0xFF : 0
7175 uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
7177 // (b == 0) ? 0xFF : 0
7178 #if defined(__aarch64__)
7179 int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
7181 int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
7184 // bitwise select either a or nagative 'a' (vnegq_s8(a) return nagative 'a')
7186 int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
7187 // res = masked & (~zeroMask)
7188 int8x16_t res = vbicq_s8(masked, zeroMask);
7190 return vreinterpretq_m128i_s8(res);
7193 // Negate packed 16-bit integers in a when the corresponding signed 16-bit
7194 // integer in b is negative, and store the results in dst. Element in dst are
7195 // zeroed out when the corresponding element in b is zero.
7200 // dst[i+15:i] := -(a[i+15:i])
7201 // ELSE IF b[i+15:i] == 0
7204 // dst[i+15:i] := a[i+15:i]
7208 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16
7209 FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
7211 int16x4_t a = vreinterpret_s16_m64(_a);
7212 int16x4_t b = vreinterpret_s16_m64(_b);
7214 // signed shift right: faster than vclt
7215 // (b < 0) ? 0xFFFF : 0
7216 uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
7218 // (b == 0) ? 0xFFFF : 0
7219 #if defined(__aarch64__)
7220 int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
7222 int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
7225 // bitwise select either a or nagative 'a' (vneg_s16(a) return nagative 'a')
7227 int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
7228 // res = masked & (~zeroMask)
7229 int16x4_t res = vbic_s16(masked, zeroMask);
7231 return vreinterpret_m64_s16(res);
7234 // Negate packed 32-bit integers in a when the corresponding signed 32-bit
7235 // integer in b is negative, and store the results in dst. Element in dst are
7236 // zeroed out when the corresponding element in b is zero.
7241 // dst[i+31:i] := -(a[i+31:i])
7242 // ELSE IF b[i+31:i] == 0
7245 // dst[i+31:i] := a[i+31:i]
7249 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32
7250 FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
7252 int32x2_t a = vreinterpret_s32_m64(_a);
7253 int32x2_t b = vreinterpret_s32_m64(_b);
7255 // signed shift right: faster than vclt
7256 // (b < 0) ? 0xFFFFFFFF : 0
7257 uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
7259 // (b == 0) ? 0xFFFFFFFF : 0
7260 #if defined(__aarch64__)
7261 int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
7263 int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
7266 // bitwise select either a or nagative 'a' (vneg_s32(a) return nagative 'a')
7268 int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
7269 // res = masked & (~zeroMask)
7270 int32x2_t res = vbic_s32(masked, zeroMask);
7272 return vreinterpret_m64_s32(res);
7275 // Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
7276 // in b is negative, and store the results in dst. Element in dst are zeroed out
7277 // when the corresponding element in b is zero.
7282 // dst[i+7:i] := -(a[i+7:i])
7283 // ELSE IF b[i+7:i] == 0
7286 // dst[i+7:i] := a[i+7:i]
7290 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8
7291 FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
7293 int8x8_t a = vreinterpret_s8_m64(_a);
7294 int8x8_t b = vreinterpret_s8_m64(_b);
7296 // signed shift right: faster than vclt
7297 // (b < 0) ? 0xFF : 0
7298 uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
7300 // (b == 0) ? 0xFF : 0
7301 #if defined(__aarch64__)
7302 int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
7304 int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
7307 // bitwise select either a or nagative 'a' (vneg_s8(a) return nagative 'a')
7309 int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
7310 // res = masked & (~zeroMask)
7311 int8x8_t res = vbic_s8(masked, zeroMask);
7313 return vreinterpret_m64_s8(res);
7318 // Blend packed 16-bit integers from a and b using control mask imm8, and store
7319 // the results in dst.
7324 // dst[i+15:i] := b[i+15:i]
7326 // dst[i+15:i] := a[i+15:i]
7329 // FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
7330 // __constrange(0,255) int imm)
7331 #define _mm_blend_epi16(a, b, imm) \
7333 const uint16_t _mask[8] = {((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0, \
7334 ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0, \
7335 ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0, \
7336 ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0, \
7337 ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0, \
7338 ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0, \
7339 ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0, \
7340 ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0}; \
7341 uint16x8_t _mask_vec = vld1q_u16(_mask); \
7342 uint16x8_t _a = vreinterpretq_u16_m128i(a); \
7343 uint16x8_t _b = vreinterpretq_u16_m128i(b); \
7344 vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a)); \
7347 // Blend packed double-precision (64-bit) floating-point elements from a and b
7348 // using control mask imm8, and store the results in dst.
7349 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_pd
7350 #define _mm_blend_pd(a, b, imm) \
7352 const uint64_t _mask[2] = { \
7353 ((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0), \
7354 ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)}; \
7355 uint64x2_t _mask_vec = vld1q_u64(_mask); \
7356 uint64x2_t _a = vreinterpretq_u64_m128d(a); \
7357 uint64x2_t _b = vreinterpretq_u64_m128d(b); \
7358 vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, _b, _a)); \
7361 // Blend packed single-precision (32-bit) floating-point elements from a and b
7362 // using mask, and store the results in dst.
7363 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_ps
7364 FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
7366 const uint32_t ALIGN_STRUCT(16)
7367 data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
7368 ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
7369 ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
7370 ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
7371 uint32x4_t mask = vld1q_u32(data);
7372 float32x4_t a = vreinterpretq_f32_m128(_a);
7373 float32x4_t b = vreinterpretq_f32_m128(_b);
7374 return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
7377 // Blend packed 8-bit integers from a and b using mask, and store the results in
7383 // dst[i+7:i] := b[i+7:i]
7385 // dst[i+7:i] := a[i+7:i]
7388 FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
7390 // Use a signed shift right to create a mask with the sign bit
7392 vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
7393 uint8x16_t a = vreinterpretq_u8_m128i(_a);
7394 uint8x16_t b = vreinterpretq_u8_m128i(_b);
7395 return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
7398 // Blend packed double-precision (64-bit) floating-point elements from a and b
7399 // using mask, and store the results in dst.
7400 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd
7401 FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
7404 vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
7405 #if defined(__aarch64__)
7406 float64x2_t a = vreinterpretq_f64_m128d(_a);
7407 float64x2_t b = vreinterpretq_f64_m128d(_b);
7408 return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
7410 uint64x2_t a = vreinterpretq_u64_m128d(_a);
7411 uint64x2_t b = vreinterpretq_u64_m128d(_b);
7412 return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a));
7416 // Blend packed single-precision (32-bit) floating-point elements from a and b
7417 // using mask, and store the results in dst.
7418 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps
7419 FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
7421 // Use a signed shift right to create a mask with the sign bit
7423 vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31));
7424 float32x4_t a = vreinterpretq_f32_m128(_a);
7425 float32x4_t b = vreinterpretq_f32_m128(_b);
7426 return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
7429 // Round the packed double-precision (64-bit) floating-point elements in a up
7430 // to an integer value, and store the results as packed double-precision
7431 // floating-point elements in dst.
7432 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_pd
7433 FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
7435 #if defined(__aarch64__)
7436 return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
7438 double *f = (double *) &a;
7439 return _mm_set_pd(ceil(f[1]), ceil(f[0]));
7443 // Round the packed single-precision (32-bit) floating-point elements in a up to
7444 // an integer value, and store the results as packed single-precision
7445 // floating-point elements in dst.
7446 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps
7447 FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
7449 #if defined(__aarch64__)
7450 return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
7452 float *f = (float *) &a;
7453 return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0]));
7457 // Round the lower double-precision (64-bit) floating-point element in b up to
7458 // an integer value, store the result as a double-precision floating-point
7459 // element in the lower element of dst, and copy the upper element from a to the
7460 // upper element of dst.
7461 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_sd
7462 FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b)
7464 return _mm_move_sd(a, _mm_ceil_pd(b));
7467 // Round the lower single-precision (32-bit) floating-point element in b up to
7468 // an integer value, store the result as a single-precision floating-point
7469 // element in the lower element of dst, and copy the upper 3 packed elements
7470 // from a to the upper elements of dst.
7472 // dst[31:0] := CEIL(b[31:0])
7473 // dst[127:32] := a[127:32]
7475 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ss
7476 FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
7478 return _mm_move_ss(a, _mm_ceil_ps(b));
7481 // Compare packed 64-bit integers in a and b for equality, and store the results
7483 FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
7485 #if defined(__aarch64__)
7486 return vreinterpretq_m128i_u64(
7487 vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
7489 // ARMv7 lacks vceqq_u64
7490 // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
7492 vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
7493 uint32x4_t swapped = vrev64q_u32(cmp);
7494 return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
7498 // Converts the four signed 16-bit integers in the lower 64 bits to four signed
7500 FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
7502 return vreinterpretq_m128i_s32(
7503 vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
7506 // Converts the two signed 16-bit integers in the lower 32 bits two signed
7508 FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
7510 int16x8_t s16x8 = vreinterpretq_s16_m128i(a); /* xxxx xxxx xxxx 0B0A */
7511 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
7512 int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
7513 return vreinterpretq_m128i_s64(s64x2);
7516 // Converts the two signed 32-bit integers in the lower 64 bits to two signed
7518 FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
7520 return vreinterpretq_m128i_s64(
7521 vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
7524 // Converts the four unsigned 8-bit integers in the lower 16 bits to four
7525 // unsigned 32-bit integers.
7526 FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
7528 int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
7529 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
7530 return vreinterpretq_m128i_s16(s16x8);
7533 // Converts the four unsigned 8-bit integers in the lower 32 bits to four
7534 // unsigned 32-bit integers.
7535 FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
7537 int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
7538 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
7539 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
7540 return vreinterpretq_m128i_s32(s32x4);
7543 // Converts the two signed 8-bit integers in the lower 32 bits to four
7544 // signed 64-bit integers.
7545 FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
7547 int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx xxBA */
7548 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0x0x 0B0A */
7549 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
7550 int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
7551 return vreinterpretq_m128i_s64(s64x2);
7554 // Converts the four unsigned 16-bit integers in the lower 64 bits to four
7555 // unsigned 32-bit integers.
7556 FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
7558 return vreinterpretq_m128i_u32(
7559 vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
7562 // Converts the two unsigned 16-bit integers in the lower 32 bits to two
7563 // unsigned 64-bit integers.
7564 FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
7566 uint16x8_t u16x8 = vreinterpretq_u16_m128i(a); /* xxxx xxxx xxxx 0B0A */
7567 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
7568 uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
7569 return vreinterpretq_m128i_u64(u64x2);
7572 // Converts the two unsigned 32-bit integers in the lower 64 bits to two
7573 // unsigned 64-bit integers.
7574 FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
7576 return vreinterpretq_m128i_u64(
7577 vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
7580 // Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers,
7581 // and store the results in dst.
7582 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi16
7583 FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
7585 uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx HGFE DCBA */
7586 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0H0G 0F0E 0D0C 0B0A */
7587 return vreinterpretq_m128i_u16(u16x8);
7590 // Converts the four unsigned 8-bit integers in the lower 32 bits to four
7591 // unsigned 32-bit integers.
7592 // https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx
7593 FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
7595 uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */
7596 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
7597 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
7598 return vreinterpretq_m128i_u32(u32x4);
7601 // Converts the two unsigned 8-bit integers in the lower 16 bits to two
7602 // unsigned 64-bit integers.
7603 FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
7605 uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx xxBA */
7606 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0x0x 0B0A */
7607 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
7608 uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
7609 return vreinterpretq_m128i_u64(u64x2);
7612 // Conditionally multiply the packed double-precision (64-bit) floating-point
7613 // elements in a and b using the high 4 bits in imm8, sum the four products, and
7614 // conditionally store the sum in dst using the low 4 bits of imm8.
7615 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_pd
7616 FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
7618 // Generate mask value from constant immediate bit value
7619 const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0;
7620 const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0;
7621 #if !SSE2NEON_PRECISE_DP
7622 const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0;
7623 const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0;
7625 // Conditional multiplication
7626 #if !SSE2NEON_PRECISE_DP
7627 __m128d mul = _mm_mul_pd(a, b);
7628 const __m128d mulMask =
7629 _mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask));
7630 __m128d tmp = _mm_and_pd(mul, mulMask);
7632 #if defined(__aarch64__)
7633 double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) *
7634 vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0)
7636 double d1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1) *
7637 vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1)
7640 double d0 = (imm & 0x10) ? ((double *) &a)[0] * ((double *) &b)[0] : 0;
7641 double d1 = (imm & 0x20) ? ((double *) &a)[1] * ((double *) &b)[1] : 0;
7643 __m128d tmp = _mm_set_pd(d1, d0);
7646 #if defined(__aarch64__)
7647 double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
7649 double sum = *((double *) &tmp) + *(((double *) &tmp) + 1);
7651 // Conditionally store the sum
7652 const __m128d sumMask =
7653 _mm_castsi128_pd(_mm_set_epi64x(bit1Mask, bit0Mask));
7654 __m128d res = _mm_and_pd(_mm_set_pd1(sum), sumMask);
7658 // Conditionally multiply the packed single-precision (32-bit) floating-point
7659 // elements in a and b using the high 4 bits in imm8, sum the four products,
7660 // and conditionally store the sum in dst using the low 4 bits of imm.
7661 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps
7662 FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
7664 #if defined(__aarch64__)
7667 return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b)));
7670 float32x4_t m = _mm_mul_ps(a, b);
7672 return _mm_set1_ps(vaddvq_f32(m));
7677 float32x4_t f32a = vreinterpretq_f32_m128(a);
7678 float32x4_t f32b = vreinterpretq_f32_m128(b);
7680 /* To improve the accuracy of floating-point summation, Kahan algorithm
7681 * is used for each operation.
7684 _sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]);
7686 _sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]);
7688 _sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]);
7690 _sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]);
7694 (imm & 0x1) ? s : 0,
7695 (imm & 0x2) ? s : 0,
7696 (imm & 0x4) ? s : 0,
7697 (imm & 0x8) ? s : 0,
7699 return vreinterpretq_m128_f32(res);
7702 // Extracts the selected signed or unsigned 32-bit integer from a and zero
7704 // FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
7705 #define _mm_extract_epi32(a, imm) \
7706 vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
7708 // Extracts the selected signed or unsigned 64-bit integer from a and zero
7710 // FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
7711 #define _mm_extract_epi64(a, imm) \
7712 vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
7714 // Extracts the selected signed or unsigned 8-bit integer from a and zero
7716 // FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm)
7717 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi8
7718 #define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
7720 // Extracts the selected single-precision (32-bit) floating-point from a.
7721 // FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
7722 #define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
7724 // Round the packed double-precision (64-bit) floating-point elements in a down
7725 // to an integer value, and store the results as packed double-precision
7726 // floating-point elements in dst.
7727 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_pd
7728 FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
7730 #if defined(__aarch64__)
7731 return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
7733 double *f = (double *) &a;
7734 return _mm_set_pd(floor(f[1]), floor(f[0]));
7738 // Round the packed single-precision (32-bit) floating-point elements in a down
7739 // to an integer value, and store the results as packed single-precision
7740 // floating-point elements in dst.
7741 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps
7742 FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
7744 #if defined(__aarch64__)
7745 return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
7747 float *f = (float *) &a;
7748 return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0]));
7752 // Round the lower double-precision (64-bit) floating-point element in b down to
7753 // an integer value, store the result as a double-precision floating-point
7754 // element in the lower element of dst, and copy the upper element from a to the
7755 // upper element of dst.
7756 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_sd
7757 FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b)
7759 return _mm_move_sd(a, _mm_floor_pd(b));
7762 // Round the lower single-precision (32-bit) floating-point element in b down to
7763 // an integer value, store the result as a single-precision floating-point
7764 // element in the lower element of dst, and copy the upper 3 packed elements
7765 // from a to the upper elements of dst.
7767 // dst[31:0] := FLOOR(b[31:0])
7768 // dst[127:32] := a[127:32]
7770 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss
7771 FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
7773 return _mm_move_ss(a, _mm_floor_ps(b));
7776 // Inserts the least significant 32 bits of b into the selected 32-bit integer
7778 // FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
7779 // __constrange(0,4) int imm)
7780 #define _mm_insert_epi32(a, b, imm) \
7782 vreinterpretq_m128i_s32( \
7783 vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
7786 // Inserts the least significant 64 bits of b into the selected 64-bit integer
7788 // FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
7789 // __constrange(0,2) int imm)
7790 #define _mm_insert_epi64(a, b, imm) \
7792 vreinterpretq_m128i_s64( \
7793 vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
7796 // Inserts the least significant 8 bits of b into the selected 8-bit integer
7798 // FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
7799 // __constrange(0,16) int imm)
7800 #define _mm_insert_epi8(a, b, imm) \
7802 vreinterpretq_m128i_s8( \
7803 vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
7806 // Copy a to tmp, then insert a single-precision (32-bit) floating-point
7807 // element from b into tmp using the control in imm8. Store tmp to dst using
7808 // the mask in imm8 (elements are zeroed out when the corresponding bit is set).
7809 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=insert_ps
7810 #define _mm_insert_ps(a, b, imm8) \
7812 float32x4_t tmp1 = \
7813 vsetq_lane_f32(vgetq_lane_f32(b, (imm8 >> 6) & 0x3), \
7814 vreinterpretq_f32_m128(a), 0); \
7815 float32x4_t tmp2 = \
7816 vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), vreinterpretq_f32_m128(a), \
7817 ((imm8 >> 4) & 0x3)); \
7818 const uint32_t data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0, \
7819 ((imm8) & (1 << 1)) ? UINT32_MAX : 0, \
7820 ((imm8) & (1 << 2)) ? UINT32_MAX : 0, \
7821 ((imm8) & (1 << 3)) ? UINT32_MAX : 0}; \
7822 uint32x4_t mask = vld1q_u32(data); \
7823 float32x4_t all_zeros = vdupq_n_f32(0); \
7825 vreinterpretq_m128_f32( \
7826 vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2))); \
7829 // epi versions of min/max
7830 // Computes the pariwise maximums of the four signed 32-bit integer values of a
7833 // A 128-bit parameter that can be defined with the following equations:
7834 // r0 := (a0 > b0) ? a0 : b0
7835 // r1 := (a1 > b1) ? a1 : b1
7836 // r2 := (a2 > b2) ? a2 : b2
7837 // r3 := (a3 > b3) ? a3 : b3
7839 // https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
7840 FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
7842 return vreinterpretq_m128i_s32(
7843 vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
7846 // Compare packed signed 8-bit integers in a and b, and store packed maximum
7848 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8
7849 FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
7851 return vreinterpretq_m128i_s8(
7852 vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
7855 // Compare packed unsigned 16-bit integers in a and b, and store packed maximum
7857 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16
7858 FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
7860 return vreinterpretq_m128i_u16(
7861 vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
7864 // Compare packed unsigned 32-bit integers in a and b, and store packed maximum
7866 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
7867 FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
7869 return vreinterpretq_m128i_u32(
7870 vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
7873 // Computes the pariwise minima of the four signed 32-bit integer values of a
7876 // A 128-bit parameter that can be defined with the following equations:
7877 // r0 := (a0 < b0) ? a0 : b0
7878 // r1 := (a1 < b1) ? a1 : b1
7879 // r2 := (a2 < b2) ? a2 : b2
7880 // r3 := (a3 < b3) ? a3 : b3
7882 // https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
7883 FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
7885 return vreinterpretq_m128i_s32(
7886 vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
7889 // Compare packed signed 8-bit integers in a and b, and store packed minimum
7891 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8
7892 FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
7894 return vreinterpretq_m128i_s8(
7895 vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
7898 // Compare packed unsigned 16-bit integers in a and b, and store packed minimum
7900 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16
7901 FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
7903 return vreinterpretq_m128i_u16(
7904 vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
7907 // Compare packed unsigned 32-bit integers in a and b, and store packed minimum
7909 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
7910 FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
7912 return vreinterpretq_m128i_u32(
7913 vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
7916 // Horizontally compute the minimum amongst the packed unsigned 16-bit integers
7917 // in a, store the minimum and index in dst, and zero the remaining bits in dst.
7920 // min[15:0] := a[15:0]
7923 // IF a[i+15:i] < min[15:0]
7925 // min[15:0] := a[i+15:i]
7928 // dst[15:0] := min[15:0]
7929 // dst[18:16] := index[2:0]
7932 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16
7933 FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
7936 uint16_t min, idx = 0;
7937 // Find the minimum value
7938 #if defined(__aarch64__)
7939 min = vminvq_u16(vreinterpretq_u16_m128i(a));
7942 tmp = vreinterpret_m64_u16(
7943 vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
7944 vget_high_u16(vreinterpretq_u16_m128i(a))));
7945 tmp = vreinterpret_m64_u16(
7946 vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
7947 tmp = vreinterpret_m64_u16(
7948 vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
7949 min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
7951 // Get the index of the minimum value
7953 for (i = 0; i < 8; i++) {
7954 if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
7958 a = _mm_srli_si128(a, 2);
7961 dst = _mm_setzero_si128();
7962 dst = vreinterpretq_m128i_u16(
7963 vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
7964 dst = vreinterpretq_m128i_u16(
7965 vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
7969 // Compute the sum of absolute differences (SADs) of quadruplets of unsigned
7970 // 8-bit integers in a compared to those in b, and store the 16-bit results in
7971 // dst. Eight SADs are performed using one quadruplet from b and eight
7972 // quadruplets from a. One quadruplet is selected from b starting at on the
7973 // offset specified in imm8. Eight quadruplets are formed from sequential 8-bit
7974 // integers selected from a starting at the offset specified in imm8.
7975 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mpsadbw_epu8
7976 FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
7980 switch (imm & 0x4) {
7983 _a = vreinterpretq_u8_m128i(a);
7986 _a = vreinterpretq_u8_u32(vextq_u32(vreinterpretq_u32_m128i(a),
7987 vreinterpretq_u32_m128i(a), 1));
7990 #if defined(__GNUC__) || defined(__clang__)
7991 __builtin_unreachable();
7996 switch (imm & 0x3) {
7998 _b = vreinterpretq_u8_u32(
7999 vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 0)));
8002 _b = vreinterpretq_u8_u32(
8003 vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 1)));
8006 _b = vreinterpretq_u8_u32(
8007 vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 2)));
8010 _b = vreinterpretq_u8_u32(
8011 vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 3)));
8014 #if defined(__GNUC__) || defined(__clang__)
8015 __builtin_unreachable();
8020 int16x8_t c04, c15, c26, c37;
8021 uint8x8_t low_b = vget_low_u8(_b);
8022 c04 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
8023 _a = vextq_u8(_a, _a, 1);
8024 c15 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
8025 _a = vextq_u8(_a, _a, 1);
8026 c26 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
8027 _a = vextq_u8(_a, _a, 1);
8028 c37 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
8029 #if defined(__aarch64__)
8031 c04 = vpaddq_s16(c04, c26);
8033 c15 = vpaddq_s16(c15, c37);
8036 vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
8038 vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
8039 return vreinterpretq_m128i_s16(vpaddq_s16(vreinterpretq_s16_s32(trn1_c),
8040 vreinterpretq_s16_s32(trn2_c)));
8042 int16x4_t c01, c23, c45, c67;
8043 c01 = vpadd_s16(vget_low_s16(c04), vget_low_s16(c15));
8044 c23 = vpadd_s16(vget_low_s16(c26), vget_low_s16(c37));
8045 c45 = vpadd_s16(vget_high_s16(c04), vget_high_s16(c15));
8046 c67 = vpadd_s16(vget_high_s16(c26), vget_high_s16(c37));
8048 return vreinterpretq_m128i_s16(
8049 vcombine_s16(vpadd_s16(c01, c23), vpadd_s16(c45, c67)));
8053 // Multiply the low signed 32-bit integers from each packed 64-bit element in
8054 // a and b, and store the signed 64-bit results in dst.
8056 // r0 := (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0
8057 // r1 := (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2
8058 FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
8060 // vmull_s32 upcasts instead of masking, so we downcast.
8061 int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
8062 int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
8063 return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
8066 // Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or
8067 // unsigned 32-bit integers from b.
8068 // https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
8069 FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
8071 return vreinterpretq_m128i_s32(
8072 vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
8075 // Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit
8076 // integers and saturates.
8078 // r0 := UnsignedSaturate(a0)
8079 // r1 := UnsignedSaturate(a1)
8080 // r2 := UnsignedSaturate(a2)
8081 // r3 := UnsignedSaturate(a3)
8082 // r4 := UnsignedSaturate(b0)
8083 // r5 := UnsignedSaturate(b1)
8084 // r6 := UnsignedSaturate(b2)
8085 // r7 := UnsignedSaturate(b3)
8086 FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
8088 return vreinterpretq_m128i_u16(
8089 vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
8090 vqmovun_s32(vreinterpretq_s32_m128i(b))));
8093 // Round the packed double-precision (64-bit) floating-point elements in a using
8094 // the rounding parameter, and store the results as packed double-precision
8095 // floating-point elements in dst.
8096 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_pd
8097 FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
8099 #if defined(__aarch64__)
8101 case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
8102 return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a)));
8103 case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
8104 return _mm_floor_pd(a);
8105 case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
8106 return _mm_ceil_pd(a);
8107 case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
8108 return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a)));
8109 default: //_MM_FROUND_CUR_DIRECTION
8110 return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)));
8113 double *v_double = (double *) &a;
8115 if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
8116 (rounding == _MM_FROUND_CUR_DIRECTION &&
8117 _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
8119 for (int i = 0; i < 2; i++) {
8120 tmp = (v_double[i] < 0) ? -v_double[i] : v_double[i];
8121 double roundDown = floor(tmp); // Round down value
8122 double roundUp = ceil(tmp); // Round up value
8123 double diffDown = tmp - roundDown;
8124 double diffUp = roundUp - tmp;
8125 if (diffDown < diffUp) {
8126 /* If it's closer to the round down value, then use it */
8128 } else if (diffDown > diffUp) {
8129 /* If it's closer to the round up value, then use it */
8132 /* If it's equidistant between round up and round down value,
8133 * pick the one which is an even number */
8134 double half = roundDown / 2;
8135 if (half != floor(half)) {
8136 /* If the round down value is odd, return the round up value
8140 /* If the round up value is odd, return the round down value
8145 res[i] = (v_double[i] < 0) ? -res[i] : res[i];
8147 return _mm_set_pd(res[1], res[0]);
8148 } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
8149 (rounding == _MM_FROUND_CUR_DIRECTION &&
8150 _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
8151 return _mm_floor_pd(a);
8152 } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
8153 (rounding == _MM_FROUND_CUR_DIRECTION &&
8154 _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
8155 return _mm_ceil_pd(a);
8157 return _mm_set_pd(v_double[1] > 0 ? floor(v_double[1]) : ceil(v_double[1]),
8158 v_double[0] > 0 ? floor(v_double[0]) : ceil(v_double[0]));
8162 // Round the packed single-precision (32-bit) floating-point elements in a using
8163 // the rounding parameter, and store the results as packed single-precision
8164 // floating-point elements in dst.
8165 // software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
8166 FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
8168 #if defined(__aarch64__)
8170 case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
8171 return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
8172 case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
8173 return _mm_floor_ps(a);
8174 case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
8175 return _mm_ceil_ps(a);
8176 case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
8177 return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
8178 default: //_MM_FROUND_CUR_DIRECTION
8179 return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
8182 float *v_float = (float *) &a;
8184 if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
8185 (rounding == _MM_FROUND_CUR_DIRECTION &&
8186 _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
8187 uint32x4_t signmask = vdupq_n_u32(0x80000000);
8188 float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
8189 vdupq_n_f32(0.5f)); /* +/- 0.5 */
8190 int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
8191 vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
8192 int32x4_t r_trunc = vcvtq_s32_f32(
8193 vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
8194 int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
8195 vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
8196 int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
8197 vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
8198 float32x4_t delta = vsubq_f32(
8199 vreinterpretq_f32_m128(a),
8200 vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
8201 uint32x4_t is_delta_half =
8202 vceqq_f32(delta, half); /* delta == +/- 0.5 */
8203 return vreinterpretq_m128_f32(
8204 vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal)));
8205 } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
8206 (rounding == _MM_FROUND_CUR_DIRECTION &&
8207 _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
8208 return _mm_floor_ps(a);
8209 } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
8210 (rounding == _MM_FROUND_CUR_DIRECTION &&
8211 _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
8212 return _mm_ceil_ps(a);
8214 return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]),
8215 v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]),
8216 v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]),
8217 v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0]));
8221 // Round the lower double-precision (64-bit) floating-point element in b using
8222 // the rounding parameter, store the result as a double-precision floating-point
8223 // element in the lower element of dst, and copy the upper element from a to the
8224 // upper element of dst.
8225 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_sd
8226 FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
8228 return _mm_move_sd(a, _mm_round_pd(b, rounding));
8231 // Round the lower single-precision (32-bit) floating-point element in b using
8232 // the rounding parameter, store the result as a single-precision floating-point
8233 // element in the lower element of dst, and copy the upper 3 packed elements
8234 // from a to the upper elements of dst. Rounding is done according to the
8235 // rounding[3:0] parameter, which can be one of:
8236 // (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and
8237 // suppress exceptions
8238 // (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and
8239 // suppress exceptions
8240 // (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress
8242 // (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress
8243 // exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see
8244 // _MM_SET_ROUNDING_MODE
8245 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ss
8246 FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
8248 return _mm_move_ss(a, _mm_round_ps(b, rounding));
8251 // Load 128-bits of integer data from memory into dst using a non-temporal
8252 // memory hint. mem_addr must be aligned on a 16-byte boundary or a
8253 // general-protection exception may be generated.
8255 // dst[127:0] := MEM[mem_addr+127:mem_addr]
8257 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128
8258 FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
8260 #if __has_builtin(__builtin_nontemporal_store)
8261 return __builtin_nontemporal_load(p);
8263 return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
8267 // Compute the bitwise NOT of a and then AND with a 128-bit vector containing
8268 // all 1's, and return 1 if the result is zero, otherwise return 0.
8269 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones
8270 FORCE_INLINE int _mm_test_all_ones(__m128i a)
8272 return (uint64_t)(vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
8276 // Compute the bitwise AND of 128 bits (representing integer data) in a and
8277 // mask, and return 1 if the result is zero, otherwise return 0.
8278 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros
8279 FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
8281 int64x2_t a_and_mask =
8282 vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
8283 return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1));
8286 // Compute the bitwise AND of 128 bits (representing integer data) in a and
8287 // mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute
8288 // the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is
8289 // zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
8290 // otherwise return 0.
8291 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_test_mix_ones_zero
8292 FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
8295 vandq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
8297 vbicq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
8298 uint64x2_t result = vandq_u64(zf, cf);
8299 return !(vgetq_lane_u64(result, 0) | vgetq_lane_u64(result, 1));
8302 // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
8303 // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
8304 // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
8305 // otherwise set CF to 0. Return the CF value.
8306 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128
8307 FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
8310 vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_m128i(a))),
8311 vreinterpretq_s64_m128i(b));
8312 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
8315 // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
8316 // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
8317 // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
8318 // otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
8319 // otherwise return 0.
8320 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testnzc_si128
8321 #define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b)
8323 // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
8324 // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
8325 // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
8326 // otherwise set CF to 0. Return the ZF value.
8327 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128
8328 FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
8331 vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
8332 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
8337 // Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
8338 // in b for greater than.
8339 FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
8341 #if defined(__aarch64__)
8342 return vreinterpretq_m128i_u64(
8343 vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
8345 return vreinterpretq_m128i_s64(vshrq_n_s64(
8346 vqsubq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a)),
8351 // Starting with the initial value in crc, accumulates a CRC32 value for
8352 // unsigned 16-bit integer v.
8353 // https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100)
8354 FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
8356 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8357 __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
8361 crc = _mm_crc32_u8(crc, v & 0xff);
8362 crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
8367 // Starting with the initial value in crc, accumulates a CRC32 value for
8368 // unsigned 32-bit integer v.
8369 // https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100)
8370 FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
8372 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8373 __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
8377 crc = _mm_crc32_u16(crc, v & 0xffff);
8378 crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
8383 // Starting with the initial value in crc, accumulates a CRC32 value for
8384 // unsigned 64-bit integer v.
8385 // https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100)
8386 FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
8388 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8389 __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
8393 crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff);
8394 crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff);
8399 // Starting with the initial value in crc, accumulates a CRC32 value for
8400 // unsigned 8-bit integer v.
8401 // https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100)
8402 FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
8404 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8405 __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
8410 for (int bit = 0; bit < 8; bit++) {
8412 crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
8422 #if !defined(__ARM_FEATURE_CRYPTO)
8423 /* clang-format off */
8424 #define SSE2NEON_AES_DATA(w) \
8426 w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
8427 w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
8428 w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
8429 w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
8430 w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
8431 w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
8432 w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
8433 w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
8434 w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
8435 w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
8436 w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
8437 w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
8438 w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
8439 w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
8440 w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
8441 w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
8442 w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
8443 w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
8444 w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
8445 w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
8446 w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
8447 w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
8448 w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
8449 w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
8450 w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
8451 w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
8452 w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
8453 w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
8454 w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
8455 w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
8456 w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
8457 w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
8458 w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
8459 w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
8460 w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
8461 w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
8462 w(0xb0), w(0x54), w(0xbb), w(0x16) \
8464 /* clang-format on */
8466 /* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
8467 #define SSE2NEON_AES_H0(x) (x)
8468 static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_DATA(SSE2NEON_AES_H0);
8469 #undef SSE2NEON_AES_H0
8471 // In the absence of crypto extensions, implement aesenc using regular neon
8472 // intrinsics instead. See:
8473 // https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
8474 // https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
8475 // https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52
8476 // for more information Reproduced with permission of the author.
8477 FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
8479 #if defined(__aarch64__)
8480 static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
8481 0xe, 0x3, 0x8, 0xd, 0x2, 0x7,
8482 0xc, 0x1, 0x6, 0xb};
8483 static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
8484 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
8487 uint8x16_t w = vreinterpretq_u8_m128i(EncBlock);
8490 w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
8493 v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(SSE2NEON_sbox), w);
8494 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40);
8495 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80);
8496 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0);
8499 w = (v << 1) ^ (uint8x16_t)(((int8x16_t) v >> 7) & 0x1b);
8500 w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
8501 w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
8504 return vreinterpretq_m128i_u8(w) ^ RoundKey;
8506 #else /* ARMv7-A NEON implementation */
8507 #define SSE2NEON_AES_B2W(b0, b1, b2, b3) \
8508 (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | \
8510 #define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
8511 #define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
8512 #define SSE2NEON_AES_U0(p) \
8513 SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
8514 #define SSE2NEON_AES_U1(p) \
8515 SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
8516 #define SSE2NEON_AES_U2(p) \
8517 SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
8518 #define SSE2NEON_AES_U3(p) \
8519 SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
8520 static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
8521 SSE2NEON_AES_DATA(SSE2NEON_AES_U0),
8522 SSE2NEON_AES_DATA(SSE2NEON_AES_U1),
8523 SSE2NEON_AES_DATA(SSE2NEON_AES_U2),
8524 SSE2NEON_AES_DATA(SSE2NEON_AES_U3),
8526 #undef SSE2NEON_AES_B2W
8527 #undef SSE2NEON_AES_F2
8528 #undef SSE2NEON_AES_F3
8529 #undef SSE2NEON_AES_U0
8530 #undef SSE2NEON_AES_U1
8531 #undef SSE2NEON_AES_U2
8532 #undef SSE2NEON_AES_U3
8534 uint32_t x0 = _mm_cvtsi128_si32(EncBlock);
8535 uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0x55));
8536 uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xAA));
8537 uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xFF));
8539 __m128i out = _mm_set_epi32(
8540 (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
8541 aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
8542 (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
8543 aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
8544 (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
8545 aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
8546 (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
8547 aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
8549 return _mm_xor_si128(out, RoundKey);
8553 // Perform the last round of an AES encryption flow on data (state) in a using
8554 // the round key in RoundKey, and store the result in dst.
8555 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
8556 FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
8558 /* FIXME: optimized for NEON */
8560 {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 0)],
8561 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 5)],
8562 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 10)],
8563 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 15)]},
8564 {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 4)],
8565 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 9)],
8566 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 14)],
8567 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 3)]},
8568 {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 8)],
8569 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 13)],
8570 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 2)],
8571 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 7)]},
8572 {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 12)],
8573 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 1)],
8574 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 6)],
8575 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 11)]},
8577 for (int i = 0; i < 16; i++)
8578 vreinterpretq_nth_u8_m128i(a, i) =
8579 v[i / 4][i % 4] ^ vreinterpretq_nth_u8_m128i(RoundKey, i);
8583 // Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
8584 // This instruction generates a round key for AES encryption. See
8585 // https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
8588 // https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx
8589 FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon)
8591 uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55));
8592 uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF));
8593 for (int i = 0; i < 4; ++i) {
8594 ((uint8_t *) &X1)[i] = SSE2NEON_sbox[((uint8_t *) &X1)[i]];
8595 ((uint8_t *) &X3)[i] = SSE2NEON_sbox[((uint8_t *) &X3)[i]];
8597 return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
8598 ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
8600 #undef SSE2NEON_AES_DATA
8602 #else /* __ARM_FEATURE_CRYPTO */
8603 // Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
8604 // AESMC and then manually applying the real key as an xor operation. This
8605 // unfortunately means an additional xor op; the compiler should be able to
8606 // optimize this away for repeated calls however. See
8607 // https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
8608 // for more details.
8609 FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
8611 return vreinterpretq_m128i_u8(
8612 vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
8613 vreinterpretq_u8_m128i(b));
8616 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
8617 FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
8619 return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
8620 vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
8624 FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
8626 // AESE does ShiftRows and SubBytes on A
8627 uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
8630 // Undo ShiftRows step from AESE and extract X1 and X3
8631 u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1)
8632 u8[0x1], u8[0xE], u8[0xB], u8[0x4], // ROT(SubBytes(X1))
8633 u8[0xC], u8[0x9], u8[0x6], u8[0x3], // SubBytes(X3)
8634 u8[0x9], u8[0x6], u8[0x3], u8[0xC], // ROT(SubBytes(X3))
8636 uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
8637 return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
8643 // Perform a carry-less multiplication of two 64-bit integers, selected from a
8644 // and b according to imm8, and store the results in dst.
8645 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clmulepi64_si128
8646 FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
8648 uint64x2_t a = vreinterpretq_u64_m128i(_a);
8649 uint64x2_t b = vreinterpretq_u64_m128i(_b);
8650 switch (imm & 0x11) {
8652 return vreinterpretq_m128i_u64(
8653 _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
8655 return vreinterpretq_m128i_u64(
8656 _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
8658 return vreinterpretq_m128i_u64(
8659 _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
8661 return vreinterpretq_m128i_u64(
8662 _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
8668 FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode()
8671 fpcr_bitfield field;
8672 #if defined(__aarch64__)
8679 #if defined(__aarch64__)
8680 asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
8682 asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
8685 return r.field.bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF;
8688 // Count the number of bits set to 1 in unsigned 32-bit integer a, and
8689 // return that count in dst.
8690 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32
8691 FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
8693 #if defined(__aarch64__)
8694 #if __has_builtin(__builtin_popcount)
8695 return __builtin_popcount(a);
8697 return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
8701 uint8x8_t input_val, count8x8_val;
8702 uint16x4_t count16x4_val;
8703 uint32x2_t count32x2_val;
8705 input_val = vld1_u8((uint8_t *) &a);
8706 count8x8_val = vcnt_u8(input_val);
8707 count16x4_val = vpaddl_u8(count8x8_val);
8708 count32x2_val = vpaddl_u16(count16x4_val);
8710 vst1_u32(&count, count32x2_val);
8715 // Count the number of bits set to 1 in unsigned 64-bit integer a, and
8716 // return that count in dst.
8717 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64
8718 FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
8720 #if defined(__aarch64__)
8721 #if __has_builtin(__builtin_popcountll)
8722 return __builtin_popcountll(a);
8724 return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
8728 uint8x8_t input_val, count8x8_val;
8729 uint16x4_t count16x4_val;
8730 uint32x2_t count32x2_val;
8731 uint64x1_t count64x1_val;
8733 input_val = vld1_u8((uint8_t *) &a);
8734 count8x8_val = vcnt_u8(input_val);
8735 count16x4_val = vpaddl_u8(count8x8_val);
8736 count32x2_val = vpaddl_u16(count16x4_val);
8737 count64x1_val = vpaddl_u32(count32x2_val);
8738 vst1_u64(&count, count64x1_val);
8743 FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
8745 // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
8746 // regardless of the value of the FZ bit.
8748 fpcr_bitfield field;
8749 #if defined(__aarch64__)
8756 #if defined(__aarch64__)
8757 asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
8759 asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
8762 r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON;
8764 #if defined(__aarch64__)
8765 asm volatile("msr FPCR, %0" ::"r"(r)); /* write */
8767 asm volatile("vmsr FPSCR, %0" ::"r"(r)); /* write */
8771 #if defined(__GNUC__) || defined(__clang__)
8772 #pragma pop_macro("ALIGN_STRUCT")
8773 #pragma pop_macro("FORCE_INLINE")
8776 #if defined(__GNUC__) && !defined(__clang__)
8777 #pragma GCC pop_options