src/sse2neon.h

   1 #ifndef SSE2NEON_H
   2 #define SSE2NEON_H
   3
   4 #include <stdlib.h> // For aligned_malloc
   5
   6 // This header file provides a simple API translation layer
   7 // between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
   8 //
   9 // This header file does not yet translate all of the SSE intrinsics.
  10 //
  11 // Contributors to this work are:
  12 //   John W. Ratcliff <jratcliffscarab@gmail.com>
  13 //   Brandon Rowlett <browlett@nvidia.com>
  14 //   Ken Fast <kfast@gdeb.com>
  15 //   Eric van Beurden <evanbeurden@nvidia.com>
  16 //   Alexander Potylitsin <apotylitsin@nvidia.com>
  17 //   Hasindu Gamaarachchi <hasindu2008@gmail.com>
  18 //   Jim Huang <jserv@biilabs.io>
  19 //   Mark Cheng <marktwtn@biilabs.io>
  20 //   Malcolm James MacLeod <malcolm@gulden.com>
  21 //   Devin Hussey (easyaspi314) <husseydevin@gmail.com>
  22 //   Sebastian Pop <spop@amazon.com>
  23 //   Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
  24 //   Danila Kutenin <danilak@google.com>
  25 //   François Turban (JishinMaster) <francois.turban@gmail.com>
  26 //   Pei-Hsuan Hung <afcidk@gmail.com>
  27 //   Yang-Hao Yuan <yanghau@biilabs.io>
  28 //   Syoyo Fujita <syoyo@lighttransport.com>
  29 //   Brecht Van Lommel <brecht@blender.org>
  30
  31 /*
  32  * sse2neon is freely redistributable under the MIT License.
  33  *
  34  * Permission is hereby granted, free of charge, to any person obtaining a copy
  35  * of this software and associated documentation files (the "Software"), to deal
  36  * in the Software without restriction, including without limitation the rights
  37  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  38  * copies of the Software, and to permit persons to whom the Software is
  39  * furnished to do so, subject to the following conditions:
  40  *
  41  * The above copyright notice and this permission notice shall be included in
  42  * all copies or substantial portions of the Software.
  43  *
  44  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  45  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  46  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  47  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  48  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  49  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  50  * SOFTWARE.
  51  */
  52
  53 /* Tunable configurations */
  54
  55 /* Enable precise implementation of math operations
  56  * This would slow down the computation a bit, but gives consistent result with
  57  * x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result)
  58  */
  59 /* _mm_min|max_ps|ss|pd|sd */
  60 #ifndef SSE2NEON_PRECISE_MINMAX
  61 #define SSE2NEON_PRECISE_MINMAX (0)
  62 #endif
  63 /* _mm_rcp_ps and _mm_div_ps */
  64 #ifndef SSE2NEON_PRECISE_DIV
  65 #define SSE2NEON_PRECISE_DIV (0)
  66 #endif
  67 /* _mm_sqrt_ps and _mm_rsqrt_ps */
  68 #ifndef SSE2NEON_PRECISE_SQRT
  69 #define SSE2NEON_PRECISE_SQRT (0)
  70 #endif
  71 /* _mm_dp_pd */
  72 #ifndef SSE2NEON_PRECISE_DP
  73 #define SSE2NEON_PRECISE_DP (0)
  74 #endif
  75
  76 /* compiler specific definitions */
  77 #if defined(__GNUC__) || defined(__clang__)
  78 #pragma push_macro("FORCE_INLINE")
  79 #pragma push_macro("ALIGN_STRUCT")
  80 #define FORCE_INLINE static inline __attribute__((always_inline))
  81 #define ALIGN_STRUCT(x) __attribute__((aligned(x)))
  82 #define _sse2neon_likely(x) __builtin_expect(!!(x), 1)
  83 #define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0)
  84 #else /* non-GNU / non-clang compilers */
  85 #warning "Macro name collisions may happen with unsupported compiler."
  86 #ifndef FORCE_INLINE
  87 #define FORCE_INLINE static inline
  88 #endif
  89 #ifndef ALIGN_STRUCT
  90 #define ALIGN_STRUCT(x) __declspec(align(x))
  91 #endif
  92 #define _sse2neon_likely(x) (x)
  93 #define _sse2neon_unlikely(x) (x)
  94 #endif
  95
  96 #include <stdint.h>
  97 #include <stdlib.h>
  98
  99 /* Architecture-specific build options */
 100 /* FIXME: #pragma GCC push_options is only available on GCC */
 101 #if defined(__GNUC__)
 102 #if defined(__arm__) && __ARM_ARCH == 7
 103 /* According to ARM C Language Extensions Architecture specification,
 104  * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
 105  * architecture supported.
 106  */
 107 #if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
 108 #error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
 109 #endif
 110 #if !defined(__clang__)
 111 #pragma GCC push_options
 112 #pragma GCC target("fpu=neon")
 113 #endif
 114 #elif defined(__aarch64__)
 115 #if !defined(__clang__)
 116 #pragma GCC push_options
 117 #pragma GCC target("+simd")
 118 #endif
 119 #else
 120 #error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
 121 #endif
 122 #endif
 123
 124 #include <arm_neon.h>
 125
 126 /* Rounding functions require either Aarch64 instructions or libm failback */
 127 #if !defined(__aarch64__)
 128 #include <math.h>
 129 #endif
 130
 131 /* "__has_builtin" can be used to query support for built-in functions
 132  * provided by gcc/clang and other compilers that support it.
 133  */
 134 #ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
 135 /* Compatibility with gcc <= 9 */
 136 #if defined(__GNUC__) && (__GNUC__ <= 9)
 137 #define __has_builtin(x) HAS##x
 138 #define HAS__builtin_popcount 1
 139 #define HAS__builtin_popcountll 1
 140 #else
 141 #define __has_builtin(x) 0
 142 #endif
 143 #endif
 144
 145 /**
 146  * MACRO for shuffle parameter for _mm_shuffle_ps().
 147  * Argument fp3 is a digit[0123] that represents the fp from argument "b"
 148  * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
 149  * for fp2 in result. fp1 is a digit[0123] that represents the fp from
 150  * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
 151  * fp0 is the same for fp0 of result.
 152  */
 153 #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
 154     (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
 155
 156 /* Rounding mode macros. */
 157 #define _MM_FROUND_TO_NEAREST_INT 0x00
 158 #define _MM_FROUND_TO_NEG_INF 0x01
 159 #define _MM_FROUND_TO_POS_INF 0x02
 160 #define _MM_FROUND_TO_ZERO 0x03
 161 #define _MM_FROUND_CUR_DIRECTION 0x04
 162 #define _MM_FROUND_NO_EXC 0x08
 163 #define _MM_FROUND_RAISE_EXC 0x00
 164 #define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
 165 #define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
 166 #define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
 167 #define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
 168 #define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
 169 #define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
 170 #define _MM_ROUND_NEAREST 0x0000
 171 #define _MM_ROUND_DOWN 0x2000
 172 #define _MM_ROUND_UP 0x4000
 173 #define _MM_ROUND_TOWARD_ZERO 0x6000
 174 /* Flush zero mode macros. */
 175 #define _MM_FLUSH_ZERO_MASK 0x8000
 176 #define _MM_FLUSH_ZERO_ON 0x8000
 177 #define _MM_FLUSH_ZERO_OFF 0x0000
 178 /* Denormals are zeros mode macros. */
 179 #define _MM_DENORMALS_ZERO_MASK 0x0040
 180 #define _MM_DENORMALS_ZERO_ON 0x0040
 181 #define _MM_DENORMALS_ZERO_OFF 0x0000
 182
 183 /* indicate immediate constant argument in a given range */
 184 #define __constrange(a, b) const
 185
 186 /* A few intrinsics accept traditional data types like ints or floats, but
 187  * most operate on data types that are specific to SSE.
 188  * If a vector type ends in d, it contains doubles, and if it does not have
 189  * a suffix, it contains floats. An integer vector type can contain any type
 190  * of integer, from chars to shorts to unsigned long longs.
 191  */
 192 typedef int64x1_t __m64;
 193 typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
 194 // On ARM 32-bit architecture, the float64x2_t is not supported.
 195 // The data type __m128d should be represented in a different way for related
 196 // intrinsic conversion.
 197 #if defined(__aarch64__)
 198 typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
 199 #else
 200 typedef float32x4_t __m128d;
 201 #endif
 202 typedef int64x2_t __m128i; /* 128-bit vector containing integers */
 203
 204 // __int64 is defined in the Intrinsics Guide which maps to different datatype
 205 // in different data model
 206 #if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
 207 #if (defined(__x86_64__) || defined(__i386__))
 208 #define __int64 long long
 209 #else
 210 #define __int64 int64_t
 211 #endif
 212 #endif
 213
 214 /* type-safe casting between types */
 215
 216 #define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
 217 #define vreinterpretq_m128_f32(x) (x)
 218 #define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
 219
 220 #define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
 221 #define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
 222 #define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
 223 #define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
 224
 225 #define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
 226 #define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
 227 #define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
 228 #define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
 229
 230 #define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
 231 #define vreinterpretq_f32_m128(x) (x)
 232 #define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
 233
 234 #define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
 235 #define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
 236 #define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
 237 #define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
 238
 239 #define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
 240 #define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
 241 #define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
 242 #define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
 243
 244 #define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
 245 #define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
 246 #define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
 247 #define vreinterpretq_m128i_s64(x) (x)
 248
 249 #define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
 250 #define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
 251 #define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
 252 #define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
 253
 254 #define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
 255 #define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
 256
 257 #define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
 258 #define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
 259 #define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
 260 #define vreinterpretq_s64_m128i(x) (x)
 261
 262 #define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
 263 #define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
 264 #define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
 265 #define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
 266
 267 #define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
 268 #define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
 269 #define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
 270 #define vreinterpret_m64_s64(x) (x)
 271
 272 #define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
 273 #define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
 274 #define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
 275 #define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
 276
 277 #define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
 278 #define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
 279 #define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
 280
 281 #define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
 282 #define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
 283 #define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
 284 #define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
 285
 286 #define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
 287 #define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
 288 #define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
 289 #define vreinterpret_s64_m64(x) (x)
 290
 291 #define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
 292
 293 #if defined(__aarch64__)
 294 #define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
 295 #define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
 296
 297 #define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
 298
 299 #define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
 300 #define vreinterpretq_m128d_f64(x) (x)
 301
 302 #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
 303
 304 #define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x)
 305 #define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
 306
 307 #define vreinterpretq_f64_m128d(x) (x)
 308 #define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
 309 #else
 310 #define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
 311 #define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
 312
 313 #define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x)
 314 #define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
 315
 316 #define vreinterpretq_m128d_f32(x) (x)
 317
 318 #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
 319
 320 #define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x)
 321 #define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
 322
 323 #define vreinterpretq_f32_m128d(x) (x)
 324 #endif
 325
 326 // A struct is defined in this header file called 'SIMDVec' which can be used
 327 // by applications which attempt to access the contents of an __m128 struct
 328 // directly.  It is important to note that accessing the __m128 struct directly
 329 // is bad coding practice by Microsoft: @see:
 330 // https://docs.microsoft.com/en-us/cpp/cpp/m128
 331 //
 332 // However, some legacy source code may try to access the contents of an __m128
 333 // struct directly so the developer can use the SIMDVec as an alias for it.  Any
 334 // casting must be done manually by the developer, as you cannot cast or
 335 // otherwise alias the base NEON data type for intrinsic operations.
 336 //
 337 // union intended to allow direct access to an __m128 variable using the names
 338 // that the MSVC compiler provides.  This union should really only be used when
 339 // trying to access the members of the vector as integer values.  GCC/clang
 340 // allow native access to the float members through a simple array access
 341 // operator (in C since 4.6, in C++ since 4.8).
 342 //
 343 // Ideally direct accesses to SIMD vectors should not be used since it can cause
 344 // a performance hit.  If it really is needed however, the original __m128
 345 // variable can be aliased with a pointer to this union and used to access
 346 // individual components.  The use of this union should be hidden behind a macro
 347 // that is used throughout the codebase to access the members instead of always
 348 // declaring this type of variable.
 349 typedef union ALIGN_STRUCT(16) SIMDVec {
 350     float m128_f32[4];     // as floats - DON'T USE. Added for convenience.
 351     int8_t m128_i8[16];    // as signed 8-bit integers.
 352     int16_t m128_i16[8];   // as signed 16-bit integers.
 353     int32_t m128_i32[4];   // as signed 32-bit integers.
 354     int64_t m128_i64[2];   // as signed 64-bit integers.
 355     uint8_t m128_u8[16];   // as unsigned 8-bit integers.
 356     uint16_t m128_u16[8];  // as unsigned 16-bit integers.
 357     uint32_t m128_u32[4];  // as unsigned 32-bit integers.
 358     uint64_t m128_u64[2];  // as unsigned 64-bit integers.
 359 } SIMDVec;
 360
 361 // casting using SIMDVec
 362 #define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
 363 #define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
 364 #define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
 365
 366 /* SSE macros */
 367 #define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode
 368 #define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode
 369 #define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode
 370 #define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode
 371
 372 // Function declaration
 373 // SSE
 374 FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE();
 375 FORCE_INLINE __m128 _mm_move_ss(__m128, __m128);
 376 FORCE_INLINE __m128 _mm_or_ps(__m128, __m128);
 377 FORCE_INLINE __m128 _mm_set_ps1(float);
 378 FORCE_INLINE __m128 _mm_setzero_ps(void);
 379 // SSE2
 380 FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i);
 381 FORCE_INLINE __m128i _mm_castps_si128(__m128);
 382 FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i);
 383 FORCE_INLINE __m128i _mm_cvtps_epi32(__m128);
 384 FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d);
 385 FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i);
 386 FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int);
 387 FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t);
 388 FORCE_INLINE __m128d _mm_set_pd(double, double);
 389 FORCE_INLINE __m128i _mm_set1_epi32(int);
 390 FORCE_INLINE __m128i _mm_setzero_si128();
 391 // SSE4.1
 392 FORCE_INLINE __m128d _mm_ceil_pd(__m128d);
 393 FORCE_INLINE __m128 _mm_ceil_ps(__m128);
 394 FORCE_INLINE __m128d _mm_floor_pd(__m128d);
 395 FORCE_INLINE __m128 _mm_floor_ps(__m128);
 396 FORCE_INLINE __m128d _mm_round_pd(__m128d, int);
 397 FORCE_INLINE __m128 _mm_round_ps(__m128, int);
 398 // SSE4.2
 399 FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
 400
 401 /* Backwards compatibility for compilers with lack of specific type support */
 402
 403 // Older gcc does not define vld1q_u8_x4 type
 404 #if defined(__GNUC__) && !defined(__clang__) &&                        \
 405     ((__GNUC__ <= 10 && defined(__arm__)) ||                           \
 406      (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \
 407      (__GNUC__ <= 9 && defined(__aarch64__)))
 408 FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
 409 {
 410     uint8x16x4_t ret;
 411     ret.val[0] = vld1q_u8(p + 0);
 412     ret.val[1] = vld1q_u8(p + 16);
 413     ret.val[2] = vld1q_u8(p + 32);
 414     ret.val[3] = vld1q_u8(p + 48);
 415     return ret;
 416 }
 417 #else
 418 // Wraps vld1q_u8_x4
 419 FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
 420 {
 421     return vld1q_u8_x4(p);
 422 }
 423 #endif
 424
 425 /* Function Naming Conventions
 426  * The naming convention of SSE intrinsics is straightforward. A generic SSE
 427  * intrinsic function is given as follows:
 428  *   _mm_<name>_<data_type>
 429  *
 430  * The parts of this format are given as follows:
 431  * 1. <name> describes the operation performed by the intrinsic
 432  * 2. <data_type> identifies the data type of the function's primary arguments
 433  *
 434  * This last part, <data_type>, is a little complicated. It identifies the
 435  * content of the input values, and can be set to any of the following values:
 436  * + ps - vectors contain floats (ps stands for packed single-precision)
 437  * + pd - vectors cantain doubles (pd stands for packed double-precision)
 438  * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
 439  *                            signed integers
 440  * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
 441  *                            unsigned integers
 442  * + si128 - unspecified 128-bit vector or 256-bit vector
 443  * + m128/m128i/m128d - identifies input vector types when they are different
 444  *                      than the type of the returned vector
 445  *
 446  * For example, _mm_setzero_ps. The _mm implies that the function returns
 447  * a 128-bit vector. The _ps at the end implies that the argument vectors
 448  * contain floats.
 449  *
 450  * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
 451  *   // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
 452  *   __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
 453  *   // Set packed 8-bit integers
 454  *   // 128 bits, 16 chars, per 8 bits
 455  *   __m128i v_perm = _mm_setr_epi8(1, 0,  2,  3, 8, 9, 10, 11,
 456  *                                  4, 5, 12, 13, 6, 7, 14, 15);
 457  *   // Shuffle packed 8-bit integers
 458  *   __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
 459  *
 460  * Data (Number, Binary, Byte Index):
 461     +------+------+-------------+------+------+-------------+
 462     |      1      |      2      |      3      |      4      | Number
 463     +------+------+------+------+------+------+------+------+
 464     | 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary
 465     +------+------+------+------+------+------+------+------+
 466     |    0 |    1 |    2 |    3 |    4 |    5 |    6 |    7 | Index
 467     +------+------+------+------+------+------+------+------+
 468
 469     +------+------+------+------+------+------+------+------+
 470     |      5      |      6      |      7      |      8      | Number
 471     +------+------+------+------+------+------+------+------+
 472     | 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary
 473     +------+------+------+------+------+------+------+------+
 474     |    8 |    9 |   10 |   11 |   12 |   13 |   14 |   15 | Index
 475     +------+------+------+------+------+------+------+------+
 476  * Index (Byte Index):
 477     +------+------+------+------+------+------+------+------+
 478     |    1 |    0 |    2 |    3 |    8 |    9 |   10 |   11 |
 479     +------+------+------+------+------+------+------+------+
 480
 481     +------+------+------+------+------+------+------+------+
 482     |    4 |    5 |   12 |   13 |    6 |    7 |   14 |   15 |
 483     +------+------+------+------+------+------+------+------+
 484  * Result:
 485     +------+------+------+------+------+------+------+------+
 486     |    1 |    0 |    2 |    3 |    8 |    9 |   10 |   11 | Index
 487     +------+------+------+------+------+------+------+------+
 488     | 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary
 489     +------+------+------+------+------+------+------+------+
 490     |     256     |      2      |      5      |      6      | Number
 491     +------+------+------+------+------+------+------+------+
 492
 493     +------+------+------+------+------+------+------+------+
 494     |    4 |    5 |   12 |   13 |    6 |    7 |   14 |   15 | Index
 495     +------+------+------+------+------+------+------+------+
 496     | 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary
 497     +------+------+------+------+------+------+------+------+
 498     |      3      |      7      |      4      |      8      | Number
 499     +------+------+------+------+------+------+-------------+
 500  */
 501
 502 /* Constants for use with _mm_prefetch.  */
 503 enum _mm_hint {
 504     _MM_HINT_NTA = 0,  /* load data to L1 and L2 cache, mark it as NTA */
 505     _MM_HINT_T0 = 1,   /* load data to L1 and L2 cache */
 506     _MM_HINT_T1 = 2,   /* load data to L2 cache only */
 507     _MM_HINT_T2 = 3,   /* load data to L2 cache only, mark it as NTA */
 508     _MM_HINT_ENTA = 4, /* exclusive version of _MM_HINT_NTA */
 509     _MM_HINT_ET0 = 5,  /* exclusive version of _MM_HINT_T0 */
 510     _MM_HINT_ET1 = 6,  /* exclusive version of _MM_HINT_T1 */
 511     _MM_HINT_ET2 = 7   /* exclusive version of _MM_HINT_T2 */
 512 };
 513
 514 // The bit field mapping to the FPCR(floating-point control register)
 515 typedef struct {
 516     uint16_t res0;
 517     uint8_t res1 : 6;
 518     uint8_t bit22 : 1;
 519     uint8_t bit23 : 1;
 520     uint8_t bit24 : 1;
 521     uint8_t res2 : 7;
 522 #if defined(__aarch64__)
 523     uint32_t res3;
 524 #endif
 525 } fpcr_bitfield;
 526
 527 // Takes the upper 64 bits of a and places it in the low end of the result
 528 // Takes the lower 64 bits of b and places it into the high end of the result.
 529 FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
 530 {
 531     float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
 532     float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
 533     return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
 534 }
 535
 536 // takes the lower two 32-bit values from a and swaps them and places in high
 537 // end of result takes the higher two 32 bit values from b and swaps them and
 538 // places in low end of result.
 539 FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
 540 {
 541     float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
 542     float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
 543     return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
 544 }
 545
 546 FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
 547 {
 548     float32x2_t a21 = vget_high_f32(
 549         vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
 550     float32x2_t b03 = vget_low_f32(
 551         vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
 552     return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
 553 }
 554
 555 FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
 556 {
 557     float32x2_t a03 = vget_low_f32(
 558         vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
 559     float32x2_t b21 = vget_high_f32(
 560         vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
 561     return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
 562 }
 563
 564 FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
 565 {
 566     float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
 567     float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
 568     return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
 569 }
 570
 571 FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
 572 {
 573     float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
 574     float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
 575     return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
 576 }
 577
 578 FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
 579 {
 580     float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
 581     float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
 582     return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
 583 }
 584
 585 // keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
 586 // high
 587 FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
 588 {
 589     float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
 590     float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
 591     return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
 592 }
 593
 594 FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
 595 {
 596     float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
 597     float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
 598     return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
 599 }
 600
 601 FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
 602 {
 603     float32x2_t a22 =
 604         vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
 605     float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
 606     return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
 607 }
 608
 609 FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
 610 {
 611     float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
 612     float32x2_t b22 =
 613         vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
 614     return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
 615 }
 616
 617 FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
 618 {
 619     float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
 620     float32x2_t a22 =
 621         vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
 622     float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
 623     float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
 624     return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
 625 }
 626
 627 FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
 628 {
 629     float32x2_t a33 =
 630         vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
 631     float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
 632     return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
 633 }
 634
 635 FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
 636 {
 637     float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
 638     float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
 639     float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
 640     float32x2_t b20 = vset_lane_f32(b2, b00, 1);
 641     return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
 642 }
 643
 644 FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
 645 {
 646     float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
 647     float32_t b2 = vgetq_lane_f32(b, 2);
 648     float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
 649     float32x2_t b20 = vset_lane_f32(b2, b00, 1);
 650     return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
 651 }
 652
 653 FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
 654 {
 655     float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
 656     float32_t b2 = vgetq_lane_f32(b, 2);
 657     float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
 658     float32x2_t b20 = vset_lane_f32(b2, b00, 1);
 659     return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
 660 }
 661
 662 // Kahan summation for accurate summation of floating-point numbers.
 663 // http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html
 664 FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y)
 665 {
 666     y -= *c;
 667     float t = *sum + y;
 668     *c = (t - *sum) - y;
 669     *sum = t;
 670 }
 671
 672 #if defined(__ARM_FEATURE_CRYPTO)
 673 // Wraps vmull_p64
 674 FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
 675 {
 676     poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
 677     poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
 678     return vreinterpretq_u64_p128(vmull_p64(a, b));
 679 }
 680 #else  // ARMv7 polyfill
 681 // ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
 682 //
 683 // vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
 684 // 64-bit->128-bit polynomial multiply.
 685 //
 686 // It needs some work and is somewhat slow, but it is still faster than all
 687 // known scalar methods.
 688 //
 689 // Algorithm adapted to C from
 690 // https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
 691 // from "Fast Software Polynomial Multiplication on ARM Processors Using the
 692 // NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
 693 // (https://hal.inria.fr/hal-01506572)
 694 static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
 695 {
 696     poly8x8_t a = vreinterpret_p8_u64(_a);
 697     poly8x8_t b = vreinterpret_p8_u64(_b);
 698
 699     // Masks
 700     uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
 701                                     vcreate_u8(0x00000000ffffffff));
 702     uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
 703                                     vcreate_u8(0x0000000000000000));
 704
 705     // Do the multiplies, rotating with vext to get all combinations
 706     uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b));  // D = A0 * B0
 707     uint8x16_t e =
 708         vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1)));  // E = A0 * B1
 709     uint8x16_t f =
 710         vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b));  // F = A1 * B0
 711     uint8x16_t g =
 712         vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2)));  // G = A0 * B2
 713     uint8x16_t h =
 714         vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b));  // H = A2 * B0
 715     uint8x16_t i =
 716         vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3)));  // I = A0 * B3
 717     uint8x16_t j =
 718         vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b));  // J = A3 * B0
 719     uint8x16_t k =
 720         vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4)));  // L = A0 * B4
 721
 722     // Add cross products
 723     uint8x16_t l = veorq_u8(e, f);  // L = E + F
 724     uint8x16_t m = veorq_u8(g, h);  // M = G + H
 725     uint8x16_t n = veorq_u8(i, j);  // N = I + J
 726
 727     // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
 728     // instructions.
 729 #if defined(__aarch64__)
 730     uint8x16_t lm_p0 = vreinterpretq_u8_u64(
 731         vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
 732     uint8x16_t lm_p1 = vreinterpretq_u8_u64(
 733         vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
 734     uint8x16_t nk_p0 = vreinterpretq_u8_u64(
 735         vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
 736     uint8x16_t nk_p1 = vreinterpretq_u8_u64(
 737         vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
 738 #else
 739     uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
 740     uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
 741     uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
 742     uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
 743 #endif
 744     // t0 = (L) (P0 + P1) << 8
 745     // t1 = (M) (P2 + P3) << 16
 746     uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
 747     uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
 748     uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
 749
 750     // t2 = (N) (P4 + P5) << 24
 751     // t3 = (K) (P6 + P7) << 32
 752     uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
 753     uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
 754     uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
 755
 756     // De-interleave
 757 #if defined(__aarch64__)
 758     uint8x16_t t0 = vreinterpretq_u8_u64(
 759         vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
 760     uint8x16_t t1 = vreinterpretq_u8_u64(
 761         vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
 762     uint8x16_t t2 = vreinterpretq_u8_u64(
 763         vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
 764     uint8x16_t t3 = vreinterpretq_u8_u64(
 765         vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
 766 #else
 767     uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
 768     uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
 769     uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
 770     uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
 771 #endif
 772     // Shift the cross products
 773     uint8x16_t t0_shift = vextq_u8(t0, t0, 15);  // t0 << 8
 774     uint8x16_t t1_shift = vextq_u8(t1, t1, 14);  // t1 << 16
 775     uint8x16_t t2_shift = vextq_u8(t2, t2, 13);  // t2 << 24
 776     uint8x16_t t3_shift = vextq_u8(t3, t3, 12);  // t3 << 32
 777
 778     // Accumulate the products
 779     uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
 780     uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
 781     uint8x16_t mix = veorq_u8(d, cross1);
 782     uint8x16_t r = veorq_u8(mix, cross2);
 783     return vreinterpretq_u64_u8(r);
 784 }
 785 #endif  // ARMv7 polyfill
 786
 787 // C equivalent:
 788 //   __m128i _mm_shuffle_epi32_default(__m128i a,
 789 //                                     __constrange(0, 255) int imm) {
 790 //       __m128i ret;
 791 //       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
 792 //       ret[2] = a[(imm >> 4) & 0x03];  ret[3] = a[(imm >> 6) & 0x03];
 793 //       return ret;
 794 //   }
 795 #define _mm_shuffle_epi32_default(a, imm)                                   \
 796     __extension__({                                                         \
 797         int32x4_t ret;                                                      \
 798         ret = vmovq_n_s32(                                                  \
 799             vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3)));     \
 800         ret = vsetq_lane_s32(                                               \
 801             vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
 802             ret, 1);                                                        \
 803         ret = vsetq_lane_s32(                                               \
 804             vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
 805             ret, 2);                                                        \
 806         ret = vsetq_lane_s32(                                               \
 807             vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
 808             ret, 3);                                                        \
 809         vreinterpretq_m128i_s32(ret);                                       \
 810     })
 811
 812 // Takes the upper 64 bits of a and places it in the low end of the result
 813 // Takes the lower 64 bits of a and places it into the high end of the result.
 814 FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
 815 {
 816     int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
 817     int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
 818     return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
 819 }
 820
 821 // takes the lower two 32-bit values from a and swaps them and places in low end
 822 // of result takes the higher two 32 bit values from a and swaps them and places
 823 // in high end of result.
 824 FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
 825 {
 826     int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
 827     int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
 828     return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
 829 }
 830
 831 // rotates the least significant 32 bits into the most significant 32 bits, and
 832 // shifts the rest down
 833 FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
 834 {
 835     return vreinterpretq_m128i_s32(
 836         vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
 837 }
 838
 839 // rotates the most significant 32 bits into the least significant 32 bits, and
 840 // shifts the rest up
 841 FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
 842 {
 843     return vreinterpretq_m128i_s32(
 844         vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
 845 }
 846
 847 // gets the lower 64 bits of a, and places it in the upper 64 bits
 848 // gets the lower 64 bits of a and places it in the lower 64 bits
 849 FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
 850 {
 851     int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
 852     return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
 853 }
 854
 855 // gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
 856 // lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
 857 FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
 858 {
 859     int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
 860     int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
 861     return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
 862 }
 863
 864 // gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
 865 // upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
 866 // places it in the lower 64 bits
 867 FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
 868 {
 869     int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
 870     return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
 871 }
 872
 873 FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
 874 {
 875     int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
 876     int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
 877     return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
 878 }
 879
 880 FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
 881 {
 882     int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
 883     int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
 884     return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
 885 }
 886
 887 FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
 888 {
 889     int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
 890     int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
 891     return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
 892 }
 893
 894 // FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
 895 // int imm)
 896 #if defined(__aarch64__)
 897 #define _mm_shuffle_epi32_splat(a, imm)                          \
 898     __extension__({                                              \
 899         vreinterpretq_m128i_s32(                                 \
 900             vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
 901     })
 902 #else
 903 #define _mm_shuffle_epi32_splat(a, imm)                                      \
 904     __extension__({                                                          \
 905         vreinterpretq_m128i_s32(                                             \
 906             vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
 907     })
 908 #endif
 909
 910 // NEON does not support a general purpose permute intrinsic
 911 // Selects four specific single-precision, floating-point values from a and b,
 912 // based on the mask i.
 913 //
 914 // C equivalent:
 915 //   __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
 916 //                                 __constrange(0, 255) int imm) {
 917 //       __m128 ret;
 918 //       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
 919 //       ret[2] = b[(imm >> 4) & 0x03];  ret[3] = b[(imm >> 6) & 0x03];
 920 //       return ret;
 921 //   }
 922 //
 923 // https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
 924 #define _mm_shuffle_ps_default(a, b, imm)                                  \
 925     __extension__({                                                        \
 926         float32x4_t ret;                                                   \
 927         ret = vmovq_n_f32(                                                 \
 928             vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3)));     \
 929         ret = vsetq_lane_f32(                                              \
 930             vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
 931             ret, 1);                                                       \
 932         ret = vsetq_lane_f32(                                              \
 933             vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
 934             ret, 2);                                                       \
 935         ret = vsetq_lane_f32(                                              \
 936             vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
 937             ret, 3);                                                       \
 938         vreinterpretq_m128_f32(ret);                                       \
 939     })
 940
 941 // Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified
 942 // by imm.
 943 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100)
 944 // FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a,
 945 //                                                   __constrange(0,255) int
 946 //                                                   imm)
 947 #define _mm_shufflelo_epi16_function(a, imm)                                  \
 948     __extension__({                                                           \
 949         int16x8_t ret = vreinterpretq_s16_m128i(a);                           \
 950         int16x4_t lowBits = vget_low_s16(ret);                                \
 951         ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0);  \
 952         ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
 953                              1);                                              \
 954         ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
 955                              2);                                              \
 956         ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
 957                              3);                                              \
 958         vreinterpretq_m128i_s16(ret);                                         \
 959     })
 960
 961 // Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified
 962 // by imm.
 963 // https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
 964 // FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a,
 965 //                                                   __constrange(0,255) int
 966 //                                                   imm)
 967 #define _mm_shufflehi_epi16_function(a, imm)                                   \
 968     __extension__({                                                            \
 969         int16x8_t ret = vreinterpretq_s16_m128i(a);                            \
 970         int16x4_t highBits = vget_high_s16(ret);                               \
 971         ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4);  \
 972         ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
 973                              5);                                               \
 974         ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
 975                              6);                                               \
 976         ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
 977                              7);                                               \
 978         vreinterpretq_m128i_s16(ret);                                          \
 979     })
 980
 981 /* MMX */
 982
 983 //_mm_empty is a no-op on arm
 984 FORCE_INLINE void _mm_empty(void) {}
 985
 986 /* SSE */
 987
 988 // Adds the four single-precision, floating-point values of a and b.
 989 //
 990 //   r0 := a0 + b0
 991 //   r1 := a1 + b1
 992 //   r2 := a2 + b2
 993 //   r3 := a3 + b3
 994 //
 995 // https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
 996 FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
 997 {
 998     return vreinterpretq_m128_f32(
 999         vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1000 }
1001
1002 // adds the scalar single-precision floating point values of a and b.
1003 // https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
1004 FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
1005 {
1006     float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
1007     float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
1008     // the upper values in the result must be the remnants of <a>.
1009     return vreinterpretq_m128_f32(vaddq_f32(a, value));
1010 }
1011
1012 // Computes the bitwise AND of the four single-precision, floating-point values
1013 // of a and b.
1014 //
1015 //   r0 := a0 & b0
1016 //   r1 := a1 & b1
1017 //   r2 := a2 & b2
1018 //   r3 := a3 & b3
1019 //
1020 // https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
1021 FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
1022 {
1023     return vreinterpretq_m128_s32(
1024         vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
1025 }
1026
1027 // Computes the bitwise AND-NOT of the four single-precision, floating-point
1028 // values of a and b.
1029 //
1030 //   r0 := ~a0 & b0
1031 //   r1 := ~a1 & b1
1032 //   r2 := ~a2 & b2
1033 //   r3 := ~a3 & b3
1034 //
1035 // https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
1036 FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
1037 {
1038     return vreinterpretq_m128_s32(
1039         vbicq_s32(vreinterpretq_s32_m128(b),
1040                   vreinterpretq_s32_m128(a)));  // *NOTE* argument swap
1041 }
1042
1043 // Average packed unsigned 16-bit integers in a and b, and store the results in
1044 // dst.
1045 //
1046 //   FOR j := 0 to 3
1047 //     i := j*16
1048 //     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
1049 //   ENDFOR
1050 //
1051 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16
1052 FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
1053 {
1054     return vreinterpret_m64_u16(
1055         vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
1056 }
1057
1058 // Average packed unsigned 8-bit integers in a and b, and store the results in
1059 // dst.
1060 //
1061 //   FOR j := 0 to 7
1062 //     i := j*8
1063 //     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
1064 //   ENDFOR
1065 //
1066 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8
1067 FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
1068 {
1069     return vreinterpret_m64_u8(
1070         vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
1071 }
1072
1073 // Compares for equality.
1074 // https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
1075 FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
1076 {
1077     return vreinterpretq_m128_u32(
1078         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1079 }
1080
1081 // Compares for equality.
1082 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100)
1083 FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
1084 {
1085     return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
1086 }
1087
1088 // Compares for greater than or equal.
1089 // https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
1090 FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
1091 {
1092     return vreinterpretq_m128_u32(
1093         vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1094 }
1095
1096 // Compares for greater than or equal.
1097 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100)
1098 FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
1099 {
1100     return _mm_move_ss(a, _mm_cmpge_ps(a, b));
1101 }
1102
1103 // Compares for greater than.
1104 //
1105 //   r0 := (a0 > b0) ? 0xffffffff : 0x0
1106 //   r1 := (a1 > b1) ? 0xffffffff : 0x0
1107 //   r2 := (a2 > b2) ? 0xffffffff : 0x0
1108 //   r3 := (a3 > b3) ? 0xffffffff : 0x0
1109 //
1110 // https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
1111 FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
1112 {
1113     return vreinterpretq_m128_u32(
1114         vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1115 }
1116
1117 // Compares for greater than.
1118 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100)
1119 FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
1120 {
1121     return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
1122 }
1123
1124 // Compares for less than or equal.
1125 //
1126 //   r0 := (a0 <= b0) ? 0xffffffff : 0x0
1127 //   r1 := (a1 <= b1) ? 0xffffffff : 0x0
1128 //   r2 := (a2 <= b2) ? 0xffffffff : 0x0
1129 //   r3 := (a3 <= b3) ? 0xffffffff : 0x0
1130 //
1131 // https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
1132 FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
1133 {
1134     return vreinterpretq_m128_u32(
1135         vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1136 }
1137
1138 // Compares for less than or equal.
1139 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100)
1140 FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
1141 {
1142     return _mm_move_ss(a, _mm_cmple_ps(a, b));
1143 }
1144
1145 // Compares for less than
1146 // https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
1147 FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
1148 {
1149     return vreinterpretq_m128_u32(
1150         vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1151 }
1152
1153 // Compares for less than
1154 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100)
1155 FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
1156 {
1157     return _mm_move_ss(a, _mm_cmplt_ps(a, b));
1158 }
1159
1160 // Compares for inequality.
1161 // https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
1162 FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
1163 {
1164     return vreinterpretq_m128_u32(vmvnq_u32(
1165         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1166 }
1167
1168 // Compares for inequality.
1169 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100)
1170 FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
1171 {
1172     return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
1173 }
1174
1175 // Compares for not greater than or equal.
1176 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100)
1177 FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
1178 {
1179     return vreinterpretq_m128_u32(vmvnq_u32(
1180         vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1181 }
1182
1183 // Compares for not greater than or equal.
1184 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100)
1185 FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
1186 {
1187     return _mm_move_ss(a, _mm_cmpnge_ps(a, b));
1188 }
1189
1190 // Compares for not greater than.
1191 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100)
1192 FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
1193 {
1194     return vreinterpretq_m128_u32(vmvnq_u32(
1195         vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1196 }
1197
1198 // Compares for not greater than.
1199 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
1200 FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
1201 {
1202     return _mm_move_ss(a, _mm_cmpngt_ps(a, b));
1203 }
1204
1205 // Compares for not less than or equal.
1206 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100)
1207 FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
1208 {
1209     return vreinterpretq_m128_u32(vmvnq_u32(
1210         vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1211 }
1212
1213 // Compares for not less than or equal.
1214 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
1215 FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
1216 {
1217     return _mm_move_ss(a, _mm_cmpnle_ps(a, b));
1218 }
1219
1220 // Compares for not less than.
1221 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100)
1222 FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
1223 {
1224     return vreinterpretq_m128_u32(vmvnq_u32(
1225         vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1226 }
1227
1228 // Compares for not less than.
1229 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100)
1230 FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
1231 {
1232     return _mm_move_ss(a, _mm_cmpnlt_ps(a, b));
1233 }
1234
1235 // Compares the four 32-bit floats in a and b to check if any values are NaN.
1236 // Ordered compare between each value returns true for "orderable" and false for
1237 // "not orderable" (NaN).
1238 // https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see
1239 // also:
1240 // http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
1241 // http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
1242 FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
1243 {
1244     // Note: NEON does not have ordered compare builtin
1245     // Need to compare a eq a and b eq b to check for NaN
1246     // Do AND of results to get final
1247     uint32x4_t ceqaa =
1248         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1249     uint32x4_t ceqbb =
1250         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1251     return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
1252 }
1253
1254 // Compares for ordered.
1255 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100)
1256 FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
1257 {
1258     return _mm_move_ss(a, _mm_cmpord_ps(a, b));
1259 }
1260
1261 // Compares for unordered.
1262 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100)
1263 FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
1264 {
1265     uint32x4_t f32a =
1266         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1267     uint32x4_t f32b =
1268         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1269     return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
1270 }
1271
1272 // Compares for unordered.
1273 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100)
1274 FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
1275 {
1276     return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
1277 }
1278
1279 // Compares the lower single-precision floating point scalar values of a and b
1280 // using an equality operation. :
1281 // https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
1282 FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
1283 {
1284     uint32x4_t a_eq_b =
1285         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1286     return vgetq_lane_u32(a_eq_b, 0) & 0x1;
1287 }
1288
1289 // Compares the lower single-precision floating point scalar values of a and b
1290 // using a greater than or equal operation. :
1291 // https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
1292 FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
1293 {
1294     uint32x4_t a_ge_b =
1295         vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1296     return vgetq_lane_u32(a_ge_b, 0) & 0x1;
1297 }
1298
1299 // Compares the lower single-precision floating point scalar values of a and b
1300 // using a greater than operation. :
1301 // https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
1302 FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
1303 {
1304     uint32x4_t a_gt_b =
1305         vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1306     return vgetq_lane_u32(a_gt_b, 0) & 0x1;
1307 }
1308
1309 // Compares the lower single-precision floating point scalar values of a and b
1310 // using a less than or equal operation. :
1311 // https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
1312 FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
1313 {
1314     uint32x4_t a_le_b =
1315         vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1316     return vgetq_lane_u32(a_le_b, 0) & 0x1;
1317 }
1318
1319 // Compares the lower single-precision floating point scalar values of a and b
1320 // using a less than operation. :
1321 // https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important
1322 // note!! The documentation on MSDN is incorrect!  If either of the values is a
1323 // NAN the docs say you will get a one, but in fact, it will return a zero!!
1324 FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
1325 {
1326     uint32x4_t a_lt_b =
1327         vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1328     return vgetq_lane_u32(a_lt_b, 0) & 0x1;
1329 }
1330
1331 // Compares the lower single-precision floating point scalar values of a and b
1332 // using an inequality operation. :
1333 // https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
1334 FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
1335 {
1336     return !_mm_comieq_ss(a, b);
1337 }
1338
1339 // Convert packed signed 32-bit integers in b to packed single-precision
1340 // (32-bit) floating-point elements, store the results in the lower 2 elements
1341 // of dst, and copy the upper 2 packed elements from a to the upper elements of
1342 // dst.
1343 //
1344 //   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
1345 //   dst[63:32] := Convert_Int32_To_FP32(b[63:32])
1346 //   dst[95:64] := a[95:64]
1347 //   dst[127:96] := a[127:96]
1348 //
1349 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps
1350 FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
1351 {
1352     return vreinterpretq_m128_f32(
1353         vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
1354                      vget_high_f32(vreinterpretq_f32_m128(a))));
1355 }
1356
1357 // Convert packed single-precision (32-bit) floating-point elements in a to
1358 // packed 32-bit integers, and store the results in dst.
1359 //
1360 //   FOR j := 0 to 1
1361 //       i := 32*j
1362 //       dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
1363 //   ENDFOR
1364 //
1365 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ps2pi
1366 FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
1367 {
1368 #if defined(__aarch64__)
1369     return vreinterpret_m64_s32(
1370         vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a)))));
1371 #else
1372     return vreinterpret_m64_s32(vcvt_s32_f32(vget_low_f32(
1373         vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)))));
1374 #endif
1375 }
1376
1377 // Convert the signed 32-bit integer b to a single-precision (32-bit)
1378 // floating-point element, store the result in the lower element of dst, and
1379 // copy the upper 3 packed elements from a to the upper elements of dst.
1380 //
1381 //   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
1382 //   dst[127:32] := a[127:32]
1383 //
1384 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss
1385 FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
1386 {
1387     return vreinterpretq_m128_f32(
1388         vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
1389 }
1390
1391 // Convert the lower single-precision (32-bit) floating-point element in a to a
1392 // 32-bit integer, and store the result in dst.
1393 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si
1394 FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
1395 {
1396 #if defined(__aarch64__)
1397     return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))),
1398                           0);
1399 #else
1400     float32_t data = vgetq_lane_f32(
1401         vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
1402     return (int32_t) data;
1403 #endif
1404 }
1405
1406 // Convert packed 16-bit integers in a to packed single-precision (32-bit)
1407 // floating-point elements, and store the results in dst.
1408 //
1409 //   FOR j := 0 to 3
1410 //      i := j*16
1411 //      m := j*32
1412 //      dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
1413 //   ENDFOR
1414 //
1415 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps
1416 FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
1417 {
1418     return vreinterpretq_m128_f32(
1419         vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
1420 }
1421
1422 // Convert packed 32-bit integers in b to packed single-precision (32-bit)
1423 // floating-point elements, store the results in the lower 2 elements of dst,
1424 // and copy the upper 2 packed elements from a to the upper elements of dst.
1425 //
1426 //   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
1427 //   dst[63:32] := Convert_Int32_To_FP32(b[63:32])
1428 //   dst[95:64] := a[95:64]
1429 //   dst[127:96] := a[127:96]
1430 //
1431 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps
1432 FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
1433 {
1434     return vreinterpretq_m128_f32(
1435         vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
1436                      vget_high_f32(vreinterpretq_f32_m128(a))));
1437 }
1438
1439 // Convert packed signed 32-bit integers in a to packed single-precision
1440 // (32-bit) floating-point elements, store the results in the lower 2 elements
1441 // of dst, then covert the packed signed 32-bit integers in b to
1442 // single-precision (32-bit) floating-point element, and store the results in
1443 // the upper 2 elements of dst.
1444 //
1445 //   dst[31:0] := Convert_Int32_To_FP32(a[31:0])
1446 //   dst[63:32] := Convert_Int32_To_FP32(a[63:32])
1447 //   dst[95:64] := Convert_Int32_To_FP32(b[31:0])
1448 //   dst[127:96] := Convert_Int32_To_FP32(b[63:32])
1449 //
1450 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps
1451 FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
1452 {
1453     return vreinterpretq_m128_f32(vcvtq_f32_s32(
1454         vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
1455 }
1456
1457 // Convert the lower packed 8-bit integers in a to packed single-precision
1458 // (32-bit) floating-point elements, and store the results in dst.
1459 //
1460 //   FOR j := 0 to 3
1461 //      i := j*8
1462 //      m := j*32
1463 //      dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
1464 //   ENDFOR
1465 //
1466 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps
1467 FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
1468 {
1469     return vreinterpretq_m128_f32(vcvtq_f32_s32(
1470         vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
1471 }
1472
1473 // Convert packed single-precision (32-bit) floating-point elements in a to
1474 // packed 16-bit integers, and store the results in dst. Note: this intrinsic
1475 // will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
1476 // 0x7FFFFFFF.
1477 //
1478 //   FOR j := 0 to 3
1479 //     i := 16*j
1480 //     k := 32*j
1481 //     IF a[k+31:k] >= FP32(0x7FFF) && a[k+31:k] <= FP32(0x7FFFFFFF)
1482 //       dst[i+15:i] := 0x7FFF
1483 //     ELSE
1484 //       dst[i+15:i] := Convert_FP32_To_Int16(a[k+31:k])
1485 //     FI
1486 //   ENDFOR
1487 //
1488 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi16
1489 FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
1490 {
1491     const __m128 i16Min = _mm_set_ps1((float) INT16_MIN);
1492     const __m128 i16Max = _mm_set_ps1((float) INT16_MAX);
1493     const __m128 i32Max = _mm_set_ps1((float) INT32_MAX);
1494     const __m128i maxMask = _mm_castps_si128(
1495         _mm_and_ps(_mm_cmpge_ps(a, i16Max), _mm_cmple_ps(a, i32Max)));
1496     const __m128i betweenMask = _mm_castps_si128(
1497         _mm_and_ps(_mm_cmpgt_ps(a, i16Min), _mm_cmplt_ps(a, i16Max)));
1498     const __m128i minMask = _mm_cmpeq_epi32(_mm_or_si128(maxMask, betweenMask),
1499                                             _mm_setzero_si128());
1500     __m128i max = _mm_and_si128(maxMask, _mm_set1_epi32(INT16_MAX));
1501     __m128i min = _mm_and_si128(minMask, _mm_set1_epi32(INT16_MIN));
1502     __m128i cvt = _mm_and_si128(betweenMask, _mm_cvtps_epi32(a));
1503     __m128i res32 = _mm_or_si128(_mm_or_si128(max, min), cvt);
1504     return vreinterpret_m64_s16(vmovn_s32(vreinterpretq_s32_m128i(res32)));
1505 }
1506
1507 // Convert packed single-precision (32-bit) floating-point elements in a to
1508 // packed 32-bit integers, and store the results in dst.
1509 //
1510 //   FOR j := 0 to 1
1511 //       i := 32*j
1512 //       dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
1513 //   ENDFOR
1514 //
1515 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi32
1516 #define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
1517
1518 // Convert packed single-precision (32-bit) floating-point elements in a to
1519 // packed 8-bit integers, and store the results in lower 4 elements of dst.
1520 // Note: this intrinsic will generate 0x7F, rather than 0x80, for input values
1521 // between 0x7F and 0x7FFFFFFF.
1522 //
1523 //   FOR j := 0 to 3
1524 //     i := 8*j
1525 //     k := 32*j
1526 //     IF a[k+31:k] >= FP32(0x7F) && a[k+31:k] <= FP32(0x7FFFFFFF)
1527 //       dst[i+7:i] := 0x7F
1528 //     ELSE
1529 //       dst[i+7:i] := Convert_FP32_To_Int8(a[k+31:k])
1530 //     FI
1531 //   ENDFOR
1532 //
1533 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi8
1534 FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a)
1535 {
1536     const __m128 i8Min = _mm_set_ps1((float) INT8_MIN);
1537     const __m128 i8Max = _mm_set_ps1((float) INT8_MAX);
1538     const __m128 i32Max = _mm_set_ps1((float) INT32_MAX);
1539     const __m128i maxMask = _mm_castps_si128(
1540         _mm_and_ps(_mm_cmpge_ps(a, i8Max), _mm_cmple_ps(a, i32Max)));
1541     const __m128i betweenMask = _mm_castps_si128(
1542         _mm_and_ps(_mm_cmpgt_ps(a, i8Min), _mm_cmplt_ps(a, i8Max)));
1543     const __m128i minMask = _mm_cmpeq_epi32(_mm_or_si128(maxMask, betweenMask),
1544                                             _mm_setzero_si128());
1545     __m128i max = _mm_and_si128(maxMask, _mm_set1_epi32(INT8_MAX));
1546     __m128i min = _mm_and_si128(minMask, _mm_set1_epi32(INT8_MIN));
1547     __m128i cvt = _mm_and_si128(betweenMask, _mm_cvtps_epi32(a));
1548     __m128i res32 = _mm_or_si128(_mm_or_si128(max, min), cvt);
1549     int16x4_t res16 = vmovn_s32(vreinterpretq_s32_m128i(res32));
1550     int8x8_t res8 = vmovn_s16(vcombine_s16(res16, res16));
1551     static const uint32_t bitMask[2] = {0xFFFFFFFF, 0};
1552     int8x8_t mask = vreinterpret_s8_u32(vld1_u32(bitMask));
1553
1554     return vreinterpret_m64_s8(vorr_s8(vand_s8(mask, res8), vdup_n_s8(0)));
1555 }
1556
1557 // Convert packed unsigned 16-bit integers in a to packed single-precision
1558 // (32-bit) floating-point elements, and store the results in dst.
1559 //
1560 //   FOR j := 0 to 3
1561 //      i := j*16
1562 //      m := j*32
1563 //      dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i])
1564 //   ENDFOR
1565 //
1566 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps
1567 FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
1568 {
1569     return vreinterpretq_m128_f32(
1570         vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
1571 }
1572
1573 // Convert the lower packed unsigned 8-bit integers in a to packed
1574 // single-precision (32-bit) floating-point elements, and store the results in
1575 // dst.
1576 //
1577 //   FOR j := 0 to 3
1578 //      i := j*8
1579 //      m := j*32
1580 //      dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i])
1581 //   ENDFOR
1582 //
1583 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps
1584 FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
1585 {
1586     return vreinterpretq_m128_f32(vcvtq_f32_u32(
1587         vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
1588 }
1589
1590 // Convert the signed 32-bit integer b to a single-precision (32-bit)
1591 // floating-point element, store the result in the lower element of dst, and
1592 // copy the upper 3 packed elements from a to the upper elements of dst.
1593 //
1594 //   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
1595 //   dst[127:32] := a[127:32]
1596 //
1597 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss
1598 #define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
1599
1600 // Convert the signed 64-bit integer b to a single-precision (32-bit)
1601 // floating-point element, store the result in the lower element of dst, and
1602 // copy the upper 3 packed elements from a to the upper elements of dst.
1603 //
1604 //   dst[31:0] := Convert_Int64_To_FP32(b[63:0])
1605 //   dst[127:32] := a[127:32]
1606 //
1607 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_ss
1608 FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
1609 {
1610     return vreinterpretq_m128_f32(
1611         vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
1612 }
1613
1614 // Copy the lower single-precision (32-bit) floating-point element of a to dst.
1615 //
1616 //   dst[31:0] := a[31:0]
1617 //
1618 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32
1619 FORCE_INLINE float _mm_cvtss_f32(__m128 a)
1620 {
1621     return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1622 }
1623
1624 // Convert the lower single-precision (32-bit) floating-point element in a to a
1625 // 32-bit integer, and store the result in dst.
1626 //
1627 //   dst[31:0] := Convert_FP32_To_Int32(a[31:0])
1628 //
1629 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32
1630 #define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
1631
1632 // Convert the lower single-precision (32-bit) floating-point element in a to a
1633 // 64-bit integer, and store the result in dst.
1634 //
1635 //   dst[63:0] := Convert_FP32_To_Int64(a[31:0])
1636 //
1637 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si64
1638 FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
1639 {
1640 #if defined(__aarch64__)
1641     return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0);
1642 #else
1643     float32_t data = vgetq_lane_f32(
1644         vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
1645     return (int64_t) data;
1646 #endif
1647 }
1648
1649 // Convert packed single-precision (32-bit) floating-point elements in a to
1650 // packed 32-bit integers with truncation, and store the results in dst.
1651 //
1652 //   FOR j := 0 to 1
1653 //      i := 32*j
1654 //      dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
1655 //   ENDFOR
1656 //
1657 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ps2pi
1658 FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
1659 {
1660     return vreinterpret_m64_s32(
1661         vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
1662 }
1663
1664 // Convert the lower single-precision (32-bit) floating-point element in a to a
1665 // 32-bit integer with truncation, and store the result in dst.
1666 //
1667 //   dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
1668 //
1669 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si
1670 FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
1671 {
1672     return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
1673 }
1674
1675 // Convert packed single-precision (32-bit) floating-point elements in a to
1676 // packed 32-bit integers with truncation, and store the results in dst.
1677 //
1678 //   FOR j := 0 to 1
1679 //      i := 32*j
1680 //      dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
1681 //   ENDFOR
1682 //
1683 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_pi32
1684 #define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
1685
1686 // Convert the lower single-precision (32-bit) floating-point element in a to a
1687 // 32-bit integer with truncation, and store the result in dst.
1688 //
1689 //   dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
1690 //
1691 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32
1692 #define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
1693
1694 // Convert the lower single-precision (32-bit) floating-point element in a to a
1695 // 64-bit integer with truncation, and store the result in dst.
1696 //
1697 //   dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
1698 //
1699 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si64
1700 FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
1701 {
1702     return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1703 }
1704
1705 // Divides the four single-precision, floating-point values of a and b.
1706 //
1707 //   r0 := a0 / b0
1708 //   r1 := a1 / b1
1709 //   r2 := a2 / b2
1710 //   r3 := a3 / b3
1711 //
1712 // https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
1713 FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
1714 {
1715 #if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV
1716     return vreinterpretq_m128_f32(
1717         vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1718 #else
1719     float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
1720     recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
1721 #if SSE2NEON_PRECISE_DIV
1722     // Additional Netwon-Raphson iteration for accuracy
1723     recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
1724 #endif
1725     return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
1726 #endif
1727 }
1728
1729 // Divides the scalar single-precision floating point value of a by b.
1730 // https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
1731 FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
1732 {
1733     float32_t value =
1734         vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
1735     return vreinterpretq_m128_f32(
1736         vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
1737 }
1738
1739 // Extract a 16-bit integer from a, selected with imm8, and store the result in
1740 // the lower element of dst.
1741 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_pi16
1742 #define _mm_extract_pi16(a, imm) \
1743     (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
1744
1745 // Macro: Get the flush zero bits from the MXCSR control and status register.
1746 // The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or
1747 // _MM_FLUSH_ZERO_OFF
1748 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_FLUSH_ZERO_MODE
1749 FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode()
1750 {
1751     union {
1752         fpcr_bitfield field;
1753 #if defined(__aarch64__)
1754         uint64_t value;
1755 #else
1756         uint32_t value;
1757 #endif
1758     } r;
1759
1760 #if defined(__aarch64__)
1761     asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
1762 #else
1763     asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
1764 #endif
1765
1766     return r.field.bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF;
1767 }
1768
1769 // Macro: Get the rounding mode bits from the MXCSR control and status register.
1770 // The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
1771 // _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
1772 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_ROUNDING_MODE
1773 FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE()
1774 {
1775     union {
1776         fpcr_bitfield field;
1777 #if defined(__aarch64__)
1778         uint64_t value;
1779 #else
1780         uint32_t value;
1781 #endif
1782     } r;
1783
1784 #if defined(__aarch64__)
1785     asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
1786 #else
1787     asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
1788 #endif
1789
1790     if (r.field.bit22) {
1791         return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP;
1792     } else {
1793         return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST;
1794     }
1795 }
1796
1797 // Copy a to dst, and insert the 16-bit integer i into dst at the location
1798 // specified by imm8.
1799 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_pi16
1800 #define _mm_insert_pi16(a, b, imm)                               \
1801     __extension__({                                              \
1802         vreinterpret_m64_s16(                                    \
1803             vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \
1804     })
1805
1806 // Loads four single-precision, floating-point values.
1807 // https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
1808 FORCE_INLINE __m128 _mm_load_ps(const float *p)
1809 {
1810     return vreinterpretq_m128_f32(vld1q_f32(p));
1811 }
1812
1813 // Load a single-precision (32-bit) floating-point element from memory into all
1814 // elements of dst.
1815 //
1816 //   dst[31:0] := MEM[mem_addr+31:mem_addr]
1817 //   dst[63:32] := MEM[mem_addr+31:mem_addr]
1818 //   dst[95:64] := MEM[mem_addr+31:mem_addr]
1819 //   dst[127:96] := MEM[mem_addr+31:mem_addr]
1820 //
1821 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1
1822 #define _mm_load_ps1 _mm_load1_ps
1823
1824 // Loads an single - precision, floating - point value into the low word and
1825 // clears the upper three words.
1826 // https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
1827 FORCE_INLINE __m128 _mm_load_ss(const float *p)
1828 {
1829     return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
1830 }
1831
1832 // Loads a single single-precision, floating-point value, copying it into all
1833 // four words
1834 // https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
1835 FORCE_INLINE __m128 _mm_load1_ps(const float *p)
1836 {
1837     return vreinterpretq_m128_f32(vld1q_dup_f32(p));
1838 }
1839
1840 // Sets the upper two single-precision, floating-point values with 64
1841 // bits of data loaded from the address p; the lower two values are passed
1842 // through from a.
1843 //
1844 //   r0 := a0
1845 //   r1 := a1
1846 //   r2 := *p0
1847 //   r3 := *p1
1848 //
1849 // https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx
1850 FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
1851 {
1852     return vreinterpretq_m128_f32(
1853         vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
1854 }
1855
1856 // Sets the lower two single-precision, floating-point values with 64
1857 // bits of data loaded from the address p; the upper two values are passed
1858 // through from a.
1859 //
1860 // Return Value
1861 //   r0 := *p0
1862 //   r1 := *p1
1863 //   r2 := a2
1864 //   r3 := a3
1865 //
1866 // https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx
1867 FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
1868 {
1869     return vreinterpretq_m128_f32(
1870         vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
1871 }
1872
1873 // Load 4 single-precision (32-bit) floating-point elements from memory into dst
1874 // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
1875 // general-protection exception may be generated.
1876 //
1877 //   dst[31:0] := MEM[mem_addr+127:mem_addr+96]
1878 //   dst[63:32] := MEM[mem_addr+95:mem_addr+64]
1879 //   dst[95:64] := MEM[mem_addr+63:mem_addr+32]
1880 //   dst[127:96] := MEM[mem_addr+31:mem_addr]
1881 //
1882 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps
1883 FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
1884 {
1885     float32x4_t v = vrev64q_f32(vld1q_f32(p));
1886     return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
1887 }
1888
1889 // Loads four single-precision, floating-point values.
1890 // https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
1891 FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
1892 {
1893     // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
1894     // equivalent for neon
1895     return vreinterpretq_m128_f32(vld1q_f32(p));
1896 }
1897
1898 // Load unaligned 16-bit integer from memory into the first element of dst.
1899 //
1900 //   dst[15:0] := MEM[mem_addr+15:mem_addr]
1901 //   dst[MAX:16] := 0
1902 //
1903 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16
1904 FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
1905 {
1906     return vreinterpretq_m128i_s16(
1907         vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
1908 }
1909
1910 // Load unaligned 64-bit integer from memory into the first element of dst.
1911 //
1912 //   dst[63:0] := MEM[mem_addr+63:mem_addr]
1913 //   dst[MAX:64] := 0
1914 //
1915 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64
1916 FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
1917 {
1918     return vreinterpretq_m128i_s64(
1919         vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
1920 }
1921
1922 // Conditionally store 8-bit integer elements from a into memory using mask
1923 // (elements are not stored when the highest bit is not set in the corresponding
1924 // element) and a non-temporal memory hint.
1925 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmove_si64
1926 FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
1927 {
1928     int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7);
1929     __m128 b = _mm_load_ps((const float *) mem_addr);
1930     int8x8_t masked =
1931         vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a),
1932                 vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b))));
1933     vst1_s8((int8_t *) mem_addr, masked);
1934 }
1935
1936 // Conditionally store 8-bit integer elements from a into memory using mask
1937 // (elements are not stored when the highest bit is not set in the corresponding
1938 // element) and a non-temporal memory hint.
1939 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_maskmovq
1940 #define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
1941
1942 // Compare packed signed 16-bit integers in a and b, and store packed maximum
1943 // values in dst.
1944 //
1945 //   FOR j := 0 to 3
1946 //      i := j*16
1947 //      dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
1948 //   ENDFOR
1949 //
1950 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
1951 FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
1952 {
1953     return vreinterpret_m64_s16(
1954         vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
1955 }
1956
1957 // Computes the maximums of the four single-precision, floating-point values of
1958 // a and b.
1959 // https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
1960 FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
1961 {
1962 #if SSE2NEON_PRECISE_MINMAX
1963     float32x4_t _a = vreinterpretq_f32_m128(a);
1964     float32x4_t _b = vreinterpretq_f32_m128(b);
1965     return vreinterpretq_m128_f32(vbslq_f32(vcgtq_f32(_a, _b), _a, _b));
1966 #else
1967     return vreinterpretq_m128_f32(
1968         vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1969 #endif
1970 }
1971
1972 // Compare packed unsigned 8-bit integers in a and b, and store packed maximum
1973 // values in dst.
1974 //
1975 //   FOR j := 0 to 7
1976 //      i := j*8
1977 //      dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
1978 //   ENDFOR
1979 //
1980 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
1981 FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
1982 {
1983     return vreinterpret_m64_u8(
1984         vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
1985 }
1986
1987 // Computes the maximum of the two lower scalar single-precision floating point
1988 // values of a and b.
1989 // https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
1990 FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
1991 {
1992     float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
1993     return vreinterpretq_m128_f32(
1994         vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
1995 }
1996
1997 // Compare packed signed 16-bit integers in a and b, and store packed minimum
1998 // values in dst.
1999 //
2000 //   FOR j := 0 to 3
2001 //      i := j*16
2002 //      dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
2003 //   ENDFOR
2004 //
2005 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
2006 FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
2007 {
2008     return vreinterpret_m64_s16(
2009         vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
2010 }
2011
2012 // Computes the minima of the four single-precision, floating-point values of a
2013 // and b.
2014 // https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
2015 FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
2016 {
2017 #if SSE2NEON_PRECISE_MINMAX
2018     float32x4_t _a = vreinterpretq_f32_m128(a);
2019     float32x4_t _b = vreinterpretq_f32_m128(b);
2020     return vreinterpretq_m128_f32(vbslq_f32(vcltq_f32(_a, _b), _a, _b));
2021 #else
2022     return vreinterpretq_m128_f32(
2023         vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2024 #endif
2025 }
2026
2027 // Compare packed unsigned 8-bit integers in a and b, and store packed minimum
2028 // values in dst.
2029 //
2030 //   FOR j := 0 to 7
2031 //      i := j*8
2032 //      dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
2033 //   ENDFOR
2034 //
2035 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
2036 FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
2037 {
2038     return vreinterpret_m64_u8(
2039         vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
2040 }
2041
2042 // Computes the minimum of the two lower scalar single-precision floating point
2043 // values of a and b.
2044 // https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
2045 FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
2046 {
2047     float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
2048     return vreinterpretq_m128_f32(
2049         vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
2050 }
2051
2052 // Sets the low word to the single-precision, floating-point value of b
2053 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100)
2054 FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
2055 {
2056     return vreinterpretq_m128_f32(
2057         vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
2058                        vreinterpretq_f32_m128(a), 0));
2059 }
2060
2061 // Moves the upper two values of B into the lower two values of A.
2062 //
2063 //   r3 := a3
2064 //   r2 := a2
2065 //   r1 := b3
2066 //   r0 := b2
2067 FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B)
2068 {
2069     float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A));
2070     float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B));
2071     return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
2072 }
2073
2074 // Moves the lower two values of B into the upper two values of A.
2075 //
2076 //   r3 := b1
2077 //   r2 := b0
2078 //   r1 := a1
2079 //   r0 := a0
2080 FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
2081 {
2082     float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
2083     float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
2084     return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
2085 }
2086
2087 // Create mask from the most significant bit of each 8-bit element in a, and
2088 // store the result in dst.
2089 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pi8
2090 FORCE_INLINE int _mm_movemask_pi8(__m64 a)
2091 {
2092     uint8x8_t input = vreinterpret_u8_m64(a);
2093 #if defined(__aarch64__)
2094     static const int8x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
2095     uint8x8_t tmp = vshr_n_u8(input, 7);
2096     return vaddv_u8(vshl_u8(tmp, shift));
2097 #else
2098     // Refer the implementation of `_mm_movemask_epi8`
2099     uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7));
2100     uint32x2_t paired16 =
2101         vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
2102     uint8x8_t paired32 =
2103         vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14));
2104     return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4);
2105 #endif
2106 }
2107
2108 // NEON does not provide this method
2109 // Creates a 4-bit mask from the most significant bits of the four
2110 // single-precision, floating-point values.
2111 // https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
2112 FORCE_INLINE int _mm_movemask_ps(__m128 a)
2113 {
2114     uint32x4_t input = vreinterpretq_u32_m128(a);
2115 #if defined(__aarch64__)
2116     static const int32x4_t shift = {0, 1, 2, 3};
2117     uint32x4_t tmp = vshrq_n_u32(input, 31);
2118     return vaddvq_u32(vshlq_u32(tmp, shift));
2119 #else
2120     // Uses the exact same method as _mm_movemask_epi8, see that for details.
2121     // Shift out everything but the sign bits with a 32-bit unsigned shift
2122     // right.
2123     uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
2124     // Merge the two pairs together with a 64-bit unsigned shift right + add.
2125     uint8x16_t paired =
2126         vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
2127     // Extract the result.
2128     return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
2129 #endif
2130 }
2131
2132 // Multiplies the four single-precision, floating-point values of a and b.
2133 //
2134 //   r0 := a0 * b0
2135 //   r1 := a1 * b1
2136 //   r2 := a2 * b2
2137 //   r3 := a3 * b3
2138 //
2139 // https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
2140 FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
2141 {
2142     return vreinterpretq_m128_f32(
2143         vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2144 }
2145
2146 // Multiply the lower single-precision (32-bit) floating-point element in a and
2147 // b, store the result in the lower element of dst, and copy the upper 3 packed
2148 // elements from a to the upper elements of dst.
2149 //
2150 //   dst[31:0] := a[31:0] * b[31:0]
2151 //   dst[127:32] := a[127:32]
2152 //
2153 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss
2154 FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
2155 {
2156     return _mm_move_ss(a, _mm_mul_ps(a, b));
2157 }
2158
2159 // Multiply the packed unsigned 16-bit integers in a and b, producing
2160 // intermediate 32-bit integers, and store the high 16 bits of the intermediate
2161 // integers in dst.
2162 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16
2163 FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
2164 {
2165     return vreinterpret_m64_u16(vshrn_n_u32(
2166         vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
2167 }
2168
2169 // Computes the bitwise OR of the four single-precision, floating-point values
2170 // of a and b.
2171 // https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
2172 FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
2173 {
2174     return vreinterpretq_m128_s32(
2175         vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
2176 }
2177
2178 // Average packed unsigned 8-bit integers in a and b, and store the results in
2179 // dst.
2180 //
2181 //   FOR j := 0 to 7
2182 //     i := j*8
2183 //     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
2184 //   ENDFOR
2185 //
2186 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb
2187 #define _m_pavgb(a, b) _mm_avg_pu8(a, b)
2188
2189 // Average packed unsigned 16-bit integers in a and b, and store the results in
2190 // dst.
2191 //
2192 //   FOR j := 0 to 3
2193 //     i := j*16
2194 //     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
2195 //   ENDFOR
2196 //
2197 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw
2198 #define _m_pavgw(a, b) _mm_avg_pu16(a, b)
2199
2200 // Extract a 16-bit integer from a, selected with imm8, and store the result in
2201 // the lower element of dst.
2202 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pextrw
2203 #define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
2204
2205 // Copy a to dst, and insert the 16-bit integer i into dst at the location
2206 // specified by imm8.
2207 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_pinsrw
2208 #define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
2209
2210 // Compare packed signed 16-bit integers in a and b, and store packed maximum
2211 // values in dst.
2212 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxsw
2213 #define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
2214
2215 // Compare packed unsigned 8-bit integers in a and b, and store packed maximum
2216 // values in dst.
2217 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxub
2218 #define _m_pmaxub(a, b) _mm_max_pu8(a, b)
2219
2220 // Compare packed signed 16-bit integers in a and b, and store packed minimum
2221 // values in dst.
2222 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminsw
2223 #define _m_pminsw(a, b) _mm_min_pi16(a, b)
2224
2225 // Compare packed unsigned 8-bit integers in a and b, and store packed minimum
2226 // values in dst.
2227 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminub
2228 #define _m_pminub(a, b) _mm_min_pu8(a, b)
2229
2230 // Create mask from the most significant bit of each 8-bit element in a, and
2231 // store the result in dst.
2232 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmovmskb
2233 #define _m_pmovmskb(a) _mm_movemask_pi8(a)
2234
2235 // Multiply the packed unsigned 16-bit integers in a and b, producing
2236 // intermediate 32-bit integers, and store the high 16 bits of the intermediate
2237 // integers in dst.
2238 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw
2239 #define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
2240
2241 // Loads one cache line of data from address p to a location closer to the
2242 // processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx
2243 FORCE_INLINE void _mm_prefetch(const void *p, int i)
2244 {
2245     (void) i;
2246     __builtin_prefetch(p);
2247 }
2248
2249 // Compute the absolute differences of packed unsigned 8-bit integers in a and
2250 // b, then horizontally sum each consecutive 8 differences to produce four
2251 // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
2252 // 16 bits of dst.
2253 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_psadbw
2254 #define _m_psadbw(a, b) _mm_sad_pu8(a, b)
2255
2256 // Shuffle 16-bit integers in a using the control in imm8, and store the results
2257 // in dst.
2258 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pshufw
2259 #define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)
2260
2261 // Compute the approximate reciprocal of packed single-precision (32-bit)
2262 // floating-point elements in a, and store the results in dst. The maximum
2263 // relative error for this approximation is less than 1.5*2^-12.
2264 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps
2265 FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
2266 {
2267     float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
2268     recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
2269 #if SSE2NEON_PRECISE_DIV
2270     // Additional Netwon-Raphson iteration for accuracy
2271     recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
2272 #endif
2273     return vreinterpretq_m128_f32(recip);
2274 }
2275
2276 // Compute the approximate reciprocal of the lower single-precision (32-bit)
2277 // floating-point element in a, store the result in the lower element of dst,
2278 // and copy the upper 3 packed elements from a to the upper elements of dst. The
2279 // maximum relative error for this approximation is less than 1.5*2^-12.
2280 //
2281 //   dst[31:0] := (1.0 / a[31:0])
2282 //   dst[127:32] := a[127:32]
2283 //
2284 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss
2285 FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
2286 {
2287     return _mm_move_ss(a, _mm_rcp_ps(a));
2288 }
2289
2290 // Computes the approximations of the reciprocal square roots of the four
2291 // single-precision floating point values of in.
2292 // The current precision is 1% error.
2293 // https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
2294 FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
2295 {
2296     float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2297 #if SSE2NEON_PRECISE_SQRT
2298     // Additional Netwon-Raphson iteration for accuracy
2299     out = vmulq_f32(
2300         out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
2301     out = vmulq_f32(
2302         out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
2303 #endif
2304     return vreinterpretq_m128_f32(out);
2305 }
2306
2307 // Compute the approximate reciprocal square root of the lower single-precision
2308 // (32-bit) floating-point element in a, store the result in the lower element
2309 // of dst, and copy the upper 3 packed elements from a to the upper elements of
2310 // dst.
2311 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss
2312 FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
2313 {
2314     return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
2315 }
2316
2317 // Compute the absolute differences of packed unsigned 8-bit integers in a and
2318 // b, then horizontally sum each consecutive 8 differences to produce four
2319 // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
2320 // 16 bits of dst.
2321 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8
2322 FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
2323 {
2324     uint64x1_t t = vpaddl_u32(vpaddl_u16(
2325         vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)))));
2326     return vreinterpret_m64_u16(
2327         vset_lane_u16(vget_lane_u64(t, 0), vdup_n_u16(0), 0));
2328 }
2329
2330 // Macro: Set the flush zero bits of the MXCSR control and status register to
2331 // the value in unsigned 32-bit integer a. The flush zero may contain any of the
2332 // following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF
2333 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_FLUSH_ZERO_MODE
2334 FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
2335 {
2336     // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
2337     // regardless of the value of the FZ bit.
2338     union {
2339         fpcr_bitfield field;
2340 #if defined(__aarch64__)
2341         uint64_t value;
2342 #else
2343         uint32_t value;
2344 #endif
2345     } r;
2346
2347 #if defined(__aarch64__)
2348     asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
2349 #else
2350     asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
2351 #endif
2352
2353     r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON;
2354
2355 #if defined(__aarch64__)
2356     asm volatile("msr FPCR, %0" ::"r"(r)); /* write */
2357 #else
2358     asm volatile("vmsr FPSCR, %0" ::"r"(r));        /* write */
2359 #endif
2360 }
2361
2362 // Sets the four single-precision, floating-point values to the four inputs.
2363 // https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
2364 FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
2365 {
2366     float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
2367     return vreinterpretq_m128_f32(vld1q_f32(data));
2368 }
2369
2370 // Sets the four single-precision, floating-point values to w.
2371 // https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
2372 FORCE_INLINE __m128 _mm_set_ps1(float _w)
2373 {
2374     return vreinterpretq_m128_f32(vdupq_n_f32(_w));
2375 }
2376
2377 // Macro: Set the rounding mode bits of the MXCSR control and status register to
2378 // the value in unsigned 32-bit integer a. The rounding mode may contain any of
2379 // the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
2380 // _MM_ROUND_TOWARD_ZERO
2381 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_ROUNDING_MODE
2382 FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
2383 {
2384     union {
2385         fpcr_bitfield field;
2386 #if defined(__aarch64__)
2387         uint64_t value;
2388 #else
2389         uint32_t value;
2390 #endif
2391     } r;
2392
2393 #if defined(__aarch64__)
2394     asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
2395 #else
2396     asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
2397 #endif
2398
2399     switch (rounding) {
2400     case _MM_ROUND_TOWARD_ZERO:
2401         r.field.bit22 = 1;
2402         r.field.bit23 = 1;
2403         break;
2404     case _MM_ROUND_DOWN:
2405         r.field.bit22 = 0;
2406         r.field.bit23 = 1;
2407         break;
2408     case _MM_ROUND_UP:
2409         r.field.bit22 = 1;
2410         r.field.bit23 = 0;
2411         break;
2412     default:  //_MM_ROUND_NEAREST
2413         r.field.bit22 = 0;
2414         r.field.bit23 = 0;
2415     }
2416
2417 #if defined(__aarch64__)
2418     asm volatile("msr FPCR, %0" ::"r"(r)); /* write */
2419 #else
2420     asm volatile("vmsr FPSCR, %0" ::"r"(r));        /* write */
2421 #endif
2422 }
2423
2424 // Copy single-precision (32-bit) floating-point element a to the lower element
2425 // of dst, and zero the upper 3 elements.
2426 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss
2427 FORCE_INLINE __m128 _mm_set_ss(float a)
2428 {
2429     float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0};
2430     return vreinterpretq_m128_f32(vld1q_f32(data));
2431 }
2432
2433 // Sets the four single-precision, floating-point values to w.
2434 //
2435 //   r0 := r1 := r2 := r3 := w
2436 //
2437 // https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
2438 FORCE_INLINE __m128 _mm_set1_ps(float _w)
2439 {
2440     return vreinterpretq_m128_f32(vdupq_n_f32(_w));
2441 }
2442
2443 // FIXME: _mm_setcsr() implementation supports changing the rounding mode only.
2444 FORCE_INLINE void _mm_setcsr(unsigned int a)
2445 {
2446     _MM_SET_ROUNDING_MODE(a);
2447 }
2448
2449 // FIXME: _mm_getcsr() implementation supports reading the rounding mode only.
2450 FORCE_INLINE unsigned int _mm_getcsr()
2451 {
2452     return _MM_GET_ROUNDING_MODE();
2453 }
2454
2455 // Sets the four single-precision, floating-point values to the four inputs in
2456 // reverse order.
2457 // https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
2458 FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
2459 {
2460     float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
2461     return vreinterpretq_m128_f32(vld1q_f32(data));
2462 }
2463
2464 // Clears the four single-precision, floating-point values.
2465 // https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
2466 FORCE_INLINE __m128 _mm_setzero_ps(void)
2467 {
2468     return vreinterpretq_m128_f32(vdupq_n_f32(0));
2469 }
2470
2471 // Shuffle 16-bit integers in a using the control in imm8, and store the results
2472 // in dst.
2473 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pi16
2474 #if __has_builtin(__builtin_shufflevector)
2475 #define _mm_shuffle_pi16(a, imm)                                           \
2476     __extension__({                                                        \
2477         vreinterpret_m64_s16(__builtin_shufflevector(                      \
2478             vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \
2479             ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3)));  \
2480     })
2481 #else
2482 #define _mm_shuffle_pi16(a, imm)                                               \
2483     __extension__({                                                            \
2484         int16x4_t ret;                                                         \
2485         ret =                                                                  \
2486             vmov_n_s16(vget_lane_s16(vreinterpret_s16_m64(a), (imm) & (0x3))); \
2487         ret = vset_lane_s16(                                                   \
2488             vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 2) & 0x3), ret,   \
2489             1);                                                                \
2490         ret = vset_lane_s16(                                                   \
2491             vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 4) & 0x3), ret,   \
2492             2);                                                                \
2493         ret = vset_lane_s16(                                                   \
2494             vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 6) & 0x3), ret,   \
2495             3);                                                                \
2496         vreinterpret_m64_s16(ret);                                             \
2497     })
2498 #endif
2499
2500 // Guarantees that every preceding store is globally visible before any
2501 // subsequent store.
2502 // https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
2503 FORCE_INLINE void _mm_sfence(void)
2504 {
2505     __sync_synchronize();
2506 }
2507
2508 // FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
2509 // int imm)
2510 #if __has_builtin(__builtin_shufflevector)
2511 #define _mm_shuffle_ps(a, b, imm)                                \
2512     __extension__({                                              \
2513         float32x4_t _input1 = vreinterpretq_f32_m128(a);         \
2514         float32x4_t _input2 = vreinterpretq_f32_m128(b);         \
2515         float32x4_t _shuf = __builtin_shufflevector(             \
2516             _input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
2517             (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
2518         vreinterpretq_m128_f32(_shuf);                           \
2519     })
2520 #else  // generic
2521 #define _mm_shuffle_ps(a, b, imm)                          \
2522     __extension__({                                        \
2523         __m128 ret;                                        \
2524         switch (imm) {                                     \
2525         case _MM_SHUFFLE(1, 0, 3, 2):                      \
2526             ret = _mm_shuffle_ps_1032((a), (b));           \
2527             break;                                         \
2528         case _MM_SHUFFLE(2, 3, 0, 1):                      \
2529             ret = _mm_shuffle_ps_2301((a), (b));           \
2530             break;                                         \
2531         case _MM_SHUFFLE(0, 3, 2, 1):                      \
2532             ret = _mm_shuffle_ps_0321((a), (b));           \
2533             break;                                         \
2534         case _MM_SHUFFLE(2, 1, 0, 3):                      \
2535             ret = _mm_shuffle_ps_2103((a), (b));           \
2536             break;                                         \
2537         case _MM_SHUFFLE(1, 0, 1, 0):                      \
2538             ret = _mm_movelh_ps((a), (b));                 \
2539             break;                                         \
2540         case _MM_SHUFFLE(1, 0, 0, 1):                      \
2541             ret = _mm_shuffle_ps_1001((a), (b));           \
2542             break;                                         \
2543         case _MM_SHUFFLE(0, 1, 0, 1):                      \
2544             ret = _mm_shuffle_ps_0101((a), (b));           \
2545             break;                                         \
2546         case _MM_SHUFFLE(3, 2, 1, 0):                      \
2547             ret = _mm_shuffle_ps_3210((a), (b));           \
2548             break;                                         \
2549         case _MM_SHUFFLE(0, 0, 1, 1):                      \
2550             ret = _mm_shuffle_ps_0011((a), (b));           \
2551             break;                                         \
2552         case _MM_SHUFFLE(0, 0, 2, 2):                      \
2553             ret = _mm_shuffle_ps_0022((a), (b));           \
2554             break;                                         \
2555         case _MM_SHUFFLE(2, 2, 0, 0):                      \
2556             ret = _mm_shuffle_ps_2200((a), (b));           \
2557             break;                                         \
2558         case _MM_SHUFFLE(3, 2, 0, 2):                      \
2559             ret = _mm_shuffle_ps_3202((a), (b));           \
2560             break;                                         \
2561         case _MM_SHUFFLE(3, 2, 3, 2):                      \
2562             ret = _mm_movehl_ps((b), (a));                 \
2563             break;                                         \
2564         case _MM_SHUFFLE(1, 1, 3, 3):                      \
2565             ret = _mm_shuffle_ps_1133((a), (b));           \
2566             break;                                         \
2567         case _MM_SHUFFLE(2, 0, 1, 0):                      \
2568             ret = _mm_shuffle_ps_2010((a), (b));           \
2569             break;                                         \
2570         case _MM_SHUFFLE(2, 0, 0, 1):                      \
2571             ret = _mm_shuffle_ps_2001((a), (b));           \
2572             break;                                         \
2573         case _MM_SHUFFLE(2, 0, 3, 2):                      \
2574             ret = _mm_shuffle_ps_2032((a), (b));           \
2575             break;                                         \
2576         default:                                           \
2577             ret = _mm_shuffle_ps_default((a), (b), (imm)); \
2578             break;                                         \
2579         }                                                  \
2580         ret;                                               \
2581     })
2582 #endif
2583
2584 // Computes the approximations of square roots of the four single-precision,
2585 // floating-point values of a. First computes reciprocal square roots and then
2586 // reciprocals of the four values.
2587 //
2588 //   r0 := sqrt(a0)
2589 //   r1 := sqrt(a1)
2590 //   r2 := sqrt(a2)
2591 //   r3 := sqrt(a3)
2592 //
2593 // https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
2594 FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
2595 {
2596 #if SSE2NEON_PRECISE_SQRT
2597     float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2598
2599     // Test for vrsqrteq_f32(0) -> positive infinity case.
2600     // Change to zero, so that s * 1/sqrt(s) result is zero too.
2601     const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
2602     const uint32x4_t div_by_zero =
2603         vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
2604     recip = vreinterpretq_f32_u32(
2605         vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
2606
2607     // Additional Netwon-Raphson iteration for accuracy
2608     recip = vmulq_f32(
2609         vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
2610         recip);
2611     recip = vmulq_f32(
2612         vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
2613         recip);
2614
2615     // sqrt(s) = s * 1/sqrt(s)
2616     return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));
2617 #elif defined(__aarch64__)
2618     return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
2619 #else
2620     float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2621     float32x4_t sq = vrecpeq_f32(recipsq);
2622     return vreinterpretq_m128_f32(sq);
2623 #endif
2624 }
2625
2626 // Computes the approximation of the square root of the scalar single-precision
2627 // floating point value of in.
2628 // https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
2629 FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
2630 {
2631     float32_t value =
2632         vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
2633     return vreinterpretq_m128_f32(
2634         vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
2635 }
2636
2637 // Stores four single-precision, floating-point values.
2638 // https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
2639 FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
2640 {
2641     vst1q_f32(p, vreinterpretq_f32_m128(a));
2642 }
2643
2644 // Store the lower single-precision (32-bit) floating-point element from a into
2645 // 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
2646 // boundary or a general-protection exception may be generated.
2647 //
2648 //   MEM[mem_addr+31:mem_addr] := a[31:0]
2649 //   MEM[mem_addr+63:mem_addr+32] := a[31:0]
2650 //   MEM[mem_addr+95:mem_addr+64] := a[31:0]
2651 //   MEM[mem_addr+127:mem_addr+96] := a[31:0]
2652 //
2653 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps1
2654 FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
2655 {
2656     float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
2657     vst1q_f32(p, vdupq_n_f32(a0));
2658 }
2659
2660 // Stores the lower single - precision, floating - point value.
2661 // https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
2662 FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
2663 {
2664     vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
2665 }
2666
2667 // Store the lower single-precision (32-bit) floating-point element from a into
2668 // 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
2669 // boundary or a general-protection exception may be generated.
2670 //
2671 //   MEM[mem_addr+31:mem_addr] := a[31:0]
2672 //   MEM[mem_addr+63:mem_addr+32] := a[31:0]
2673 //   MEM[mem_addr+95:mem_addr+64] := a[31:0]
2674 //   MEM[mem_addr+127:mem_addr+96] := a[31:0]
2675 //
2676 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_ps
2677 #define _mm_store1_ps _mm_store_ps1
2678
2679 // Stores the upper two single-precision, floating-point values of a to the
2680 // address p.
2681 //
2682 //   *p0 := a2
2683 //   *p1 := a3
2684 //
2685 // https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx
2686 FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
2687 {
2688     *p = vreinterpret_m64_f32(vget_high_f32(a));
2689 }
2690
2691 // Stores the lower two single-precision floating point values of a to the
2692 // address p.
2693 //
2694 //   *p0 := a0
2695 //   *p1 := a1
2696 //
2697 // https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx
2698 FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
2699 {
2700     *p = vreinterpret_m64_f32(vget_low_f32(a));
2701 }
2702
2703 // Store 4 single-precision (32-bit) floating-point elements from a into memory
2704 // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
2705 // general-protection exception may be generated.
2706 //
2707 //   MEM[mem_addr+31:mem_addr] := a[127:96]
2708 //   MEM[mem_addr+63:mem_addr+32] := a[95:64]
2709 //   MEM[mem_addr+95:mem_addr+64] := a[63:32]
2710 //   MEM[mem_addr+127:mem_addr+96] := a[31:0]
2711 //
2712 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_ps
2713 FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
2714 {
2715     float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
2716     float32x4_t rev = vextq_f32(tmp, tmp, 2);
2717     vst1q_f32(p, rev);
2718 }
2719
2720 // Stores four single-precision, floating-point values.
2721 // https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
2722 FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
2723 {
2724     vst1q_f32(p, vreinterpretq_f32_m128(a));
2725 }
2726
2727 // Stores 16-bits of integer data a at the address p.
2728 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si16
2729 FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a)
2730 {
2731     vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0);
2732 }
2733
2734 // Stores 64-bits of integer data a at the address p.
2735 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si64
2736 FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
2737 {
2738     vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0);
2739 }
2740
2741 // Store 64-bits of integer data from a into memory using a non-temporal memory
2742 // hint.
2743 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pi
2744 FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
2745 {
2746     vst1_s64((int64_t *) p, vreinterpret_s64_m64(a));
2747 }
2748
2749 // Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
2750 // point elements) from a into memory using a non-temporal memory hint.
2751 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps
2752 FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
2753 {
2754 #if __has_builtin(__builtin_nontemporal_store)
2755     __builtin_nontemporal_store(a, (float32x4_t *) p);
2756 #else
2757     vst1q_f32(p, vreinterpretq_f32_m128(a));
2758 #endif
2759 }
2760
2761 // Subtracts the four single-precision, floating-point values of a and b.
2762 //
2763 //   r0 := a0 - b0
2764 //   r1 := a1 - b1
2765 //   r2 := a2 - b2
2766 //   r3 := a3 - b3
2767 //
2768 // https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
2769 FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
2770 {
2771     return vreinterpretq_m128_f32(
2772         vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2773 }
2774
2775 // Subtract the lower single-precision (32-bit) floating-point element in b from
2776 // the lower single-precision (32-bit) floating-point element in a, store the
2777 // result in the lower element of dst, and copy the upper 3 packed elements from
2778 // a to the upper elements of dst.
2779 //
2780 //   dst[31:0] := a[31:0] - b[31:0]
2781 //   dst[127:32] := a[127:32]
2782 //
2783 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss
2784 FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
2785 {
2786     return _mm_move_ss(a, _mm_sub_ps(a, b));
2787 }
2788
2789 // Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
2790 // (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
2791 // transposed matrix in these vectors (row0 now contains column 0, etc.).
2792 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS
2793 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)         \
2794     do {                                                  \
2795         float32x4x2_t ROW01 = vtrnq_f32(row0, row1);      \
2796         float32x4x2_t ROW23 = vtrnq_f32(row2, row3);      \
2797         row0 = vcombine_f32(vget_low_f32(ROW01.val[0]),   \
2798                             vget_low_f32(ROW23.val[0]));  \
2799         row1 = vcombine_f32(vget_low_f32(ROW01.val[1]),   \
2800                             vget_low_f32(ROW23.val[1]));  \
2801         row2 = vcombine_f32(vget_high_f32(ROW01.val[0]),  \
2802                             vget_high_f32(ROW23.val[0])); \
2803         row3 = vcombine_f32(vget_high_f32(ROW01.val[1]),  \
2804                             vget_high_f32(ROW23.val[1])); \
2805     } while (0)
2806
2807 // according to the documentation, these intrinsics behave the same as the
2808 // non-'u' versions.  We'll just alias them here.
2809 #define _mm_ucomieq_ss _mm_comieq_ss
2810 #define _mm_ucomige_ss _mm_comige_ss
2811 #define _mm_ucomigt_ss _mm_comigt_ss
2812 #define _mm_ucomile_ss _mm_comile_ss
2813 #define _mm_ucomilt_ss _mm_comilt_ss
2814 #define _mm_ucomineq_ss _mm_comineq_ss
2815
2816 // Return vector of type __m128i with undefined elements.
2817 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_undefined_si128
2818 FORCE_INLINE __m128i _mm_undefined_si128(void)
2819 {
2820 #if defined(__GNUC__) || defined(__clang__)
2821 #pragma GCC diagnostic push
2822 #pragma GCC diagnostic ignored "-Wuninitialized"
2823 #endif
2824     __m128i a;
2825     return a;
2826 #if defined(__GNUC__) || defined(__clang__)
2827 #pragma GCC diagnostic pop
2828 #endif
2829 }
2830
2831 // Return vector of type __m128 with undefined elements.
2832 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps
2833 FORCE_INLINE __m128 _mm_undefined_ps(void)
2834 {
2835 #if defined(__GNUC__) || defined(__clang__)
2836 #pragma GCC diagnostic push
2837 #pragma GCC diagnostic ignored "-Wuninitialized"
2838 #endif
2839     __m128 a;
2840     return a;
2841 #if defined(__GNUC__) || defined(__clang__)
2842 #pragma GCC diagnostic pop
2843 #endif
2844 }
2845
2846 // Selects and interleaves the upper two single-precision, floating-point values
2847 // from a and b.
2848 //
2849 //   r0 := a2
2850 //   r1 := b2
2851 //   r2 := a3
2852 //   r3 := b3
2853 //
2854 // https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
2855 FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
2856 {
2857 #if defined(__aarch64__)
2858     return vreinterpretq_m128_f32(
2859         vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2860 #else
2861     float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
2862     float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
2863     float32x2x2_t result = vzip_f32(a1, b1);
2864     return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
2865 #endif
2866 }
2867
2868 // Selects and interleaves the lower two single-precision, floating-point values
2869 // from a and b.
2870 //
2871 //   r0 := a0
2872 //   r1 := b0
2873 //   r2 := a1
2874 //   r3 := b1
2875 //
2876 // https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
2877 FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
2878 {
2879 #if defined(__aarch64__)
2880     return vreinterpretq_m128_f32(
2881         vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2882 #else
2883     float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
2884     float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
2885     float32x2x2_t result = vzip_f32(a1, b1);
2886     return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
2887 #endif
2888 }
2889
2890 // Computes bitwise EXOR (exclusive-or) of the four single-precision,
2891 // floating-point values of a and b.
2892 // https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
2893 FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
2894 {
2895     return vreinterpretq_m128_s32(
2896         veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
2897 }
2898
2899 /* SSE2 */
2900
2901 // Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or
2902 // unsigned 16-bit integers in b.
2903 // https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
2904 FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
2905 {
2906     return vreinterpretq_m128i_s16(
2907         vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2908 }
2909
2910 // Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or
2911 // unsigned 32-bit integers in b.
2912 //
2913 //   r0 := a0 + b0
2914 //   r1 := a1 + b1
2915 //   r2 := a2 + b2
2916 //   r3 := a3 + b3
2917 //
2918 // https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
2919 FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
2920 {
2921     return vreinterpretq_m128i_s32(
2922         vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
2923 }
2924
2925 // Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or
2926 // unsigned 32-bit integers in b.
2927 // https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
2928 FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
2929 {
2930     return vreinterpretq_m128i_s64(
2931         vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
2932 }
2933
2934 // Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or
2935 // unsigned 8-bit integers in b.
2936 // https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
2937 FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
2938 {
2939     return vreinterpretq_m128i_s8(
2940         vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2941 }
2942
2943 // Add packed double-precision (64-bit) floating-point elements in a and b, and
2944 // store the results in dst.
2945 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd
2946 FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
2947 {
2948 #if defined(__aarch64__)
2949     return vreinterpretq_m128d_f64(
2950         vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
2951 #else
2952     double *da = (double *) &a;
2953     double *db = (double *) &b;
2954     double c[2];
2955     c[0] = da[0] + db[0];
2956     c[1] = da[1] + db[1];
2957     return vld1q_f32((float32_t *) c);
2958 #endif
2959 }
2960
2961 // Add the lower double-precision (64-bit) floating-point element in a and b,
2962 // store the result in the lower element of dst, and copy the upper element from
2963 // a to the upper element of dst.
2964 //
2965 //   dst[63:0] := a[63:0] + b[63:0]
2966 //   dst[127:64] := a[127:64]
2967 //
2968 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sd
2969 FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
2970 {
2971 #if defined(__aarch64__)
2972     return _mm_move_sd(a, _mm_add_pd(a, b));
2973 #else
2974     double *da = (double *) &a;
2975     double *db = (double *) &b;
2976     double c[2];
2977     c[0] = da[0] + db[0];
2978     c[1] = da[1];
2979     return vld1q_f32((float32_t *) c);
2980 #endif
2981 }
2982
2983 // Add 64-bit integers a and b, and store the result in dst.
2984 //
2985 //   dst[63:0] := a[63:0] + b[63:0]
2986 //
2987 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64
2988 FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
2989 {
2990     return vreinterpret_m64_s64(
2991         vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
2992 }
2993
2994 // Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b
2995 // and saturates.
2996 //
2997 //   r0 := SignedSaturate(a0 + b0)
2998 //   r1 := SignedSaturate(a1 + b1)
2999 //   ...
3000 //   r7 := SignedSaturate(a7 + b7)
3001 //
3002 // https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
3003 FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
3004 {
3005     return vreinterpretq_m128i_s16(
3006         vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3007 }
3008
3009 // Add packed signed 8-bit integers in a and b using saturation, and store the
3010 // results in dst.
3011 //
3012 //   FOR j := 0 to 15
3013 //     i := j*8
3014 //     dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
3015 //   ENDFOR
3016 //
3017 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8
3018 FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
3019 {
3020     return vreinterpretq_m128i_s8(
3021         vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3022 }
3023
3024 // Add packed unsigned 16-bit integers in a and b using saturation, and store
3025 // the results in dst.
3026 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epu16
3027 FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
3028 {
3029     return vreinterpretq_m128i_u16(
3030         vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
3031 }
3032
3033 // Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in
3034 // b and saturates..
3035 // https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
3036 FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
3037 {
3038     return vreinterpretq_m128i_u8(
3039         vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
3040 }
3041
3042 // Compute the bitwise AND of packed double-precision (64-bit) floating-point
3043 // elements in a and b, and store the results in dst.
3044 //
3045 //   FOR j := 0 to 1
3046 //     i := j*64
3047 //     dst[i+63:i] := a[i+63:i] AND b[i+63:i]
3048 //   ENDFOR
3049 //
3050 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd
3051 FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
3052 {
3053     return vreinterpretq_m128d_s64(
3054         vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
3055 }
3056
3057 // Computes the bitwise AND of the 128-bit value in a and the 128-bit value in
3058 // b.
3059 //
3060 //   r := a & b
3061 //
3062 // https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
3063 FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
3064 {
3065     return vreinterpretq_m128i_s32(
3066         vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3067 }
3068
3069 // Compute the bitwise NOT of packed double-precision (64-bit) floating-point
3070 // elements in a and then AND with b, and store the results in dst.
3071 //
3072 //   FOR j := 0 to 1
3073 //           i := j*64
3074 //           dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
3075 //   ENDFOR
3076 //
3077 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd
3078 FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
3079 {
3080     // *NOTE* argument swap
3081     return vreinterpretq_m128d_s64(
3082         vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
3083 }
3084
3085 // Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the
3086 // 128-bit value in a.
3087 //
3088 //   r := (~a) & b
3089 //
3090 // https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
3091 FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
3092 {
3093     return vreinterpretq_m128i_s32(
3094         vbicq_s32(vreinterpretq_s32_m128i(b),
3095                   vreinterpretq_s32_m128i(a)));  // *NOTE* argument swap
3096 }
3097
3098 // Computes the average of the 8 unsigned 16-bit integers in a and the 8
3099 // unsigned 16-bit integers in b and rounds.
3100 //
3101 //   r0 := (a0 + b0) / 2
3102 //   r1 := (a1 + b1) / 2
3103 //   ...
3104 //   r7 := (a7 + b7) / 2
3105 //
3106 // https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx
3107 FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
3108 {
3109     return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
3110                                  vreinterpretq_u16_m128i(b));
3111 }
3112
3113 // Computes the average of the 16 unsigned 8-bit integers in a and the 16
3114 // unsigned 8-bit integers in b and rounds.
3115 //
3116 //   r0 := (a0 + b0) / 2
3117 //   r1 := (a1 + b1) / 2
3118 //   ...
3119 //   r15 := (a15 + b15) / 2
3120 //
3121 // https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx
3122 FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
3123 {
3124     return vreinterpretq_m128i_u8(
3125         vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
3126 }
3127
3128 // Shift a left by imm8 bytes while shifting in zeros, and store the results in
3129 // dst.
3130 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bslli_si128
3131 #define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)
3132
3133 // Shift a right by imm8 bytes while shifting in zeros, and store the results in
3134 // dst.
3135 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bsrli_si128
3136 #define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
3137
3138 // Cast vector of type __m128d to type __m128. This intrinsic is only used for
3139 // compilation and does not generate any instructions, thus it has zero latency.
3140 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ps
3141 FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
3142 {
3143     return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a));
3144 }
3145
3146 // Cast vector of type __m128d to type __m128i. This intrinsic is only used for
3147 // compilation and does not generate any instructions, thus it has zero latency.
3148 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128
3149 FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
3150 {
3151     return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
3152 }
3153
3154 // Cast vector of type __m128 to type __m128d. This intrinsic is only used for
3155 // compilation and does not generate any instructions, thus it has zero latency.
3156 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd
3157 FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
3158 {
3159     return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
3160 }
3161
3162 // Applies a type cast to reinterpret four 32-bit floating point values passed
3163 // in as a 128-bit parameter as packed 32-bit integers.
3164 // https://msdn.microsoft.com/en-us/library/bb514099.aspx
3165 FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
3166 {
3167     return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
3168 }
3169
3170 // Cast vector of type __m128i to type __m128d. This intrinsic is only used for
3171 // compilation and does not generate any instructions, thus it has zero latency.
3172 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_pd
3173 FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
3174 {
3175 #if defined(__aarch64__)
3176     return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
3177 #else
3178     return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a));
3179 #endif
3180 }
3181
3182 // Applies a type cast to reinterpret four 32-bit integers passed in as a
3183 // 128-bit parameter as packed 32-bit floating point values.
3184 // https://msdn.microsoft.com/en-us/library/bb514029.aspx
3185 FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
3186 {
3187     return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
3188 }
3189
3190 // Cache line containing p is flushed and invalidated from all caches in the
3191 // coherency domain. :
3192 // https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx
3193 FORCE_INLINE void _mm_clflush(void const *p)
3194 {
3195     (void) p;
3196     // no corollary for Neon?
3197 }
3198
3199 // Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or
3200 // unsigned 16-bit integers in b for equality.
3201 // https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
3202 FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
3203 {
3204     return vreinterpretq_m128i_u16(
3205         vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3206 }
3207
3208 // Compare packed 32-bit integers in a and b for equality, and store the results
3209 // in dst
3210 FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
3211 {
3212     return vreinterpretq_m128i_u32(
3213         vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3214 }
3215
3216 // Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or
3217 // unsigned 8-bit integers in b for equality.
3218 // https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
3219 FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
3220 {
3221     return vreinterpretq_m128i_u8(
3222         vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3223 }
3224
3225 // Compare packed double-precision (64-bit) floating-point elements in a and b
3226 // for equality, and store the results in dst.
3227 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_pd
3228 FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
3229 {
3230 #if defined(__aarch64__)
3231     return vreinterpretq_m128d_u64(
3232         vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3233 #else
3234     // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
3235     uint32x4_t cmp =
3236         vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
3237     uint32x4_t swapped = vrev64q_u32(cmp);
3238     return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped));
3239 #endif
3240 }
3241
3242 // Compare the lower double-precision (64-bit) floating-point elements in a and
3243 // b for equality, store the result in the lower element of dst, and copy the
3244 // upper element from a to the upper element of dst.
3245 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_sd
3246 FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
3247 {
3248     return _mm_move_sd(a, _mm_cmpeq_pd(a, b));
3249 }
3250
3251 // Compare packed double-precision (64-bit) floating-point elements in a and b
3252 // for greater-than-or-equal, and store the results in dst.
3253 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_pd
3254 FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
3255 {
3256 #if defined(__aarch64__)
3257     return vreinterpretq_m128d_u64(
3258         vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3259 #else
3260     uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3261     uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3262     uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3263     uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3264     uint64_t d[2];
3265     d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3266     d[1] = (*(double *) &a1) >= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3267
3268     return vreinterpretq_m128d_u64(vld1q_u64(d));
3269 #endif
3270 }
3271
3272 // Compare the lower double-precision (64-bit) floating-point elements in a and
3273 // b for greater-than-or-equal, store the result in the lower element of dst,
3274 // and copy the upper element from a to the upper element of dst.
3275 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_sd
3276 FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
3277 {
3278 #if defined(__aarch64__)
3279     return _mm_move_sd(a, _mm_cmpge_pd(a, b));
3280 #else
3281     // expand "_mm_cmpge_pd()" to reduce unnecessary operations
3282     uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3283     uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3284     uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3285     uint64_t d[2];
3286     d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3287     d[1] = a1;
3288
3289     return vreinterpretq_m128d_u64(vld1q_u64(d));
3290 #endif
3291 }
3292
3293 // Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
3294 // in b for greater than.
3295 //
3296 //   r0 := (a0 > b0) ? 0xffff : 0x0
3297 //   r1 := (a1 > b1) ? 0xffff : 0x0
3298 //   ...
3299 //   r7 := (a7 > b7) ? 0xffff : 0x0
3300 //
3301 // https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
3302 FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
3303 {
3304     return vreinterpretq_m128i_u16(
3305         vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3306 }
3307
3308 // Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
3309 // in b for greater than.
3310 // https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
3311 FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
3312 {
3313     return vreinterpretq_m128i_u32(
3314         vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3315 }
3316
3317 // Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
3318 // in b for greater than.
3319 //
3320 //   r0 := (a0 > b0) ? 0xff : 0x0
3321 //   r1 := (a1 > b1) ? 0xff : 0x0
3322 //   ...
3323 //   r15 := (a15 > b15) ? 0xff : 0x0
3324 //
3325 // https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
3326 FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
3327 {
3328     return vreinterpretq_m128i_u8(
3329         vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3330 }
3331
3332 // Compare packed double-precision (64-bit) floating-point elements in a and b
3333 // for greater-than, and store the results in dst.
3334 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_pd
3335 FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
3336 {
3337 #if defined(__aarch64__)
3338     return vreinterpretq_m128d_u64(
3339         vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3340 #else
3341     uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3342     uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3343     uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3344     uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3345     uint64_t d[2];
3346     d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3347     d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3348
3349     return vreinterpretq_m128d_u64(vld1q_u64(d));
3350 #endif
3351 }
3352
3353 // Compare the lower double-precision (64-bit) floating-point elements in a and
3354 // b for greater-than, store the result in the lower element of dst, and copy
3355 // the upper element from a to the upper element of dst.
3356 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_sd
3357 FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
3358 {
3359 #if defined(__aarch64__)
3360     return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
3361 #else
3362     // expand "_mm_cmpge_pd()" to reduce unnecessary operations
3363     uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3364     uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3365     uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3366     uint64_t d[2];
3367     d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3368     d[1] = a1;
3369
3370     return vreinterpretq_m128d_u64(vld1q_u64(d));
3371 #endif
3372 }
3373
3374 // Compare packed double-precision (64-bit) floating-point elements in a and b
3375 // for less-than-or-equal, and store the results in dst.
3376 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_pd
3377 FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
3378 {
3379 #if defined(__aarch64__)
3380     return vreinterpretq_m128d_u64(
3381         vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3382 #else
3383     uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3384     uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3385     uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3386     uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3387     uint64_t d[2];
3388     d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3389     d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3390
3391     return vreinterpretq_m128d_u64(vld1q_u64(d));
3392 #endif
3393 }
3394
3395 // Compare the lower double-precision (64-bit) floating-point elements in a and
3396 // b for less-than-or-equal, store the result in the lower element of dst, and
3397 // copy the upper element from a to the upper element of dst.
3398 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_sd
3399 FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
3400 {
3401 #if defined(__aarch64__)
3402     return _mm_move_sd(a, _mm_cmple_pd(a, b));
3403 #else
3404     // expand "_mm_cmpge_pd()" to reduce unnecessary operations
3405     uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3406     uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3407     uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3408     uint64_t d[2];
3409     d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3410     d[1] = a1;
3411
3412     return vreinterpretq_m128d_u64(vld1q_u64(d));
3413 #endif
3414 }
3415
3416 // Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
3417 // in b for less than.
3418 //
3419 //   r0 := (a0 < b0) ? 0xffff : 0x0
3420 //   r1 := (a1 < b1) ? 0xffff : 0x0
3421 //   ...
3422 //   r7 := (a7 < b7) ? 0xffff : 0x0
3423 //
3424 // https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx
3425 FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
3426 {
3427     return vreinterpretq_m128i_u16(
3428         vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3429 }
3430
3431
3432 // Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
3433 // in b for less than.
3434 // https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
3435 FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
3436 {
3437     return vreinterpretq_m128i_u32(
3438         vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3439 }
3440
3441 // Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
3442 // in b for lesser than.
3443 // https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
3444 FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
3445 {
3446     return vreinterpretq_m128i_u8(
3447         vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3448 }
3449
3450 // Compare packed double-precision (64-bit) floating-point elements in a and b
3451 // for less-than, and store the results in dst.
3452 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_pd
3453 FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
3454 {
3455 #if defined(__aarch64__)
3456     return vreinterpretq_m128d_u64(
3457         vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3458 #else
3459     uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3460     uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3461     uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3462     uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3463     uint64_t d[2];
3464     d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3465     d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3466
3467     return vreinterpretq_m128d_u64(vld1q_u64(d));
3468 #endif
3469 }
3470
3471 // Compare the lower double-precision (64-bit) floating-point elements in a and
3472 // b for less-than, store the result in the lower element of dst, and copy the
3473 // upper element from a to the upper element of dst.
3474 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_sd
3475 FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
3476 {
3477 #if defined(__aarch64__)
3478     return _mm_move_sd(a, _mm_cmplt_pd(a, b));
3479 #else
3480     uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3481     uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3482     uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3483     uint64_t d[2];
3484     d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3485     d[1] = a1;
3486
3487     return vreinterpretq_m128d_u64(vld1q_u64(d));
3488 #endif
3489 }
3490
3491 // Compare packed double-precision (64-bit) floating-point elements in a and b
3492 // for not-equal, and store the results in dst.
3493 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_pd
3494 FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
3495 {
3496 #if defined(__aarch64__)
3497     return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64(
3498         vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));
3499 #else
3500     // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
3501     uint32x4_t cmp =
3502         vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
3503     uint32x4_t swapped = vrev64q_u32(cmp);
3504     return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped)));
3505 #endif
3506 }
3507
3508 // Compare the lower double-precision (64-bit) floating-point elements in a and
3509 // b for not-equal, store the result in the lower element of dst, and copy the
3510 // upper element from a to the upper element of dst.
3511 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_sd
3512 FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
3513 {
3514     return _mm_move_sd(a, _mm_cmpneq_pd(a, b));
3515 }
3516
3517 // Compare packed double-precision (64-bit) floating-point elements in a and b
3518 // for not-greater-than-or-equal, and store the results in dst.
3519 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_pd
3520 FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
3521 {
3522 #if defined(__aarch64__)
3523     return vreinterpretq_m128d_u64(veorq_u64(
3524         vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3525         vdupq_n_u64(UINT64_MAX)));
3526 #else
3527     uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3528     uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3529     uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3530     uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3531     uint64_t d[2];
3532     d[0] =
3533         !((*(double *) &a0) >= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3534     d[1] =
3535         !((*(double *) &a1) >= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3536
3537     return vreinterpretq_m128d_u64(vld1q_u64(d));
3538 #endif
3539 }
3540
3541 // Compare the lower double-precision (64-bit) floating-point elements in a and
3542 // b for not-greater-than-or-equal, store the result in the lower element of
3543 // dst, and copy the upper element from a to the upper element of dst.
3544 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_sd
3545 FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
3546 {
3547     return _mm_move_sd(a, _mm_cmpnge_pd(a, b));
3548 }
3549
3550 // Compare packed double-precision (64-bit) floating-point elements in a and b
3551 // for not-greater-than, and store the results in dst.
3552 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_cmpngt_pd
3553 FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
3554 {
3555 #if defined(__aarch64__)
3556     return vreinterpretq_m128d_u64(veorq_u64(
3557         vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3558         vdupq_n_u64(UINT64_MAX)));
3559 #else
3560     uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3561     uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3562     uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3563     uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3564     uint64_t d[2];
3565     d[0] =
3566         !((*(double *) &a0) > (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3567     d[1] =
3568         !((*(double *) &a1) > (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3569
3570     return vreinterpretq_m128d_u64(vld1q_u64(d));
3571 #endif
3572 }
3573
3574 // Compare the lower double-precision (64-bit) floating-point elements in a and
3575 // b for not-greater-than, store the result in the lower element of dst, and
3576 // copy the upper element from a to the upper element of dst.
3577 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpngt_sd
3578 FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
3579 {
3580     return _mm_move_sd(a, _mm_cmpngt_pd(a, b));
3581 }
3582
3583 // Compare packed double-precision (64-bit) floating-point elements in a and b
3584 // for not-less-than-or-equal, and store the results in dst.
3585 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_pd
3586 FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
3587 {
3588 #if defined(__aarch64__)
3589     return vreinterpretq_m128d_u64(veorq_u64(
3590         vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3591         vdupq_n_u64(UINT64_MAX)));
3592 #else
3593     uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3594     uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3595     uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3596     uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3597     uint64_t d[2];
3598     d[0] =
3599         !((*(double *) &a0) <= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3600     d[1] =
3601         !((*(double *) &a1) <= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3602
3603     return vreinterpretq_m128d_u64(vld1q_u64(d));
3604 #endif
3605 }
3606
3607 // Compare the lower double-precision (64-bit) floating-point elements in a and
3608 // b for not-less-than-or-equal, store the result in the lower element of dst,
3609 // and copy the upper element from a to the upper element of dst.
3610 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_sd
3611 FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
3612 {
3613     return _mm_move_sd(a, _mm_cmpnle_pd(a, b));
3614 }
3615
3616 // Compare packed double-precision (64-bit) floating-point elements in a and b
3617 // for not-less-than, and store the results in dst.
3618 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_pd
3619 FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
3620 {
3621 #if defined(__aarch64__)
3622     return vreinterpretq_m128d_u64(veorq_u64(
3623         vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3624         vdupq_n_u64(UINT64_MAX)));
3625 #else
3626     uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3627     uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3628     uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3629     uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3630     uint64_t d[2];
3631     d[0] =
3632         !((*(double *) &a0) < (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3633     d[1] =
3634         !((*(double *) &a1) < (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3635
3636     return vreinterpretq_m128d_u64(vld1q_u64(d));
3637 #endif
3638 }
3639
3640 // Compare the lower double-precision (64-bit) floating-point elements in a and
3641 // b for not-less-than, store the result in the lower element of dst, and copy
3642 // the upper element from a to the upper element of dst.
3643 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_sd
3644 FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
3645 {
3646     return _mm_move_sd(a, _mm_cmpnlt_pd(a, b));
3647 }
3648
3649 // Compare packed double-precision (64-bit) floating-point elements in a and b
3650 // to see if neither is NaN, and store the results in dst.
3651 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_pd
3652 FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
3653 {
3654 #if defined(__aarch64__)
3655     // Excluding NaNs, any two floating point numbers can be compared.
3656     uint64x2_t not_nan_a =
3657         vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
3658     uint64x2_t not_nan_b =
3659         vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
3660     return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b));
3661 #else
3662     uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3663     uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3664     uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3665     uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3666     uint64_t d[2];
3667     d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3668             (*(double *) &b0) == (*(double *) &b0))
3669                ? ~UINT64_C(0)
3670                : UINT64_C(0);
3671     d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
3672             (*(double *) &b1) == (*(double *) &b1))
3673                ? ~UINT64_C(0)
3674                : UINT64_C(0);
3675
3676     return vreinterpretq_m128d_u64(vld1q_u64(d));
3677 #endif
3678 }
3679
3680 // Compare the lower double-precision (64-bit) floating-point elements in a and
3681 // b to see if neither is NaN, store the result in the lower element of dst, and
3682 // copy the upper element from a to the upper element of dst.
3683 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_sd
3684 FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
3685 {
3686 #if defined(__aarch64__)
3687     return _mm_move_sd(a, _mm_cmpord_pd(a, b));
3688 #else
3689     uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3690     uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3691     uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3692     uint64_t d[2];
3693     d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3694             (*(double *) &b0) == (*(double *) &b0))
3695                ? ~UINT64_C(0)
3696                : UINT64_C(0);
3697     d[1] = a1;
3698
3699     return vreinterpretq_m128d_u64(vld1q_u64(d));
3700 #endif
3701 }
3702
3703 // Compare packed double-precision (64-bit) floating-point elements in a and b
3704 // to see if either is NaN, and store the results in dst.
3705 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_pd
3706 FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
3707 {
3708 #if defined(__aarch64__)
3709     // Two NaNs are not equal in comparison operation.
3710     uint64x2_t not_nan_a =
3711         vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
3712     uint64x2_t not_nan_b =
3713         vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
3714     return vreinterpretq_m128d_s32(
3715         vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
3716 #else
3717     uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3718     uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3719     uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3720     uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3721     uint64_t d[2];
3722     d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3723             (*(double *) &b0) == (*(double *) &b0))
3724                ? UINT64_C(0)
3725                : ~UINT64_C(0);
3726     d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
3727             (*(double *) &b1) == (*(double *) &b1))
3728                ? UINT64_C(0)
3729                : ~UINT64_C(0);
3730
3731     return vreinterpretq_m128d_u64(vld1q_u64(d));
3732 #endif
3733 }
3734
3735 // Compare the lower double-precision (64-bit) floating-point elements in a and
3736 // b to see if either is NaN, store the result in the lower element of dst, and
3737 // copy the upper element from a to the upper element of dst.
3738 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_sd
3739 FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
3740 {
3741 #if defined(__aarch64__)
3742     return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
3743 #else
3744     uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3745     uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3746     uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3747     uint64_t d[2];
3748     d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3749             (*(double *) &b0) == (*(double *) &b0))
3750                ? UINT64_C(0)
3751                : ~UINT64_C(0);
3752     d[1] = a1;
3753
3754     return vreinterpretq_m128d_u64(vld1q_u64(d));
3755 #endif
3756 }
3757
3758 // Compare the lower double-precision (64-bit) floating-point element in a and b
3759 // for greater-than-or-equal, and return the boolean result (0 or 1).
3760 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comige_sd
3761 FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
3762 {
3763 #if defined(__aarch64__)
3764     return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
3765 #else
3766     uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3767     uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3768
3769     return (*(double *) &a0 >= *(double *) &b0);
3770 #endif
3771 }
3772
3773 // Compare the lower double-precision (64-bit) floating-point element in a and b
3774 // for greater-than, and return the boolean result (0 or 1).
3775 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comigt_sd
3776 FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
3777 {
3778 #if defined(__aarch64__)
3779     return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
3780 #else
3781     uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3782     uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3783
3784     return (*(double *) &a0 > *(double *) &b0);
3785 #endif
3786 }
3787
3788 // Compare the lower double-precision (64-bit) floating-point element in a and b
3789 // for less-than-or-equal, and return the boolean result (0 or 1).
3790 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comile_sd
3791 FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
3792 {
3793 #if defined(__aarch64__)
3794     return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
3795 #else
3796     uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3797     uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3798
3799     return (*(double *) &a0 <= *(double *) &b0);
3800 #endif
3801 }
3802
3803 // Compare the lower double-precision (64-bit) floating-point element in a and b
3804 // for less-than, and return the boolean result (0 or 1).
3805 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comilt_sd
3806 FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
3807 {
3808 #if defined(__aarch64__)
3809     return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
3810 #else
3811     uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3812     uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3813
3814     return (*(double *) &a0 < *(double *) &b0);
3815 #endif
3816 }
3817
3818 // Compare the lower double-precision (64-bit) floating-point element in a and b
3819 // for equality, and return the boolean result (0 or 1).
3820 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_sd
3821 FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
3822 {
3823 #if defined(__aarch64__)
3824     return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1;
3825 #else
3826     uint32x4_t a_not_nan =
3827         vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(a));
3828     uint32x4_t b_not_nan =
3829         vceqq_u32(vreinterpretq_u32_m128d(b), vreinterpretq_u32_m128d(b));
3830     uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
3831     uint32x4_t a_eq_b =
3832         vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
3833     uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan),
3834                                        vreinterpretq_u64_u32(a_eq_b));
3835     return vgetq_lane_u64(and_results, 0) & 0x1;
3836 #endif
3837 }
3838
3839 // Compare the lower double-precision (64-bit) floating-point element in a and b
3840 // for not-equal, and return the boolean result (0 or 1).
3841 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comineq_sd
3842 FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
3843 {
3844     return !_mm_comieq_sd(a, b);
3845 }
3846
3847 // Convert packed signed 32-bit integers in a to packed double-precision
3848 // (64-bit) floating-point elements, and store the results in dst.
3849 //
3850 //   FOR j := 0 to 1
3851 //     i := j*32
3852 //     m := j*64
3853 //     dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
3854 //   ENDFOR
3855 //
3856 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_pd
3857 FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
3858 {
3859 #if defined(__aarch64__)
3860     return vreinterpretq_m128d_f64(
3861         vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))));
3862 #else
3863     double a0 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
3864     double a1 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1);
3865     return _mm_set_pd(a1, a0);
3866 #endif
3867 }
3868
3869 // Converts the four signed 32-bit integer values of a to single-precision,
3870 // floating-point values
3871 // https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
3872 FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
3873 {
3874     return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
3875 }
3876
3877 // Convert packed double-precision (64-bit) floating-point elements in a to
3878 // packed 32-bit integers, and store the results in dst.
3879 //
3880 //   FOR j := 0 to 1
3881 //      i := 32*j
3882 //      k := 64*j
3883 //      dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
3884 //   ENDFOR
3885 //
3886 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_epi32
3887 FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
3888 {
3889     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
3890     double d0 = ((double *) &rnd)[0];
3891     double d1 = ((double *) &rnd)[1];
3892     return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);
3893 }
3894
3895 // Convert packed double-precision (64-bit) floating-point elements in a to
3896 // packed 32-bit integers, and store the results in dst.
3897 //
3898 //   FOR j := 0 to 1
3899 //      i := 32*j
3900 //      k := 64*j
3901 //      dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
3902 //   ENDFOR
3903 //
3904 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_pi32
3905 FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
3906 {
3907     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
3908     double d0 = ((double *) &rnd)[0];
3909     double d1 = ((double *) &rnd)[1];
3910     int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
3911     return vreinterpret_m64_s32(vld1_s32(data));
3912 }
3913
3914 // Convert packed double-precision (64-bit) floating-point elements in a to
3915 // packed single-precision (32-bit) floating-point elements, and store the
3916 // results in dst.
3917 //
3918 //   FOR j := 0 to 1
3919 //     i := 32*j
3920 //     k := 64*j
3921 //     dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k])
3922 //   ENDFOR
3923 //   dst[127:64] := 0
3924 //
3925 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps
3926 FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
3927 {
3928 #if defined(__aarch64__)
3929     float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
3930     return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
3931 #else
3932     float a0 = (float) ((double *) &a)[0];
3933     float a1 = (float) ((double *) &a)[1];
3934     return _mm_set_ps(0, 0, a1, a0);
3935 #endif
3936 }
3937
3938 // Convert packed signed 32-bit integers in a to packed double-precision
3939 // (64-bit) floating-point elements, and store the results in dst.
3940 //
3941 //   FOR j := 0 to 1
3942 //     i := j*32
3943 //     m := j*64
3944 //     dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
3945 //   ENDFOR
3946 //
3947 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_pd
3948 FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
3949 {
3950 #if defined(__aarch64__)
3951     return vreinterpretq_m128d_f64(
3952         vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a))));
3953 #else
3954     double a0 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 0);
3955     double a1 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 1);
3956     return _mm_set_pd(a1, a0);
3957 #endif
3958 }
3959
3960 // Converts the four single-precision, floating-point values of a to signed
3961 // 32-bit integer values.
3962 //
3963 //   r0 := (int) a0
3964 //   r1 := (int) a1
3965 //   r2 := (int) a2
3966 //   r3 := (int) a3
3967 //
3968 // https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
3969 // *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
3970 // does not support! It is supported on ARMv8-A however.
3971 FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
3972 {
3973 #if defined(__aarch64__)
3974     switch (_MM_GET_ROUNDING_MODE()) {
3975     case _MM_ROUND_NEAREST:
3976         return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
3977     case _MM_ROUND_DOWN:
3978         return vreinterpretq_m128i_s32(vcvtmq_s32_f32(a));
3979     case _MM_ROUND_UP:
3980         return vreinterpretq_m128i_s32(vcvtpq_s32_f32(a));
3981     default:  // _MM_ROUND_TOWARD_ZERO
3982         return vreinterpretq_m128i_s32(vcvtq_s32_f32(a));
3983     }
3984 #else
3985     float *f = (float *) &a;
3986     switch (_MM_GET_ROUNDING_MODE()) {
3987     case _MM_ROUND_NEAREST: {
3988         uint32x4_t signmask = vdupq_n_u32(0x80000000);
3989         float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
3990                                      vdupq_n_f32(0.5f)); /* +/- 0.5 */
3991         int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
3992             vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
3993         int32x4_t r_trunc = vcvtq_s32_f32(
3994             vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
3995         int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
3996             vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
3997         int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
3998                                      vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
3999         float32x4_t delta = vsubq_f32(
4000             vreinterpretq_f32_m128(a),
4001             vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
4002         uint32x4_t is_delta_half =
4003             vceqq_f32(delta, half); /* delta == +/- 0.5 */
4004         return vreinterpretq_m128i_s32(
4005             vbslq_s32(is_delta_half, r_even, r_normal));
4006     }
4007     case _MM_ROUND_DOWN:
4008         return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]),
4009                              floorf(f[0]));
4010     case _MM_ROUND_UP:
4011         return _mm_set_epi32(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]),
4012                              ceilf(f[0]));
4013     default:  // _MM_ROUND_TOWARD_ZERO
4014         return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1],
4015                              (int32_t) f[0]);
4016     }
4017 #endif
4018 }
4019
4020 // Convert packed single-precision (32-bit) floating-point elements in a to
4021 // packed double-precision (64-bit) floating-point elements, and store the
4022 // results in dst.
4023 //
4024 //   FOR j := 0 to 1
4025 //     i := 64*j
4026 //     k := 32*j
4027 //     dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
4028 //   ENDFOR
4029 //
4030 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd
4031 FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
4032 {
4033 #if defined(__aarch64__)
4034     return vreinterpretq_m128d_f64(
4035         vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
4036 #else
4037     double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
4038     double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
4039     return _mm_set_pd(a1, a0);
4040 #endif
4041 }
4042
4043 // Copy the lower double-precision (64-bit) floating-point element of a to dst.
4044 //
4045 //   dst[63:0] := a[63:0]
4046 //
4047 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64
4048 FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
4049 {
4050 #if defined(__aarch64__)
4051     return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
4052 #else
4053     return ((double *) &a)[0];
4054 #endif
4055 }
4056
4057 // Convert the lower double-precision (64-bit) floating-point element in a to a
4058 // 32-bit integer, and store the result in dst.
4059 //
4060 //   dst[31:0] := Convert_FP64_To_Int32(a[63:0])
4061 //
4062 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si32
4063 FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
4064 {
4065 #if defined(__aarch64__)
4066     return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
4067 #else
4068     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
4069     double ret = ((double *) &rnd)[0];
4070     return (int32_t) ret;
4071 #endif
4072 }
4073
4074 // Convert the lower double-precision (64-bit) floating-point element in a to a
4075 // 64-bit integer, and store the result in dst.
4076 //
4077 //   dst[63:0] := Convert_FP64_To_Int64(a[63:0])
4078 //
4079 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64
4080 FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
4081 {
4082 #if defined(__aarch64__)
4083     return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
4084 #else
4085     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
4086     double ret = ((double *) &rnd)[0];
4087     return (int64_t) ret;
4088 #endif
4089 }
4090
4091 // Convert the lower double-precision (64-bit) floating-point element in a to a
4092 // 64-bit integer, and store the result in dst.
4093 //
4094 //   dst[63:0] := Convert_FP64_To_Int64(a[63:0])
4095 //
4096 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64x
4097 #define _mm_cvtsd_si64x _mm_cvtsd_si64
4098
4099 // Convert the lower double-precision (64-bit) floating-point element in b to a
4100 // single-precision (32-bit) floating-point element, store the result in the
4101 // lower element of dst, and copy the upper 3 packed elements from a to the
4102 // upper elements of dst.
4103 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_ss
4104 FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
4105 {
4106 #if defined(__aarch64__)
4107     return vreinterpretq_m128_f32(vsetq_lane_f32(
4108         vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
4109         vreinterpretq_f32_m128(a), 0));
4110 #else
4111     return vreinterpretq_m128_f32(vsetq_lane_f32((float) ((double *) &b)[0],
4112                                                  vreinterpretq_f32_m128(a), 0));
4113 #endif
4114 }
4115
4116 // Copy the lower 32-bit integer in a to dst.
4117 //
4118 //   dst[31:0] := a[31:0]
4119 //
4120 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32
4121 FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
4122 {
4123     return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
4124 }
4125
4126 // Copy the lower 64-bit integer in a to dst.
4127 //
4128 //   dst[63:0] := a[63:0]
4129 //
4130 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64
4131 FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
4132 {
4133     return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
4134 }
4135
4136 // Copy the lower 64-bit integer in a to dst.
4137 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
4138 #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4139
4140 // Convert the signed 32-bit integer b to a double-precision (64-bit)
4141 // floating-point element, store the result in the lower element of dst, and
4142 // copy the upper element from a to the upper element of dst.
4143 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_sd
4144 FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
4145 {
4146 #if defined(__aarch64__)
4147     return vreinterpretq_m128d_f64(
4148         vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
4149 #else
4150     double bf = (double) b;
4151     return vreinterpretq_m128d_s64(
4152         vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
4153 #endif
4154 }
4155
4156 // Copy the lower 64-bit integer in a to dst.
4157 //
4158 //   dst[63:0] := a[63:0]
4159 //
4160 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
4161 #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4162
4163 // Moves 32-bit integer a to the least significant 32 bits of an __m128 object,
4164 // zero extending the upper bits.
4165 //
4166 //   r0 := a
4167 //   r1 := 0x0
4168 //   r2 := 0x0
4169 //   r3 := 0x0
4170 //
4171 // https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
4172 FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
4173 {
4174     return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
4175 }
4176
4177 // Convert the signed 64-bit integer b to a double-precision (64-bit)
4178 // floating-point element, store the result in the lower element of dst, and
4179 // copy the upper element from a to the upper element of dst.
4180 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_sd
4181 FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
4182 {
4183 #if defined(__aarch64__)
4184     return vreinterpretq_m128d_f64(
4185         vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
4186 #else
4187     double bf = (double) b;
4188     return vreinterpretq_m128d_s64(
4189         vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
4190 #endif
4191 }
4192
4193 // Moves 64-bit integer a to the least significant 64 bits of an __m128 object,
4194 // zero extending the upper bits.
4195 //
4196 //   r0 := a
4197 //   r1 := 0x0
4198 FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
4199 {
4200     return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
4201 }
4202
4203 // Copy 64-bit integer a to the lower element of dst, and zero the upper
4204 // element.
4205 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_si128
4206 #define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a)
4207
4208 // Convert the signed 64-bit integer b to a double-precision (64-bit)
4209 // floating-point element, store the result in the lower element of dst, and
4210 // copy the upper element from a to the upper element of dst.
4211 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_sd
4212 #define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b)
4213
4214 // Convert the lower single-precision (32-bit) floating-point element in b to a
4215 // double-precision (64-bit) floating-point element, store the result in the
4216 // lower element of dst, and copy the upper element from a to the upper element
4217 // of dst.
4218 //
4219 //   dst[63:0] := Convert_FP32_To_FP64(b[31:0])
4220 //   dst[127:64] := a[127:64]
4221 //
4222 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sd
4223 FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
4224 {
4225     double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
4226 #if defined(__aarch64__)
4227     return vreinterpretq_m128d_f64(
4228         vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
4229 #else
4230     return vreinterpretq_m128d_s64(
4231         vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0));
4232 #endif
4233 }
4234
4235 // Convert packed double-precision (64-bit) floating-point elements in a to
4236 // packed 32-bit integers with truncation, and store the results in dst.
4237 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_epi32
4238 FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
4239 {
4240     double a0 = ((double *) &a)[0];
4241     double a1 = ((double *) &a)[1];
4242     return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0);
4243 }
4244
4245 // Convert packed double-precision (64-bit) floating-point elements in a to
4246 // packed 32-bit integers with truncation, and store the results in dst.
4247 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_pi32
4248 FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
4249 {
4250     double a0 = ((double *) &a)[0];
4251     double a1 = ((double *) &a)[1];
4252     int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
4253     return vreinterpret_m64_s32(vld1_s32(data));
4254 }
4255
4256 // Converts the four single-precision, floating-point values of a to signed
4257 // 32-bit integer values using truncate.
4258 // https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
4259 FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
4260 {
4261     return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
4262 }
4263
4264 // Convert the lower double-precision (64-bit) floating-point element in a to a
4265 // 32-bit integer with truncation, and store the result in dst.
4266 //
4267 //   dst[63:0] := Convert_FP64_To_Int32_Truncate(a[63:0])
4268 //
4269 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si32
4270 FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
4271 {
4272     double ret = *((double *) &a);
4273     return (int32_t) ret;
4274 }
4275
4276 // Convert the lower double-precision (64-bit) floating-point element in a to a
4277 // 64-bit integer with truncation, and store the result in dst.
4278 //
4279 //   dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
4280 //
4281 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64
4282 FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
4283 {
4284 #if defined(__aarch64__)
4285     return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
4286 #else
4287     double ret = *((double *) &a);
4288     return (int64_t) ret;
4289 #endif
4290 }
4291
4292 // Convert the lower double-precision (64-bit) floating-point element in a to a
4293 // 64-bit integer with truncation, and store the result in dst.
4294 //
4295 //   dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
4296 //
4297 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64x
4298 #define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
4299
4300 // Divide packed double-precision (64-bit) floating-point elements in a by
4301 // packed elements in b, and store the results in dst.
4302 //
4303 //  FOR j := 0 to 1
4304 //    i := 64*j
4305 //    dst[i+63:i] := a[i+63:i] / b[i+63:i]
4306 //  ENDFOR
4307 //
4308 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_pd
4309 FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
4310 {
4311 #if defined(__aarch64__)
4312     return vreinterpretq_m128d_f64(
4313         vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4314 #else
4315     double *da = (double *) &a;
4316     double *db = (double *) &b;
4317     double c[2];
4318     c[0] = da[0] / db[0];
4319     c[1] = da[1] / db[1];
4320     return vld1q_f32((float32_t *) c);
4321 #endif
4322 }
4323
4324 // Divide the lower double-precision (64-bit) floating-point element in a by the
4325 // lower double-precision (64-bit) floating-point element in b, store the result
4326 // in the lower element of dst, and copy the upper element from a to the upper
4327 // element of dst.
4328 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_sd
4329 FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
4330 {
4331 #if defined(__aarch64__)
4332     float64x2_t tmp =
4333         vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
4334     return vreinterpretq_m128d_f64(
4335         vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1));
4336 #else
4337     return _mm_move_sd(a, _mm_div_pd(a, b));
4338 #endif
4339 }
4340
4341 // Extracts the selected signed or unsigned 16-bit integer from a and zero
4342 // extends.
4343 // https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
4344 // FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
4345 #define _mm_extract_epi16(a, imm) \
4346     vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
4347
4348 // Inserts the least significant 16 bits of b into the selected 16-bit integer
4349 // of a.
4350 // https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
4351 // FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
4352 //                                       __constrange(0,8) int imm)
4353 #define _mm_insert_epi16(a, b, imm)                                  \
4354     __extension__({                                                  \
4355         vreinterpretq_m128i_s16(                                     \
4356             vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
4357     })
4358
4359 // Loads two double-precision from 16-byte aligned memory, floating-point
4360 // values.
4361 //
4362 //   dst[127:0] := MEM[mem_addr+127:mem_addr]
4363 //
4364 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd
4365 FORCE_INLINE __m128d _mm_load_pd(const double *p)
4366 {
4367 #if defined(__aarch64__)
4368     return vreinterpretq_m128d_f64(vld1q_f64(p));
4369 #else
4370     const float *fp = (const float *) p;
4371     float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
4372     return vreinterpretq_m128d_f32(vld1q_f32(data));
4373 #endif
4374 }
4375
4376 // Load a double-precision (64-bit) floating-point element from memory into both
4377 // elements of dst.
4378 //
4379 //   dst[63:0] := MEM[mem_addr+63:mem_addr]
4380 //   dst[127:64] := MEM[mem_addr+63:mem_addr]
4381 //
4382 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1
4383 #define _mm_load_pd1 _mm_load1_pd
4384
4385 // Load a double-precision (64-bit) floating-point element from memory into the
4386 // lower of dst, and zero the upper element. mem_addr does not need to be
4387 // aligned on any particular boundary.
4388 //
4389 //   dst[63:0] := MEM[mem_addr+63:mem_addr]
4390 //   dst[127:64] := 0
4391 //
4392 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd
4393 FORCE_INLINE __m128d _mm_load_sd(const double *p)
4394 {
4395 #if defined(__aarch64__)
4396     return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
4397 #else
4398     const float *fp = (const float *) p;
4399     float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
4400     return vreinterpretq_m128d_f32(vld1q_f32(data));
4401 #endif
4402 }
4403
4404 // Loads 128-bit value. :
4405 // https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
4406 FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
4407 {
4408     return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
4409 }
4410
4411 // Load a double-precision (64-bit) floating-point element from memory into both
4412 // elements of dst.
4413 //
4414 //   dst[63:0] := MEM[mem_addr+63:mem_addr]
4415 //   dst[127:64] := MEM[mem_addr+63:mem_addr]
4416 //
4417 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd
4418 FORCE_INLINE __m128d _mm_load1_pd(const double *p)
4419 {
4420 #if defined(__aarch64__)
4421     return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
4422 #else
4423     return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
4424 #endif
4425 }
4426
4427 // Load a double-precision (64-bit) floating-point element from memory into the
4428 // upper element of dst, and copy the lower element from a to dst. mem_addr does
4429 // not need to be aligned on any particular boundary.
4430 //
4431 //   dst[63:0] := a[63:0]
4432 //   dst[127:64] := MEM[mem_addr+63:mem_addr]
4433 //
4434 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd
4435 FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
4436 {
4437 #if defined(__aarch64__)
4438     return vreinterpretq_m128d_f64(
4439         vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
4440 #else
4441     return vreinterpretq_m128d_f32(vcombine_f32(
4442         vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
4443 #endif
4444 }
4445
4446 // Load 64-bit integer from memory into the first element of dst.
4447 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_epi64
4448 FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
4449 {
4450     /* Load the lower 64 bits of the value pointed to by p into the
4451      * lower 64 bits of the result, zeroing the upper 64 bits of the result.
4452      */
4453     return vreinterpretq_m128i_s32(
4454         vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
4455 }
4456
4457 // Load a double-precision (64-bit) floating-point element from memory into the
4458 // lower element of dst, and copy the upper element from a to dst. mem_addr does
4459 // not need to be aligned on any particular boundary.
4460 //
4461 //   dst[63:0] := MEM[mem_addr+63:mem_addr]
4462 //   dst[127:64] := a[127:64]
4463 //
4464 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd
4465 FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
4466 {
4467 #if defined(__aarch64__)
4468     return vreinterpretq_m128d_f64(
4469         vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
4470 #else
4471     return vreinterpretq_m128d_f32(
4472         vcombine_f32(vld1_f32((const float *) p),
4473                      vget_high_f32(vreinterpretq_f32_m128d(a))));
4474 #endif
4475 }
4476
4477 // Load 2 double-precision (64-bit) floating-point elements from memory into dst
4478 // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
4479 // general-protection exception may be generated.
4480 //
4481 //   dst[63:0] := MEM[mem_addr+127:mem_addr+64]
4482 //   dst[127:64] := MEM[mem_addr+63:mem_addr]
4483 //
4484 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd
4485 FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
4486 {
4487 #if defined(__aarch64__)
4488     float64x2_t v = vld1q_f64(p);
4489     return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
4490 #else
4491     int64x2_t v = vld1q_s64((const int64_t *) p);
4492     return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
4493 #endif
4494 }
4495
4496 // Loads two double-precision from unaligned memory, floating-point values.
4497 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd
4498 FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
4499 {
4500     return _mm_load_pd(p);
4501 }
4502
4503 // Loads 128-bit value. :
4504 // https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
4505 FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
4506 {
4507     return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
4508 }
4509
4510 // Load unaligned 32-bit integer from memory into the first element of dst.
4511 //
4512 //   dst[31:0] := MEM[mem_addr+31:mem_addr]
4513 //   dst[MAX:32] := 0
4514 //
4515 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32
4516 FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
4517 {
4518     return vreinterpretq_m128i_s32(
4519         vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
4520 }
4521
4522 // Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
4523 // integers from b.
4524 //
4525 //   r0 := (a0 * b0) + (a1 * b1)
4526 //   r1 := (a2 * b2) + (a3 * b3)
4527 //   r2 := (a4 * b4) + (a5 * b5)
4528 //   r3 := (a6 * b6) + (a7 * b7)
4529 // https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx
4530 FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
4531 {
4532     int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
4533                               vget_low_s16(vreinterpretq_s16_m128i(b)));
4534     int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
4535                                vget_high_s16(vreinterpretq_s16_m128i(b)));
4536
4537     int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
4538     int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
4539
4540     return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
4541 }
4542
4543 // Conditionally store 8-bit integer elements from a into memory using mask
4544 // (elements are not stored when the highest bit is not set in the corresponding
4545 // element) and a non-temporal memory hint. mem_addr does not need to be aligned
4546 // on any particular boundary.
4547 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmoveu_si128
4548 FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
4549 {
4550     int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7);
4551     __m128 b = _mm_load_ps((const float *) mem_addr);
4552     int8x16_t masked =
4553         vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a),
4554                  vreinterpretq_s8_m128(b));
4555     vst1q_s8((int8_t *) mem_addr, masked);
4556 }
4557
4558 // Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8
4559 // signed 16-bit integers from b.
4560 // https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
4561 FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
4562 {
4563     return vreinterpretq_m128i_s16(
4564         vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4565 }
4566
4567 // Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the
4568 // 16 unsigned 8-bit integers from b.
4569 // https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx
4570 FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
4571 {
4572     return vreinterpretq_m128i_u8(
4573         vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
4574 }
4575
4576 // Compare packed double-precision (64-bit) floating-point elements in a and b,
4577 // and store packed maximum values in dst.
4578 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pd
4579 FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
4580 {
4581 #if defined(__aarch64__)
4582 #if SSE2NEON_PRECISE_MINMAX
4583     float64x2_t _a = vreinterpretq_f64_m128d(a);
4584     float64x2_t _b = vreinterpretq_f64_m128d(b);
4585     return vreinterpretq_m128d_f64(vbslq_f64(vcgtq_f64(_a, _b), _a, _b));
4586 #else
4587     return vreinterpretq_m128d_f64(
4588         vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4589 #endif
4590 #else
4591     uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
4592     uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
4593     uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
4594     uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
4595     uint64_t d[2];
4596     d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0;
4597     d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1;
4598
4599     return vreinterpretq_m128d_u64(vld1q_u64(d));
4600 #endif
4601 }
4602
4603 // Compare the lower double-precision (64-bit) floating-point elements in a and
4604 // b, store the maximum value in the lower element of dst, and copy the upper
4605 // element from a to the upper element of dst.
4606 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_sd
4607 FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
4608 {
4609 #if defined(__aarch64__)
4610     return _mm_move_sd(a, _mm_max_pd(a, b));
4611 #else
4612     double *da = (double *) &a;
4613     double *db = (double *) &b;
4614     double c[2] = {da[0] > db[0] ? da[0] : db[0], da[1]};
4615     return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
4616 #endif
4617 }
4618
4619 // Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8
4620 // signed 16-bit integers from b.
4621 // https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
4622 FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
4623 {
4624     return vreinterpretq_m128i_s16(
4625         vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4626 }
4627
4628 // Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the
4629 // 16 unsigned 8-bit integers from b.
4630 // https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
4631 FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
4632 {
4633     return vreinterpretq_m128i_u8(
4634         vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
4635 }
4636
4637 // Compare packed double-precision (64-bit) floating-point elements in a and b,
4638 // and store packed minimum values in dst.
4639 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pd
4640 FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
4641 {
4642 #if defined(__aarch64__)
4643 #if SSE2NEON_PRECISE_MINMAX
4644     float64x2_t _a = vreinterpretq_f64_m128d(a);
4645     float64x2_t _b = vreinterpretq_f64_m128d(b);
4646     return vreinterpretq_m128d_f64(vbslq_f64(vcltq_f64(_a, _b), _a, _b));
4647 #else
4648     return vreinterpretq_m128d_f64(
4649         vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4650 #endif
4651 #else
4652     uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
4653     uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
4654     uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
4655     uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
4656     uint64_t d[2];
4657     d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0;
4658     d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1;
4659     return vreinterpretq_m128d_u64(vld1q_u64(d));
4660 #endif
4661 }
4662
4663 // Compare the lower double-precision (64-bit) floating-point elements in a and
4664 // b, store the minimum value in the lower element of dst, and copy the upper
4665 // element from a to the upper element of dst.
4666 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_sd
4667 FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
4668 {
4669 #if defined(__aarch64__)
4670     return _mm_move_sd(a, _mm_min_pd(a, b));
4671 #else
4672     double *da = (double *) &a;
4673     double *db = (double *) &b;
4674     double c[2] = {da[0] < db[0] ? da[0] : db[0], da[1]};
4675     return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
4676 #endif
4677 }
4678
4679 // Copy the lower 64-bit integer in a to the lower element of dst, and zero the
4680 // upper element.
4681 //
4682 //   dst[63:0] := a[63:0]
4683 //   dst[127:64] := 0
4684 //
4685 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64
4686 FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
4687 {
4688     return vreinterpretq_m128i_s64(
4689         vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
4690 }
4691
4692 // Move the lower double-precision (64-bit) floating-point element from b to the
4693 // lower element of dst, and copy the upper element from a to the upper element
4694 // of dst.
4695 //
4696 //   dst[63:0] := b[63:0]
4697 //   dst[127:64] := a[127:64]
4698 //
4699 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sd
4700 FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
4701 {
4702     return vreinterpretq_m128d_f32(
4703         vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)),
4704                      vget_high_f32(vreinterpretq_f32_m128d(a))));
4705 }
4706
4707 // NEON does not provide a version of this function.
4708 // Creates a 16-bit mask from the most significant bits of the 16 signed or
4709 // unsigned 8-bit integers in a and zero extends the upper bits.
4710 // https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
4711 FORCE_INLINE int _mm_movemask_epi8(__m128i a)
4712 {
4713     // Use increasingly wide shifts+adds to collect the sign bits
4714     // together.
4715     // Since the widening shifts would be rather confusing to follow in little
4716     // endian, everything will be illustrated in big endian order instead. This
4717     // has a different result - the bits would actually be reversed on a big
4718     // endian machine.
4719
4720     // Starting input (only half the elements are shown):
4721     // 89 ff 1d c0 00 10 99 33
4722     uint8x16_t input = vreinterpretq_u8_m128i(a);
4723
4724     // Shift out everything but the sign bits with an unsigned shift right.
4725     //
4726     // Bytes of the vector::
4727     // 89 ff 1d c0 00 10 99 33
4728     // \  \  \  \  \  \  \  \    high_bits = (uint16x4_t)(input >> 7)
4729     //  |  |  |  |  |  |  |  |
4730     // 01 01 00 01 00 00 01 00
4731     //
4732     // Bits of first important lane(s):
4733     // 10001001 (89)
4734     // \______
4735     //        |
4736     // 00000001 (01)
4737     uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
4738
4739     // Merge the even lanes together with a 16-bit unsigned shift right + add.
4740     // 'xx' represents garbage data which will be ignored in the final result.
4741     // In the important bytes, the add functions like a binary OR.
4742     //
4743     // 01 01 00 01 00 00 01 00
4744     //  \_ |  \_ |  \_ |  \_ |   paired16 = (uint32x4_t)(input + (input >> 7))
4745     //    \|    \|    \|    \|
4746     // xx 03 xx 01 xx 00 xx 02
4747     //
4748     // 00000001 00000001 (01 01)
4749     //        \_______ |
4750     //                \|
4751     // xxxxxxxx xxxxxx11 (xx 03)
4752     uint32x4_t paired16 =
4753         vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
4754
4755     // Repeat with a wider 32-bit shift + add.
4756     // xx 03 xx 01 xx 00 xx 02
4757     //     \____ |     \____ |  paired32 = (uint64x1_t)(paired16 + (paired16 >>
4758     //     14))
4759     //          \|          \|
4760     // xx xx xx 0d xx xx xx 02
4761     //
4762     // 00000011 00000001 (03 01)
4763     //        \\_____ ||
4764     //         '----.\||
4765     // xxxxxxxx xxxx1101 (xx 0d)
4766     uint64x2_t paired32 =
4767         vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
4768
4769     // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
4770     // lanes. xx xx xx 0d xx xx xx 02
4771     //            \_________ |   paired64 = (uint8x8_t)(paired32 + (paired32 >>
4772     //            28))
4773     //                      \|
4774     // xx xx xx xx xx xx xx d2
4775     //
4776     // 00001101 00000010 (0d 02)
4777     //     \   \___ |  |
4778     //      '---.  \|  |
4779     // xxxxxxxx 11010010 (xx d2)
4780     uint8x16_t paired64 =
4781         vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
4782
4783     // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
4784     // xx xx xx xx xx xx xx d2
4785     //                      ||  return paired64[0]
4786     //                      d2
4787     // Note: Little endian would return the correct value 4b (01001011) instead.
4788     return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
4789 }
4790
4791 // Set each bit of mask dst based on the most significant bit of the
4792 // corresponding packed double-precision (64-bit) floating-point element in a.
4793 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pd
4794 FORCE_INLINE int _mm_movemask_pd(__m128d a)
4795 {
4796     uint64x2_t input = vreinterpretq_u64_m128d(a);
4797     uint64x2_t high_bits = vshrq_n_u64(input, 63);
4798     return vgetq_lane_u64(high_bits, 0) | (vgetq_lane_u64(high_bits, 1) << 1);
4799 }
4800
4801 // Copy the lower 64-bit integer in a to dst.
4802 //
4803 //   dst[63:0] := a[63:0]
4804 //
4805 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64
4806 FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
4807 {
4808     return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
4809 }
4810
4811 // Copy the 64-bit integer a to the lower element of dst, and zero the upper
4812 // element.
4813 //
4814 //   dst[63:0] := a[63:0]
4815 //   dst[127:64] := 0
4816 //
4817 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64
4818 FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
4819 {
4820     return vreinterpretq_m128i_s64(
4821         vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
4822 }
4823
4824 // Multiply the low unsigned 32-bit integers from each packed 64-bit element in
4825 // a and b, and store the unsigned 64-bit results in dst.
4826 //
4827 //   r0 :=  (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF)
4828 //   r1 :=  (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF)
4829 FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
4830 {
4831     // vmull_u32 upcasts instead of masking, so we downcast.
4832     uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
4833     uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
4834     return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
4835 }
4836
4837 // Multiply packed double-precision (64-bit) floating-point elements in a and b,
4838 // and store the results in dst.
4839 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pd
4840 FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
4841 {
4842 #if defined(__aarch64__)
4843     return vreinterpretq_m128d_f64(
4844         vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4845 #else
4846     double *da = (double *) &a;
4847     double *db = (double *) &b;
4848     double c[2];
4849     c[0] = da[0] * db[0];
4850     c[1] = da[1] * db[1];
4851     return vld1q_f32((float32_t *) c);
4852 #endif
4853 }
4854
4855 // Multiply the lower double-precision (64-bit) floating-point element in a and
4856 // b, store the result in the lower element of dst, and copy the upper element
4857 // from a to the upper element of dst.
4858 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_sd
4859 FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
4860 {
4861     return _mm_move_sd(a, _mm_mul_pd(a, b));
4862 }
4863
4864 // Multiply the low unsigned 32-bit integers from a and b, and store the
4865 // unsigned 64-bit result in dst.
4866 //
4867 //   dst[63:0] := a[31:0] * b[31:0]
4868 //
4869 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32
4870 FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
4871 {
4872     return vreinterpret_m64_u64(vget_low_u64(
4873         vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
4874 }
4875
4876 // Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
4877 // integers from b.
4878 //
4879 //   r0 := (a0 * b0)[31:16]
4880 //   r1 := (a1 * b1)[31:16]
4881 //   ...
4882 //   r7 := (a7 * b7)[31:16]
4883 //
4884 // https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
4885 FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
4886 {
4887     /* FIXME: issue with large values because of result saturation */
4888     // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
4889     // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
4890     // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
4891     int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
4892     int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
4893     int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
4894     int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
4895     int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
4896     int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
4897     uint16x8x2_t r =
4898         vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
4899     return vreinterpretq_m128i_u16(r.val[1]);
4900 }
4901
4902 // Multiply the packed unsigned 16-bit integers in a and b, producing
4903 // intermediate 32-bit integers, and store the high 16 bits of the intermediate
4904 // integers in dst.
4905 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_epu16
4906 FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
4907 {
4908     uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
4909     uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
4910     uint32x4_t ab3210 = vmull_u16(a3210, b3210);
4911 #if defined(__aarch64__)
4912     uint32x4_t ab7654 =
4913         vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
4914     uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
4915                               vreinterpretq_u16_u32(ab7654));
4916     return vreinterpretq_m128i_u16(r);
4917 #else
4918     uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a));
4919     uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b));
4920     uint32x4_t ab7654 = vmull_u16(a7654, b7654);
4921     uint16x8x2_t r =
4922         vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
4923     return vreinterpretq_m128i_u16(r.val[1]);
4924 #endif
4925 }
4926
4927 // Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or
4928 // unsigned 16-bit integers from b.
4929 //
4930 //   r0 := (a0 * b0)[15:0]
4931 //   r1 := (a1 * b1)[15:0]
4932 //   ...
4933 //   r7 := (a7 * b7)[15:0]
4934 //
4935 // https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
4936 FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
4937 {
4938     return vreinterpretq_m128i_s16(
4939         vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4940 }
4941
4942 // Compute the bitwise OR of packed double-precision (64-bit) floating-point
4943 // elements in a and b, and store the results in dst.
4944 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_or_pd
4945 FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
4946 {
4947     return vreinterpretq_m128d_s64(
4948         vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
4949 }
4950
4951 // Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
4952 //
4953 //   r := a | b
4954 //
4955 // https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
4956 FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
4957 {
4958     return vreinterpretq_m128i_s32(
4959         vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
4960 }
4961
4962 // Packs the 16 signed 16-bit integers from a and b into 8-bit integers and
4963 // saturates.
4964 // https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
4965 FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
4966 {
4967     return vreinterpretq_m128i_s8(
4968         vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
4969                     vqmovn_s16(vreinterpretq_s16_m128i(b))));
4970 }
4971
4972 // Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers
4973 // and saturates.
4974 //
4975 //   r0 := SignedSaturate(a0)
4976 //   r1 := SignedSaturate(a1)
4977 //   r2 := SignedSaturate(a2)
4978 //   r3 := SignedSaturate(a3)
4979 //   r4 := SignedSaturate(b0)
4980 //   r5 := SignedSaturate(b1)
4981 //   r6 := SignedSaturate(b2)
4982 //   r7 := SignedSaturate(b3)
4983 //
4984 // https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
4985 FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
4986 {
4987     return vreinterpretq_m128i_s16(
4988         vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
4989                      vqmovn_s32(vreinterpretq_s32_m128i(b))));
4990 }
4991
4992 // Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned
4993 // integers and saturates.
4994 //
4995 //   r0 := UnsignedSaturate(a0)
4996 //   r1 := UnsignedSaturate(a1)
4997 //   ...
4998 //   r7 := UnsignedSaturate(a7)
4999 //   r8 := UnsignedSaturate(b0)
5000 //   r9 := UnsignedSaturate(b1)
5001 //   ...
5002 //   r15 := UnsignedSaturate(b7)
5003 //
5004 // https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
5005 FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
5006 {
5007     return vreinterpretq_m128i_u8(
5008         vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
5009                     vqmovun_s16(vreinterpretq_s16_m128i(b))));
5010 }
5011
5012 // Pause the processor. This is typically used in spin-wait loops and depending
5013 // on the x86 processor typical values are in the 40-100 cycle range. The
5014 // 'yield' instruction isn't a good fit beacuse it's effectively a nop on most
5015 // Arm cores. Experience with several databases has shown has shown an 'isb' is
5016 // a reasonable approximation.
5017 FORCE_INLINE void _mm_pause()
5018 {
5019     __asm__ __volatile__("isb\n");
5020 }
5021
5022 // Compute the absolute differences of packed unsigned 8-bit integers in a and
5023 // b, then horizontally sum each consecutive 8 differences to produce two
5024 // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
5025 // 16 bits of 64-bit elements in dst.
5026 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8
5027 FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
5028 {
5029     uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
5030     return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t)));
5031 }
5032
5033 // Sets the 8 signed 16-bit integer values.
5034 // https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx
5035 FORCE_INLINE __m128i _mm_set_epi16(short i7,
5036                                    short i6,
5037                                    short i5,
5038                                    short i4,
5039                                    short i3,
5040                                    short i2,
5041                                    short i1,
5042                                    short i0)
5043 {
5044     int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
5045     return vreinterpretq_m128i_s16(vld1q_s16(data));
5046 }
5047
5048 // Sets the 4 signed 32-bit integer values.
5049 // https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
5050 FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
5051 {
5052     int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
5053     return vreinterpretq_m128i_s32(vld1q_s32(data));
5054 }
5055
5056 // Returns the __m128i structure with its two 64-bit integer values
5057 // initialized to the values of the two 64-bit integers passed in.
5058 // https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
5059 FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
5060 {
5061     return _mm_set_epi64x((int64_t) i1, (int64_t) i2);
5062 }
5063
5064 // Returns the __m128i structure with its two 64-bit integer values
5065 // initialized to the values of the two 64-bit integers passed in.
5066 // https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
5067 FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
5068 {
5069     return vreinterpretq_m128i_s64(
5070         vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
5071 }
5072
5073 // Sets the 16 signed 8-bit integer values.
5074 // https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
5075 FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
5076                                   signed char b14,
5077                                   signed char b13,
5078                                   signed char b12,
5079                                   signed char b11,
5080                                   signed char b10,
5081                                   signed char b9,
5082                                   signed char b8,
5083                                   signed char b7,
5084                                   signed char b6,
5085                                   signed char b5,
5086                                   signed char b4,
5087                                   signed char b3,
5088                                   signed char b2,
5089                                   signed char b1,
5090                                   signed char b0)
5091 {
5092     int8_t ALIGN_STRUCT(16)
5093         data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
5094                     (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
5095                     (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
5096                     (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
5097     return (__m128i) vld1q_s8(data);
5098 }
5099
5100 // Set packed double-precision (64-bit) floating-point elements in dst with the
5101 // supplied values.
5102 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd
5103 FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
5104 {
5105     double ALIGN_STRUCT(16) data[2] = {e0, e1};
5106 #if defined(__aarch64__)
5107     return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
5108 #else
5109     return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
5110 #endif
5111 }
5112
5113 // Broadcast double-precision (64-bit) floating-point value a to all elements of
5114 // dst.
5115 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd1
5116 #define _mm_set_pd1 _mm_set1_pd
5117
5118 // Copy double-precision (64-bit) floating-point element a to the lower element
5119 // of dst, and zero the upper element.
5120 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sd
5121 FORCE_INLINE __m128d _mm_set_sd(double a)
5122 {
5123     return _mm_set_pd(0, a);
5124 }
5125
5126 // Sets the 8 signed 16-bit integer values to w.
5127 //
5128 //   r0 := w
5129 //   r1 := w
5130 //   ...
5131 //   r7 := w
5132 //
5133 // https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx
5134 FORCE_INLINE __m128i _mm_set1_epi16(short w)
5135 {
5136     return vreinterpretq_m128i_s16(vdupq_n_s16(w));
5137 }
5138
5139 // Sets the 4 signed 32-bit integer values to i.
5140 //
5141 //   r0 := i
5142 //   r1 := i
5143 //   r2 := i
5144 //   r3 := I
5145 //
5146 // https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
5147 FORCE_INLINE __m128i _mm_set1_epi32(int _i)
5148 {
5149     return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
5150 }
5151
5152 // Sets the 2 signed 64-bit integer values to i.
5153 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100)
5154 FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
5155 {
5156     return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));
5157 }
5158
5159 // Sets the 2 signed 64-bit integer values to i.
5160 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x
5161 FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
5162 {
5163     return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
5164 }
5165
5166 // Sets the 16 signed 8-bit integer values to b.
5167 //
5168 //   r0 := b
5169 //   r1 := b
5170 //   ...
5171 //   r15 := b
5172 //
5173 // https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx
5174 FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
5175 {
5176     return vreinterpretq_m128i_s8(vdupq_n_s8(w));
5177 }
5178
5179 // Broadcast double-precision (64-bit) floating-point value a to all elements of
5180 // dst.
5181 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_pd
5182 FORCE_INLINE __m128d _mm_set1_pd(double d)
5183 {
5184 #if defined(__aarch64__)
5185     return vreinterpretq_m128d_f64(vdupq_n_f64(d));
5186 #else
5187     return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d));
5188 #endif
5189 }
5190
5191 // Sets the 8 signed 16-bit integer values in reverse order.
5192 //
5193 // Return Value
5194 //   r0 := w0
5195 //   r1 := w1
5196 //   ...
5197 //   r7 := w7
5198 FORCE_INLINE __m128i _mm_setr_epi16(short w0,
5199                                     short w1,
5200                                     short w2,
5201                                     short w3,
5202                                     short w4,
5203                                     short w5,
5204                                     short w6,
5205                                     short w7)
5206 {
5207     int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
5208     return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
5209 }
5210
5211 // Sets the 4 signed 32-bit integer values in reverse order
5212 // https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx
5213 FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
5214 {
5215     int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
5216     return vreinterpretq_m128i_s32(vld1q_s32(data));
5217 }
5218
5219 // Set packed 64-bit integers in dst with the supplied values in reverse order.
5220 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64
5221 FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
5222 {
5223     return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
5224 }
5225
5226 // Sets the 16 signed 8-bit integer values in reverse order.
5227 // https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx
5228 FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
5229                                    signed char b1,
5230                                    signed char b2,
5231                                    signed char b3,
5232                                    signed char b4,
5233                                    signed char b5,
5234                                    signed char b6,
5235                                    signed char b7,
5236                                    signed char b8,
5237                                    signed char b9,
5238                                    signed char b10,
5239                                    signed char b11,
5240                                    signed char b12,
5241                                    signed char b13,
5242                                    signed char b14,
5243                                    signed char b15)
5244 {
5245     int8_t ALIGN_STRUCT(16)
5246         data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
5247                     (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
5248                     (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
5249                     (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
5250     return (__m128i) vld1q_s8(data);
5251 }
5252
5253 // Set packed double-precision (64-bit) floating-point elements in dst with the
5254 // supplied values in reverse order.
5255 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_pd
5256 FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
5257 {
5258     return _mm_set_pd(e0, e1);
5259 }
5260
5261 // Return vector of type __m128d with all elements set to zero.
5262 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_pd
5263 FORCE_INLINE __m128d _mm_setzero_pd(void)
5264 {
5265 #if defined(__aarch64__)
5266     return vreinterpretq_m128d_f64(vdupq_n_f64(0));
5267 #else
5268     return vreinterpretq_m128d_f32(vdupq_n_f32(0));
5269 #endif
5270 }
5271
5272 // Sets the 128-bit value to zero
5273 // https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
5274 FORCE_INLINE __m128i _mm_setzero_si128(void)
5275 {
5276     return vreinterpretq_m128i_s32(vdupq_n_s32(0));
5277 }
5278
5279 // Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.
5280 // https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
5281 // FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
5282 //                                        __constrange(0,255) int imm)
5283 #if __has_builtin(__builtin_shufflevector)
5284 #define _mm_shuffle_epi32(a, imm)                              \
5285     __extension__({                                            \
5286         int32x4_t _input = vreinterpretq_s32_m128i(a);         \
5287         int32x4_t _shuf = __builtin_shufflevector(             \
5288             _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
5289             ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3);           \
5290         vreinterpretq_m128i_s32(_shuf);                        \
5291     })
5292 #else  // generic
5293 #define _mm_shuffle_epi32(a, imm)                        \
5294     __extension__({                                      \
5295         __m128i ret;                                     \
5296         switch (imm) {                                   \
5297         case _MM_SHUFFLE(1, 0, 3, 2):                    \
5298             ret = _mm_shuffle_epi_1032((a));             \
5299             break;                                       \
5300         case _MM_SHUFFLE(2, 3, 0, 1):                    \
5301             ret = _mm_shuffle_epi_2301((a));             \
5302             break;                                       \
5303         case _MM_SHUFFLE(0, 3, 2, 1):                    \
5304             ret = _mm_shuffle_epi_0321((a));             \
5305             break;                                       \
5306         case _MM_SHUFFLE(2, 1, 0, 3):                    \
5307             ret = _mm_shuffle_epi_2103((a));             \
5308             break;                                       \
5309         case _MM_SHUFFLE(1, 0, 1, 0):                    \
5310             ret = _mm_shuffle_epi_1010((a));             \
5311             break;                                       \
5312         case _MM_SHUFFLE(1, 0, 0, 1):                    \
5313             ret = _mm_shuffle_epi_1001((a));             \
5314             break;                                       \
5315         case _MM_SHUFFLE(0, 1, 0, 1):                    \
5316             ret = _mm_shuffle_epi_0101((a));             \
5317             break;                                       \
5318         case _MM_SHUFFLE(2, 2, 1, 1):                    \
5319             ret = _mm_shuffle_epi_2211((a));             \
5320             break;                                       \
5321         case _MM_SHUFFLE(0, 1, 2, 2):                    \
5322             ret = _mm_shuffle_epi_0122((a));             \
5323             break;                                       \
5324         case _MM_SHUFFLE(3, 3, 3, 2):                    \
5325             ret = _mm_shuffle_epi_3332((a));             \
5326             break;                                       \
5327         case _MM_SHUFFLE(0, 0, 0, 0):                    \
5328             ret = _mm_shuffle_epi32_splat((a), 0);       \
5329             break;                                       \
5330         case _MM_SHUFFLE(1, 1, 1, 1):                    \
5331             ret = _mm_shuffle_epi32_splat((a), 1);       \
5332             break;                                       \
5333         case _MM_SHUFFLE(2, 2, 2, 2):                    \
5334             ret = _mm_shuffle_epi32_splat((a), 2);       \
5335             break;                                       \
5336         case _MM_SHUFFLE(3, 3, 3, 3):                    \
5337             ret = _mm_shuffle_epi32_splat((a), 3);       \
5338             break;                                       \
5339         default:                                         \
5340             ret = _mm_shuffle_epi32_default((a), (imm)); \
5341             break;                                       \
5342         }                                                \
5343         ret;                                             \
5344     })
5345 #endif
5346
5347 // Shuffle double-precision (64-bit) floating-point elements using the control
5348 // in imm8, and store the results in dst.
5349 //
5350 //   dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
5351 //   dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
5352 //
5353 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pd
5354 #if __has_builtin(__builtin_shufflevector)
5355 #define _mm_shuffle_pd(a, b, imm8)                                          \
5356     vreinterpretq_m128d_s64(__builtin_shufflevector(                        \
5357         vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), imm8 & 0x1, \
5358         ((imm8 & 0x2) >> 1) + 2))
5359 #else
5360 #define _mm_shuffle_pd(a, b, imm8)                                     \
5361     _mm_castsi128_pd(_mm_set_epi64x(                                   \
5362         vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
5363         vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
5364 #endif
5365
5366 // FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
5367 //                                          __constrange(0,255) int imm)
5368 #if __has_builtin(__builtin_shufflevector)
5369 #define _mm_shufflehi_epi16(a, imm)                             \
5370     __extension__({                                             \
5371         int16x8_t _input = vreinterpretq_s16_m128i(a);          \
5372         int16x8_t _shuf = __builtin_shufflevector(              \
5373             _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4,    \
5374             (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
5375             (((imm) >> 6) & 0x3) + 4);                          \
5376         vreinterpretq_m128i_s16(_shuf);                         \
5377     })
5378 #else  // generic
5379 #define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
5380 #endif
5381
5382 // FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
5383 //                                          __constrange(0,255) int imm)
5384 #if __has_builtin(__builtin_shufflevector)
5385 #define _mm_shufflelo_epi16(a, imm)                                  \
5386     __extension__({                                                  \
5387         int16x8_t _input = vreinterpretq_s16_m128i(a);               \
5388         int16x8_t _shuf = __builtin_shufflevector(                   \
5389             _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3),   \
5390             (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
5391         vreinterpretq_m128i_s16(_shuf);                              \
5392     })
5393 #else  // generic
5394 #define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
5395 #endif
5396
5397 // Shift packed 16-bit integers in a left by count while shifting in zeros, and
5398 // store the results in dst.
5399 //
5400 //   FOR j := 0 to 7
5401 //     i := j*16
5402 //     IF count[63:0] > 15
5403 //       dst[i+15:i] := 0
5404 //     ELSE
5405 //       dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0])
5406 //     FI
5407 //   ENDFOR
5408 //
5409 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi16
5410 FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
5411 {
5412     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5413     if (_sse2neon_unlikely(c & ~15))
5414         return _mm_setzero_si128();
5415
5416     int16x8_t vc = vdupq_n_s16((int16_t) c);
5417     return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
5418 }
5419
5420 // Shift packed 32-bit integers in a left by count while shifting in zeros, and
5421 // store the results in dst.
5422 //
5423 //   FOR j := 0 to 3
5424 //     i := j*32
5425 //     IF count[63:0] > 31
5426 //       dst[i+31:i] := 0
5427 //     ELSE
5428 //       dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0])
5429 //     FI
5430 //   ENDFOR
5431 //
5432 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi32
5433 FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
5434 {
5435     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5436     if (_sse2neon_unlikely(c & ~31))
5437         return _mm_setzero_si128();
5438
5439     int32x4_t vc = vdupq_n_s32((int32_t) c);
5440     return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
5441 }
5442
5443 // Shift packed 64-bit integers in a left by count while shifting in zeros, and
5444 // store the results in dst.
5445 //
5446 //   FOR j := 0 to 1
5447 //     i := j*64
5448 //     IF count[63:0] > 63
5449 //       dst[i+63:i] := 0
5450 //     ELSE
5451 //       dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0])
5452 //     FI
5453 //   ENDFOR
5454 //
5455 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi64
5456 FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
5457 {
5458     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5459     if (_sse2neon_unlikely(c & ~63))
5460         return _mm_setzero_si128();
5461
5462     int64x2_t vc = vdupq_n_s64((int64_t) c);
5463     return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
5464 }
5465
5466 // Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and
5467 // store the results in dst.
5468 //
5469 //   FOR j := 0 to 7
5470 //     i := j*16
5471 //     IF imm8[7:0] > 15
5472 //       dst[i+15:i] := 0
5473 //     ELSE
5474 //       dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0])
5475 //     FI
5476 //   ENDFOR
5477 //
5478 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi16
5479 FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
5480 {
5481     if (_sse2neon_unlikely(imm & ~15))
5482         return _mm_setzero_si128();
5483     return vreinterpretq_m128i_s16(
5484         vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16(imm)));
5485 }
5486
5487 // Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and
5488 // store the results in dst.
5489 //
5490 //   FOR j := 0 to 3
5491 //     i := j*32
5492 //     IF imm8[7:0] > 31
5493 //       dst[i+31:i] := 0
5494 //     ELSE
5495 //       dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0])
5496 //     FI
5497 //   ENDFOR
5498 //
5499 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi32
5500 FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
5501 {
5502     if (_sse2neon_unlikely(imm & ~31))
5503         return _mm_setzero_si128();
5504     return vreinterpretq_m128i_s32(
5505         vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
5506 }
5507
5508 // Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
5509 // store the results in dst.
5510 //
5511 //   FOR j := 0 to 1
5512 //     i := j*64
5513 //     IF imm8[7:0] > 63
5514 //       dst[i+63:i] := 0
5515 //     ELSE
5516 //       dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0])
5517 //     FI
5518 //   ENDFOR
5519 //
5520 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi64
5521 FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
5522 {
5523     if (_sse2neon_unlikely(imm & ~63))
5524         return _mm_setzero_si128();
5525     return vreinterpretq_m128i_s64(
5526         vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
5527 }
5528
5529 // Shift a left by imm8 bytes while shifting in zeros, and store the results in
5530 // dst.
5531 //
5532 //   tmp := imm8[7:0]
5533 //   IF tmp > 15
5534 //     tmp := 16
5535 //   FI
5536 //   dst[127:0] := a[127:0] << (tmp*8)
5537 //
5538 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_si128
5539 FORCE_INLINE __m128i _mm_slli_si128(__m128i a, int imm)
5540 {
5541     if (_sse2neon_unlikely(imm & ~15))
5542         return _mm_setzero_si128();
5543     uint8x16_t tmp[2] = {vdupq_n_u8(0), vreinterpretq_u8_m128i(a)};
5544     return vreinterpretq_m128i_u8(
5545         vld1q_u8(((uint8_t const *) tmp) + (16 - imm)));
5546 }
5547
5548 // Compute the square root of packed double-precision (64-bit) floating-point
5549 // elements in a, and store the results in dst.
5550 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_pd
5551 FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
5552 {
5553 #if defined(__aarch64__)
5554     return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
5555 #else
5556     double a0 = sqrt(((double *) &a)[0]);
5557     double a1 = sqrt(((double *) &a)[1]);
5558     return _mm_set_pd(a1, a0);
5559 #endif
5560 }
5561
5562 // Compute the square root of the lower double-precision (64-bit) floating-point
5563 // element in b, store the result in the lower element of dst, and copy the
5564 // upper element from a to the upper element of dst.
5565 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_sd
5566 FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
5567 {
5568 #if defined(__aarch64__)
5569     return _mm_move_sd(a, _mm_sqrt_pd(b));
5570 #else
5571     return _mm_set_pd(((double *) &a)[1], sqrt(((double *) &b)[0]));
5572 #endif
5573 }
5574
5575 // Shift packed 16-bit integers in a right by count while shifting in sign bits,
5576 // and store the results in dst.
5577 //
5578 //   FOR j := 0 to 7
5579 //     i := j*16
5580 //     IF count[63:0] > 15
5581 //       dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
5582 //     ELSE
5583 //       dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0])
5584 //     FI
5585 //  ENDFOR
5586 //
5587 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sra_epi16
5588 FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
5589 {
5590     int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
5591     if (_sse2neon_unlikely(c & ~15))
5592         return _mm_cmplt_epi16(a, _mm_setzero_si128());
5593     return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c)));
5594 }
5595
5596 // Shift packed 32-bit integers in a right by count while shifting in sign bits,
5597 // and store the results in dst.
5598 //
5599 //   FOR j := 0 to 3
5600 //     i := j*32
5601 //     IF count[63:0] > 31
5602 //       dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
5603 //     ELSE
5604 //       dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0])
5605 //     FI
5606 //  ENDFOR
5607 //
5608 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sra_epi32
5609 FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
5610 {
5611     int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
5612     if (_sse2neon_unlikely(c & ~31))
5613         return _mm_cmplt_epi32(a, _mm_setzero_si128());
5614     return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c)));
5615 }
5616
5617 // Shift packed 16-bit integers in a right by imm8 while shifting in sign
5618 // bits, and store the results in dst.
5619 //
5620 //   FOR j := 0 to 7
5621 //     i := j*16
5622 //     IF imm8[7:0] > 15
5623 //       dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
5624 //     ELSE
5625 //       dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0])
5626 //     FI
5627 //   ENDFOR
5628 //
5629 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16
5630 FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
5631 {
5632     const int count = (imm & ~15) ? 15 : imm;
5633     return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
5634 }
5635
5636 // Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
5637 // and store the results in dst.
5638 //
5639 //   FOR j := 0 to 3
5640 //     i := j*32
5641 //     IF imm8[7:0] > 31
5642 //       dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
5643 //     ELSE
5644 //       dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0])
5645 //     FI
5646 //   ENDFOR
5647 //
5648 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32
5649 // FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
5650 #define _mm_srai_epi32(a, imm)                                             \
5651     __extension__({                                                        \
5652         __m128i ret;                                                       \
5653         if (_sse2neon_unlikely((imm) == 0)) {                              \
5654             ret = a;                                                       \
5655         } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) {            \
5656             ret = vreinterpretq_m128i_s32(                                 \
5657                 vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-imm))); \
5658         } else {                                                           \
5659             ret = vreinterpretq_m128i_s32(                                 \
5660                 vshrq_n_s32(vreinterpretq_s32_m128i(a), 31));              \
5661         }                                                                  \
5662         ret;                                                               \
5663     })
5664
5665 // Shift packed 16-bit integers in a right by count while shifting in zeros, and
5666 // store the results in dst.
5667 //
5668 //   FOR j := 0 to 7
5669 //     i := j*16
5670 //     IF count[63:0] > 15
5671 //       dst[i+15:i] := 0
5672 //     ELSE
5673 //       dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0])
5674 //     FI
5675 //   ENDFOR
5676 //
5677 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi16
5678 FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
5679 {
5680     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5681     if (_sse2neon_unlikely(c & ~15))
5682         return _mm_setzero_si128();
5683
5684     int16x8_t vc = vdupq_n_s16(-(int16_t) c);
5685     return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
5686 }
5687
5688 // Shift packed 32-bit integers in a right by count while shifting in zeros, and
5689 // store the results in dst.
5690 //
5691 //   FOR j := 0 to 3
5692 //     i := j*32
5693 //     IF count[63:0] > 31
5694 //       dst[i+31:i] := 0
5695 //     ELSE
5696 //       dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0])
5697 //     FI
5698 //   ENDFOR
5699 //
5700 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi32
5701 FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
5702 {
5703     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5704     if (_sse2neon_unlikely(c & ~31))
5705         return _mm_setzero_si128();
5706
5707     int32x4_t vc = vdupq_n_s32(-(int32_t) c);
5708     return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
5709 }
5710
5711 // Shift packed 64-bit integers in a right by count while shifting in zeros, and
5712 // store the results in dst.
5713 //
5714 //   FOR j := 0 to 1
5715 //     i := j*64
5716 //     IF count[63:0] > 63
5717 //       dst[i+63:i] := 0
5718 //     ELSE
5719 //       dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0])
5720 //     FI
5721 //   ENDFOR
5722 //
5723 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi64
5724 FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
5725 {
5726     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5727     if (_sse2neon_unlikely(c & ~63))
5728         return _mm_setzero_si128();
5729
5730     int64x2_t vc = vdupq_n_s64(-(int64_t) c);
5731     return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
5732 }
5733
5734 // Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
5735 // store the results in dst.
5736 //
5737 //   FOR j := 0 to 7
5738 //     i := j*16
5739 //     IF imm8[7:0] > 15
5740 //       dst[i+15:i] := 0
5741 //     ELSE
5742 //       dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0])
5743 //     FI
5744 //   ENDFOR
5745 //
5746 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16
5747 #define _mm_srli_epi16(a, imm)                                               \
5748     __extension__({                                                          \
5749         __m128i ret;                                                         \
5750         if (_sse2neon_unlikely((imm) & ~15)) {                               \
5751             ret = _mm_setzero_si128();                                       \
5752         } else {                                                             \
5753             ret = vreinterpretq_m128i_u16(                                   \
5754                 vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-(imm)))); \
5755         }                                                                    \
5756         ret;                                                                 \
5757     })
5758
5759 // Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
5760 // store the results in dst.
5761 //
5762 //   FOR j := 0 to 3
5763 //     i := j*32
5764 //     IF imm8[7:0] > 31
5765 //       dst[i+31:i] := 0
5766 //     ELSE
5767 //       dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0])
5768 //     FI
5769 //   ENDFOR
5770 //
5771 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32
5772 // FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
5773 #define _mm_srli_epi32(a, imm)                                               \
5774     __extension__({                                                          \
5775         __m128i ret;                                                         \
5776         if (_sse2neon_unlikely((imm) & ~31)) {                               \
5777             ret = _mm_setzero_si128();                                       \
5778         } else {                                                             \
5779             ret = vreinterpretq_m128i_u32(                                   \
5780                 vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-(imm)))); \
5781         }                                                                    \
5782         ret;                                                                 \
5783     })
5784
5785 // Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
5786 // store the results in dst.
5787 //
5788 //   FOR j := 0 to 1
5789 //     i := j*64
5790 //     IF imm8[7:0] > 63
5791 //       dst[i+63:i] := 0
5792 //     ELSE
5793 //       dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0])
5794 //     FI
5795 //   ENDFOR
5796 //
5797 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64
5798 #define _mm_srli_epi64(a, imm)                                               \
5799     __extension__({                                                          \
5800         __m128i ret;                                                         \
5801         if (_sse2neon_unlikely((imm) & ~63)) {                               \
5802             ret = _mm_setzero_si128();                                       \
5803         } else {                                                             \
5804             ret = vreinterpretq_m128i_u64(                                   \
5805                 vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-(imm)))); \
5806         }                                                                    \
5807         ret;                                                                 \
5808     })
5809
5810 // Shift a right by imm8 bytes while shifting in zeros, and store the results in
5811 // dst.
5812 //
5813 //   tmp := imm8[7:0]
5814 //   IF tmp > 15
5815 //     tmp := 16
5816 //   FI
5817 //   dst[127:0] := a[127:0] >> (tmp*8)
5818 //
5819 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_si128
5820 FORCE_INLINE __m128i _mm_srli_si128(__m128i a, int imm)
5821 {
5822     if (_sse2neon_unlikely(imm & ~15))
5823         return _mm_setzero_si128();
5824     uint8x16_t tmp[2] = {vreinterpretq_u8_m128i(a), vdupq_n_u8(0)};
5825     return vreinterpretq_m128i_u8(vld1q_u8(((uint8_t const *) tmp) + imm));
5826 }
5827
5828 // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
5829 // elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
5830 // or a general-protection exception may be generated.
5831 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd
5832 FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
5833 {
5834 #if defined(__aarch64__)
5835     vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
5836 #else
5837     vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
5838 #endif
5839 }
5840
5841 // Store the lower double-precision (64-bit) floating-point element from a into
5842 // 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
5843 // boundary or a general-protection exception may be generated.
5844 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd1
5845 FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
5846 {
5847 #if defined(__aarch64__)
5848     float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
5849     vst1q_f64((float64_t *) mem_addr,
5850               vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
5851 #else
5852     float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a));
5853     vst1q_f32((float32_t *) mem_addr,
5854               vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low)));
5855 #endif
5856 }
5857
5858 // Store the lower double-precision (64-bit) floating-point element from a into
5859 // memory. mem_addr does not need to be aligned on any particular boundary.
5860 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_store_sd
5861 FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
5862 {
5863 #if defined(__aarch64__)
5864     vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
5865 #else
5866     vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a)));
5867 #endif
5868 }
5869
5870 // Stores four 32-bit integer values as (as a __m128i value) at the address p.
5871 // https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
5872 FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
5873 {
5874     vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
5875 }
5876
5877 // Store the lower double-precision (64-bit) floating-point element from a into
5878 // 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
5879 // boundary or a general-protection exception may be generated.
5880 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=9,526,5601&text=_mm_store1_pd
5881 #define _mm_store1_pd _mm_store_pd1
5882
5883 // Store the upper double-precision (64-bit) floating-point element from a into
5884 // memory.
5885 //
5886 //   MEM[mem_addr+63:mem_addr] := a[127:64]
5887 //
5888 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeh_pd
5889 FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
5890 {
5891 #if defined(__aarch64__)
5892     vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
5893 #else
5894     vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a)));
5895 #endif
5896 }
5897
5898 // Reads the lower 64 bits of b and stores them into the lower 64 bits of a.
5899 // https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
5900 FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
5901 {
5902     uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a));
5903     uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b));
5904     *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi));
5905 }
5906
5907 // Store the lower double-precision (64-bit) floating-point element from a into
5908 // memory.
5909 //
5910 //   MEM[mem_addr+63:mem_addr] := a[63:0]
5911 //
5912 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storel_pd
5913 FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
5914 {
5915 #if defined(__aarch64__)
5916     vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
5917 #else
5918     vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a)));
5919 #endif
5920 }
5921
5922 // Store 2 double-precision (64-bit) floating-point elements from a into memory
5923 // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
5924 // general-protection exception may be generated.
5925 //
5926 //   MEM[mem_addr+63:mem_addr] := a[127:64]
5927 //   MEM[mem_addr+127:mem_addr+64] := a[63:0]
5928 //
5929 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_pd
5930 FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
5931 {
5932     float32x4_t f = vreinterpretq_f32_m128d(a);
5933     _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2)));
5934 }
5935
5936 // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
5937 // elements) from a into memory. mem_addr does not need to be aligned on any
5938 // particular boundary.
5939 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd
5940 FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
5941 {
5942     _mm_store_pd(mem_addr, a);
5943 }
5944
5945 // Stores 128-bits of integer data a at the address p.
5946 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si128
5947 FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
5948 {
5949     vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
5950 }
5951
5952 // Stores 32-bits of integer data a at the address p.
5953 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si32
5954 FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
5955 {
5956     vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0);
5957 }
5958
5959 // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
5960 // elements) from a into memory using a non-temporal memory hint. mem_addr must
5961 // be aligned on a 16-byte boundary or a general-protection exception may be
5962 // generated.
5963 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pd
5964 FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
5965 {
5966 #if __has_builtin(__builtin_nontemporal_store)
5967     __builtin_nontemporal_store(a, (float32x4_t *) p);
5968 #elif defined(__aarch64__)
5969     vst1q_f64(p, vreinterpretq_f64_m128d(a));
5970 #else
5971     vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a));
5972 #endif
5973 }
5974
5975 // Stores the data in a to the address p without polluting the caches.  If the
5976 // cache line containing address p is already in the cache, the cache will be
5977 // updated.
5978 // https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
5979 FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
5980 {
5981 #if __has_builtin(__builtin_nontemporal_store)
5982     __builtin_nontemporal_store(a, p);
5983 #else
5984     vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
5985 #endif
5986 }
5987
5988 // Store 32-bit integer a into memory using a non-temporal hint to minimize
5989 // cache pollution. If the cache line containing address mem_addr is already in
5990 // the cache, the cache will be updated.
5991 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_si32
5992 FORCE_INLINE void _mm_stream_si32(int *p, int a)
5993 {
5994     vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);
5995 }
5996
5997 // Store 64-bit integer a into memory using a non-temporal hint to minimize
5998 // cache pollution. If the cache line containing address mem_addr is already in
5999 // the cache, the cache will be updated.
6000 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_si64
6001 FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a)
6002 {
6003     vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a));
6004 }
6005
6006 // Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
6007 // store the results in dst.
6008 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi16
6009 FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
6010 {
6011     return vreinterpretq_m128i_s16(
6012         vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
6013 }
6014
6015 // Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or
6016 // unsigned 32-bit integers of a.
6017 //
6018 //   r0 := a0 - b0
6019 //   r1 := a1 - b1
6020 //   r2 := a2 - b2
6021 //   r3 := a3 - b3
6022 //
6023 // https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
6024 FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
6025 {
6026     return vreinterpretq_m128i_s32(
6027         vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
6028 }
6029
6030 // Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a,
6031 // and store the results in dst.
6032 //    r0 := a0 - b0
6033 //    r1 := a1 - b1
6034 FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
6035 {
6036     return vreinterpretq_m128i_s64(
6037         vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
6038 }
6039
6040 // Subtract packed 8-bit integers in b from packed 8-bit integers in a, and
6041 // store the results in dst.
6042 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi8
6043 FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
6044 {
6045     return vreinterpretq_m128i_s8(
6046         vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
6047 }
6048
6049 // Subtract packed double-precision (64-bit) floating-point elements in b from
6050 // packed double-precision (64-bit) floating-point elements in a, and store the
6051 // results in dst.
6052 //
6053 //   FOR j := 0 to 1
6054 //     i := j*64
6055 //     dst[i+63:i] := a[i+63:i] - b[i+63:i]
6056 //   ENDFOR
6057 //
6058 //  https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_pd
6059 FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
6060 {
6061 #if defined(__aarch64__)
6062     return vreinterpretq_m128d_f64(
6063         vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6064 #else
6065     double *da = (double *) &a;
6066     double *db = (double *) &b;
6067     double c[2];
6068     c[0] = da[0] - db[0];
6069     c[1] = da[1] - db[1];
6070     return vld1q_f32((float32_t *) c);
6071 #endif
6072 }
6073
6074 // Subtract the lower double-precision (64-bit) floating-point element in b from
6075 // the lower double-precision (64-bit) floating-point element in a, store the
6076 // result in the lower element of dst, and copy the upper element from a to the
6077 // upper element of dst.
6078 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sd
6079 FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
6080 {
6081     return _mm_move_sd(a, _mm_sub_pd(a, b));
6082 }
6083
6084 // Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
6085 //
6086 //   dst[63:0] := a[63:0] - b[63:0]
6087 //
6088 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64
6089 FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
6090 {
6091     return vreinterpret_m64_s64(
6092         vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
6093 }
6094
6095 // Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers
6096 // of a and saturates.
6097 //
6098 //   r0 := SignedSaturate(a0 - b0)
6099 //   r1 := SignedSaturate(a1 - b1)
6100 //   ...
6101 //   r7 := SignedSaturate(a7 - b7)
6102 //
6103 // https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90)
6104 FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
6105 {
6106     return vreinterpretq_m128i_s16(
6107         vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
6108 }
6109
6110 // Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers
6111 // of a and saturates.
6112 //
6113 //   r0 := SignedSaturate(a0 - b0)
6114 //   r1 := SignedSaturate(a1 - b1)
6115 //   ...
6116 //   r15 := SignedSaturate(a15 - b15)
6117 //
6118 // https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90)
6119 FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
6120 {
6121     return vreinterpretq_m128i_s8(
6122         vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
6123 }
6124
6125 // Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit
6126 // integers of a and saturates..
6127 // https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx
6128 FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
6129 {
6130     return vreinterpretq_m128i_u16(
6131         vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
6132 }
6133
6134 // Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit
6135 // integers of a and saturates.
6136 //
6137 //   r0 := UnsignedSaturate(a0 - b0)
6138 //   r1 := UnsignedSaturate(a1 - b1)
6139 //   ...
6140 //   r15 := UnsignedSaturate(a15 - b15)
6141 //
6142 // https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)
6143 FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
6144 {
6145     return vreinterpretq_m128i_u8(
6146         vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
6147 }
6148
6149 #define _mm_ucomieq_sd _mm_comieq_sd
6150 #define _mm_ucomige_sd _mm_comige_sd
6151 #define _mm_ucomigt_sd _mm_comigt_sd
6152 #define _mm_ucomile_sd _mm_comile_sd
6153 #define _mm_ucomilt_sd _mm_comilt_sd
6154 #define _mm_ucomineq_sd _mm_comineq_sd
6155
6156 // Return vector of type __m128d with undefined elements.
6157 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_pd
6158 FORCE_INLINE __m128d _mm_undefined_pd(void)
6159 {
6160 #if defined(__GNUC__) || defined(__clang__)
6161 #pragma GCC diagnostic push
6162 #pragma GCC diagnostic ignored "-Wuninitialized"
6163 #endif
6164     __m128d a;
6165     return a;
6166 #if defined(__GNUC__) || defined(__clang__)
6167 #pragma GCC diagnostic pop
6168 #endif
6169 }
6170
6171 // Interleaves the upper 4 signed or unsigned 16-bit integers in a with the
6172 // upper 4 signed or unsigned 16-bit integers in b.
6173 //
6174 //   r0 := a4
6175 //   r1 := b4
6176 //   r2 := a5
6177 //   r3 := b5
6178 //   r4 := a6
6179 //   r5 := b6
6180 //   r6 := a7
6181 //   r7 := b7
6182 //
6183 // https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
6184 FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
6185 {
6186 #if defined(__aarch64__)
6187     return vreinterpretq_m128i_s16(
6188         vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
6189 #else
6190     int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
6191     int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
6192     int16x4x2_t result = vzip_s16(a1, b1);
6193     return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
6194 #endif
6195 }
6196
6197 // Interleaves the upper 2 signed or unsigned 32-bit integers in a with the
6198 // upper 2 signed or unsigned 32-bit integers in b.
6199 // https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
6200 FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
6201 {
6202 #if defined(__aarch64__)
6203     return vreinterpretq_m128i_s32(
6204         vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
6205 #else
6206     int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
6207     int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
6208     int32x2x2_t result = vzip_s32(a1, b1);
6209     return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
6210 #endif
6211 }
6212
6213 // Interleaves the upper signed or unsigned 64-bit integer in a with the
6214 // upper signed or unsigned 64-bit integer in b.
6215 //
6216 //   r0 := a1
6217 //   r1 := b1
6218 FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
6219 {
6220     int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
6221     int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
6222     return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
6223 }
6224
6225 // Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper
6226 // 8 signed or unsigned 8-bit integers in b.
6227 //
6228 //   r0 := a8
6229 //   r1 := b8
6230 //   r2 := a9
6231 //   r3 := b9
6232 //   ...
6233 //   r14 := a15
6234 //   r15 := b15
6235 //
6236 // https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
6237 FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
6238 {
6239 #if defined(__aarch64__)
6240     return vreinterpretq_m128i_s8(
6241         vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
6242 #else
6243     int8x8_t a1 =
6244         vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
6245     int8x8_t b1 =
6246         vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
6247     int8x8x2_t result = vzip_s8(a1, b1);
6248     return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
6249 #endif
6250 }
6251
6252 // Unpack and interleave double-precision (64-bit) floating-point elements from
6253 // the high half of a and b, and store the results in dst.
6254 //
6255 //   DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
6256 //     dst[63:0] := src1[127:64]
6257 //     dst[127:64] := src2[127:64]
6258 //     RETURN dst[127:0]
6259 //   }
6260 //   dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
6261 //
6262 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_pd
6263 FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
6264 {
6265 #if defined(__aarch64__)
6266     return vreinterpretq_m128d_f64(
6267         vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6268 #else
6269     return vreinterpretq_m128d_s64(
6270         vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)),
6271                      vget_high_s64(vreinterpretq_s64_m128d(b))));
6272 #endif
6273 }
6274
6275 // Interleaves the lower 4 signed or unsigned 16-bit integers in a with the
6276 // lower 4 signed or unsigned 16-bit integers in b.
6277 //
6278 //   r0 := a0
6279 //   r1 := b0
6280 //   r2 := a1
6281 //   r3 := b1
6282 //   r4 := a2
6283 //   r5 := b2
6284 //   r6 := a3
6285 //   r7 := b3
6286 //
6287 // https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
6288 FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
6289 {
6290 #if defined(__aarch64__)
6291     return vreinterpretq_m128i_s16(
6292         vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
6293 #else
6294     int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
6295     int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
6296     int16x4x2_t result = vzip_s16(a1, b1);
6297     return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
6298 #endif
6299 }
6300
6301 // Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the
6302 // lower 2 signed or unsigned 32 - bit integers in b.
6303 //
6304 //   r0 := a0
6305 //   r1 := b0
6306 //   r2 := a1
6307 //   r3 := b1
6308 //
6309 // https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
6310 FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
6311 {
6312 #if defined(__aarch64__)
6313     return vreinterpretq_m128i_s32(
6314         vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
6315 #else
6316     int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
6317     int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
6318     int32x2x2_t result = vzip_s32(a1, b1);
6319     return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
6320 #endif
6321 }
6322
6323 FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
6324 {
6325     int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
6326     int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
6327     return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
6328 }
6329
6330 // Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower
6331 // 8 signed or unsigned 8-bit integers in b.
6332 //
6333 //   r0 := a0
6334 //   r1 := b0
6335 //   r2 := a1
6336 //   r3 := b1
6337 //   ...
6338 //   r14 := a7
6339 //   r15 := b7
6340 //
6341 // https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
6342 FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
6343 {
6344 #if defined(__aarch64__)
6345     return vreinterpretq_m128i_s8(
6346         vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
6347 #else
6348     int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
6349     int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
6350     int8x8x2_t result = vzip_s8(a1, b1);
6351     return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
6352 #endif
6353 }
6354
6355 // Unpack and interleave double-precision (64-bit) floating-point elements from
6356 // the low half of a and b, and store the results in dst.
6357 //
6358 //   DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
6359 //     dst[63:0] := src1[63:0]
6360 //     dst[127:64] := src2[63:0]
6361 //     RETURN dst[127:0]
6362 //   }
6363 //   dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
6364 //
6365 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_pd
6366 FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
6367 {
6368 #if defined(__aarch64__)
6369     return vreinterpretq_m128d_f64(
6370         vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6371 #else
6372     return vreinterpretq_m128d_s64(
6373         vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)),
6374                      vget_low_s64(vreinterpretq_s64_m128d(b))));
6375 #endif
6376 }
6377
6378 // Compute the bitwise XOR of packed double-precision (64-bit) floating-point
6379 // elements in a and b, and store the results in dst.
6380 //
6381 //   FOR j := 0 to 1
6382 //      i := j*64
6383 //      dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
6384 //   ENDFOR
6385 //
6386 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd
6387 FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
6388 {
6389     return vreinterpretq_m128d_s64(
6390         veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
6391 }
6392
6393 // Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in
6394 // b.  https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
6395 FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
6396 {
6397     return vreinterpretq_m128i_s32(
6398         veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
6399 }
6400
6401 /* SSE3 */
6402
6403 // Alternatively add and subtract packed double-precision (64-bit)
6404 // floating-point elements in a to/from packed elements in b, and store the
6405 // results in dst.
6406 //
6407 // FOR j := 0 to 1
6408 //   i := j*64
6409 //   IF ((j & 1) == 0)
6410 //     dst[i+63:i] := a[i+63:i] - b[i+63:i]
6411 //   ELSE
6412 //     dst[i+63:i] := a[i+63:i] + b[i+63:i]
6413 //   FI
6414 // ENDFOR
6415 //
6416 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_addsub_pd
6417 FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
6418 {
6419     static const __m128d mask = _mm_set_pd(1.0f, -1.0f);
6420 #if defined(__aarch64__)
6421     return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),
6422                                              vreinterpretq_f64_m128d(b),
6423                                              vreinterpretq_f64_m128d(mask)));
6424 #else
6425     return _mm_add_pd(_mm_mul_pd(b, mask), a);
6426 #endif
6427 }
6428
6429 // Alternatively add and subtract packed single-precision (32-bit)
6430 // floating-point elements in a to/from packed elements in b, and store the
6431 // results in dst.
6432 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps
6433 FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
6434 {
6435     static const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f);
6436 #if defined(__aarch64__) || defined(__ARM_FEATURE_FMA) /* VFPv4+ */
6437     return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a),
6438                                             vreinterpretq_f32_m128(mask),
6439                                             vreinterpretq_f32_m128(b)));
6440 #else
6441     return _mm_add_ps(_mm_mul_ps(b, mask), a);
6442 #endif
6443 }
6444
6445 // Horizontally add adjacent pairs of double-precision (64-bit) floating-point
6446 // elements in a and b, and pack the results in dst.
6447 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pd
6448 FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
6449 {
6450 #if defined(__aarch64__)
6451     return vreinterpretq_m128d_f64(
6452         vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6453 #else
6454     double *da = (double *) &a;
6455     double *db = (double *) &b;
6456     double c[] = {da[0] + da[1], db[0] + db[1]};
6457     return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
6458 #endif
6459 }
6460
6461 // Computes pairwise add of each argument as single-precision, floating-point
6462 // values a and b.
6463 // https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
6464 FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
6465 {
6466 #if defined(__aarch64__)
6467     return vreinterpretq_m128_f32(
6468         vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
6469 #else
6470     float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
6471     float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
6472     float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
6473     float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
6474     return vreinterpretq_m128_f32(
6475         vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
6476 #endif
6477 }
6478
6479 // Horizontally subtract adjacent pairs of double-precision (64-bit)
6480 // floating-point elements in a and b, and pack the results in dst.
6481 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pd
6482 FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b)
6483 {
6484 #if defined(__aarch64__)
6485     float64x2_t a = vreinterpretq_f64_m128d(_a);
6486     float64x2_t b = vreinterpretq_f64_m128d(_b);
6487     return vreinterpretq_m128d_f64(
6488         vsubq_f64(vuzp1q_f64(a, b), vuzp2q_f64(a, b)));
6489 #else
6490     double *da = (double *) &_a;
6491     double *db = (double *) &_b;
6492     double c[] = {da[0] - da[1], db[0] - db[1]};
6493     return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
6494 #endif
6495 }
6496
6497 // Horizontally substract adjacent pairs of single-precision (32-bit)
6498 // floating-point elements in a and b, and pack the results in dst.
6499 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps
6500 FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
6501 {
6502     float32x4_t a = vreinterpretq_f32_m128(_a);
6503     float32x4_t b = vreinterpretq_f32_m128(_b);
6504 #if defined(__aarch64__)
6505     return vreinterpretq_m128_f32(
6506         vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b)));
6507 #else
6508     float32x4x2_t c = vuzpq_f32(a, b);
6509     return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
6510 #endif
6511 }
6512
6513 // Load 128-bits of integer data from unaligned memory into dst. This intrinsic
6514 // may perform better than _mm_loadu_si128 when the data crosses a cache line
6515 // boundary.
6516 //
6517 //   dst[127:0] := MEM[mem_addr+127:mem_addr]
6518 //
6519 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128
6520 #define _mm_lddqu_si128 _mm_loadu_si128
6521
6522 // Load a double-precision (64-bit) floating-point element from memory into both
6523 // elements of dst.
6524 //
6525 //   dst[63:0] := MEM[mem_addr+63:mem_addr]
6526 //   dst[127:64] := MEM[mem_addr+63:mem_addr]
6527 //
6528 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd
6529 #define _mm_loaddup_pd _mm_load1_pd
6530
6531 // Duplicate the low double-precision (64-bit) floating-point element from a,
6532 // and store the results in dst.
6533 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movedup_pd
6534 FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
6535 {
6536 #if defined(__aarch64__)
6537     return vreinterpretq_m128d_f64(
6538         vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
6539 #else
6540     return vreinterpretq_m128d_u64(
6541         vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)));
6542 #endif
6543 }
6544
6545 // Duplicate odd-indexed single-precision (32-bit) floating-point elements
6546 // from a, and store the results in dst.
6547 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps
6548 FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
6549 {
6550 #if __has_builtin(__builtin_shufflevector)
6551     return vreinterpretq_m128_f32(__builtin_shufflevector(
6552         vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
6553 #else
6554     float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
6555     float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
6556     float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
6557     return vreinterpretq_m128_f32(vld1q_f32(data));
6558 #endif
6559 }
6560
6561 // Duplicate even-indexed single-precision (32-bit) floating-point elements
6562 // from a, and store the results in dst.
6563 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps
6564 FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
6565 {
6566 #if __has_builtin(__builtin_shufflevector)
6567     return vreinterpretq_m128_f32(__builtin_shufflevector(
6568         vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
6569 #else
6570     float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
6571     float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
6572     float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
6573     return vreinterpretq_m128_f32(vld1q_f32(data));
6574 #endif
6575 }
6576
6577 /* SSSE3 */
6578
6579 // Compute the absolute value of packed signed 16-bit integers in a, and store
6580 // the unsigned results in dst.
6581 //
6582 //   FOR j := 0 to 7
6583 //     i := j*16
6584 //     dst[i+15:i] := ABS(a[i+15:i])
6585 //   ENDFOR
6586 //
6587 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16
6588 FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
6589 {
6590     return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
6591 }
6592
6593 // Compute the absolute value of packed signed 32-bit integers in a, and store
6594 // the unsigned results in dst.
6595 //
6596 //   FOR j := 0 to 3
6597 //     i := j*32
6598 //     dst[i+31:i] := ABS(a[i+31:i])
6599 //   ENDFOR
6600 //
6601 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32
6602 FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
6603 {
6604     return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
6605 }
6606
6607 // Compute the absolute value of packed signed 8-bit integers in a, and store
6608 // the unsigned results in dst.
6609 //
6610 //   FOR j := 0 to 15
6611 //     i := j*8
6612 //     dst[i+7:i] := ABS(a[i+7:i])
6613 //   ENDFOR
6614 //
6615 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8
6616 FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
6617 {
6618     return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
6619 }
6620
6621 // Compute the absolute value of packed signed 16-bit integers in a, and store
6622 // the unsigned results in dst.
6623 //
6624 //   FOR j := 0 to 3
6625 //     i := j*16
6626 //     dst[i+15:i] := ABS(a[i+15:i])
6627 //   ENDFOR
6628 //
6629 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16
6630 FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
6631 {
6632     return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
6633 }
6634
6635 // Compute the absolute value of packed signed 32-bit integers in a, and store
6636 // the unsigned results in dst.
6637 //
6638 //   FOR j := 0 to 1
6639 //     i := j*32
6640 //     dst[i+31:i] := ABS(a[i+31:i])
6641 //   ENDFOR
6642 //
6643 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32
6644 FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
6645 {
6646     return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
6647 }
6648
6649 // Compute the absolute value of packed signed 8-bit integers in a, and store
6650 // the unsigned results in dst.
6651 //
6652 //   FOR j := 0 to 7
6653 //     i := j*8
6654 //     dst[i+7:i] := ABS(a[i+7:i])
6655 //   ENDFOR
6656 //
6657 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8
6658 FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
6659 {
6660     return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
6661 }
6662
6663 // Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift
6664 // the result right by imm8 bytes, and store the low 16 bytes in dst.
6665 //
6666 //   tmp[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8)
6667 //   dst[127:0] := tmp[127:0]
6668 //
6669 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_epi8
6670 FORCE_INLINE __m128i _mm_alignr_epi8(__m128i a, __m128i b, int imm)
6671 {
6672     if (_sse2neon_unlikely(imm & ~31))
6673         return _mm_setzero_si128();
6674     int idx;
6675     uint8x16_t tmp[2];
6676     if (imm >= 16) {
6677         idx = imm - 16;
6678         tmp[0] = vreinterpretq_u8_m128i(a);
6679         tmp[1] = vdupq_n_u8(0);
6680     } else {
6681         idx = imm;
6682         tmp[0] = vreinterpretq_u8_m128i(b);
6683         tmp[1] = vreinterpretq_u8_m128i(a);
6684     }
6685     return vreinterpretq_m128i_u8(vld1q_u8(((uint8_t const *) tmp) + idx));
6686 }
6687
6688 // Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
6689 // the result right by imm8 bytes, and store the low 8 bytes in dst.
6690 //
6691 //   tmp[127:0] := ((a[63:0] << 64)[127:0] OR b[63:0]) >> (imm8*8)
6692 //   dst[63:0] := tmp[63:0]
6693 //
6694 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_pi8
6695 #define _mm_alignr_pi8(a, b, imm)                                           \
6696     __extension__({                                                         \
6697         __m64 ret;                                                          \
6698         if (_sse2neon_unlikely((imm) >= 16)) {                              \
6699             ret = vreinterpret_m64_s8(vdup_n_s8(0));                        \
6700         } else {                                                            \
6701             uint8x8_t tmp_low, tmp_high;                                    \
6702             if ((imm) >= 8) {                                               \
6703                 const int idx = (imm) -8;                                   \
6704                 tmp_low = vreinterpret_u8_m64(a);                           \
6705                 tmp_high = vdup_n_u8(0);                                    \
6706                 ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
6707             } else {                                                        \
6708                 const int idx = (imm);                                      \
6709                 tmp_low = vreinterpret_u8_m64(b);                           \
6710                 tmp_high = vreinterpret_u8_m64(a);                          \
6711                 ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
6712             }                                                               \
6713         }                                                                   \
6714         ret;                                                                \
6715     })
6716
6717 // Computes pairwise add of each argument as a 16-bit signed or unsigned integer
6718 // values a and b.
6719 FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
6720 {
6721     int16x8_t a = vreinterpretq_s16_m128i(_a);
6722     int16x8_t b = vreinterpretq_s16_m128i(_b);
6723 #if defined(__aarch64__)
6724     return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
6725 #else
6726     return vreinterpretq_m128i_s16(
6727         vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
6728                      vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
6729 #endif
6730 }
6731
6732 // Computes pairwise add of each argument as a 32-bit signed or unsigned integer
6733 // values a and b.
6734 FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
6735 {
6736     int32x4_t a = vreinterpretq_s32_m128i(_a);
6737     int32x4_t b = vreinterpretq_s32_m128i(_b);
6738     return vreinterpretq_m128i_s32(
6739         vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
6740                      vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
6741 }
6742
6743 // Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
6744 // signed 16-bit results in dst.
6745 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16
6746 FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
6747 {
6748     return vreinterpret_m64_s16(
6749         vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
6750 }
6751
6752 // Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
6753 // signed 32-bit results in dst.
6754 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32
6755 FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
6756 {
6757     return vreinterpret_m64_s32(
6758         vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
6759 }
6760
6761 // Computes saturated pairwise sub of each argument as a 16-bit signed
6762 // integer values a and b.
6763 FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
6764 {
6765 #if defined(__aarch64__)
6766     int16x8_t a = vreinterpretq_s16_m128i(_a);
6767     int16x8_t b = vreinterpretq_s16_m128i(_b);
6768     return vreinterpretq_s64_s16(
6769         vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6770 #else
6771     int32x4_t a = vreinterpretq_s32_m128i(_a);
6772     int32x4_t b = vreinterpretq_s32_m128i(_b);
6773     // Interleave using vshrn/vmovn
6774     // [a0|a2|a4|a6|b0|b2|b4|b6]
6775     // [a1|a3|a5|a7|b1|b3|b5|b7]
6776     int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
6777     int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
6778     // Saturated add
6779     return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
6780 #endif
6781 }
6782
6783 // Horizontally add adjacent pairs of signed 16-bit integers in a and b using
6784 // saturation, and pack the signed 16-bit results in dst.
6785 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadds_pi16
6786 FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
6787 {
6788     int16x4_t a = vreinterpret_s16_m64(_a);
6789     int16x4_t b = vreinterpret_s16_m64(_b);
6790 #if defined(__aarch64__)
6791     return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
6792 #else
6793     int16x4x2_t res = vuzp_s16(a, b);
6794     return vreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1]));
6795 #endif
6796 }
6797
6798 // Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
6799 // the signed 16-bit results in dst.
6800 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_epi16
6801 FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
6802 {
6803     int16x8_t a = vreinterpretq_s16_m128i(_a);
6804     int16x8_t b = vreinterpretq_s16_m128i(_b);
6805 #if defined(__aarch64__)
6806     return vreinterpretq_m128i_s16(
6807         vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6808 #else
6809     int16x8x2_t c = vuzpq_s16(a, b);
6810     return vreinterpretq_m128i_s16(vsubq_s16(c.val[0], c.val[1]));
6811 #endif
6812 }
6813
6814 // Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
6815 // the signed 32-bit results in dst.
6816 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_epi32
6817 FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
6818 {
6819     int32x4_t a = vreinterpretq_s32_m128i(_a);
6820     int32x4_t b = vreinterpretq_s32_m128i(_b);
6821 #if defined(__aarch64__)
6822     return vreinterpretq_m128i_s32(
6823         vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b)));
6824 #else
6825     int32x4x2_t c = vuzpq_s32(a, b);
6826     return vreinterpretq_m128i_s32(vsubq_s32(c.val[0], c.val[1]));
6827 #endif
6828 }
6829
6830 // Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
6831 // the signed 16-bit results in dst.
6832 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pi16
6833 FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
6834 {
6835     int16x4_t a = vreinterpret_s16_m64(_a);
6836     int16x4_t b = vreinterpret_s16_m64(_b);
6837 #if defined(__aarch64__)
6838     return vreinterpret_m64_s16(vsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
6839 #else
6840     int16x4x2_t c = vuzp_s16(a, b);
6841     return vreinterpret_m64_s16(vsub_s16(c.val[0], c.val[1]));
6842 #endif
6843 }
6844
6845 // Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
6846 // the signed 32-bit results in dst.
6847 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_hsub_pi32
6848 FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
6849 {
6850     int32x2_t a = vreinterpret_s32_m64(_a);
6851     int32x2_t b = vreinterpret_s32_m64(_b);
6852 #if defined(__aarch64__)
6853     return vreinterpret_m64_s32(vsub_s32(vuzp1_s32(a, b), vuzp2_s32(a, b)));
6854 #else
6855     int32x2x2_t c = vuzp_s32(a, b);
6856     return vreinterpret_m64_s32(vsub_s32(c.val[0], c.val[1]));
6857 #endif
6858 }
6859
6860 // Computes saturated pairwise difference of each argument as a 16-bit signed
6861 // integer values a and b.
6862 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16
6863 FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
6864 {
6865     int16x8_t a = vreinterpretq_s16_m128i(_a);
6866     int16x8_t b = vreinterpretq_s16_m128i(_b);
6867 #if defined(__aarch64__)
6868     return vreinterpretq_m128i_s16(
6869         vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6870 #else
6871     int16x8x2_t c = vuzpq_s16(a, b);
6872     return vreinterpretq_m128i_s16(vqsubq_s16(c.val[0], c.val[1]));
6873 #endif
6874 }
6875
6876 // Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
6877 // using saturation, and pack the signed 16-bit results in dst.
6878 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_pi16
6879 FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
6880 {
6881     int16x4_t a = vreinterpret_s16_m64(_a);
6882     int16x4_t b = vreinterpret_s16_m64(_b);
6883 #if defined(__aarch64__)
6884     return vreinterpret_m64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
6885 #else
6886     int16x4x2_t c = vuzp_s16(a, b);
6887     return vreinterpret_m64_s16(vqsub_s16(c.val[0], c.val[1]));
6888 #endif
6889 }
6890
6891 // Vertically multiply each unsigned 8-bit integer from a with the corresponding
6892 // signed 8-bit integer from b, producing intermediate signed 16-bit integers.
6893 // Horizontally add adjacent pairs of intermediate signed 16-bit integers,
6894 // and pack the saturated results in dst.
6895 //
6896 //   FOR j := 0 to 7
6897 //      i := j*16
6898 //      dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] +
6899 //      a[i+7:i]*b[i+7:i] )
6900 //   ENDFOR
6901 FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
6902 {
6903 #if defined(__aarch64__)
6904     uint8x16_t a = vreinterpretq_u8_m128i(_a);
6905     int8x16_t b = vreinterpretq_s8_m128i(_b);
6906     int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
6907                              vmovl_s8(vget_low_s8(b)));
6908     int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
6909                              vmovl_s8(vget_high_s8(b)));
6910     return vreinterpretq_m128i_s16(
6911         vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
6912 #else
6913     // This would be much simpler if x86 would choose to zero extend OR sign
6914     // extend, not both. This could probably be optimized better.
6915     uint16x8_t a = vreinterpretq_u16_m128i(_a);
6916     int16x8_t b = vreinterpretq_s16_m128i(_b);
6917
6918     // Zero extend a
6919     int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
6920     int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
6921
6922     // Sign extend by shifting left then shifting right.
6923     int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
6924     int16x8_t b_odd = vshrq_n_s16(b, 8);
6925
6926     // multiply
6927     int16x8_t prod1 = vmulq_s16(a_even, b_even);
6928     int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
6929
6930     // saturated add
6931     return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
6932 #endif
6933 }
6934
6935 // Vertically multiply each unsigned 8-bit integer from a with the corresponding
6936 // signed 8-bit integer from b, producing intermediate signed 16-bit integers.
6937 // Horizontally add adjacent pairs of intermediate signed 16-bit integers, and
6938 // pack the saturated results in dst.
6939 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maddubs_pi16
6940 FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
6941 {
6942     uint16x4_t a = vreinterpret_u16_m64(_a);
6943     int16x4_t b = vreinterpret_s16_m64(_b);
6944
6945     // Zero extend a
6946     int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8));
6947     int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff)));
6948
6949     // Sign extend by shifting left then shifting right.
6950     int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8);
6951     int16x4_t b_odd = vshr_n_s16(b, 8);
6952
6953     // multiply
6954     int16x4_t prod1 = vmul_s16(a_even, b_even);
6955     int16x4_t prod2 = vmul_s16(a_odd, b_odd);
6956
6957     // saturated add
6958     return vreinterpret_m64_s16(vqadd_s16(prod1, prod2));
6959 }
6960
6961 // Multiply packed signed 16-bit integers in a and b, producing intermediate
6962 // signed 32-bit integers. Shift right by 15 bits while rounding up, and store
6963 // the packed 16-bit integers in dst.
6964 //
6965 //   r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15)
6966 //   r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15)
6967 //   r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15)
6968 //   ...
6969 //   r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15)
6970 FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
6971 {
6972     // Has issues due to saturation
6973     // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
6974
6975     // Multiply
6976     int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
6977                                  vget_low_s16(vreinterpretq_s16_m128i(b)));
6978     int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
6979                                  vget_high_s16(vreinterpretq_s16_m128i(b)));
6980
6981     // Rounding narrowing shift right
6982     // narrow = (int16_t)((mul + 16384) >> 15);
6983     int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
6984     int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
6985
6986     // Join together
6987     return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
6988 }
6989
6990 // Multiply packed signed 16-bit integers in a and b, producing intermediate
6991 // signed 32-bit integers. Truncate each intermediate integer to the 18 most
6992 // significant bits, round by adding 1, and store bits [16:1] to dst.
6993 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhrs_pi16
6994 FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)
6995 {
6996     int32x4_t mul_extend =
6997         vmull_s16((vreinterpret_s16_m64(a)), (vreinterpret_s16_m64(b)));
6998
6999     // Rounding narrowing shift right
7000     return vreinterpret_m64_s16(vrshrn_n_s32(mul_extend, 15));
7001 }
7002
7003 // Shuffle packed 8-bit integers in a according to shuffle control mask in the
7004 // corresponding 8-bit element of b, and store the results in dst.
7005 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8
7006 FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
7007 {
7008     int8x16_t tbl = vreinterpretq_s8_m128i(a);   // input a
7009     uint8x16_t idx = vreinterpretq_u8_m128i(b);  // input b
7010     uint8x16_t idx_masked =
7011         vandq_u8(idx, vdupq_n_u8(0x8F));  // avoid using meaningless bits
7012 #if defined(__aarch64__)
7013     return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
7014 #elif defined(__GNUC__)
7015     int8x16_t ret;
7016     // %e and %f represent the even and odd D registers
7017     // respectively.
7018     __asm__ __volatile__(
7019         "vtbl.8  %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
7020         "vtbl.8  %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
7021         : [ret] "=&w"(ret)
7022         : [tbl] "w"(tbl), [idx] "w"(idx_masked));
7023     return vreinterpretq_m128i_s8(ret);
7024 #else
7025     // use this line if testing on aarch64
7026     int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
7027     return vreinterpretq_m128i_s8(
7028         vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
7029                     vtbl2_s8(a_split, vget_high_u8(idx_masked))));
7030 #endif
7031 }
7032
7033 // Shuffle packed 8-bit integers in a according to shuffle control mask in the
7034 // corresponding 8-bit element of b, and store the results in dst.
7035 //
7036 //   FOR j := 0 to 7
7037 //     i := j*8
7038 //     IF b[i+7] == 1
7039 //       dst[i+7:i] := 0
7040 //     ELSE
7041 //       index[2:0] := b[i+2:i]
7042 //       dst[i+7:i] := a[index*8+7:index*8]
7043 //     FI
7044 //   ENDFOR
7045 //
7046 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pi8
7047 FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b)
7048 {
7049     const int8x8_t controlMask =
7050         vand_s8(vreinterpret_s8_m64(b), vdup_n_s8((int8_t)(0x1 << 7 | 0x07)));
7051     int8x8_t res = vtbl1_s8(vreinterpret_s8_m64(a), controlMask);
7052     return vreinterpret_m64_s8(res);
7053 }
7054
7055 // Negate packed 16-bit integers in a when the corresponding signed
7056 // 16-bit integer in b is negative, and store the results in dst.
7057 // Element in dst are zeroed out when the corresponding element
7058 // in b is zero.
7059 //
7060 //   for i in 0..7
7061 //     if b[i] < 0
7062 //       r[i] := -a[i]
7063 //     else if b[i] == 0
7064 //       r[i] := 0
7065 //     else
7066 //       r[i] := a[i]
7067 //     fi
7068 //   done
7069 FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
7070 {
7071     int16x8_t a = vreinterpretq_s16_m128i(_a);
7072     int16x8_t b = vreinterpretq_s16_m128i(_b);
7073
7074     // signed shift right: faster than vclt
7075     // (b < 0) ? 0xFFFF : 0
7076     uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
7077     // (b == 0) ? 0xFFFF : 0
7078 #if defined(__aarch64__)
7079     int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
7080 #else
7081     int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
7082 #endif
7083
7084     // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
7085     // 'a') based on ltMask
7086     int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
7087     // res = masked & (~zeroMask)
7088     int16x8_t res = vbicq_s16(masked, zeroMask);
7089     return vreinterpretq_m128i_s16(res);
7090 }
7091
7092 // Negate packed 32-bit integers in a when the corresponding signed
7093 // 32-bit integer in b is negative, and store the results in dst.
7094 // Element in dst are zeroed out when the corresponding element
7095 // in b is zero.
7096 //
7097 //   for i in 0..3
7098 //     if b[i] < 0
7099 //       r[i] := -a[i]
7100 //     else if b[i] == 0
7101 //       r[i] := 0
7102 //     else
7103 //       r[i] := a[i]
7104 //     fi
7105 //   done
7106 FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
7107 {
7108     int32x4_t a = vreinterpretq_s32_m128i(_a);
7109     int32x4_t b = vreinterpretq_s32_m128i(_b);
7110
7111     // signed shift right: faster than vclt
7112     // (b < 0) ? 0xFFFFFFFF : 0
7113     uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
7114
7115     // (b == 0) ? 0xFFFFFFFF : 0
7116 #if defined(__aarch64__)
7117     int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
7118 #else
7119     int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
7120 #endif
7121
7122     // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
7123     // 'a') based on ltMask
7124     int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
7125     // res = masked & (~zeroMask)
7126     int32x4_t res = vbicq_s32(masked, zeroMask);
7127     return vreinterpretq_m128i_s32(res);
7128 }
7129
7130 // Negate packed 8-bit integers in a when the corresponding signed
7131 // 8-bit integer in b is negative, and store the results in dst.
7132 // Element in dst are zeroed out when the corresponding element
7133 // in b is zero.
7134 //
7135 //   for i in 0..15
7136 //     if b[i] < 0
7137 //       r[i] := -a[i]
7138 //     else if b[i] == 0
7139 //       r[i] := 0
7140 //     else
7141 //       r[i] := a[i]
7142 //     fi
7143 //   done
7144 FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
7145 {
7146     int8x16_t a = vreinterpretq_s8_m128i(_a);
7147     int8x16_t b = vreinterpretq_s8_m128i(_b);
7148
7149     // signed shift right: faster than vclt
7150     // (b < 0) ? 0xFF : 0
7151     uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
7152
7153     // (b == 0) ? 0xFF : 0
7154 #if defined(__aarch64__)
7155     int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
7156 #else
7157     int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
7158 #endif
7159
7160     // bitwise select either a or nagative 'a' (vnegq_s8(a) return nagative 'a')
7161     // based on ltMask
7162     int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
7163     // res = masked & (~zeroMask)
7164     int8x16_t res = vbicq_s8(masked, zeroMask);
7165
7166     return vreinterpretq_m128i_s8(res);
7167 }
7168
7169 // Negate packed 16-bit integers in a when the corresponding signed 16-bit
7170 // integer in b is negative, and store the results in dst. Element in dst are
7171 // zeroed out when the corresponding element in b is zero.
7172 //
7173 //   FOR j := 0 to 3
7174 //      i := j*16
7175 //      IF b[i+15:i] < 0
7176 //        dst[i+15:i] := -(a[i+15:i])
7177 //      ELSE IF b[i+15:i] == 0
7178 //        dst[i+15:i] := 0
7179 //      ELSE
7180 //        dst[i+15:i] := a[i+15:i]
7181 //      FI
7182 //   ENDFOR
7183 //
7184 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16
7185 FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
7186 {
7187     int16x4_t a = vreinterpret_s16_m64(_a);
7188     int16x4_t b = vreinterpret_s16_m64(_b);
7189
7190     // signed shift right: faster than vclt
7191     // (b < 0) ? 0xFFFF : 0
7192     uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
7193
7194     // (b == 0) ? 0xFFFF : 0
7195 #if defined(__aarch64__)
7196     int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
7197 #else
7198     int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
7199 #endif
7200
7201     // bitwise select either a or nagative 'a' (vneg_s16(a) return nagative 'a')
7202     // based on ltMask
7203     int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
7204     // res = masked & (~zeroMask)
7205     int16x4_t res = vbic_s16(masked, zeroMask);
7206
7207     return vreinterpret_m64_s16(res);
7208 }
7209
7210 // Negate packed 32-bit integers in a when the corresponding signed 32-bit
7211 // integer in b is negative, and store the results in dst. Element in dst are
7212 // zeroed out when the corresponding element in b is zero.
7213 //
7214 //   FOR j := 0 to 1
7215 //      i := j*32
7216 //      IF b[i+31:i] < 0
7217 //        dst[i+31:i] := -(a[i+31:i])
7218 //      ELSE IF b[i+31:i] == 0
7219 //        dst[i+31:i] := 0
7220 //      ELSE
7221 //        dst[i+31:i] := a[i+31:i]
7222 //      FI
7223 //   ENDFOR
7224 //
7225 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32
7226 FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
7227 {
7228     int32x2_t a = vreinterpret_s32_m64(_a);
7229     int32x2_t b = vreinterpret_s32_m64(_b);
7230
7231     // signed shift right: faster than vclt
7232     // (b < 0) ? 0xFFFFFFFF : 0
7233     uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
7234
7235     // (b == 0) ? 0xFFFFFFFF : 0
7236 #if defined(__aarch64__)
7237     int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
7238 #else
7239     int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
7240 #endif
7241
7242     // bitwise select either a or nagative 'a' (vneg_s32(a) return nagative 'a')
7243     // based on ltMask
7244     int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
7245     // res = masked & (~zeroMask)
7246     int32x2_t res = vbic_s32(masked, zeroMask);
7247
7248     return vreinterpret_m64_s32(res);
7249 }
7250
7251 // Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
7252 // in b is negative, and store the results in dst. Element in dst are zeroed out
7253 // when the corresponding element in b is zero.
7254 //
7255 //   FOR j := 0 to 7
7256 //      i := j*8
7257 //      IF b[i+7:i] < 0
7258 //        dst[i+7:i] := -(a[i+7:i])
7259 //      ELSE IF b[i+7:i] == 0
7260 //        dst[i+7:i] := 0
7261 //      ELSE
7262 //        dst[i+7:i] := a[i+7:i]
7263 //      FI
7264 //   ENDFOR
7265 //
7266 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8
7267 FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
7268 {
7269     int8x8_t a = vreinterpret_s8_m64(_a);
7270     int8x8_t b = vreinterpret_s8_m64(_b);
7271
7272     // signed shift right: faster than vclt
7273     // (b < 0) ? 0xFF : 0
7274     uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
7275
7276     // (b == 0) ? 0xFF : 0
7277 #if defined(__aarch64__)
7278     int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
7279 #else
7280     int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
7281 #endif
7282
7283     // bitwise select either a or nagative 'a' (vneg_s8(a) return nagative 'a')
7284     // based on ltMask
7285     int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
7286     // res = masked & (~zeroMask)
7287     int8x8_t res = vbic_s8(masked, zeroMask);
7288
7289     return vreinterpret_m64_s8(res);
7290 }
7291
7292 /* SSE4.1 */
7293
7294 // Blend packed 16-bit integers from a and b using control mask imm8, and store
7295 // the results in dst.
7296 //
7297 //   FOR j := 0 to 7
7298 //       i := j*16
7299 //       IF imm8[j]
7300 //           dst[i+15:i] := b[i+15:i]
7301 //       ELSE
7302 //           dst[i+15:i] := a[i+15:i]
7303 //       FI
7304 //   ENDFOR
7305 // FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
7306 //                                      __constrange(0,255) int imm)
7307 #define _mm_blend_epi16(a, b, imm)                                            \
7308     __extension__({                                                           \
7309         const uint16_t _mask[8] = {((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0,  \
7310                                    ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0,  \
7311                                    ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0,  \
7312                                    ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0,  \
7313                                    ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0,  \
7314                                    ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0,  \
7315                                    ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0,  \
7316                                    ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0}; \
7317         uint16x8_t _mask_vec = vld1q_u16(_mask);                              \
7318         uint16x8_t _a = vreinterpretq_u16_m128i(a);                           \
7319         uint16x8_t _b = vreinterpretq_u16_m128i(b);                           \
7320         vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a));                \
7321     })
7322
7323 // Blend packed double-precision (64-bit) floating-point elements from a and b
7324 // using control mask imm8, and store the results in dst.
7325 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_pd
7326 #define _mm_blend_pd(a, b, imm)                                \
7327     __extension__({                                            \
7328         const uint64_t _mask[2] = {                            \
7329             ((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0),   \
7330             ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)};  \
7331         uint64x2_t _mask_vec = vld1q_u64(_mask);               \
7332         uint64x2_t _a = vreinterpretq_u64_m128d(a);            \
7333         uint64x2_t _b = vreinterpretq_u64_m128d(b);            \
7334         vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, _b, _a)); \
7335     })
7336
7337 // Blend packed single-precision (32-bit) floating-point elements from a and b
7338 // using mask, and store the results in dst.
7339 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_ps
7340 FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
7341 {
7342     const uint32_t ALIGN_STRUCT(16)
7343         data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
7344                    ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
7345                    ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
7346                    ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
7347     uint32x4_t mask = vld1q_u32(data);
7348     float32x4_t a = vreinterpretq_f32_m128(_a);
7349     float32x4_t b = vreinterpretq_f32_m128(_b);
7350     return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
7351 }
7352
7353 // Blend packed 8-bit integers from a and b using mask, and store the results in
7354 // dst.
7355 //
7356 //   FOR j := 0 to 15
7357 //       i := j*8
7358 //       IF mask[i+7]
7359 //           dst[i+7:i] := b[i+7:i]
7360 //       ELSE
7361 //           dst[i+7:i] := a[i+7:i]
7362 //       FI
7363 //   ENDFOR
7364 FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
7365 {
7366     // Use a signed shift right to create a mask with the sign bit
7367     uint8x16_t mask =
7368         vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
7369     uint8x16_t a = vreinterpretq_u8_m128i(_a);
7370     uint8x16_t b = vreinterpretq_u8_m128i(_b);
7371     return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
7372 }
7373
7374 // Blend packed double-precision (64-bit) floating-point elements from a and b
7375 // using mask, and store the results in dst.
7376 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd
7377 FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
7378 {
7379     uint64x2_t mask =
7380         vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
7381 #if defined(__aarch64__)
7382     float64x2_t a = vreinterpretq_f64_m128d(_a);
7383     float64x2_t b = vreinterpretq_f64_m128d(_b);
7384     return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
7385 #else
7386     uint64x2_t a = vreinterpretq_u64_m128d(_a);
7387     uint64x2_t b = vreinterpretq_u64_m128d(_b);
7388     return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a));
7389 #endif
7390 }
7391
7392 // Blend packed single-precision (32-bit) floating-point elements from a and b
7393 // using mask, and store the results in dst.
7394 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps
7395 FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
7396 {
7397     // Use a signed shift right to create a mask with the sign bit
7398     uint32x4_t mask =
7399         vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31));
7400     float32x4_t a = vreinterpretq_f32_m128(_a);
7401     float32x4_t b = vreinterpretq_f32_m128(_b);
7402     return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
7403 }
7404
7405 // Round the packed double-precision (64-bit) floating-point elements in a up
7406 // to an integer value, and store the results as packed double-precision
7407 // floating-point elements in dst.
7408 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_pd
7409 FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
7410 {
7411 #if defined(__aarch64__)
7412     return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
7413 #else
7414     double *f = (double *) &a;
7415     return _mm_set_pd(ceil(f[1]), ceil(f[0]));
7416 #endif
7417 }
7418
7419 // Round the packed single-precision (32-bit) floating-point elements in a up to
7420 // an integer value, and store the results as packed single-precision
7421 // floating-point elements in dst.
7422 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps
7423 FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
7424 {
7425 #if defined(__aarch64__)
7426     return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
7427 #else
7428     float *f = (float *) &a;
7429     return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0]));
7430 #endif
7431 }
7432
7433 // Round the lower double-precision (64-bit) floating-point element in b up to
7434 // an integer value, store the result as a double-precision floating-point
7435 // element in the lower element of dst, and copy the upper element from a to the
7436 // upper element of dst.
7437 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_sd
7438 FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b)
7439 {
7440     return _mm_move_sd(a, _mm_ceil_pd(b));
7441 }
7442
7443 // Round the lower single-precision (32-bit) floating-point element in b up to
7444 // an integer value, store the result as a single-precision floating-point
7445 // element in the lower element of dst, and copy the upper 3 packed elements
7446 // from a to the upper elements of dst.
7447 //
7448 //   dst[31:0] := CEIL(b[31:0])
7449 //   dst[127:32] := a[127:32]
7450 //
7451 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ss
7452 FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
7453 {
7454     return _mm_move_ss(a, _mm_ceil_ps(b));
7455 }
7456
7457 // Compare packed 64-bit integers in a and b for equality, and store the results
7458 // in dst
7459 FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
7460 {
7461 #if defined(__aarch64__)
7462     return vreinterpretq_m128i_u64(
7463         vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
7464 #else
7465     // ARMv7 lacks vceqq_u64
7466     // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
7467     uint32x4_t cmp =
7468         vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
7469     uint32x4_t swapped = vrev64q_u32(cmp);
7470     return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
7471 #endif
7472 }
7473
7474 // Converts the four signed 16-bit integers in the lower 64 bits to four signed
7475 // 32-bit integers.
7476 FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
7477 {
7478     return vreinterpretq_m128i_s32(
7479         vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
7480 }
7481
7482 // Converts the two signed 16-bit integers in the lower 32 bits two signed
7483 // 32-bit integers.
7484 FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
7485 {
7486     int16x8_t s16x8 = vreinterpretq_s16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
7487     int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
7488     int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
7489     return vreinterpretq_m128i_s64(s64x2);
7490 }
7491
7492 // Converts the two signed 32-bit integers in the lower 64 bits to two signed
7493 // 64-bit integers.
7494 FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
7495 {
7496     return vreinterpretq_m128i_s64(
7497         vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
7498 }
7499
7500 // Converts the four unsigned 8-bit integers in the lower 16 bits to four
7501 // unsigned 32-bit integers.
7502 FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
7503 {
7504     int8x16_t s8x16 = vreinterpretq_s8_m128i(a);    /* xxxx xxxx xxxx DCBA */
7505     int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
7506     return vreinterpretq_m128i_s16(s16x8);
7507 }
7508
7509 // Converts the four unsigned 8-bit integers in the lower 32 bits to four
7510 // unsigned 32-bit integers.
7511 FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
7512 {
7513     int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx DCBA */
7514     int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0D0C 0B0A */
7515     int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
7516     return vreinterpretq_m128i_s32(s32x4);
7517 }
7518
7519 // Converts the two signed 8-bit integers in the lower 32 bits to four
7520 // signed 64-bit integers.
7521 FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
7522 {
7523     int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx xxBA */
7524     int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0x0x 0B0A */
7525     int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
7526     int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
7527     return vreinterpretq_m128i_s64(s64x2);
7528 }
7529
7530 // Converts the four unsigned 16-bit integers in the lower 64 bits to four
7531 // unsigned 32-bit integers.
7532 FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
7533 {
7534     return vreinterpretq_m128i_u32(
7535         vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
7536 }
7537
7538 // Converts the two unsigned 16-bit integers in the lower 32 bits to two
7539 // unsigned 64-bit integers.
7540 FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
7541 {
7542     uint16x8_t u16x8 = vreinterpretq_u16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
7543     uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
7544     uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
7545     return vreinterpretq_m128i_u64(u64x2);
7546 }
7547
7548 // Converts the two unsigned 32-bit integers in the lower 64 bits to two
7549 // unsigned 64-bit integers.
7550 FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
7551 {
7552     return vreinterpretq_m128i_u64(
7553         vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
7554 }
7555
7556 // Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers,
7557 // and store the results in dst.
7558 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi16
7559 FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
7560 {
7561     uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);    /* xxxx xxxx HGFE DCBA */
7562     uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0H0G 0F0E 0D0C 0B0A */
7563     return vreinterpretq_m128i_u16(u16x8);
7564 }
7565
7566 // Converts the four unsigned 8-bit integers in the lower 32 bits to four
7567 // unsigned 32-bit integers.
7568 // https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx
7569 FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
7570 {
7571     uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx DCBA */
7572     uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0D0C 0B0A */
7573     uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
7574     return vreinterpretq_m128i_u32(u32x4);
7575 }
7576
7577 // Converts the two unsigned 8-bit integers in the lower 16 bits to two
7578 // unsigned 64-bit integers.
7579 FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
7580 {
7581     uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx xxBA */
7582     uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0x0x 0B0A */
7583     uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
7584     uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
7585     return vreinterpretq_m128i_u64(u64x2);
7586 }
7587
7588 // Conditionally multiply the packed double-precision (64-bit) floating-point
7589 // elements in a and b using the high 4 bits in imm8, sum the four products, and
7590 // conditionally store the sum in dst using the low 4 bits of imm8.
7591 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_pd
7592 FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
7593 {
7594     // Generate mask value from constant immediate bit value
7595     const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0;
7596     const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0;
7597 #if !SSE2NEON_PRECISE_DP
7598     const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0;
7599     const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0;
7600 #endif
7601     // Conditional multiplication
7602 #if !SSE2NEON_PRECISE_DP
7603     __m128d mul = _mm_mul_pd(a, b);
7604     const __m128d mulMask =
7605         _mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask));
7606     __m128d tmp = _mm_and_pd(mul, mulMask);
7607 #else
7608 #if defined(__aarch64__)
7609     double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) *
7610                                    vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0)
7611                              : 0;
7612     double d1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1) *
7613                                    vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1)
7614                              : 0;
7615 #else
7616     double d0 = (imm & 0x10) ? ((double *) &a)[0] * ((double *) &b)[0] : 0;
7617     double d1 = (imm & 0x20) ? ((double *) &a)[1] * ((double *) &b)[1] : 0;
7618 #endif
7619     __m128d tmp = _mm_set_pd(d1, d0);
7620 #endif
7621     // Sum the products
7622 #if defined(__aarch64__)
7623     double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
7624 #else
7625     double sum = *((double *) &tmp) + *(((double *) &tmp) + 1);
7626 #endif
7627     // Conditionally store the sum
7628     const __m128d sumMask =
7629         _mm_castsi128_pd(_mm_set_epi64x(bit1Mask, bit0Mask));
7630     __m128d res = _mm_and_pd(_mm_set_pd1(sum), sumMask);
7631     return res;
7632 }
7633
7634 // Conditionally multiply the packed single-precision (32-bit) floating-point
7635 // elements in a and b using the high 4 bits in imm8, sum the four products,
7636 // and conditionally store the sum in dst using the low 4 bits of imm.
7637 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps
7638 FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
7639 {
7640 #if defined(__aarch64__)
7641     /* shortcuts */
7642     if (imm == 0xFF) {
7643         return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b)));
7644     }
7645     if (imm == 0x7F) {
7646         float32x4_t m = _mm_mul_ps(a, b);
7647         m[3] = 0;
7648         return _mm_set1_ps(vaddvq_f32(m));
7649     }
7650 #endif
7651
7652     float s = 0, c = 0;
7653     float32x4_t f32a = vreinterpretq_f32_m128(a);
7654     float32x4_t f32b = vreinterpretq_f32_m128(b);
7655
7656     /* To improve the accuracy of floating-point summation, Kahan algorithm
7657      * is used for each operation.
7658      */
7659     if (imm & (1 << 4))
7660         _sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]);
7661     if (imm & (1 << 5))
7662         _sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]);
7663     if (imm & (1 << 6))
7664         _sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]);
7665     if (imm & (1 << 7))
7666         _sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]);
7667     s += c;
7668
7669     float32x4_t res = {
7670         (imm & 0x1) ? s : 0,
7671         (imm & 0x2) ? s : 0,
7672         (imm & 0x4) ? s : 0,
7673         (imm & 0x8) ? s : 0,
7674     };
7675     return vreinterpretq_m128_f32(res);
7676 }
7677
7678 // Extracts the selected signed or unsigned 32-bit integer from a and zero
7679 // extends.
7680 // FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
7681 #define _mm_extract_epi32(a, imm) \
7682     vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
7683
7684 // Extracts the selected signed or unsigned 64-bit integer from a and zero
7685 // extends.
7686 // FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
7687 #define _mm_extract_epi64(a, imm) \
7688     vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
7689
7690 // Extracts the selected signed or unsigned 8-bit integer from a and zero
7691 // extends.
7692 // FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm)
7693 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi8
7694 #define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
7695
7696 // Extracts the selected single-precision (32-bit) floating-point from a.
7697 // FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
7698 #define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
7699
7700 // Round the packed double-precision (64-bit) floating-point elements in a down
7701 // to an integer value, and store the results as packed double-precision
7702 // floating-point elements in dst.
7703 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_pd
7704 FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
7705 {
7706 #if defined(__aarch64__)
7707     return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
7708 #else
7709     double *f = (double *) &a;
7710     return _mm_set_pd(floor(f[1]), floor(f[0]));
7711 #endif
7712 }
7713
7714 // Round the packed single-precision (32-bit) floating-point elements in a down
7715 // to an integer value, and store the results as packed single-precision
7716 // floating-point elements in dst.
7717 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps
7718 FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
7719 {
7720 #if defined(__aarch64__)
7721     return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
7722 #else
7723     float *f = (float *) &a;
7724     return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0]));
7725 #endif
7726 }
7727
7728 // Round the lower double-precision (64-bit) floating-point element in b down to
7729 // an integer value, store the result as a double-precision floating-point
7730 // element in the lower element of dst, and copy the upper element from a to the
7731 // upper element of dst.
7732 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_sd
7733 FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b)
7734 {
7735     return _mm_move_sd(a, _mm_floor_pd(b));
7736 }
7737
7738 // Round the lower single-precision (32-bit) floating-point element in b down to
7739 // an integer value, store the result as a single-precision floating-point
7740 // element in the lower element of dst, and copy the upper 3 packed elements
7741 // from a to the upper elements of dst.
7742 //
7743 //   dst[31:0] := FLOOR(b[31:0])
7744 //   dst[127:32] := a[127:32]
7745 //
7746 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss
7747 FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
7748 {
7749     return _mm_move_ss(a, _mm_floor_ps(b));
7750 }
7751
7752 // Inserts the least significant 32 bits of b into the selected 32-bit integer
7753 // of a.
7754 // FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
7755 //                                       __constrange(0,4) int imm)
7756 #define _mm_insert_epi32(a, b, imm)                                  \
7757     __extension__({                                                  \
7758         vreinterpretq_m128i_s32(                                     \
7759             vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
7760     })
7761
7762 // Inserts the least significant 64 bits of b into the selected 64-bit integer
7763 // of a.
7764 // FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
7765 //                                       __constrange(0,2) int imm)
7766 #define _mm_insert_epi64(a, b, imm)                                  \
7767     __extension__({                                                  \
7768         vreinterpretq_m128i_s64(                                     \
7769             vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
7770     })
7771
7772 // Inserts the least significant 8 bits of b into the selected 8-bit integer
7773 // of a.
7774 // FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
7775 //                                      __constrange(0,16) int imm)
7776 #define _mm_insert_epi8(a, b, imm)                                 \
7777     __extension__({                                                \
7778         vreinterpretq_m128i_s8(                                    \
7779             vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
7780     })
7781
7782 // Copy a to tmp, then insert a single-precision (32-bit) floating-point
7783 // element from b into tmp using the control in imm8. Store tmp to dst using
7784 // the mask in imm8 (elements are zeroed out when the corresponding bit is set).
7785 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=insert_ps
7786 #define _mm_insert_ps(a, b, imm8)                                              \
7787     __extension__({                                                            \
7788         float32x4_t tmp1 =                                                     \
7789             vsetq_lane_f32(vgetq_lane_f32(b, (imm8 >> 6) & 0x3),               \
7790                            vreinterpretq_f32_m128(a), 0);                      \
7791         float32x4_t tmp2 =                                                     \
7792             vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), vreinterpretq_f32_m128(a), \
7793                            ((imm8 >> 4) & 0x3));                               \
7794         const uint32_t data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,        \
7795                                   ((imm8) & (1 << 1)) ? UINT32_MAX : 0,        \
7796                                   ((imm8) & (1 << 2)) ? UINT32_MAX : 0,        \
7797                                   ((imm8) & (1 << 3)) ? UINT32_MAX : 0};       \
7798         uint32x4_t mask = vld1q_u32(data);                                     \
7799         float32x4_t all_zeros = vdupq_n_f32(0);                                \
7800                                                                                \
7801         vreinterpretq_m128_f32(                                                \
7802             vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2)));         \
7803     })
7804
7805 // epi versions of min/max
7806 // Computes the pariwise maximums of the four signed 32-bit integer values of a
7807 // and b.
7808 //
7809 // A 128-bit parameter that can be defined with the following equations:
7810 //   r0 := (a0 > b0) ? a0 : b0
7811 //   r1 := (a1 > b1) ? a1 : b1
7812 //   r2 := (a2 > b2) ? a2 : b2
7813 //   r3 := (a3 > b3) ? a3 : b3
7814 //
7815 // https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
7816 FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
7817 {
7818     return vreinterpretq_m128i_s32(
7819         vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
7820 }
7821
7822 // Compare packed signed 8-bit integers in a and b, and store packed maximum
7823 // values in dst.
7824 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8
7825 FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
7826 {
7827     return vreinterpretq_m128i_s8(
7828         vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
7829 }
7830
7831 // Compare packed unsigned 16-bit integers in a and b, and store packed maximum
7832 // values in dst.
7833 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16
7834 FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
7835 {
7836     return vreinterpretq_m128i_u16(
7837         vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
7838 }
7839
7840 // Compare packed unsigned 32-bit integers in a and b, and store packed maximum
7841 // values in dst.
7842 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
7843 FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
7844 {
7845     return vreinterpretq_m128i_u32(
7846         vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
7847 }
7848
7849 // Computes the pariwise minima of the four signed 32-bit integer values of a
7850 // and b.
7851 //
7852 // A 128-bit parameter that can be defined with the following equations:
7853 //   r0 := (a0 < b0) ? a0 : b0
7854 //   r1 := (a1 < b1) ? a1 : b1
7855 //   r2 := (a2 < b2) ? a2 : b2
7856 //   r3 := (a3 < b3) ? a3 : b3
7857 //
7858 // https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
7859 FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
7860 {
7861     return vreinterpretq_m128i_s32(
7862         vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
7863 }
7864
7865 // Compare packed signed 8-bit integers in a and b, and store packed minimum
7866 // values in dst.
7867 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8
7868 FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
7869 {
7870     return vreinterpretq_m128i_s8(
7871         vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
7872 }
7873
7874 // Compare packed unsigned 16-bit integers in a and b, and store packed minimum
7875 // values in dst.
7876 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16
7877 FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
7878 {
7879     return vreinterpretq_m128i_u16(
7880         vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
7881 }
7882
7883 // Compare packed unsigned 32-bit integers in a and b, and store packed minimum
7884 // values in dst.
7885 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
7886 FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
7887 {
7888     return vreinterpretq_m128i_u32(
7889         vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
7890 }
7891
7892 // Horizontally compute the minimum amongst the packed unsigned 16-bit integers
7893 // in a, store the minimum and index in dst, and zero the remaining bits in dst.
7894 //
7895 //   index[2:0] := 0
7896 //   min[15:0] := a[15:0]
7897 //   FOR j := 0 to 7
7898 //       i := j*16
7899 //       IF a[i+15:i] < min[15:0]
7900 //           index[2:0] := j
7901 //           min[15:0] := a[i+15:i]
7902 //       FI
7903 //   ENDFOR
7904 //   dst[15:0] := min[15:0]
7905 //   dst[18:16] := index[2:0]
7906 //   dst[127:19] := 0
7907 //
7908 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16
7909 FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
7910 {
7911     __m128i dst;
7912     uint16_t min, idx = 0;
7913     // Find the minimum value
7914 #if defined(__aarch64__)
7915     min = vminvq_u16(vreinterpretq_u16_m128i(a));
7916 #else
7917     __m64 tmp;
7918     tmp = vreinterpret_m64_u16(
7919         vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
7920                  vget_high_u16(vreinterpretq_u16_m128i(a))));
7921     tmp = vreinterpret_m64_u16(
7922         vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
7923     tmp = vreinterpret_m64_u16(
7924         vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
7925     min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
7926 #endif
7927     // Get the index of the minimum value
7928     int i;
7929     for (i = 0; i < 8; i++) {
7930         if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
7931             idx = (uint16_t) i;
7932             break;
7933         }
7934         a = _mm_srli_si128(a, 2);
7935     }
7936     // Generate result
7937     dst = _mm_setzero_si128();
7938     dst = vreinterpretq_m128i_u16(
7939         vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
7940     dst = vreinterpretq_m128i_u16(
7941         vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
7942     return dst;
7943 }
7944
7945 // Compute the sum of absolute differences (SADs) of quadruplets of unsigned
7946 // 8-bit integers in a compared to those in b, and store the 16-bit results in
7947 // dst. Eight SADs are performed using one quadruplet from b and eight
7948 // quadruplets from a. One quadruplet is selected from b starting at on the
7949 // offset specified in imm8. Eight quadruplets are formed from sequential 8-bit
7950 // integers selected from a starting at the offset specified in imm8.
7951 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mpsadbw_epu8
7952 FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
7953 {
7954     uint8x16_t _a, _b;
7955
7956     switch (imm & 0x4) {
7957     case 0:
7958         // do nothing
7959         _a = vreinterpretq_u8_m128i(a);
7960         break;
7961     case 4:
7962         _a = vreinterpretq_u8_u32(vextq_u32(vreinterpretq_u32_m128i(a),
7963                                             vreinterpretq_u32_m128i(a), 1));
7964         break;
7965     default:
7966 #if defined(__GNUC__) || defined(__clang__)
7967         __builtin_unreachable();
7968 #endif
7969         break;
7970     }
7971
7972     switch (imm & 0x3) {
7973     case 0:
7974         _b = vreinterpretq_u8_u32(
7975             vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 0)));
7976         break;
7977     case 1:
7978         _b = vreinterpretq_u8_u32(
7979             vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 1)));
7980         break;
7981     case 2:
7982         _b = vreinterpretq_u8_u32(
7983             vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 2)));
7984         break;
7985     case 3:
7986         _b = vreinterpretq_u8_u32(
7987             vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 3)));
7988         break;
7989     default:
7990 #if defined(__GNUC__) || defined(__clang__)
7991         __builtin_unreachable();
7992 #endif
7993         break;
7994     }
7995
7996     int16x8_t c04, c15, c26, c37;
7997     uint8x8_t low_b = vget_low_u8(_b);
7998     c04 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
7999     _a = vextq_u8(_a, _a, 1);
8000     c15 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
8001     _a = vextq_u8(_a, _a, 1);
8002     c26 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
8003     _a = vextq_u8(_a, _a, 1);
8004     c37 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
8005 #if defined(__aarch64__)
8006     // |0|4|2|6|
8007     c04 = vpaddq_s16(c04, c26);
8008     // |1|5|3|7|
8009     c15 = vpaddq_s16(c15, c37);
8010
8011     int32x4_t trn1_c =
8012         vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
8013     int32x4_t trn2_c =
8014         vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
8015     return vreinterpretq_m128i_s16(vpaddq_s16(vreinterpretq_s16_s32(trn1_c),
8016                                               vreinterpretq_s16_s32(trn2_c)));
8017 #else
8018     int16x4_t c01, c23, c45, c67;
8019     c01 = vpadd_s16(vget_low_s16(c04), vget_low_s16(c15));
8020     c23 = vpadd_s16(vget_low_s16(c26), vget_low_s16(c37));
8021     c45 = vpadd_s16(vget_high_s16(c04), vget_high_s16(c15));
8022     c67 = vpadd_s16(vget_high_s16(c26), vget_high_s16(c37));
8023
8024     return vreinterpretq_m128i_s16(
8025         vcombine_s16(vpadd_s16(c01, c23), vpadd_s16(c45, c67)));
8026 #endif
8027 }
8028
8029 // Multiply the low signed 32-bit integers from each packed 64-bit element in
8030 // a and b, and store the signed 64-bit results in dst.
8031 //
8032 //   r0 :=  (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0
8033 //   r1 :=  (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2
8034 FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
8035 {
8036     // vmull_s32 upcasts instead of masking, so we downcast.
8037     int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
8038     int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
8039     return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
8040 }
8041
8042 // Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or
8043 // unsigned 32-bit integers from b.
8044 // https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
8045 FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
8046 {
8047     return vreinterpretq_m128i_s32(
8048         vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
8049 }
8050
8051 // Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit
8052 // integers and saturates.
8053 //
8054 //   r0 := UnsignedSaturate(a0)
8055 //   r1 := UnsignedSaturate(a1)
8056 //   r2 := UnsignedSaturate(a2)
8057 //   r3 := UnsignedSaturate(a3)
8058 //   r4 := UnsignedSaturate(b0)
8059 //   r5 := UnsignedSaturate(b1)
8060 //   r6 := UnsignedSaturate(b2)
8061 //   r7 := UnsignedSaturate(b3)
8062 FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
8063 {
8064     return vreinterpretq_m128i_u16(
8065         vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
8066                      vqmovun_s32(vreinterpretq_s32_m128i(b))));
8067 }
8068
8069 // Round the packed double-precision (64-bit) floating-point elements in a using
8070 // the rounding parameter, and store the results as packed double-precision
8071 // floating-point elements in dst.
8072 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_pd
8073 FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
8074 {
8075 #if defined(__aarch64__)
8076     switch (rounding) {
8077     case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
8078         return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a)));
8079     case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
8080         return _mm_floor_pd(a);
8081     case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
8082         return _mm_ceil_pd(a);
8083     case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
8084         return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a)));
8085     default:  //_MM_FROUND_CUR_DIRECTION
8086         return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)));
8087     }
8088 #else
8089     double *v_double = (double *) &a;
8090
8091     if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
8092         (rounding == _MM_FROUND_CUR_DIRECTION &&
8093          _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
8094         double res[2], tmp;
8095         for (int i = 0; i < 2; i++) {
8096             tmp = (v_double[i] < 0) ? -v_double[i] : v_double[i];
8097             double roundDown = floor(tmp);  // Round down value
8098             double roundUp = ceil(tmp);     // Round up value
8099             double diffDown = tmp - roundDown;
8100             double diffUp = roundUp - tmp;
8101             if (diffDown < diffUp) {
8102                 /* If it's closer to the round down value, then use it */
8103                 res[i] = roundDown;
8104             } else if (diffDown > diffUp) {
8105                 /* If it's closer to the round up value, then use it */
8106                 res[i] = roundUp;
8107             } else {
8108                 /* If it's equidistant between round up and round down value,
8109                  * pick the one which is an even number */
8110                 double half = roundDown / 2;
8111                 if (half != floor(half)) {
8112                     /* If the round down value is odd, return the round up value
8113                      */
8114                     res[i] = roundUp;
8115                 } else {
8116                     /* If the round up value is odd, return the round down value
8117                      */
8118                     res[i] = roundDown;
8119                 }
8120             }
8121             res[i] = (v_double[i] < 0) ? -res[i] : res[i];
8122         }
8123         return _mm_set_pd(res[1], res[0]);
8124     } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
8125                (rounding == _MM_FROUND_CUR_DIRECTION &&
8126                 _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
8127         return _mm_floor_pd(a);
8128     } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
8129                (rounding == _MM_FROUND_CUR_DIRECTION &&
8130                 _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
8131         return _mm_ceil_pd(a);
8132     }
8133     return _mm_set_pd(v_double[1] > 0 ? floor(v_double[1]) : ceil(v_double[1]),
8134                       v_double[0] > 0 ? floor(v_double[0]) : ceil(v_double[0]));
8135 #endif
8136 }
8137
8138 // Round the packed single-precision (32-bit) floating-point elements in a using
8139 // the rounding parameter, and store the results as packed single-precision
8140 // floating-point elements in dst.
8141 // software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
8142 FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
8143 {
8144 #if defined(__aarch64__)
8145     switch (rounding) {
8146     case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
8147         return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
8148     case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
8149         return _mm_floor_ps(a);
8150     case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
8151         return _mm_ceil_ps(a);
8152     case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
8153         return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
8154     default:  //_MM_FROUND_CUR_DIRECTION
8155         return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
8156     }
8157 #else
8158     float *v_float = (float *) &a;
8159
8160     if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
8161         (rounding == _MM_FROUND_CUR_DIRECTION &&
8162          _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
8163         uint32x4_t signmask = vdupq_n_u32(0x80000000);
8164         float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
8165                                      vdupq_n_f32(0.5f)); /* +/- 0.5 */
8166         int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
8167             vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
8168         int32x4_t r_trunc = vcvtq_s32_f32(
8169             vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
8170         int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
8171             vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
8172         int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
8173                                      vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
8174         float32x4_t delta = vsubq_f32(
8175             vreinterpretq_f32_m128(a),
8176             vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
8177         uint32x4_t is_delta_half =
8178             vceqq_f32(delta, half); /* delta == +/- 0.5 */
8179         return vreinterpretq_m128_f32(
8180             vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal)));
8181     } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
8182                (rounding == _MM_FROUND_CUR_DIRECTION &&
8183                 _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
8184         return _mm_floor_ps(a);
8185     } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
8186                (rounding == _MM_FROUND_CUR_DIRECTION &&
8187                 _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
8188         return _mm_ceil_ps(a);
8189     }
8190     return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]),
8191                       v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]),
8192                       v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]),
8193                       v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0]));
8194 #endif
8195 }
8196
8197 // Round the lower double-precision (64-bit) floating-point element in b using
8198 // the rounding parameter, store the result as a double-precision floating-point
8199 // element in the lower element of dst, and copy the upper element from a to the
8200 // upper element of dst.
8201 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_sd
8202 FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
8203 {
8204     return _mm_move_sd(a, _mm_round_pd(b, rounding));
8205 }
8206
8207 // Round the lower single-precision (32-bit) floating-point element in b using
8208 // the rounding parameter, store the result as a single-precision floating-point
8209 // element in the lower element of dst, and copy the upper 3 packed elements
8210 // from a to the upper elements of dst. Rounding is done according to the
8211 // rounding[3:0] parameter, which can be one of:
8212 //     (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and
8213 //     suppress exceptions
8214 //     (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and
8215 //     suppress exceptions
8216 //     (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress
8217 //     exceptions
8218 //     (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress
8219 //     exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see
8220 //     _MM_SET_ROUNDING_MODE
8221 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ss
8222 FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
8223 {
8224     return _mm_move_ss(a, _mm_round_ps(b, rounding));
8225 }
8226
8227 // Load 128-bits of integer data from memory into dst using a non-temporal
8228 // memory hint. mem_addr must be aligned on a 16-byte boundary or a
8229 // general-protection exception may be generated.
8230 //
8231 //   dst[127:0] := MEM[mem_addr+127:mem_addr]
8232 //
8233 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128
8234 FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
8235 {
8236 #if __has_builtin(__builtin_nontemporal_store)
8237     return __builtin_nontemporal_load(p);
8238 #else
8239     return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
8240 #endif
8241 }
8242
8243 // Compute the bitwise NOT of a and then AND with a 128-bit vector containing
8244 // all 1's, and return 1 if the result is zero, otherwise return 0.
8245 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones
8246 FORCE_INLINE int _mm_test_all_ones(__m128i a)
8247 {
8248     return (uint64_t)(vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
8249            ~(uint64_t) 0;
8250 }
8251
8252 // Compute the bitwise AND of 128 bits (representing integer data) in a and
8253 // mask, and return 1 if the result is zero, otherwise return 0.
8254 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros
8255 FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
8256 {
8257     int64x2_t a_and_mask =
8258         vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
8259     return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1));
8260 }
8261
8262 // Compute the bitwise AND of 128 bits (representing integer data) in a and
8263 // mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute
8264 // the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is
8265 // zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
8266 // otherwise return 0.
8267 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_test_mix_ones_zero
8268 FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
8269 {
8270     uint64x2_t zf =
8271         vandq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
8272     uint64x2_t cf =
8273         vbicq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
8274     uint64x2_t result = vandq_u64(zf, cf);
8275     return !(vgetq_lane_u64(result, 0) | vgetq_lane_u64(result, 1));
8276 }
8277
8278 // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
8279 // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
8280 // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
8281 // otherwise set CF to 0. Return the CF value.
8282 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128
8283 FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
8284 {
8285     int64x2_t s64 =
8286         vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_m128i(a))),
8287                   vreinterpretq_s64_m128i(b));
8288     return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
8289 }
8290
8291 // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
8292 // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
8293 // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
8294 // otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
8295 // otherwise return 0.
8296 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testnzc_si128
8297 #define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b)
8298
8299 // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
8300 // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
8301 // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
8302 // otherwise set CF to 0. Return the ZF value.
8303 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128
8304 FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
8305 {
8306     int64x2_t s64 =
8307         vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
8308     return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
8309 }
8310
8311 /* SSE4.2 */
8312
8313 // Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
8314 // in b for greater than.
8315 FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
8316 {
8317 #if defined(__aarch64__)
8318     return vreinterpretq_m128i_u64(
8319         vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
8320 #else
8321     return vreinterpretq_m128i_s64(vshrq_n_s64(
8322         vqsubq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a)),
8323         63));
8324 #endif
8325 }
8326
8327 // Starting with the initial value in crc, accumulates a CRC32 value for
8328 // unsigned 16-bit integer v.
8329 // https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100)
8330 FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
8331 {
8332 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8333     __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
8334                          : [c] "+r"(crc)
8335                          : [v] "r"(v));
8336 #else
8337     crc = _mm_crc32_u8(crc, v & 0xff);
8338     crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
8339 #endif
8340     return crc;
8341 }
8342
8343 // Starting with the initial value in crc, accumulates a CRC32 value for
8344 // unsigned 32-bit integer v.
8345 // https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100)
8346 FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
8347 {
8348 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8349     __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
8350                          : [c] "+r"(crc)
8351                          : [v] "r"(v));
8352 #else
8353     crc = _mm_crc32_u16(crc, v & 0xffff);
8354     crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
8355 #endif
8356     return crc;
8357 }
8358
8359 // Starting with the initial value in crc, accumulates a CRC32 value for
8360 // unsigned 64-bit integer v.
8361 // https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100)
8362 FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
8363 {
8364 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8365     __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
8366                          : [c] "+r"(crc)
8367                          : [v] "r"(v));
8368 #else
8369     crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff);
8370     crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff);
8371 #endif
8372     return crc;
8373 }
8374
8375 // Starting with the initial value in crc, accumulates a CRC32 value for
8376 // unsigned 8-bit integer v.
8377 // https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100)
8378 FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
8379 {
8380 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8381     __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
8382                          : [c] "+r"(crc)
8383                          : [v] "r"(v));
8384 #else
8385     crc ^= v;
8386     for (int bit = 0; bit < 8; bit++) {
8387         if (crc & 1)
8388             crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
8389         else
8390             crc = (crc >> 1);
8391     }
8392 #endif
8393     return crc;
8394 }
8395
8396 /* AES */
8397
8398 #if !defined(__ARM_FEATURE_CRYPTO)
8399 /* clang-format off */
8400 #define SSE2NEON_AES_DATA(w)                                           \
8401     {                                                                  \
8402         w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
8403         w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
8404         w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
8405         w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
8406         w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
8407         w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
8408         w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
8409         w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
8410         w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
8411         w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
8412         w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
8413         w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
8414         w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
8415         w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
8416         w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
8417         w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
8418         w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
8419         w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
8420         w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
8421         w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
8422         w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
8423         w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
8424         w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
8425         w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
8426         w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
8427         w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
8428         w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
8429         w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
8430         w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
8431         w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
8432         w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
8433         w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
8434         w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
8435         w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
8436         w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
8437         w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
8438         w(0xb0), w(0x54), w(0xbb), w(0x16)                             \
8439     }
8440 /* clang-format on */
8441
8442 /* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
8443 #define SSE2NEON_AES_H0(x) (x)
8444 static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_DATA(SSE2NEON_AES_H0);
8445 #undef SSE2NEON_AES_H0
8446
8447 // In the absence of crypto extensions, implement aesenc using regular neon
8448 // intrinsics instead. See:
8449 // https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
8450 // https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
8451 // https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52
8452 // for more information Reproduced with permission of the author.
8453 FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
8454 {
8455 #if defined(__aarch64__)
8456     static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
8457                                          0xe, 0x3, 0x8, 0xd, 0x2, 0x7,
8458                                          0xc, 0x1, 0x6, 0xb};
8459     static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
8460                                        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
8461
8462     uint8x16_t v;
8463     uint8x16_t w = vreinterpretq_u8_m128i(EncBlock);
8464
8465     // shift rows
8466     w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
8467
8468     // sub bytes
8469     v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(SSE2NEON_sbox), w);
8470     v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40);
8471     v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80);
8472     v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0);
8473
8474     // mix columns
8475     w = (v << 1) ^ (uint8x16_t)(((int8x16_t) v >> 7) & 0x1b);
8476     w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
8477     w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
8478
8479     //  add round key
8480     return vreinterpretq_m128i_u8(w) ^ RoundKey;
8481
8482 #else /* ARMv7-A NEON implementation */
8483 #define SSE2NEON_AES_B2W(b0, b1, b2, b3)                                       \
8484     (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | \
8485      (b0))
8486 #define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
8487 #define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
8488 #define SSE2NEON_AES_U0(p) \
8489     SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
8490 #define SSE2NEON_AES_U1(p) \
8491     SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
8492 #define SSE2NEON_AES_U2(p) \
8493     SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
8494 #define SSE2NEON_AES_U3(p) \
8495     SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
8496     static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
8497         SSE2NEON_AES_DATA(SSE2NEON_AES_U0),
8498         SSE2NEON_AES_DATA(SSE2NEON_AES_U1),
8499         SSE2NEON_AES_DATA(SSE2NEON_AES_U2),
8500         SSE2NEON_AES_DATA(SSE2NEON_AES_U3),
8501     };
8502 #undef SSE2NEON_AES_B2W
8503 #undef SSE2NEON_AES_F2
8504 #undef SSE2NEON_AES_F3
8505 #undef SSE2NEON_AES_U0
8506 #undef SSE2NEON_AES_U1
8507 #undef SSE2NEON_AES_U2
8508 #undef SSE2NEON_AES_U3
8509
8510     uint32_t x0 = _mm_cvtsi128_si32(EncBlock);
8511     uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0x55));
8512     uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xAA));
8513     uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xFF));
8514
8515     __m128i out = _mm_set_epi32(
8516         (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
8517          aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
8518         (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
8519          aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
8520         (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
8521          aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
8522         (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
8523          aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
8524
8525     return _mm_xor_si128(out, RoundKey);
8526 #endif
8527 }
8528
8529 // Perform the last round of an AES encryption flow on data (state) in a using
8530 // the round key in RoundKey, and store the result in dst.
8531 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
8532 FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
8533 {
8534     /* FIXME: optimized for NEON */
8535     uint8_t v[4][4] = {
8536         {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 0)],
8537          SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 5)],
8538          SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 10)],
8539          SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 15)]},
8540         {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 4)],
8541          SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 9)],
8542          SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 14)],
8543          SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 3)]},
8544         {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 8)],
8545          SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 13)],
8546          SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 2)],
8547          SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 7)]},
8548         {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 12)],
8549          SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 1)],
8550          SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 6)],
8551          SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 11)]},
8552     };
8553     for (int i = 0; i < 16; i++)
8554         vreinterpretq_nth_u8_m128i(a, i) =
8555             v[i / 4][i % 4] ^ vreinterpretq_nth_u8_m128i(RoundKey, i);
8556     return a;
8557 }
8558
8559 // Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
8560 // This instruction generates a round key for AES encryption. See
8561 // https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
8562 // for details.
8563 //
8564 // https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx
8565 FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon)
8566 {
8567     uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55));
8568     uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF));
8569     for (int i = 0; i < 4; ++i) {
8570         ((uint8_t *) &X1)[i] = SSE2NEON_sbox[((uint8_t *) &X1)[i]];
8571         ((uint8_t *) &X3)[i] = SSE2NEON_sbox[((uint8_t *) &X3)[i]];
8572     }
8573     return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
8574                          ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
8575 }
8576 #undef SSE2NEON_AES_DATA
8577
8578 #else /* __ARM_FEATURE_CRYPTO */
8579 // Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
8580 // AESMC and then manually applying the real key as an xor operation. This
8581 // unfortunately means an additional xor op; the compiler should be able to
8582 // optimize this away for repeated calls however. See
8583 // https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
8584 // for more details.
8585 FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
8586 {
8587     return vreinterpretq_m128i_u8(
8588         vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
8589         vreinterpretq_u8_m128i(b));
8590 }
8591
8592 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
8593 FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
8594 {
8595     return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
8596                              vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
8597                          RoundKey);
8598 }
8599
8600 FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
8601 {
8602     // AESE does ShiftRows and SubBytes on A
8603     uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
8604
8605     uint8x16_t dest = {
8606         // Undo ShiftRows step from AESE and extract X1 and X3
8607         u8[0x4], u8[0x1], u8[0xE], u8[0xB],  // SubBytes(X1)
8608         u8[0x1], u8[0xE], u8[0xB], u8[0x4],  // ROT(SubBytes(X1))
8609         u8[0xC], u8[0x9], u8[0x6], u8[0x3],  // SubBytes(X3)
8610         u8[0x9], u8[0x6], u8[0x3], u8[0xC],  // ROT(SubBytes(X3))
8611     };
8612     uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
8613     return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
8614 }
8615 #endif
8616
8617 /* Others */
8618
8619 // Perform a carry-less multiplication of two 64-bit integers, selected from a
8620 // and b according to imm8, and store the results in dst.
8621 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clmulepi64_si128
8622 FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
8623 {
8624     uint64x2_t a = vreinterpretq_u64_m128i(_a);
8625     uint64x2_t b = vreinterpretq_u64_m128i(_b);
8626     switch (imm & 0x11) {
8627     case 0x00:
8628         return vreinterpretq_m128i_u64(
8629             _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
8630     case 0x01:
8631         return vreinterpretq_m128i_u64(
8632             _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
8633     case 0x10:
8634         return vreinterpretq_m128i_u64(
8635             _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
8636     case 0x11:
8637         return vreinterpretq_m128i_u64(
8638             _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
8639     default:
8640         abort();
8641     }
8642 }
8643
8644 FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode()
8645 {
8646     union {
8647         fpcr_bitfield field;
8648 #if defined(__aarch64__)
8649         uint64_t value;
8650 #else
8651         uint32_t value;
8652 #endif
8653     } r;
8654
8655 #if defined(__aarch64__)
8656     asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
8657 #else
8658     asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
8659 #endif
8660
8661     return r.field.bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF;
8662 }
8663
8664 // Count the number of bits set to 1 in unsigned 32-bit integer a, and
8665 // return that count in dst.
8666 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32
8667 FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
8668 {
8669 #if defined(__aarch64__)
8670 #if __has_builtin(__builtin_popcount)
8671     return __builtin_popcount(a);
8672 #else
8673     return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
8674 #endif
8675 #else
8676     uint32_t count = 0;
8677     uint8x8_t input_val, count8x8_val;
8678     uint16x4_t count16x4_val;
8679     uint32x2_t count32x2_val;
8680
8681     input_val = vld1_u8((uint8_t *) &a);
8682     count8x8_val = vcnt_u8(input_val);
8683     count16x4_val = vpaddl_u8(count8x8_val);
8684     count32x2_val = vpaddl_u16(count16x4_val);
8685
8686     vst1_u32(&count, count32x2_val);
8687     return count;
8688 #endif
8689 }
8690
8691 // Count the number of bits set to 1 in unsigned 64-bit integer a, and
8692 // return that count in dst.
8693 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64
8694 FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
8695 {
8696 #if defined(__aarch64__)
8697 #if __has_builtin(__builtin_popcountll)
8698     return __builtin_popcountll(a);
8699 #else
8700     return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
8701 #endif
8702 #else
8703     uint64_t count = 0;
8704     uint8x8_t input_val, count8x8_val;
8705     uint16x4_t count16x4_val;
8706     uint32x2_t count32x2_val;
8707     uint64x1_t count64x1_val;
8708
8709     input_val = vld1_u8((uint8_t *) &a);
8710     count8x8_val = vcnt_u8(input_val);
8711     count16x4_val = vpaddl_u8(count8x8_val);
8712     count32x2_val = vpaddl_u16(count16x4_val);
8713     count64x1_val = vpaddl_u32(count32x2_val);
8714     vst1_u64(&count, count64x1_val);
8715     return count;
8716 #endif
8717 }
8718
8719 FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
8720 {
8721     // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
8722     // regardless of the value of the FZ bit.
8723     union {
8724         fpcr_bitfield field;
8725 #if defined(__aarch64__)
8726         uint64_t value;
8727 #else
8728         uint32_t value;
8729 #endif
8730     } r;
8731
8732 #if defined(__aarch64__)
8733     asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
8734 #else
8735     asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
8736 #endif
8737
8738     r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON;
8739
8740 #if defined(__aarch64__)
8741     asm volatile("msr FPCR, %0" ::"r"(r)); /* write */
8742 #else
8743     asm volatile("vmsr FPSCR, %0" ::"r"(r));        /* write */
8744 #endif
8745 }
8746
8747 #if defined(__GNUC__) || defined(__clang__)
8748 #pragma pop_macro("ALIGN_STRUCT")
8749 #pragma pop_macro("FORCE_INLINE")
8750 #endif
8751
8752 #if defined(__GNUC__) && !defined(__clang__)
8753 #pragma GCC pop_options
8754 #endif
8755
8756 #endif