1 /*===---- smmintrin.h - SSE4 intrinsics ------------------------------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 #ifndef __SMMINTRIN_H 11 #define __SMMINTRIN_H 12 13 #if !defined(__i386__) && !defined(__x86_64__) 14 #error "This header is only meant to be used on x86 and x64 architecture" 15 #endif 16 17 #include <tmmintrin.h> 18 19 /* Define the default attributes for the functions in this file. */ 20 #if defined(__EVEX512__) && !defined(__AVX10_1_512__) 21 #define __DEFAULT_FN_ATTRS \ 22 __attribute__((__always_inline__, __nodebug__, \ 23 __target__("sse4.1,no-evex512"), __min_vector_width__(128))) 24 #else 25 #define __DEFAULT_FN_ATTRS \ 26 __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"), \ 27 __min_vector_width__(128))) 28 #endif 29 30 /* SSE4 Rounding macros. */ 31 #define _MM_FROUND_TO_NEAREST_INT 0x00 32 #define _MM_FROUND_TO_NEG_INF 0x01 33 #define _MM_FROUND_TO_POS_INF 0x02 34 #define _MM_FROUND_TO_ZERO 0x03 35 #define _MM_FROUND_CUR_DIRECTION 0x04 36 37 #define _MM_FROUND_RAISE_EXC 0x00 38 #define _MM_FROUND_NO_EXC 0x08 39 40 #define _MM_FROUND_NINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT) 41 #define _MM_FROUND_FLOOR (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF) 42 #define _MM_FROUND_CEIL (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF) 43 #define _MM_FROUND_TRUNC (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO) 44 #define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION) 45 #define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION) 46 47 /// Rounds up each element of the 128-bit vector of [4 x float] to an 48 /// integer and returns the rounded values in a 128-bit vector of 49 /// [4 x float]. 50 /// 51 /// \headerfile <x86intrin.h> 52 /// 53 /// \code 54 /// __m128 _mm_ceil_ps(__m128 X); 55 /// \endcode 56 /// 57 /// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction. 58 /// 59 /// \param X 60 /// A 128-bit vector of [4 x float] values to be rounded up. 61 /// \returns A 128-bit vector of [4 x float] containing the rounded values. 62 #define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL) 63 64 /// Rounds up each element of the 128-bit vector of [2 x double] to an 65 /// integer and returns the rounded values in a 128-bit vector of 66 /// [2 x double]. 67 /// 68 /// \headerfile <x86intrin.h> 69 /// 70 /// \code 71 /// __m128d _mm_ceil_pd(__m128d X); 72 /// \endcode 73 /// 74 /// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction. 75 /// 76 /// \param X 77 /// A 128-bit vector of [2 x double] values to be rounded up. 78 /// \returns A 128-bit vector of [2 x double] containing the rounded values. 79 #define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL) 80 81 /// Copies three upper elements of the first 128-bit vector operand to 82 /// the corresponding three upper elements of the 128-bit result vector of 83 /// [4 x float]. Rounds up the lowest element of the second 128-bit vector 84 /// operand to an integer and copies it to the lowest element of the 128-bit 85 /// result vector of [4 x float]. 86 /// 87 /// \headerfile <x86intrin.h> 88 /// 89 /// \code 90 /// __m128 _mm_ceil_ss(__m128 X, __m128 Y); 91 /// \endcode 92 /// 93 /// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction. 94 /// 95 /// \param X 96 /// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are 97 /// copied to the corresponding bits of the result. 98 /// \param Y 99 /// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is 100 /// rounded up to the nearest integer and copied to the corresponding bits 101 /// of the result. 102 /// \returns A 128-bit vector of [4 x float] containing the copied and rounded 103 /// values. 104 #define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL) 105 106 /// Copies the upper element of the first 128-bit vector operand to the 107 /// corresponding upper element of the 128-bit result vector of [2 x double]. 108 /// Rounds up the lower element of the second 128-bit vector operand to an 109 /// integer and copies it to the lower element of the 128-bit result vector 110 /// of [2 x double]. 111 /// 112 /// \headerfile <x86intrin.h> 113 /// 114 /// \code 115 /// __m128d _mm_ceil_sd(__m128d X, __m128d Y); 116 /// \endcode 117 /// 118 /// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction. 119 /// 120 /// \param X 121 /// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is 122 /// copied to the corresponding bits of the result. 123 /// \param Y 124 /// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is 125 /// rounded up to the nearest integer and copied to the corresponding bits 126 /// of the result. 127 /// \returns A 128-bit vector of [2 x double] containing the copied and rounded 128 /// values. 129 #define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL) 130 131 /// Rounds down each element of the 128-bit vector of [4 x float] to an 132 /// an integer and returns the rounded values in a 128-bit vector of 133 /// [4 x float]. 134 /// 135 /// \headerfile <x86intrin.h> 136 /// 137 /// \code 138 /// __m128 _mm_floor_ps(__m128 X); 139 /// \endcode 140 /// 141 /// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction. 142 /// 143 /// \param X 144 /// A 128-bit vector of [4 x float] values to be rounded down. 145 /// \returns A 128-bit vector of [4 x float] containing the rounded values. 146 #define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR) 147 148 /// Rounds down each element of the 128-bit vector of [2 x double] to an 149 /// integer and returns the rounded values in a 128-bit vector of 150 /// [2 x double]. 151 /// 152 /// \headerfile <x86intrin.h> 153 /// 154 /// \code 155 /// __m128d _mm_floor_pd(__m128d X); 156 /// \endcode 157 /// 158 /// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction. 159 /// 160 /// \param X 161 /// A 128-bit vector of [2 x double]. 162 /// \returns A 128-bit vector of [2 x double] containing the rounded values. 163 #define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR) 164 165 /// Copies three upper elements of the first 128-bit vector operand to 166 /// the corresponding three upper elements of the 128-bit result vector of 167 /// [4 x float]. Rounds down the lowest element of the second 128-bit vector 168 /// operand to an integer and copies it to the lowest element of the 128-bit 169 /// result vector of [4 x float]. 170 /// 171 /// \headerfile <x86intrin.h> 172 /// 173 /// \code 174 /// __m128 _mm_floor_ss(__m128 X, __m128 Y); 175 /// \endcode 176 /// 177 /// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction. 178 /// 179 /// \param X 180 /// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are 181 /// copied to the corresponding bits of the result. 182 /// \param Y 183 /// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is 184 /// rounded down to the nearest integer and copied to the corresponding bits 185 /// of the result. 186 /// \returns A 128-bit vector of [4 x float] containing the copied and rounded 187 /// values. 188 #define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR) 189 190 /// Copies the upper element of the first 128-bit vector operand to the 191 /// corresponding upper element of the 128-bit result vector of [2 x double]. 192 /// Rounds down the lower element of the second 128-bit vector operand to an 193 /// integer and copies it to the lower element of the 128-bit result vector 194 /// of [2 x double]. 195 /// 196 /// \headerfile <x86intrin.h> 197 /// 198 /// \code 199 /// __m128d _mm_floor_sd(__m128d X, __m128d Y); 200 /// \endcode 201 /// 202 /// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction. 203 /// 204 /// \param X 205 /// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is 206 /// copied to the corresponding bits of the result. 207 /// \param Y 208 /// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is 209 /// rounded down to the nearest integer and copied to the corresponding bits 210 /// of the result. 211 /// \returns A 128-bit vector of [2 x double] containing the copied and rounded 212 /// values. 213 #define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR) 214 215 /// Rounds each element of the 128-bit vector of [4 x float] to an 216 /// integer value according to the rounding control specified by the second 217 /// argument and returns the rounded values in a 128-bit vector of 218 /// [4 x float]. 219 /// 220 /// \headerfile <x86intrin.h> 221 /// 222 /// \code 223 /// __m128 _mm_round_ps(__m128 X, const int M); 224 /// \endcode 225 /// 226 /// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction. 227 /// 228 /// \param X 229 /// A 128-bit vector of [4 x float]. 230 /// \param M 231 /// An integer value that specifies the rounding operation. \n 232 /// Bits [7:4] are reserved. \n 233 /// Bit [3] is a precision exception value: \n 234 /// 0: A normal PE exception is used \n 235 /// 1: The PE field is not updated \n 236 /// Bit [2] is the rounding control source: \n 237 /// 0: Use bits [1:0] of \a M \n 238 /// 1: Use the current MXCSR setting \n 239 /// Bits [1:0] contain the rounding control definition: \n 240 /// 00: Nearest \n 241 /// 01: Downward (toward negative infinity) \n 242 /// 10: Upward (toward positive infinity) \n 243 /// 11: Truncated 244 /// \returns A 128-bit vector of [4 x float] containing the rounded values. 245 #define _mm_round_ps(X, M) \ 246 ((__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M))) 247 248 /// Copies three upper elements of the first 128-bit vector operand to 249 /// the corresponding three upper elements of the 128-bit result vector of 250 /// [4 x float]. Rounds the lowest element of the second 128-bit vector 251 /// operand to an integer value according to the rounding control specified 252 /// by the third argument and copies it to the lowest element of the 128-bit 253 /// result vector of [4 x float]. 254 /// 255 /// \headerfile <x86intrin.h> 256 /// 257 /// \code 258 /// __m128 _mm_round_ss(__m128 X, __m128 Y, const int M); 259 /// \endcode 260 /// 261 /// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction. 262 /// 263 /// \param X 264 /// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are 265 /// copied to the corresponding bits of the result. 266 /// \param Y 267 /// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is 268 /// rounded to the nearest integer using the specified rounding control and 269 /// copied to the corresponding bits of the result. 270 /// \param M 271 /// An integer value that specifies the rounding operation. \n 272 /// Bits [7:4] are reserved. \n 273 /// Bit [3] is a precision exception value: \n 274 /// 0: A normal PE exception is used \n 275 /// 1: The PE field is not updated \n 276 /// Bit [2] is the rounding control source: \n 277 /// 0: Use bits [1:0] of \a M \n 278 /// 1: Use the current MXCSR setting \n 279 /// Bits [1:0] contain the rounding control definition: \n 280 /// 00: Nearest \n 281 /// 01: Downward (toward negative infinity) \n 282 /// 10: Upward (toward positive infinity) \n 283 /// 11: Truncated 284 /// \returns A 128-bit vector of [4 x float] containing the copied and rounded 285 /// values. 286 #define _mm_round_ss(X, Y, M) \ 287 ((__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \ 288 (M))) 289 290 /// Rounds each element of the 128-bit vector of [2 x double] to an 291 /// integer value according to the rounding control specified by the second 292 /// argument and returns the rounded values in a 128-bit vector of 293 /// [2 x double]. 294 /// 295 /// \headerfile <x86intrin.h> 296 /// 297 /// \code 298 /// __m128d _mm_round_pd(__m128d X, const int M); 299 /// \endcode 300 /// 301 /// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction. 302 /// 303 /// \param X 304 /// A 128-bit vector of [2 x double]. 305 /// \param M 306 /// An integer value that specifies the rounding operation. \n 307 /// Bits [7:4] are reserved. \n 308 /// Bit [3] is a precision exception value: \n 309 /// 0: A normal PE exception is used \n 310 /// 1: The PE field is not updated \n 311 /// Bit [2] is the rounding control source: \n 312 /// 0: Use bits [1:0] of \a M \n 313 /// 1: Use the current MXCSR setting \n 314 /// Bits [1:0] contain the rounding control definition: \n 315 /// 00: Nearest \n 316 /// 01: Downward (toward negative infinity) \n 317 /// 10: Upward (toward positive infinity) \n 318 /// 11: Truncated 319 /// \returns A 128-bit vector of [2 x double] containing the rounded values. 320 #define _mm_round_pd(X, M) \ 321 ((__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M))) 322 323 /// Copies the upper element of the first 128-bit vector operand to the 324 /// corresponding upper element of the 128-bit result vector of [2 x double]. 325 /// Rounds the lower element of the second 128-bit vector operand to an 326 /// integer value according to the rounding control specified by the third 327 /// argument and copies it to the lower element of the 128-bit result vector 328 /// of [2 x double]. 329 /// 330 /// \headerfile <x86intrin.h> 331 /// 332 /// \code 333 /// __m128d _mm_round_sd(__m128d X, __m128d Y, const int M); 334 /// \endcode 335 /// 336 /// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction. 337 /// 338 /// \param X 339 /// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is 340 /// copied to the corresponding bits of the result. 341 /// \param Y 342 /// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is 343 /// rounded to the nearest integer using the specified rounding control and 344 /// copied to the corresponding bits of the result. 345 /// \param M 346 /// An integer value that specifies the rounding operation. \n 347 /// Bits [7:4] are reserved. \n 348 /// Bit [3] is a precision exception value: \n 349 /// 0: A normal PE exception is used \n 350 /// 1: The PE field is not updated \n 351 /// Bit [2] is the rounding control source: \n 352 /// 0: Use bits [1:0] of \a M \n 353 /// 1: Use the current MXCSR setting \n 354 /// Bits [1:0] contain the rounding control definition: \n 355 /// 00: Nearest \n 356 /// 01: Downward (toward negative infinity) \n 357 /// 10: Upward (toward positive infinity) \n 358 /// 11: Truncated 359 /// \returns A 128-bit vector of [2 x double] containing the copied and rounded 360 /// values. 361 #define _mm_round_sd(X, Y, M) \ 362 ((__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), \ 363 (M))) 364 365 /* SSE4 Packed Blending Intrinsics. */ 366 /// Returns a 128-bit vector of [2 x double] where the values are 367 /// selected from either the first or second operand as specified by the 368 /// third operand, the control mask. 369 /// 370 /// \headerfile <x86intrin.h> 371 /// 372 /// \code 373 /// __m128d _mm_blend_pd(__m128d V1, __m128d V2, const int M); 374 /// \endcode 375 /// 376 /// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction. 377 /// 378 /// \param V1 379 /// A 128-bit vector of [2 x double]. 380 /// \param V2 381 /// A 128-bit vector of [2 x double]. 382 /// \param M 383 /// An immediate integer operand, with mask bits [1:0] specifying how the 384 /// values are to be copied. The position of the mask bit corresponds to the 385 /// index of a copied value. When a mask bit is 0, the corresponding 64-bit 386 /// element in operand \a V1 is copied to the same position in the result. 387 /// When a mask bit is 1, the corresponding 64-bit element in operand \a V2 388 /// is copied to the same position in the result. 389 /// \returns A 128-bit vector of [2 x double] containing the copied values. 390 #define _mm_blend_pd(V1, V2, M) \ 391 ((__m128d)__builtin_ia32_blendpd((__v2df)(__m128d)(V1), \ 392 (__v2df)(__m128d)(V2), (int)(M))) 393 394 /// Returns a 128-bit vector of [4 x float] where the values are selected 395 /// from either the first or second operand as specified by the third 396 /// operand, the control mask. 397 /// 398 /// \headerfile <x86intrin.h> 399 /// 400 /// \code 401 /// __m128 _mm_blend_ps(__m128 V1, __m128 V2, const int M); 402 /// \endcode 403 /// 404 /// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS </c> instruction. 405 /// 406 /// \param V1 407 /// A 128-bit vector of [4 x float]. 408 /// \param V2 409 /// A 128-bit vector of [4 x float]. 410 /// \param M 411 /// An immediate integer operand, with mask bits [3:0] specifying how the 412 /// values are to be copied. The position of the mask bit corresponds to the 413 /// index of a copied value. When a mask bit is 0, the corresponding 32-bit 414 /// element in operand \a V1 is copied to the same position in the result. 415 /// When a mask bit is 1, the corresponding 32-bit element in operand \a V2 416 /// is copied to the same position in the result. 417 /// \returns A 128-bit vector of [4 x float] containing the copied values. 418 #define _mm_blend_ps(V1, V2, M) \ 419 ((__m128)__builtin_ia32_blendps((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2), \ 420 (int)(M))) 421 422 /// Returns a 128-bit vector of [2 x double] where the values are 423 /// selected from either the first or second operand as specified by the 424 /// third operand, the control mask. 425 /// 426 /// \headerfile <x86intrin.h> 427 /// 428 /// This intrinsic corresponds to the <c> VBLENDVPD / BLENDVPD </c> instruction. 429 /// 430 /// \param __V1 431 /// A 128-bit vector of [2 x double]. 432 /// \param __V2 433 /// A 128-bit vector of [2 x double]. 434 /// \param __M 435 /// A 128-bit vector operand, with mask bits 127 and 63 specifying how the 436 /// values are to be copied. The position of the mask bit corresponds to the 437 /// most significant bit of a copied value. When a mask bit is 0, the 438 /// corresponding 64-bit element in operand \a __V1 is copied to the same 439 /// position in the result. When a mask bit is 1, the corresponding 64-bit 440 /// element in operand \a __V2 is copied to the same position in the result. 441 /// \returns A 128-bit vector of [2 x double] containing the copied values. 442 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_blendv_pd(__m128d __V1, 443 __m128d __V2, 444 __m128d __M) { 445 return (__m128d)__builtin_ia32_blendvpd((__v2df)__V1, (__v2df)__V2, 446 (__v2df)__M); 447 } 448 449 /// Returns a 128-bit vector of [4 x float] where the values are 450 /// selected from either the first or second operand as specified by the 451 /// third operand, the control mask. 452 /// 453 /// \headerfile <x86intrin.h> 454 /// 455 /// This intrinsic corresponds to the <c> VBLENDVPS / BLENDVPS </c> instruction. 456 /// 457 /// \param __V1 458 /// A 128-bit vector of [4 x float]. 459 /// \param __V2 460 /// A 128-bit vector of [4 x float]. 461 /// \param __M 462 /// A 128-bit vector operand, with mask bits 127, 95, 63, and 31 specifying 463 /// how the values are to be copied. The position of the mask bit corresponds 464 /// to the most significant bit of a copied value. When a mask bit is 0, the 465 /// corresponding 32-bit element in operand \a __V1 is copied to the same 466 /// position in the result. When a mask bit is 1, the corresponding 32-bit 467 /// element in operand \a __V2 is copied to the same position in the result. 468 /// \returns A 128-bit vector of [4 x float] containing the copied values. 469 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_blendv_ps(__m128 __V1, 470 __m128 __V2, 471 __m128 __M) { 472 return (__m128)__builtin_ia32_blendvps((__v4sf)__V1, (__v4sf)__V2, 473 (__v4sf)__M); 474 } 475 476 /// Returns a 128-bit vector of [16 x i8] where the values are selected 477 /// from either of the first or second operand as specified by the third 478 /// operand, the control mask. 479 /// 480 /// \headerfile <x86intrin.h> 481 /// 482 /// This intrinsic corresponds to the <c> VPBLENDVB / PBLENDVB </c> instruction. 483 /// 484 /// \param __V1 485 /// A 128-bit vector of [16 x i8]. 486 /// \param __V2 487 /// A 128-bit vector of [16 x i8]. 488 /// \param __M 489 /// A 128-bit vector operand, with mask bits 127, 119, 111...7 specifying 490 /// how the values are to be copied. The position of the mask bit corresponds 491 /// to the most significant bit of a copied value. When a mask bit is 0, the 492 /// corresponding 8-bit element in operand \a __V1 is copied to the same 493 /// position in the result. When a mask bit is 1, the corresponding 8-bit 494 /// element in operand \a __V2 is copied to the same position in the result. 495 /// \returns A 128-bit vector of [16 x i8] containing the copied values. 496 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_blendv_epi8(__m128i __V1, 497 __m128i __V2, 498 __m128i __M) { 499 return (__m128i)__builtin_ia32_pblendvb128((__v16qi)__V1, (__v16qi)__V2, 500 (__v16qi)__M); 501 } 502 503 /// Returns a 128-bit vector of [8 x i16] where the values are selected 504 /// from either of the first or second operand as specified by the third 505 /// operand, the control mask. 506 /// 507 /// \headerfile <x86intrin.h> 508 /// 509 /// \code 510 /// __m128i _mm_blend_epi16(__m128i V1, __m128i V2, const int M); 511 /// \endcode 512 /// 513 /// This intrinsic corresponds to the <c> VPBLENDW / PBLENDW </c> instruction. 514 /// 515 /// \param V1 516 /// A 128-bit vector of [8 x i16]. 517 /// \param V2 518 /// A 128-bit vector of [8 x i16]. 519 /// \param M 520 /// An immediate integer operand, with mask bits [7:0] specifying how the 521 /// values are to be copied. The position of the mask bit corresponds to the 522 /// index of a copied value. When a mask bit is 0, the corresponding 16-bit 523 /// element in operand \a V1 is copied to the same position in the result. 524 /// When a mask bit is 1, the corresponding 16-bit element in operand \a V2 525 /// is copied to the same position in the result. 526 /// \returns A 128-bit vector of [8 x i16] containing the copied values. 527 #define _mm_blend_epi16(V1, V2, M) \ 528 ((__m128i)__builtin_ia32_pblendw128((__v8hi)(__m128i)(V1), \ 529 (__v8hi)(__m128i)(V2), (int)(M))) 530 531 /* SSE4 Dword Multiply Instructions. */ 532 /// Multiples corresponding elements of two 128-bit vectors of [4 x i32] 533 /// and returns the lower 32 bits of the each product in a 128-bit vector of 534 /// [4 x i32]. 535 /// 536 /// \headerfile <x86intrin.h> 537 /// 538 /// This intrinsic corresponds to the <c> VPMULLD / PMULLD </c> instruction. 539 /// 540 /// \param __V1 541 /// A 128-bit integer vector. 542 /// \param __V2 543 /// A 128-bit integer vector. 544 /// \returns A 128-bit integer vector containing the products of both operands. 545 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi32(__m128i __V1, 546 __m128i __V2) { 547 return (__m128i)((__v4su)__V1 * (__v4su)__V2); 548 } 549 550 /// Multiplies corresponding even-indexed elements of two 128-bit 551 /// vectors of [4 x i32] and returns a 128-bit vector of [2 x i64] 552 /// containing the products. 553 /// 554 /// \headerfile <x86intrin.h> 555 /// 556 /// This intrinsic corresponds to the <c> VPMULDQ / PMULDQ </c> instruction. 557 /// 558 /// \param __V1 559 /// A 128-bit vector of [4 x i32]. 560 /// \param __V2 561 /// A 128-bit vector of [4 x i32]. 562 /// \returns A 128-bit vector of [2 x i64] containing the products of both 563 /// operands. 564 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epi32(__m128i __V1, 565 __m128i __V2) { 566 return (__m128i)__builtin_ia32_pmuldq128((__v4si)__V1, (__v4si)__V2); 567 } 568 569 /* SSE4 Floating Point Dot Product Instructions. */ 570 /// Computes the dot product of the two 128-bit vectors of [4 x float] 571 /// and returns it in the elements of the 128-bit result vector of 572 /// [4 x float]. 573 /// 574 /// The immediate integer operand controls which input elements 575 /// will contribute to the dot product, and where the final results are 576 /// returned. 577 /// 578 /// \headerfile <x86intrin.h> 579 /// 580 /// \code 581 /// __m128 _mm_dp_ps(__m128 X, __m128 Y, const int M); 582 /// \endcode 583 /// 584 /// This intrinsic corresponds to the <c> VDPPS / DPPS </c> instruction. 585 /// 586 /// \param X 587 /// A 128-bit vector of [4 x float]. 588 /// \param Y 589 /// A 128-bit vector of [4 x float]. 590 /// \param M 591 /// An immediate integer operand. Mask bits [7:4] determine which elements 592 /// of the input vectors are used, with bit [4] corresponding to the lowest 593 /// element and bit [7] corresponding to the highest element of each [4 x 594 /// float] vector. If a bit is set, the corresponding elements from the two 595 /// input vectors are used as an input for dot product; otherwise that input 596 /// is treated as zero. Bits [3:0] determine which elements of the result 597 /// will receive a copy of the final dot product, with bit [0] corresponding 598 /// to the lowest element and bit [3] corresponding to the highest element of 599 /// each [4 x float] subvector. If a bit is set, the dot product is returned 600 /// in the corresponding element; otherwise that element is set to zero. 601 /// \returns A 128-bit vector of [4 x float] containing the dot product. 602 #define _mm_dp_ps(X, Y, M) \ 603 ((__m128)__builtin_ia32_dpps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (M))) 604 605 /// Computes the dot product of the two 128-bit vectors of [2 x double] 606 /// and returns it in the elements of the 128-bit result vector of 607 /// [2 x double]. 608 /// 609 /// The immediate integer operand controls which input 610 /// elements will contribute to the dot product, and where the final results 611 /// are returned. 612 /// 613 /// \headerfile <x86intrin.h> 614 /// 615 /// \code 616 /// __m128d _mm_dp_pd(__m128d X, __m128d Y, const int M); 617 /// \endcode 618 /// 619 /// This intrinsic corresponds to the <c> VDPPD / DPPD </c> instruction. 620 /// 621 /// \param X 622 /// A 128-bit vector of [2 x double]. 623 /// \param Y 624 /// A 128-bit vector of [2 x double]. 625 /// \param M 626 /// An immediate integer operand. Mask bits [5:4] determine which elements 627 /// of the input vectors are used, with bit [4] corresponding to the lowest 628 /// element and bit [5] corresponding to the highest element of each of [2 x 629 /// double] vector. If a bit is set, the corresponding elements from the two 630 /// input vectors are used as an input for dot product; otherwise that input 631 /// is treated as zero. Bits [1:0] determine which elements of the result 632 /// will receive a copy of the final dot product, with bit [0] corresponding 633 /// to the lowest element and bit [1] corresponding to the highest element of 634 /// each [2 x double] vector. If a bit is set, the dot product is returned in 635 /// the corresponding element; otherwise that element is set to zero. 636 #define _mm_dp_pd(X, Y, M) \ 637 ((__m128d)__builtin_ia32_dppd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), \ 638 (M))) 639 640 /* SSE4 Streaming Load Hint Instruction. */ 641 /// Loads integer values from a 128-bit aligned memory location to a 642 /// 128-bit integer vector. 643 /// 644 /// \headerfile <x86intrin.h> 645 /// 646 /// This intrinsic corresponds to the <c> VMOVNTDQA / MOVNTDQA </c> instruction. 647 /// 648 /// \param __V 649 /// A pointer to a 128-bit aligned memory location that contains the integer 650 /// values. 651 /// \returns A 128-bit integer vector containing the data stored at the 652 /// specified memory location. 653 static __inline__ __m128i __DEFAULT_FN_ATTRS 654 _mm_stream_load_si128(const void *__V) { 655 return (__m128i)__builtin_nontemporal_load((const __v2di *)__V); 656 } 657 658 /* SSE4 Packed Integer Min/Max Instructions. */ 659 /// Compares the corresponding elements of two 128-bit vectors of 660 /// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser 661 /// of the two values. 662 /// 663 /// \headerfile <x86intrin.h> 664 /// 665 /// This intrinsic corresponds to the <c> VPMINSB / PMINSB </c> instruction. 666 /// 667 /// \param __V1 668 /// A 128-bit vector of [16 x i8]. 669 /// \param __V2 670 /// A 128-bit vector of [16 x i8] 671 /// \returns A 128-bit vector of [16 x i8] containing the lesser values. 672 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi8(__m128i __V1, 673 __m128i __V2) { 674 return (__m128i)__builtin_elementwise_min((__v16qs)__V1, (__v16qs)__V2); 675 } 676 677 /// Compares the corresponding elements of two 128-bit vectors of 678 /// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the 679 /// greater value of the two. 680 /// 681 /// \headerfile <x86intrin.h> 682 /// 683 /// This intrinsic corresponds to the <c> VPMAXSB / PMAXSB </c> instruction. 684 /// 685 /// \param __V1 686 /// A 128-bit vector of [16 x i8]. 687 /// \param __V2 688 /// A 128-bit vector of [16 x i8]. 689 /// \returns A 128-bit vector of [16 x i8] containing the greater values. 690 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi8(__m128i __V1, 691 __m128i __V2) { 692 return (__m128i)__builtin_elementwise_max((__v16qs)__V1, (__v16qs)__V2); 693 } 694 695 /// Compares the corresponding elements of two 128-bit vectors of 696 /// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser 697 /// value of the two. 698 /// 699 /// \headerfile <x86intrin.h> 700 /// 701 /// This intrinsic corresponds to the <c> VPMINUW / PMINUW </c> instruction. 702 /// 703 /// \param __V1 704 /// A 128-bit vector of [8 x u16]. 705 /// \param __V2 706 /// A 128-bit vector of [8 x u16]. 707 /// \returns A 128-bit vector of [8 x u16] containing the lesser values. 708 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu16(__m128i __V1, 709 __m128i __V2) { 710 return (__m128i)__builtin_elementwise_min((__v8hu)__V1, (__v8hu)__V2); 711 } 712 713 /// Compares the corresponding elements of two 128-bit vectors of 714 /// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the 715 /// greater value of the two. 716 /// 717 /// \headerfile <x86intrin.h> 718 /// 719 /// This intrinsic corresponds to the <c> VPMAXUW / PMAXUW </c> instruction. 720 /// 721 /// \param __V1 722 /// A 128-bit vector of [8 x u16]. 723 /// \param __V2 724 /// A 128-bit vector of [8 x u16]. 725 /// \returns A 128-bit vector of [8 x u16] containing the greater values. 726 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu16(__m128i __V1, 727 __m128i __V2) { 728 return (__m128i)__builtin_elementwise_max((__v8hu)__V1, (__v8hu)__V2); 729 } 730 731 /// Compares the corresponding elements of two 128-bit vectors of 732 /// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser 733 /// value of the two. 734 /// 735 /// \headerfile <x86intrin.h> 736 /// 737 /// This intrinsic corresponds to the <c> VPMINSD / PMINSD </c> instruction. 738 /// 739 /// \param __V1 740 /// A 128-bit vector of [4 x i32]. 741 /// \param __V2 742 /// A 128-bit vector of [4 x i32]. 743 /// \returns A 128-bit vector of [4 x i32] containing the lesser values. 744 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi32(__m128i __V1, 745 __m128i __V2) { 746 return (__m128i)__builtin_elementwise_min((__v4si)__V1, (__v4si)__V2); 747 } 748 749 /// Compares the corresponding elements of two 128-bit vectors of 750 /// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the 751 /// greater value of the two. 752 /// 753 /// \headerfile <x86intrin.h> 754 /// 755 /// This intrinsic corresponds to the <c> VPMAXSD / PMAXSD </c> instruction. 756 /// 757 /// \param __V1 758 /// A 128-bit vector of [4 x i32]. 759 /// \param __V2 760 /// A 128-bit vector of [4 x i32]. 761 /// \returns A 128-bit vector of [4 x i32] containing the greater values. 762 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi32(__m128i __V1, 763 __m128i __V2) { 764 return (__m128i)__builtin_elementwise_max((__v4si)__V1, (__v4si)__V2); 765 } 766 767 /// Compares the corresponding elements of two 128-bit vectors of 768 /// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser 769 /// value of the two. 770 /// 771 /// \headerfile <x86intrin.h> 772 /// 773 /// This intrinsic corresponds to the <c> VPMINUD / PMINUD </c> instruction. 774 /// 775 /// \param __V1 776 /// A 128-bit vector of [4 x u32]. 777 /// \param __V2 778 /// A 128-bit vector of [4 x u32]. 779 /// \returns A 128-bit vector of [4 x u32] containing the lesser values. 780 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu32(__m128i __V1, 781 __m128i __V2) { 782 return (__m128i)__builtin_elementwise_min((__v4su)__V1, (__v4su)__V2); 783 } 784 785 /// Compares the corresponding elements of two 128-bit vectors of 786 /// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the 787 /// greater value of the two. 788 /// 789 /// \headerfile <x86intrin.h> 790 /// 791 /// This intrinsic corresponds to the <c> VPMAXUD / PMAXUD </c> instruction. 792 /// 793 /// \param __V1 794 /// A 128-bit vector of [4 x u32]. 795 /// \param __V2 796 /// A 128-bit vector of [4 x u32]. 797 /// \returns A 128-bit vector of [4 x u32] containing the greater values. 798 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu32(__m128i __V1, 799 __m128i __V2) { 800 return (__m128i)__builtin_elementwise_max((__v4su)__V1, (__v4su)__V2); 801 } 802 803 /* SSE4 Insertion and Extraction from XMM Register Instructions. */ 804 /// Takes the first argument \a X and inserts an element from the second 805 /// argument \a Y as selected by the third argument \a N. That result then 806 /// has elements zeroed out also as selected by the third argument \a N. The 807 /// resulting 128-bit vector of [4 x float] is then returned. 808 /// 809 /// \headerfile <x86intrin.h> 810 /// 811 /// \code 812 /// __m128 _mm_insert_ps(__m128 X, __m128 Y, const int N); 813 /// \endcode 814 /// 815 /// This intrinsic corresponds to the <c> VINSERTPS </c> instruction. 816 /// 817 /// \param X 818 /// A 128-bit vector source operand of [4 x float]. With the exception of 819 /// those bits in the result copied from parameter \a Y and zeroed by bits 820 /// [3:0] of \a N, all bits from this parameter are copied to the result. 821 /// \param Y 822 /// A 128-bit vector source operand of [4 x float]. One single-precision 823 /// floating-point element from this source, as determined by the immediate 824 /// parameter, is copied to the result. 825 /// \param N 826 /// Specifies which bits from operand \a Y will be copied, which bits in the 827 /// result they will be copied to, and which bits in the result will be 828 /// cleared. The following assignments are made: \n 829 /// Bits [7:6] specify the bits to copy from operand \a Y: \n 830 /// 00: Selects bits [31:0] from operand \a Y. \n 831 /// 01: Selects bits [63:32] from operand \a Y. \n 832 /// 10: Selects bits [95:64] from operand \a Y. \n 833 /// 11: Selects bits [127:96] from operand \a Y. \n 834 /// Bits [5:4] specify the bits in the result to which the selected bits 835 /// from operand \a Y are copied: \n 836 /// 00: Copies the selected bits from \a Y to result bits [31:0]. \n 837 /// 01: Copies the selected bits from \a Y to result bits [63:32]. \n 838 /// 10: Copies the selected bits from \a Y to result bits [95:64]. \n 839 /// 11: Copies the selected bits from \a Y to result bits [127:96]. \n 840 /// Bits[3:0]: If any of these bits are set, the corresponding result 841 /// element is cleared. 842 /// \returns A 128-bit vector of [4 x float] containing the copied 843 /// single-precision floating point elements from the operands. 844 #define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N)) 845 846 /// Extracts a 32-bit integer from a 128-bit vector of [4 x float] and 847 /// returns it, using the immediate value parameter \a N as a selector. 848 /// 849 /// \headerfile <x86intrin.h> 850 /// 851 /// \code 852 /// int _mm_extract_ps(__m128 X, const int N); 853 /// \endcode 854 /// 855 /// This intrinsic corresponds to the <c> VEXTRACTPS / EXTRACTPS </c> 856 /// instruction. 857 /// 858 /// \param X 859 /// A 128-bit vector of [4 x float]. 860 /// \param N 861 /// An immediate value. Bits [1:0] determines which bits from the argument 862 /// \a X are extracted and returned: \n 863 /// 00: Bits [31:0] of parameter \a X are returned. \n 864 /// 01: Bits [63:32] of parameter \a X are returned. \n 865 /// 10: Bits [95:64] of parameter \a X are returned. \n 866 /// 11: Bits [127:96] of parameter \a X are returned. 867 /// \returns A 32-bit integer containing the extracted 32 bits of float data. 868 #define _mm_extract_ps(X, N) \ 869 __builtin_bit_cast( \ 870 int, __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N))) 871 872 /* Miscellaneous insert and extract macros. */ 873 /* Extract a single-precision float from X at index N into D. */ 874 #define _MM_EXTRACT_FLOAT(D, X, N) \ 875 do { \ 876 (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); \ 877 } while (0) 878 879 /* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create 880 an index suitable for _mm_insert_ps. */ 881 #define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z)) 882 883 /* Extract a float from X at index N into the first index of the return. */ 884 #define _MM_PICK_OUT_PS(X, N) \ 885 _mm_insert_ps(_mm_setzero_ps(), (X), _MM_MK_INSERTPS_NDX((N), 0, 0x0e)) 886 887 /* Insert int into packed integer array at index. */ 888 /// Constructs a 128-bit vector of [16 x i8] by first making a copy of 889 /// the 128-bit integer vector parameter, and then inserting the lower 8 bits 890 /// of an integer parameter \a I into an offset specified by the immediate 891 /// value parameter \a N. 892 /// 893 /// \headerfile <x86intrin.h> 894 /// 895 /// \code 896 /// __m128i _mm_insert_epi8(__m128i X, int I, const int N); 897 /// \endcode 898 /// 899 /// This intrinsic corresponds to the <c> VPINSRB / PINSRB </c> instruction. 900 /// 901 /// \param X 902 /// A 128-bit integer vector of [16 x i8]. This vector is copied to the 903 /// result and then one of the sixteen elements in the result vector is 904 /// replaced by the lower 8 bits of \a I. 905 /// \param I 906 /// An integer. The lower 8 bits of this operand are written to the result 907 /// beginning at the offset specified by \a N. 908 /// \param N 909 /// An immediate value. Bits [3:0] specify the bit offset in the result at 910 /// which the lower 8 bits of \a I are written. \n 911 /// 0000: Bits [7:0] of the result are used for insertion. \n 912 /// 0001: Bits [15:8] of the result are used for insertion. \n 913 /// 0010: Bits [23:16] of the result are used for insertion. \n 914 /// 0011: Bits [31:24] of the result are used for insertion. \n 915 /// 0100: Bits [39:32] of the result are used for insertion. \n 916 /// 0101: Bits [47:40] of the result are used for insertion. \n 917 /// 0110: Bits [55:48] of the result are used for insertion. \n 918 /// 0111: Bits [63:56] of the result are used for insertion. \n 919 /// 1000: Bits [71:64] of the result are used for insertion. \n 920 /// 1001: Bits [79:72] of the result are used for insertion. \n 921 /// 1010: Bits [87:80] of the result are used for insertion. \n 922 /// 1011: Bits [95:88] of the result are used for insertion. \n 923 /// 1100: Bits [103:96] of the result are used for insertion. \n 924 /// 1101: Bits [111:104] of the result are used for insertion. \n 925 /// 1110: Bits [119:112] of the result are used for insertion. \n 926 /// 1111: Bits [127:120] of the result are used for insertion. 927 /// \returns A 128-bit integer vector containing the constructed values. 928 #define _mm_insert_epi8(X, I, N) \ 929 ((__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), (int)(I), \ 930 (int)(N))) 931 932 /// Constructs a 128-bit vector of [4 x i32] by first making a copy of 933 /// the 128-bit integer vector parameter, and then inserting the 32-bit 934 /// integer parameter \a I at the offset specified by the immediate value 935 /// parameter \a N. 936 /// 937 /// \headerfile <x86intrin.h> 938 /// 939 /// \code 940 /// __m128i _mm_insert_epi32(__m128i X, int I, const int N); 941 /// \endcode 942 /// 943 /// This intrinsic corresponds to the <c> VPINSRD / PINSRD </c> instruction. 944 /// 945 /// \param X 946 /// A 128-bit integer vector of [4 x i32]. This vector is copied to the 947 /// result and then one of the four elements in the result vector is 948 /// replaced by \a I. 949 /// \param I 950 /// A 32-bit integer that is written to the result beginning at the offset 951 /// specified by \a N. 952 /// \param N 953 /// An immediate value. Bits [1:0] specify the bit offset in the result at 954 /// which the integer \a I is written. \n 955 /// 00: Bits [31:0] of the result are used for insertion. \n 956 /// 01: Bits [63:32] of the result are used for insertion. \n 957 /// 10: Bits [95:64] of the result are used for insertion. \n 958 /// 11: Bits [127:96] of the result are used for insertion. 959 /// \returns A 128-bit integer vector containing the constructed values. 960 #define _mm_insert_epi32(X, I, N) \ 961 ((__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), (int)(I), \ 962 (int)(N))) 963 964 #ifdef __x86_64__ 965 /// Constructs a 128-bit vector of [2 x i64] by first making a copy of 966 /// the 128-bit integer vector parameter, and then inserting the 64-bit 967 /// integer parameter \a I, using the immediate value parameter \a N as an 968 /// insertion location selector. 969 /// 970 /// \headerfile <x86intrin.h> 971 /// 972 /// \code 973 /// __m128i _mm_insert_epi64(__m128i X, long long I, const int N); 974 /// \endcode 975 /// 976 /// This intrinsic corresponds to the <c> VPINSRQ / PINSRQ </c> instruction. 977 /// 978 /// \param X 979 /// A 128-bit integer vector of [2 x i64]. This vector is copied to the 980 /// result and then one of the two elements in the result vector is replaced 981 /// by \a I. 982 /// \param I 983 /// A 64-bit integer that is written to the result beginning at the offset 984 /// specified by \a N. 985 /// \param N 986 /// An immediate value. Bit [0] specifies the bit offset in the result at 987 /// which the integer \a I is written. \n 988 /// 0: Bits [63:0] of the result are used for insertion. \n 989 /// 1: Bits [127:64] of the result are used for insertion. \n 990 /// \returns A 128-bit integer vector containing the constructed values. 991 #define _mm_insert_epi64(X, I, N) \ 992 ((__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), (long long)(I), \ 993 (int)(N))) 994 #endif /* __x86_64__ */ 995 996 /* Extract int from packed integer array at index. This returns the element 997 * as a zero extended value, so it is unsigned. 998 */ 999 /// Extracts an 8-bit element from the 128-bit integer vector of 1000 /// [16 x i8], using the immediate value parameter \a N as a selector. 1001 /// 1002 /// \headerfile <x86intrin.h> 1003 /// 1004 /// \code 1005 /// int _mm_extract_epi8(__m128i X, const int N); 1006 /// \endcode 1007 /// 1008 /// This intrinsic corresponds to the <c> VPEXTRB / PEXTRB </c> instruction. 1009 /// 1010 /// \param X 1011 /// A 128-bit integer vector. 1012 /// \param N 1013 /// An immediate value. Bits [3:0] specify which 8-bit vector element from 1014 /// the argument \a X to extract and copy to the result. \n 1015 /// 0000: Bits [7:0] of parameter \a X are extracted. \n 1016 /// 0001: Bits [15:8] of the parameter \a X are extracted. \n 1017 /// 0010: Bits [23:16] of the parameter \a X are extracted. \n 1018 /// 0011: Bits [31:24] of the parameter \a X are extracted. \n 1019 /// 0100: Bits [39:32] of the parameter \a X are extracted. \n 1020 /// 0101: Bits [47:40] of the parameter \a X are extracted. \n 1021 /// 0110: Bits [55:48] of the parameter \a X are extracted. \n 1022 /// 0111: Bits [63:56] of the parameter \a X are extracted. \n 1023 /// 1000: Bits [71:64] of the parameter \a X are extracted. \n 1024 /// 1001: Bits [79:72] of the parameter \a X are extracted. \n 1025 /// 1010: Bits [87:80] of the parameter \a X are extracted. \n 1026 /// 1011: Bits [95:88] of the parameter \a X are extracted. \n 1027 /// 1100: Bits [103:96] of the parameter \a X are extracted. \n 1028 /// 1101: Bits [111:104] of the parameter \a X are extracted. \n 1029 /// 1110: Bits [119:112] of the parameter \a X are extracted. \n 1030 /// 1111: Bits [127:120] of the parameter \a X are extracted. 1031 /// \returns An unsigned integer, whose lower 8 bits are selected from the 1032 /// 128-bit integer vector parameter and the remaining bits are assigned 1033 /// zeros. 1034 #define _mm_extract_epi8(X, N) \ 1035 ((int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \ 1036 (int)(N))) 1037 1038 /// Extracts a 32-bit element from the 128-bit integer vector of 1039 /// [4 x i32], using the immediate value parameter \a N as a selector. 1040 /// 1041 /// \headerfile <x86intrin.h> 1042 /// 1043 /// \code 1044 /// int _mm_extract_epi32(__m128i X, const int N); 1045 /// \endcode 1046 /// 1047 /// This intrinsic corresponds to the <c> VPEXTRD / PEXTRD </c> instruction. 1048 /// 1049 /// \param X 1050 /// A 128-bit integer vector. 1051 /// \param N 1052 /// An immediate value. Bits [1:0] specify which 32-bit vector element from 1053 /// the argument \a X to extract and copy to the result. \n 1054 /// 00: Bits [31:0] of the parameter \a X are extracted. \n 1055 /// 01: Bits [63:32] of the parameter \a X are extracted. \n 1056 /// 10: Bits [95:64] of the parameter \a X are extracted. \n 1057 /// 11: Bits [127:96] of the parameter \a X are exracted. 1058 /// \returns An integer, whose lower 32 bits are selected from the 128-bit 1059 /// integer vector parameter and the remaining bits are assigned zeros. 1060 #define _mm_extract_epi32(X, N) \ 1061 ((int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N))) 1062 1063 /// Extracts a 64-bit element from the 128-bit integer vector of 1064 /// [2 x i64], using the immediate value parameter \a N as a selector. 1065 /// 1066 /// \headerfile <x86intrin.h> 1067 /// 1068 /// \code 1069 /// long long _mm_extract_epi64(__m128i X, const int N); 1070 /// \endcode 1071 /// 1072 /// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction 1073 /// in 64-bit mode. 1074 /// 1075 /// \param X 1076 /// A 128-bit integer vector. 1077 /// \param N 1078 /// An immediate value. Bit [0] specifies which 64-bit vector element from 1079 /// the argument \a X to return. \n 1080 /// 0: Bits [63:0] are returned. \n 1081 /// 1: Bits [127:64] are returned. \n 1082 /// \returns A 64-bit integer. 1083 #define _mm_extract_epi64(X, N) \ 1084 ((long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N))) 1085 1086 /* SSE4 128-bit Packed Integer Comparisons. */ 1087 /// Tests whether the specified bits in a 128-bit integer vector are all 1088 /// zeros. 1089 /// 1090 /// \headerfile <x86intrin.h> 1091 /// 1092 /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1093 /// 1094 /// \param __M 1095 /// A 128-bit integer vector containing the bits to be tested. 1096 /// \param __V 1097 /// A 128-bit integer vector selecting which bits to test in operand \a __M. 1098 /// \returns TRUE if the specified bits are all zeros; FALSE otherwise. 1099 static __inline__ int __DEFAULT_FN_ATTRS _mm_testz_si128(__m128i __M, 1100 __m128i __V) { 1101 return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V); 1102 } 1103 1104 /// Tests whether the specified bits in a 128-bit integer vector are all 1105 /// ones. 1106 /// 1107 /// \headerfile <x86intrin.h> 1108 /// 1109 /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1110 /// 1111 /// \param __M 1112 /// A 128-bit integer vector containing the bits to be tested. 1113 /// \param __V 1114 /// A 128-bit integer vector selecting which bits to test in operand \a __M. 1115 /// \returns TRUE if the specified bits are all ones; FALSE otherwise. 1116 static __inline__ int __DEFAULT_FN_ATTRS _mm_testc_si128(__m128i __M, 1117 __m128i __V) { 1118 return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V); 1119 } 1120 1121 /// Tests whether the specified bits in a 128-bit integer vector are 1122 /// neither all zeros nor all ones. 1123 /// 1124 /// \headerfile <x86intrin.h> 1125 /// 1126 /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1127 /// 1128 /// \param __M 1129 /// A 128-bit integer vector containing the bits to be tested. 1130 /// \param __V 1131 /// A 128-bit integer vector selecting which bits to test in operand \a __M. 1132 /// \returns TRUE if the specified bits are neither all zeros nor all ones; 1133 /// FALSE otherwise. 1134 static __inline__ int __DEFAULT_FN_ATTRS _mm_testnzc_si128(__m128i __M, 1135 __m128i __V) { 1136 return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V); 1137 } 1138 1139 /// Tests whether the specified bits in a 128-bit integer vector are all 1140 /// ones. 1141 /// 1142 /// \headerfile <x86intrin.h> 1143 /// 1144 /// \code 1145 /// int _mm_test_all_ones(__m128i V); 1146 /// \endcode 1147 /// 1148 /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1149 /// 1150 /// \param V 1151 /// A 128-bit integer vector containing the bits to be tested. 1152 /// \returns TRUE if the bits specified in the operand are all set to 1; FALSE 1153 /// otherwise. 1154 #define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_set1_epi32(-1)) 1155 1156 /// Tests whether the specified bits in a 128-bit integer vector are 1157 /// neither all zeros nor all ones. 1158 /// 1159 /// \headerfile <x86intrin.h> 1160 /// 1161 /// \code 1162 /// int _mm_test_mix_ones_zeros(__m128i M, __m128i V); 1163 /// \endcode 1164 /// 1165 /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1166 /// 1167 /// \param M 1168 /// A 128-bit integer vector containing the bits to be tested. 1169 /// \param V 1170 /// A 128-bit integer vector selecting which bits to test in operand \a M. 1171 /// \returns TRUE if the specified bits are neither all zeros nor all ones; 1172 /// FALSE otherwise. 1173 #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V)) 1174 1175 /// Tests whether the specified bits in a 128-bit integer vector are all 1176 /// zeros. 1177 /// 1178 /// \headerfile <x86intrin.h> 1179 /// 1180 /// \code 1181 /// int _mm_test_all_zeros(__m128i M, __m128i V); 1182 /// \endcode 1183 /// 1184 /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1185 /// 1186 /// \param M 1187 /// A 128-bit integer vector containing the bits to be tested. 1188 /// \param V 1189 /// A 128-bit integer vector selecting which bits to test in operand \a M. 1190 /// \returns TRUE if the specified bits are all zeros; FALSE otherwise. 1191 #define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V)) 1192 1193 /* SSE4 64-bit Packed Integer Comparisons. */ 1194 /// Compares each of the corresponding 64-bit values of the 128-bit 1195 /// integer vectors for equality. 1196 /// 1197 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 1198 /// 1199 /// \headerfile <x86intrin.h> 1200 /// 1201 /// This intrinsic corresponds to the <c> VPCMPEQQ / PCMPEQQ </c> instruction. 1202 /// 1203 /// \param __V1 1204 /// A 128-bit integer vector. 1205 /// \param __V2 1206 /// A 128-bit integer vector. 1207 /// \returns A 128-bit integer vector containing the comparison results. 1208 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi64(__m128i __V1, 1209 __m128i __V2) { 1210 return (__m128i)((__v2di)__V1 == (__v2di)__V2); 1211 } 1212 1213 /* SSE4 Packed Integer Sign-Extension. */ 1214 /// Sign-extends each of the lower eight 8-bit integer elements of a 1215 /// 128-bit vector of [16 x i8] to 16-bit values and returns them in a 1216 /// 128-bit vector of [8 x i16]. The upper eight elements of the input vector 1217 /// are unused. 1218 /// 1219 /// \headerfile <x86intrin.h> 1220 /// 1221 /// This intrinsic corresponds to the <c> VPMOVSXBW / PMOVSXBW </c> instruction. 1222 /// 1223 /// \param __V 1224 /// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are 1225 /// sign-extended to 16-bit values. 1226 /// \returns A 128-bit vector of [8 x i16] containing the sign-extended values. 1227 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi16(__m128i __V) { 1228 /* This function always performs a signed extension, but __v16qi is a char 1229 which may be signed or unsigned, so use __v16qs. */ 1230 return (__m128i) __builtin_convertvector( 1231 __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 1232 7), 1233 __v8hi); 1234 } 1235 1236 /// Sign-extends each of the lower four 8-bit integer elements of a 1237 /// 128-bit vector of [16 x i8] to 32-bit values and returns them in a 1238 /// 128-bit vector of [4 x i32]. The upper twelve elements of the input 1239 /// vector are unused. 1240 /// 1241 /// \headerfile <x86intrin.h> 1242 /// 1243 /// This intrinsic corresponds to the <c> VPMOVSXBD / PMOVSXBD </c> instruction. 1244 /// 1245 /// \param __V 1246 /// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are 1247 /// sign-extended to 32-bit values. 1248 /// \returns A 128-bit vector of [4 x i32] containing the sign-extended values. 1249 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi32(__m128i __V) { 1250 /* This function always performs a signed extension, but __v16qi is a char 1251 which may be signed or unsigned, so use __v16qs. */ 1252 return (__m128i) __builtin_convertvector( 1253 __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si); 1254 } 1255 1256 /// Sign-extends each of the lower two 8-bit integer elements of a 1257 /// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in 1258 /// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input 1259 /// vector are unused. 1260 /// 1261 /// \headerfile <x86intrin.h> 1262 /// 1263 /// This intrinsic corresponds to the <c> VPMOVSXBQ / PMOVSXBQ </c> instruction. 1264 /// 1265 /// \param __V 1266 /// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are 1267 /// sign-extended to 64-bit values. 1268 /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. 1269 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi64(__m128i __V) { 1270 /* This function always performs a signed extension, but __v16qi is a char 1271 which may be signed or unsigned, so use __v16qs. */ 1272 return (__m128i) __builtin_convertvector( 1273 __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di); 1274 } 1275 1276 /// Sign-extends each of the lower four 16-bit integer elements of a 1277 /// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in 1278 /// a 128-bit vector of [4 x i32]. The upper four elements of the input 1279 /// vector are unused. 1280 /// 1281 /// \headerfile <x86intrin.h> 1282 /// 1283 /// This intrinsic corresponds to the <c> VPMOVSXWD / PMOVSXWD </c> instruction. 1284 /// 1285 /// \param __V 1286 /// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are 1287 /// sign-extended to 32-bit values. 1288 /// \returns A 128-bit vector of [4 x i32] containing the sign-extended values. 1289 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi32(__m128i __V) { 1290 return (__m128i) __builtin_convertvector( 1291 __builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si); 1292 } 1293 1294 /// Sign-extends each of the lower two 16-bit integer elements of a 1295 /// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in 1296 /// a 128-bit vector of [2 x i64]. The upper six elements of the input 1297 /// vector are unused. 1298 /// 1299 /// \headerfile <x86intrin.h> 1300 /// 1301 /// This intrinsic corresponds to the <c> VPMOVSXWQ / PMOVSXWQ </c> instruction. 1302 /// 1303 /// \param __V 1304 /// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are 1305 /// sign-extended to 64-bit values. 1306 /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. 1307 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi64(__m128i __V) { 1308 return (__m128i) __builtin_convertvector( 1309 __builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di); 1310 } 1311 1312 /// Sign-extends each of the lower two 32-bit integer elements of a 1313 /// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in 1314 /// a 128-bit vector of [2 x i64]. The upper two elements of the input vector 1315 /// are unused. 1316 /// 1317 /// \headerfile <x86intrin.h> 1318 /// 1319 /// This intrinsic corresponds to the <c> VPMOVSXDQ / PMOVSXDQ </c> instruction. 1320 /// 1321 /// \param __V 1322 /// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are 1323 /// sign-extended to 64-bit values. 1324 /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. 1325 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi32_epi64(__m128i __V) { 1326 return (__m128i) __builtin_convertvector( 1327 __builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di); 1328 } 1329 1330 /* SSE4 Packed Integer Zero-Extension. */ 1331 /// Zero-extends each of the lower eight 8-bit integer elements of a 1332 /// 128-bit vector of [16 x i8] to 16-bit values and returns them in a 1333 /// 128-bit vector of [8 x i16]. The upper eight elements of the input vector 1334 /// are unused. 1335 /// 1336 /// \headerfile <x86intrin.h> 1337 /// 1338 /// This intrinsic corresponds to the <c> VPMOVZXBW / PMOVZXBW </c> instruction. 1339 /// 1340 /// \param __V 1341 /// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are 1342 /// zero-extended to 16-bit values. 1343 /// \returns A 128-bit vector of [8 x i16] containing the zero-extended values. 1344 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi16(__m128i __V) { 1345 return (__m128i) __builtin_convertvector( 1346 __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 1347 7), 1348 __v8hi); 1349 } 1350 1351 /// Zero-extends each of the lower four 8-bit integer elements of a 1352 /// 128-bit vector of [16 x i8] to 32-bit values and returns them in a 1353 /// 128-bit vector of [4 x i32]. The upper twelve elements of the input 1354 /// vector are unused. 1355 /// 1356 /// \headerfile <x86intrin.h> 1357 /// 1358 /// This intrinsic corresponds to the <c> VPMOVZXBD / PMOVZXBD </c> instruction. 1359 /// 1360 /// \param __V 1361 /// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are 1362 /// zero-extended to 32-bit values. 1363 /// \returns A 128-bit vector of [4 x i32] containing the zero-extended values. 1364 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi32(__m128i __V) { 1365 return (__m128i) __builtin_convertvector( 1366 __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si); 1367 } 1368 1369 /// Zero-extends each of the lower two 8-bit integer elements of a 1370 /// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in 1371 /// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input 1372 /// vector are unused. 1373 /// 1374 /// \headerfile <x86intrin.h> 1375 /// 1376 /// This intrinsic corresponds to the <c> VPMOVZXBQ / PMOVZXBQ </c> instruction. 1377 /// 1378 /// \param __V 1379 /// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are 1380 /// zero-extended to 64-bit values. 1381 /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. 1382 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi64(__m128i __V) { 1383 return (__m128i) __builtin_convertvector( 1384 __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di); 1385 } 1386 1387 /// Zero-extends each of the lower four 16-bit integer elements of a 1388 /// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in 1389 /// a 128-bit vector of [4 x i32]. The upper four elements of the input 1390 /// vector are unused. 1391 /// 1392 /// \headerfile <x86intrin.h> 1393 /// 1394 /// This intrinsic corresponds to the <c> VPMOVZXWD / PMOVZXWD </c> instruction. 1395 /// 1396 /// \param __V 1397 /// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are 1398 /// zero-extended to 32-bit values. 1399 /// \returns A 128-bit vector of [4 x i32] containing the zero-extended values. 1400 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi32(__m128i __V) { 1401 return (__m128i) __builtin_convertvector( 1402 __builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si); 1403 } 1404 1405 /// Zero-extends each of the lower two 16-bit integer elements of a 1406 /// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in 1407 /// a 128-bit vector of [2 x i64]. The upper six elements of the input vector 1408 /// are unused. 1409 /// 1410 /// \headerfile <x86intrin.h> 1411 /// 1412 /// This intrinsic corresponds to the <c> VPMOVZXWQ / PMOVZXWQ </c> instruction. 1413 /// 1414 /// \param __V 1415 /// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are 1416 /// zero-extended to 64-bit values. 1417 /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. 1418 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi64(__m128i __V) { 1419 return (__m128i) __builtin_convertvector( 1420 __builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di); 1421 } 1422 1423 /// Zero-extends each of the lower two 32-bit integer elements of a 1424 /// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in 1425 /// a 128-bit vector of [2 x i64]. The upper two elements of the input vector 1426 /// are unused. 1427 /// 1428 /// \headerfile <x86intrin.h> 1429 /// 1430 /// This intrinsic corresponds to the <c> VPMOVZXDQ / PMOVZXDQ </c> instruction. 1431 /// 1432 /// \param __V 1433 /// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are 1434 /// zero-extended to 64-bit values. 1435 /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. 1436 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu32_epi64(__m128i __V) { 1437 return (__m128i) __builtin_convertvector( 1438 __builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di); 1439 } 1440 1441 /* SSE4 Pack with Unsigned Saturation. */ 1442 /// Converts, with saturation, 32-bit signed integers from both 128-bit integer 1443 /// vector operands into 16-bit unsigned integers, and returns the packed 1444 /// result. 1445 /// 1446 /// Values greater than 0xFFFF are saturated to 0xFFFF. Values less than 1447 /// 0x0000 are saturated to 0x0000. 1448 /// 1449 /// \headerfile <x86intrin.h> 1450 /// 1451 /// This intrinsic corresponds to the <c> VPACKUSDW / PACKUSDW </c> instruction. 1452 /// 1453 /// \param __V1 1454 /// A 128-bit vector of [4 x i32]. The converted [4 x i16] values are 1455 /// written to the lower 64 bits of the result. 1456 /// \param __V2 1457 /// A 128-bit vector of [4 x i32]. The converted [4 x i16] values are 1458 /// written to the higher 64 bits of the result. 1459 /// \returns A 128-bit vector of [8 x i16] containing the converted values. 1460 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi32(__m128i __V1, 1461 __m128i __V2) { 1462 return (__m128i)__builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2); 1463 } 1464 1465 /* SSE4 Multiple Packed Sums of Absolute Difference. */ 1466 /// Subtracts 8-bit unsigned integer values and computes the absolute 1467 /// values of the differences to the corresponding bits in the destination. 1468 /// Then sums of the absolute differences are returned according to the bit 1469 /// fields in the immediate operand. 1470 /// 1471 /// \headerfile <x86intrin.h> 1472 /// 1473 /// \code 1474 /// __m128i _mm_mpsadbw_epu8(__m128i X, __m128i Y, const int M); 1475 /// \endcode 1476 /// 1477 /// This intrinsic corresponds to the <c> VMPSADBW / MPSADBW </c> instruction. 1478 /// 1479 /// \param X 1480 /// A 128-bit vector of [16 x i8]. 1481 /// \param Y 1482 /// A 128-bit vector of [16 x i8]. 1483 /// \param M 1484 /// An 8-bit immediate operand specifying how the absolute differences are to 1485 /// be calculated, according to the following algorithm: 1486 /// \code 1487 /// // M2 represents bit 2 of the immediate operand 1488 /// // M10 represents bits [1:0] of the immediate operand 1489 /// i = M2 * 4; 1490 /// j = M10 * 4; 1491 /// for (k = 0; k < 8; k = k + 1) { 1492 /// d0 = abs(X[i + k + 0] - Y[j + 0]); 1493 /// d1 = abs(X[i + k + 1] - Y[j + 1]); 1494 /// d2 = abs(X[i + k + 2] - Y[j + 2]); 1495 /// d3 = abs(X[i + k + 3] - Y[j + 3]); 1496 /// r[k] = d0 + d1 + d2 + d3; 1497 /// } 1498 /// \endcode 1499 /// \returns A 128-bit integer vector containing the sums of the sets of 1500 /// absolute differences between both operands. 1501 #define _mm_mpsadbw_epu8(X, Y, M) \ 1502 ((__m128i)__builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \ 1503 (__v16qi)(__m128i)(Y), (M))) 1504 1505 /// Finds the minimum unsigned 16-bit element in the input 128-bit 1506 /// vector of [8 x u16] and returns it and along with its index. 1507 /// 1508 /// \headerfile <x86intrin.h> 1509 /// 1510 /// This intrinsic corresponds to the <c> VPHMINPOSUW / PHMINPOSUW </c> 1511 /// instruction. 1512 /// 1513 /// \param __V 1514 /// A 128-bit vector of [8 x u16]. 1515 /// \returns A 128-bit value where bits [15:0] contain the minimum value found 1516 /// in parameter \a __V, bits [18:16] contain the index of the minimum value 1517 /// and the remaining bits are set to 0. 1518 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_minpos_epu16(__m128i __V) { 1519 return (__m128i)__builtin_ia32_phminposuw128((__v8hi)__V); 1520 } 1521 1522 /* Handle the sse4.2 definitions here. */ 1523 1524 /* These definitions are normally in nmmintrin.h, but gcc puts them in here 1525 so we'll do the same. */ 1526 1527 #undef __DEFAULT_FN_ATTRS 1528 #define __DEFAULT_FN_ATTRS \ 1529 __attribute__((__always_inline__, __nodebug__, __target__("sse4.2"))) 1530 1531 /* These specify the type of data that we're comparing. */ 1532 #define _SIDD_UBYTE_OPS 0x00 1533 #define _SIDD_UWORD_OPS 0x01 1534 #define _SIDD_SBYTE_OPS 0x02 1535 #define _SIDD_SWORD_OPS 0x03 1536 1537 /* These specify the type of comparison operation. */ 1538 #define _SIDD_CMP_EQUAL_ANY 0x00 1539 #define _SIDD_CMP_RANGES 0x04 1540 #define _SIDD_CMP_EQUAL_EACH 0x08 1541 #define _SIDD_CMP_EQUAL_ORDERED 0x0c 1542 1543 /* These macros specify the polarity of the operation. */ 1544 #define _SIDD_POSITIVE_POLARITY 0x00 1545 #define _SIDD_NEGATIVE_POLARITY 0x10 1546 #define _SIDD_MASKED_POSITIVE_POLARITY 0x20 1547 #define _SIDD_MASKED_NEGATIVE_POLARITY 0x30 1548 1549 /* These macros are used in _mm_cmpXstri() to specify the return. */ 1550 #define _SIDD_LEAST_SIGNIFICANT 0x00 1551 #define _SIDD_MOST_SIGNIFICANT 0x40 1552 1553 /* These macros are used in _mm_cmpXstri() to specify the return. */ 1554 #define _SIDD_BIT_MASK 0x00 1555 #define _SIDD_UNIT_MASK 0x40 1556 1557 /* SSE4.2 Packed Comparison Intrinsics. */ 1558 /// Uses the immediate operand \a M to perform a comparison of string 1559 /// data with implicitly defined lengths that is contained in source operands 1560 /// \a A and \a B. Returns a 128-bit integer vector representing the result 1561 /// mask of the comparison. 1562 /// 1563 /// \headerfile <x86intrin.h> 1564 /// 1565 /// \code 1566 /// __m128i _mm_cmpistrm(__m128i A, __m128i B, const int M); 1567 /// \endcode 1568 /// 1569 /// This intrinsic corresponds to the <c> VPCMPISTRM / PCMPISTRM </c> 1570 /// instruction. 1571 /// 1572 /// \param A 1573 /// A 128-bit integer vector containing one of the source operands to be 1574 /// compared. 1575 /// \param B 1576 /// A 128-bit integer vector containing one of the source operands to be 1577 /// compared. 1578 /// \param M 1579 /// An 8-bit immediate operand specifying whether the characters are bytes or 1580 /// words, the type of comparison to perform, and the format of the return 1581 /// value. \n 1582 /// Bits [1:0]: Determine source data format. \n 1583 /// 00: 16 unsigned bytes \n 1584 /// 01: 8 unsigned words \n 1585 /// 10: 16 signed bytes \n 1586 /// 11: 8 signed words \n 1587 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1588 /// 00: Subset: Each character in \a B is compared for equality with all 1589 /// the characters in \a A. \n 1590 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1591 /// basis is greater than or equal for even-indexed elements in \a A, 1592 /// and less than or equal for odd-indexed elements in \a A. \n 1593 /// 10: Match: Compare each pair of corresponding characters in \a A and 1594 /// \a B for equality. \n 1595 /// 11: Substring: Search \a B for substring matches of \a A. \n 1596 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1597 /// mask of the comparison results. \n 1598 /// 00: No effect. \n 1599 /// 01: Negate the bit mask. \n 1600 /// 10: No effect. \n 1601 /// 11: Negate the bit mask only for bits with an index less than or equal 1602 /// to the size of \a A or \a B. \n 1603 /// Bit [6]: Determines whether the result is zero-extended or expanded to 16 1604 /// bytes. \n 1605 /// 0: The result is zero-extended to 16 bytes. \n 1606 /// 1: The result is expanded to 16 bytes (this expansion is performed by 1607 /// repeating each bit 8 or 16 times). 1608 /// \returns Returns a 128-bit integer vector representing the result mask of 1609 /// the comparison. 1610 #define _mm_cmpistrm(A, B, M) \ 1611 ((__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \ 1612 (__v16qi)(__m128i)(B), (int)(M))) 1613 1614 /// Uses the immediate operand \a M to perform a comparison of string 1615 /// data with implicitly defined lengths that is contained in source operands 1616 /// \a A and \a B. Returns an integer representing the result index of the 1617 /// comparison. 1618 /// 1619 /// \headerfile <x86intrin.h> 1620 /// 1621 /// \code 1622 /// int _mm_cmpistri(__m128i A, __m128i B, const int M); 1623 /// \endcode 1624 /// 1625 /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 1626 /// instruction. 1627 /// 1628 /// \param A 1629 /// A 128-bit integer vector containing one of the source operands to be 1630 /// compared. 1631 /// \param B 1632 /// A 128-bit integer vector containing one of the source operands to be 1633 /// compared. 1634 /// \param M 1635 /// An 8-bit immediate operand specifying whether the characters are bytes or 1636 /// words, the type of comparison to perform, and the format of the return 1637 /// value. \n 1638 /// Bits [1:0]: Determine source data format. \n 1639 /// 00: 16 unsigned bytes \n 1640 /// 01: 8 unsigned words \n 1641 /// 10: 16 signed bytes \n 1642 /// 11: 8 signed words \n 1643 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1644 /// 00: Subset: Each character in \a B is compared for equality with all 1645 /// the characters in \a A. \n 1646 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1647 /// basis is greater than or equal for even-indexed elements in \a A, 1648 /// and less than or equal for odd-indexed elements in \a A. \n 1649 /// 10: Match: Compare each pair of corresponding characters in \a A and 1650 /// \a B for equality. \n 1651 /// 11: Substring: Search B for substring matches of \a A. \n 1652 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1653 /// mask of the comparison results. \n 1654 /// 00: No effect. \n 1655 /// 01: Negate the bit mask. \n 1656 /// 10: No effect. \n 1657 /// 11: Negate the bit mask only for bits with an index less than or equal 1658 /// to the size of \a A or \a B. \n 1659 /// Bit [6]: Determines whether the index of the lowest set bit or the 1660 /// highest set bit is returned. \n 1661 /// 0: The index of the least significant set bit. \n 1662 /// 1: The index of the most significant set bit. \n 1663 /// \returns Returns an integer representing the result index of the comparison. 1664 #define _mm_cmpistri(A, B, M) \ 1665 ((int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \ 1666 (__v16qi)(__m128i)(B), (int)(M))) 1667 1668 /// Uses the immediate operand \a M to perform a comparison of string 1669 /// data with explicitly defined lengths that is contained in source operands 1670 /// \a A and \a B. Returns a 128-bit integer vector representing the result 1671 /// mask of the comparison. 1672 /// 1673 /// \headerfile <x86intrin.h> 1674 /// 1675 /// \code 1676 /// __m128i _mm_cmpestrm(__m128i A, int LA, __m128i B, int LB, const int M); 1677 /// \endcode 1678 /// 1679 /// This intrinsic corresponds to the <c> VPCMPESTRM / PCMPESTRM </c> 1680 /// instruction. 1681 /// 1682 /// \param A 1683 /// A 128-bit integer vector containing one of the source operands to be 1684 /// compared. 1685 /// \param LA 1686 /// An integer that specifies the length of the string in \a A. 1687 /// \param B 1688 /// A 128-bit integer vector containing one of the source operands to be 1689 /// compared. 1690 /// \param LB 1691 /// An integer that specifies the length of the string in \a B. 1692 /// \param M 1693 /// An 8-bit immediate operand specifying whether the characters are bytes or 1694 /// words, the type of comparison to perform, and the format of the return 1695 /// value. \n 1696 /// Bits [1:0]: Determine source data format. \n 1697 /// 00: 16 unsigned bytes \n 1698 /// 01: 8 unsigned words \n 1699 /// 10: 16 signed bytes \n 1700 /// 11: 8 signed words \n 1701 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1702 /// 00: Subset: Each character in \a B is compared for equality with all 1703 /// the characters in \a A. \n 1704 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1705 /// basis is greater than or equal for even-indexed elements in \a A, 1706 /// and less than or equal for odd-indexed elements in \a A. \n 1707 /// 10: Match: Compare each pair of corresponding characters in \a A and 1708 /// \a B for equality. \n 1709 /// 11: Substring: Search \a B for substring matches of \a A. \n 1710 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1711 /// mask of the comparison results. \n 1712 /// 00: No effect. \n 1713 /// 01: Negate the bit mask. \n 1714 /// 10: No effect. \n 1715 /// 11: Negate the bit mask only for bits with an index less than or equal 1716 /// to the size of \a A or \a B. \n 1717 /// Bit [6]: Determines whether the result is zero-extended or expanded to 16 1718 /// bytes. \n 1719 /// 0: The result is zero-extended to 16 bytes. \n 1720 /// 1: The result is expanded to 16 bytes (this expansion is performed by 1721 /// repeating each bit 8 or 16 times). \n 1722 /// \returns Returns a 128-bit integer vector representing the result mask of 1723 /// the comparison. 1724 #define _mm_cmpestrm(A, LA, B, LB, M) \ 1725 ((__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \ 1726 (__v16qi)(__m128i)(B), (int)(LB), \ 1727 (int)(M))) 1728 1729 /// Uses the immediate operand \a M to perform a comparison of string 1730 /// data with explicitly defined lengths that is contained in source operands 1731 /// \a A and \a B. Returns an integer representing the result index of the 1732 /// comparison. 1733 /// 1734 /// \headerfile <x86intrin.h> 1735 /// 1736 /// \code 1737 /// int _mm_cmpestri(__m128i A, int LA, __m128i B, int LB, const int M); 1738 /// \endcode 1739 /// 1740 /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 1741 /// instruction. 1742 /// 1743 /// \param A 1744 /// A 128-bit integer vector containing one of the source operands to be 1745 /// compared. 1746 /// \param LA 1747 /// An integer that specifies the length of the string in \a A. 1748 /// \param B 1749 /// A 128-bit integer vector containing one of the source operands to be 1750 /// compared. 1751 /// \param LB 1752 /// An integer that specifies the length of the string in \a B. 1753 /// \param M 1754 /// An 8-bit immediate operand specifying whether the characters are bytes or 1755 /// words, the type of comparison to perform, and the format of the return 1756 /// value. \n 1757 /// Bits [1:0]: Determine source data format. \n 1758 /// 00: 16 unsigned bytes \n 1759 /// 01: 8 unsigned words \n 1760 /// 10: 16 signed bytes \n 1761 /// 11: 8 signed words \n 1762 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1763 /// 00: Subset: Each character in \a B is compared for equality with all 1764 /// the characters in \a A. \n 1765 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1766 /// basis is greater than or equal for even-indexed elements in \a A, 1767 /// and less than or equal for odd-indexed elements in \a A. \n 1768 /// 10: Match: Compare each pair of corresponding characters in \a A and 1769 /// \a B for equality. \n 1770 /// 11: Substring: Search B for substring matches of \a A. \n 1771 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1772 /// mask of the comparison results. \n 1773 /// 00: No effect. \n 1774 /// 01: Negate the bit mask. \n 1775 /// 10: No effect. \n 1776 /// 11: Negate the bit mask only for bits with an index less than or equal 1777 /// to the size of \a A or \a B. \n 1778 /// Bit [6]: Determines whether the index of the lowest set bit or the 1779 /// highest set bit is returned. \n 1780 /// 0: The index of the least significant set bit. \n 1781 /// 1: The index of the most significant set bit. \n 1782 /// \returns Returns an integer representing the result index of the comparison. 1783 #define _mm_cmpestri(A, LA, B, LB, M) \ 1784 ((int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \ 1785 (__v16qi)(__m128i)(B), (int)(LB), \ 1786 (int)(M))) 1787 1788 /* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */ 1789 /// Uses the immediate operand \a M to perform a comparison of string 1790 /// data with implicitly defined lengths that is contained in source operands 1791 /// \a A and \a B. Returns 1 if the bit mask is zero and the length of the 1792 /// string in \a B is the maximum, otherwise, returns 0. 1793 /// 1794 /// \headerfile <x86intrin.h> 1795 /// 1796 /// \code 1797 /// int _mm_cmpistra(__m128i A, __m128i B, const int M); 1798 /// \endcode 1799 /// 1800 /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 1801 /// instruction. 1802 /// 1803 /// \param A 1804 /// A 128-bit integer vector containing one of the source operands to be 1805 /// compared. 1806 /// \param B 1807 /// A 128-bit integer vector containing one of the source operands to be 1808 /// compared. 1809 /// \param M 1810 /// An 8-bit immediate operand specifying whether the characters are bytes or 1811 /// words and the type of comparison to perform. \n 1812 /// Bits [1:0]: Determine source data format. \n 1813 /// 00: 16 unsigned bytes \n 1814 /// 01: 8 unsigned words \n 1815 /// 10: 16 signed bytes \n 1816 /// 11: 8 signed words \n 1817 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1818 /// 00: Subset: Each character in \a B is compared for equality with all 1819 /// the characters in \a A. \n 1820 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1821 /// basis is greater than or equal for even-indexed elements in \a A, 1822 /// and less than or equal for odd-indexed elements in \a A. \n 1823 /// 10: Match: Compare each pair of corresponding characters in \a A and 1824 /// \a B for equality. \n 1825 /// 11: Substring: Search \a B for substring matches of \a A. \n 1826 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1827 /// mask of the comparison results. \n 1828 /// 00: No effect. \n 1829 /// 01: Negate the bit mask. \n 1830 /// 10: No effect. \n 1831 /// 11: Negate the bit mask only for bits with an index less than or equal 1832 /// to the size of \a A or \a B. \n 1833 /// \returns Returns 1 if the bit mask is zero and the length of the string in 1834 /// \a B is the maximum; otherwise, returns 0. 1835 #define _mm_cmpistra(A, B, M) \ 1836 ((int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \ 1837 (__v16qi)(__m128i)(B), (int)(M))) 1838 1839 /// Uses the immediate operand \a M to perform a comparison of string 1840 /// data with implicitly defined lengths that is contained in source operands 1841 /// \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns 1842 /// 0. 1843 /// 1844 /// \headerfile <x86intrin.h> 1845 /// 1846 /// \code 1847 /// int _mm_cmpistrc(__m128i A, __m128i B, const int M); 1848 /// \endcode 1849 /// 1850 /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 1851 /// instruction. 1852 /// 1853 /// \param A 1854 /// A 128-bit integer vector containing one of the source operands to be 1855 /// compared. 1856 /// \param B 1857 /// A 128-bit integer vector containing one of the source operands to be 1858 /// compared. 1859 /// \param M 1860 /// An 8-bit immediate operand specifying whether the characters are bytes or 1861 /// words and the type of comparison to perform. \n 1862 /// Bits [1:0]: Determine source data format. \n 1863 /// 00: 16 unsigned bytes \n 1864 /// 01: 8 unsigned words \n 1865 /// 10: 16 signed bytes \n 1866 /// 11: 8 signed words \n 1867 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1868 /// 00: Subset: Each character in \a B is compared for equality with all 1869 /// the characters in \a A. \n 1870 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1871 /// basis is greater than or equal for even-indexed elements in \a A, 1872 /// and less than or equal for odd-indexed elements in \a A. \n 1873 /// 10: Match: Compare each pair of corresponding characters in \a A and 1874 /// \a B for equality. \n 1875 /// 11: Substring: Search B for substring matches of \a A. \n 1876 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1877 /// mask of the comparison results. \n 1878 /// 00: No effect. \n 1879 /// 01: Negate the bit mask. \n 1880 /// 10: No effect. \n 1881 /// 11: Negate the bit mask only for bits with an index less than or equal 1882 /// to the size of \a A or \a B. 1883 /// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0. 1884 #define _mm_cmpistrc(A, B, M) \ 1885 ((int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \ 1886 (__v16qi)(__m128i)(B), (int)(M))) 1887 1888 /// Uses the immediate operand \a M to perform a comparison of string 1889 /// data with implicitly defined lengths that is contained in source operands 1890 /// \a A and \a B. Returns bit 0 of the resulting bit mask. 1891 /// 1892 /// \headerfile <x86intrin.h> 1893 /// 1894 /// \code 1895 /// int _mm_cmpistro(__m128i A, __m128i B, const int M); 1896 /// \endcode 1897 /// 1898 /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 1899 /// instruction. 1900 /// 1901 /// \param A 1902 /// A 128-bit integer vector containing one of the source operands to be 1903 /// compared. 1904 /// \param B 1905 /// A 128-bit integer vector containing one of the source operands to be 1906 /// compared. 1907 /// \param M 1908 /// An 8-bit immediate operand specifying whether the characters are bytes or 1909 /// words and the type of comparison to perform. \n 1910 /// Bits [1:0]: Determine source data format. \n 1911 /// 00: 16 unsigned bytes \n 1912 /// 01: 8 unsigned words \n 1913 /// 10: 16 signed bytes \n 1914 /// 11: 8 signed words \n 1915 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1916 /// 00: Subset: Each character in \a B is compared for equality with all 1917 /// the characters in \a A. \n 1918 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1919 /// basis is greater than or equal for even-indexed elements in \a A, 1920 /// and less than or equal for odd-indexed elements in \a A. \n 1921 /// 10: Match: Compare each pair of corresponding characters in \a A and 1922 /// \a B for equality. \n 1923 /// 11: Substring: Search B for substring matches of \a A. \n 1924 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1925 /// mask of the comparison results. \n 1926 /// 00: No effect. \n 1927 /// 01: Negate the bit mask. \n 1928 /// 10: No effect. \n 1929 /// 11: Negate the bit mask only for bits with an index less than or equal 1930 /// to the size of \a A or \a B. \n 1931 /// \returns Returns bit 0 of the resulting bit mask. 1932 #define _mm_cmpistro(A, B, M) \ 1933 ((int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \ 1934 (__v16qi)(__m128i)(B), (int)(M))) 1935 1936 /// Uses the immediate operand \a M to perform a comparison of string 1937 /// data with implicitly defined lengths that is contained in source operands 1938 /// \a A and \a B. Returns 1 if the length of the string in \a A is less than 1939 /// the maximum, otherwise, returns 0. 1940 /// 1941 /// \headerfile <x86intrin.h> 1942 /// 1943 /// \code 1944 /// int _mm_cmpistrs(__m128i A, __m128i B, const int M); 1945 /// \endcode 1946 /// 1947 /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 1948 /// instruction. 1949 /// 1950 /// \param A 1951 /// A 128-bit integer vector containing one of the source operands to be 1952 /// compared. 1953 /// \param B 1954 /// A 128-bit integer vector containing one of the source operands to be 1955 /// compared. 1956 /// \param M 1957 /// An 8-bit immediate operand specifying whether the characters are bytes or 1958 /// words and the type of comparison to perform. \n 1959 /// Bits [1:0]: Determine source data format. \n 1960 /// 00: 16 unsigned bytes \n 1961 /// 01: 8 unsigned words \n 1962 /// 10: 16 signed bytes \n 1963 /// 11: 8 signed words \n 1964 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1965 /// 00: Subset: Each character in \a B is compared for equality with all 1966 /// the characters in \a A. \n 1967 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1968 /// basis is greater than or equal for even-indexed elements in \a A, 1969 /// and less than or equal for odd-indexed elements in \a A. \n 1970 /// 10: Match: Compare each pair of corresponding characters in \a A and 1971 /// \a B for equality. \n 1972 /// 11: Substring: Search \a B for substring matches of \a A. \n 1973 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1974 /// mask of the comparison results. \n 1975 /// 00: No effect. \n 1976 /// 01: Negate the bit mask. \n 1977 /// 10: No effect. \n 1978 /// 11: Negate the bit mask only for bits with an index less than or equal 1979 /// to the size of \a A or \a B. \n 1980 /// \returns Returns 1 if the length of the string in \a A is less than the 1981 /// maximum, otherwise, returns 0. 1982 #define _mm_cmpistrs(A, B, M) \ 1983 ((int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \ 1984 (__v16qi)(__m128i)(B), (int)(M))) 1985 1986 /// Uses the immediate operand \a M to perform a comparison of string 1987 /// data with implicitly defined lengths that is contained in source operands 1988 /// \a A and \a B. Returns 1 if the length of the string in \a B is less than 1989 /// the maximum, otherwise, returns 0. 1990 /// 1991 /// \headerfile <x86intrin.h> 1992 /// 1993 /// \code 1994 /// int _mm_cmpistrz(__m128i A, __m128i B, const int M); 1995 /// \endcode 1996 /// 1997 /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 1998 /// instruction. 1999 /// 2000 /// \param A 2001 /// A 128-bit integer vector containing one of the source operands to be 2002 /// compared. 2003 /// \param B 2004 /// A 128-bit integer vector containing one of the source operands to be 2005 /// compared. 2006 /// \param M 2007 /// An 8-bit immediate operand specifying whether the characters are bytes or 2008 /// words and the type of comparison to perform. \n 2009 /// Bits [1:0]: Determine source data format. \n 2010 /// 00: 16 unsigned bytes \n 2011 /// 01: 8 unsigned words \n 2012 /// 10: 16 signed bytes \n 2013 /// 11: 8 signed words \n 2014 /// Bits [3:2]: Determine comparison type and aggregation method. \n 2015 /// 00: Subset: Each character in \a B is compared for equality with all 2016 /// the characters in \a A. \n 2017 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2018 /// basis is greater than or equal for even-indexed elements in \a A, 2019 /// and less than or equal for odd-indexed elements in \a A. \n 2020 /// 10: Match: Compare each pair of corresponding characters in \a A and 2021 /// \a B for equality. \n 2022 /// 11: Substring: Search \a B for substring matches of \a A. \n 2023 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 2024 /// mask of the comparison results. \n 2025 /// 00: No effect. \n 2026 /// 01: Negate the bit mask. \n 2027 /// 10: No effect. \n 2028 /// 11: Negate the bit mask only for bits with an index less than or equal 2029 /// to the size of \a A or \a B. 2030 /// \returns Returns 1 if the length of the string in \a B is less than the 2031 /// maximum, otherwise, returns 0. 2032 #define _mm_cmpistrz(A, B, M) \ 2033 ((int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \ 2034 (__v16qi)(__m128i)(B), (int)(M))) 2035 2036 /// Uses the immediate operand \a M to perform a comparison of string 2037 /// data with explicitly defined lengths that is contained in source operands 2038 /// \a A and \a B. Returns 1 if the bit mask is zero and the length of the 2039 /// string in \a B is the maximum, otherwise, returns 0. 2040 /// 2041 /// \headerfile <x86intrin.h> 2042 /// 2043 /// \code 2044 /// int _mm_cmpestra(__m128i A, int LA, __m128i B, int LB, const int M); 2045 /// \endcode 2046 /// 2047 /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 2048 /// instruction. 2049 /// 2050 /// \param A 2051 /// A 128-bit integer vector containing one of the source operands to be 2052 /// compared. 2053 /// \param LA 2054 /// An integer that specifies the length of the string in \a A. 2055 /// \param B 2056 /// A 128-bit integer vector containing one of the source operands to be 2057 /// compared. 2058 /// \param LB 2059 /// An integer that specifies the length of the string in \a B. 2060 /// \param M 2061 /// An 8-bit immediate operand specifying whether the characters are bytes or 2062 /// words and the type of comparison to perform. \n 2063 /// Bits [1:0]: Determine source data format. \n 2064 /// 00: 16 unsigned bytes \n 2065 /// 01: 8 unsigned words \n 2066 /// 10: 16 signed bytes \n 2067 /// 11: 8 signed words \n 2068 /// Bits [3:2]: Determine comparison type and aggregation method. \n 2069 /// 00: Subset: Each character in \a B is compared for equality with all 2070 /// the characters in \a A. \n 2071 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2072 /// basis is greater than or equal for even-indexed elements in \a A, 2073 /// and less than or equal for odd-indexed elements in \a A. \n 2074 /// 10: Match: Compare each pair of corresponding characters in \a A and 2075 /// \a B for equality. \n 2076 /// 11: Substring: Search \a B for substring matches of \a A. \n 2077 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 2078 /// mask of the comparison results. \n 2079 /// 00: No effect. \n 2080 /// 01: Negate the bit mask. \n 2081 /// 10: No effect. \n 2082 /// 11: Negate the bit mask only for bits with an index less than or equal 2083 /// to the size of \a A or \a B. 2084 /// \returns Returns 1 if the bit mask is zero and the length of the string in 2085 /// \a B is the maximum, otherwise, returns 0. 2086 #define _mm_cmpestra(A, LA, B, LB, M) \ 2087 ((int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \ 2088 (__v16qi)(__m128i)(B), (int)(LB), \ 2089 (int)(M))) 2090 2091 /// Uses the immediate operand \a M to perform a comparison of string 2092 /// data with explicitly defined lengths that is contained in source operands 2093 /// \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise, 2094 /// returns 0. 2095 /// 2096 /// \headerfile <x86intrin.h> 2097 /// 2098 /// \code 2099 /// int _mm_cmpestrc(__m128i A, int LA, __m128i B, int LB, const int M); 2100 /// \endcode 2101 /// 2102 /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 2103 /// instruction. 2104 /// 2105 /// \param A 2106 /// A 128-bit integer vector containing one of the source operands to be 2107 /// compared. 2108 /// \param LA 2109 /// An integer that specifies the length of the string in \a A. 2110 /// \param B 2111 /// A 128-bit integer vector containing one of the source operands to be 2112 /// compared. 2113 /// \param LB 2114 /// An integer that specifies the length of the string in \a B. 2115 /// \param M 2116 /// An 8-bit immediate operand specifying whether the characters are bytes or 2117 /// words and the type of comparison to perform. \n 2118 /// Bits [1:0]: Determine source data format. \n 2119 /// 00: 16 unsigned bytes \n 2120 /// 01: 8 unsigned words \n 2121 /// 10: 16 signed bytes \n 2122 /// 11: 8 signed words \n 2123 /// Bits [3:2]: Determine comparison type and aggregation method. \n 2124 /// 00: Subset: Each character in \a B is compared for equality with all 2125 /// the characters in \a A. \n 2126 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2127 /// basis is greater than or equal for even-indexed elements in \a A, 2128 /// and less than or equal for odd-indexed elements in \a A. \n 2129 /// 10: Match: Compare each pair of corresponding characters in \a A and 2130 /// \a B for equality. \n 2131 /// 11: Substring: Search \a B for substring matches of \a A. \n 2132 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 2133 /// mask of the comparison results. \n 2134 /// 00: No effect. \n 2135 /// 01: Negate the bit mask. \n 2136 /// 10: No effect. \n 2137 /// 11: Negate the bit mask only for bits with an index less than or equal 2138 /// to the size of \a A or \a B. \n 2139 /// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0. 2140 #define _mm_cmpestrc(A, LA, B, LB, M) \ 2141 ((int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \ 2142 (__v16qi)(__m128i)(B), (int)(LB), \ 2143 (int)(M))) 2144 2145 /// Uses the immediate operand \a M to perform a comparison of string 2146 /// data with explicitly defined lengths that is contained in source operands 2147 /// \a A and \a B. Returns bit 0 of the resulting bit mask. 2148 /// 2149 /// \headerfile <x86intrin.h> 2150 /// 2151 /// \code 2152 /// int _mm_cmpestro(__m128i A, int LA, __m128i B, int LB, const int M); 2153 /// \endcode 2154 /// 2155 /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 2156 /// instruction. 2157 /// 2158 /// \param A 2159 /// A 128-bit integer vector containing one of the source operands to be 2160 /// compared. 2161 /// \param LA 2162 /// An integer that specifies the length of the string in \a A. 2163 /// \param B 2164 /// A 128-bit integer vector containing one of the source operands to be 2165 /// compared. 2166 /// \param LB 2167 /// An integer that specifies the length of the string in \a B. 2168 /// \param M 2169 /// An 8-bit immediate operand specifying whether the characters are bytes or 2170 /// words and the type of comparison to perform. \n 2171 /// Bits [1:0]: Determine source data format. \n 2172 /// 00: 16 unsigned bytes \n 2173 /// 01: 8 unsigned words \n 2174 /// 10: 16 signed bytes \n 2175 /// 11: 8 signed words \n 2176 /// Bits [3:2]: Determine comparison type and aggregation method. \n 2177 /// 00: Subset: Each character in \a B is compared for equality with all 2178 /// the characters in \a A. \n 2179 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2180 /// basis is greater than or equal for even-indexed elements in \a A, 2181 /// and less than or equal for odd-indexed elements in \a A. \n 2182 /// 10: Match: Compare each pair of corresponding characters in \a A and 2183 /// \a B for equality. \n 2184 /// 11: Substring: Search \a B for substring matches of \a A. \n 2185 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 2186 /// mask of the comparison results. \n 2187 /// 00: No effect. \n 2188 /// 01: Negate the bit mask. \n 2189 /// 10: No effect. \n 2190 /// 11: Negate the bit mask only for bits with an index less than or equal 2191 /// to the size of \a A or \a B. 2192 /// \returns Returns bit 0 of the resulting bit mask. 2193 #define _mm_cmpestro(A, LA, B, LB, M) \ 2194 ((int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \ 2195 (__v16qi)(__m128i)(B), (int)(LB), \ 2196 (int)(M))) 2197 2198 /// Uses the immediate operand \a M to perform a comparison of string 2199 /// data with explicitly defined lengths that is contained in source operands 2200 /// \a A and \a B. Returns 1 if the length of the string in \a A is less than 2201 /// the maximum, otherwise, returns 0. 2202 /// 2203 /// \headerfile <x86intrin.h> 2204 /// 2205 /// \code 2206 /// int _mm_cmpestrs(__m128i A, int LA, __m128i B, int LB, const int M); 2207 /// \endcode 2208 /// 2209 /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 2210 /// instruction. 2211 /// 2212 /// \param A 2213 /// A 128-bit integer vector containing one of the source operands to be 2214 /// compared. 2215 /// \param LA 2216 /// An integer that specifies the length of the string in \a A. 2217 /// \param B 2218 /// A 128-bit integer vector containing one of the source operands to be 2219 /// compared. 2220 /// \param LB 2221 /// An integer that specifies the length of the string in \a B. 2222 /// \param M 2223 /// An 8-bit immediate operand specifying whether the characters are bytes or 2224 /// words and the type of comparison to perform. \n 2225 /// Bits [1:0]: Determine source data format. \n 2226 /// 00: 16 unsigned bytes \n 2227 /// 01: 8 unsigned words \n 2228 /// 10: 16 signed bytes \n 2229 /// 11: 8 signed words \n 2230 /// Bits [3:2]: Determine comparison type and aggregation method. \n 2231 /// 00: Subset: Each character in \a B is compared for equality with all 2232 /// the characters in \a A. \n 2233 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2234 /// basis is greater than or equal for even-indexed elements in \a A, 2235 /// and less than or equal for odd-indexed elements in \a A. \n 2236 /// 10: Match: Compare each pair of corresponding characters in \a A and 2237 /// \a B for equality. \n 2238 /// 11: Substring: Search \a B for substring matches of \a A. \n 2239 /// Bits [5:4]: Determine whether to perform a one's complement in the bit 2240 /// mask of the comparison results. \n 2241 /// 00: No effect. \n 2242 /// 01: Negate the bit mask. \n 2243 /// 10: No effect. \n 2244 /// 11: Negate the bit mask only for bits with an index less than or equal 2245 /// to the size of \a A or \a B. \n 2246 /// \returns Returns 1 if the length of the string in \a A is less than the 2247 /// maximum, otherwise, returns 0. 2248 #define _mm_cmpestrs(A, LA, B, LB, M) \ 2249 ((int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \ 2250 (__v16qi)(__m128i)(B), (int)(LB), \ 2251 (int)(M))) 2252 2253 /// Uses the immediate operand \a M to perform a comparison of string 2254 /// data with explicitly defined lengths that is contained in source operands 2255 /// \a A and \a B. Returns 1 if the length of the string in \a B is less than 2256 /// the maximum, otherwise, returns 0. 2257 /// 2258 /// \headerfile <x86intrin.h> 2259 /// 2260 /// \code 2261 /// int _mm_cmpestrz(__m128i A, int LA, __m128i B, int LB, const int M); 2262 /// \endcode 2263 /// 2264 /// This intrinsic corresponds to the <c> VPCMPESTRI </c> instruction. 2265 /// 2266 /// \param A 2267 /// A 128-bit integer vector containing one of the source operands to be 2268 /// compared. 2269 /// \param LA 2270 /// An integer that specifies the length of the string in \a A. 2271 /// \param B 2272 /// A 128-bit integer vector containing one of the source operands to be 2273 /// compared. 2274 /// \param LB 2275 /// An integer that specifies the length of the string in \a B. 2276 /// \param M 2277 /// An 8-bit immediate operand specifying whether the characters are bytes or 2278 /// words and the type of comparison to perform. \n 2279 /// Bits [1:0]: Determine source data format. \n 2280 /// 00: 16 unsigned bytes \n 2281 /// 01: 8 unsigned words \n 2282 /// 10: 16 signed bytes \n 2283 /// 11: 8 signed words \n 2284 /// Bits [3:2]: Determine comparison type and aggregation method. \n 2285 /// 00: Subset: Each character in \a B is compared for equality with all 2286 /// the characters in \a A. \n 2287 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2288 /// basis is greater than or equal for even-indexed elements in \a A, 2289 /// and less than or equal for odd-indexed elements in \a A. \n 2290 /// 10: Match: Compare each pair of corresponding characters in \a A and 2291 /// \a B for equality. \n 2292 /// 11: Substring: Search \a B for substring matches of \a A. \n 2293 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 2294 /// mask of the comparison results. \n 2295 /// 00: No effect. \n 2296 /// 01: Negate the bit mask. \n 2297 /// 10: No effect. \n 2298 /// 11: Negate the bit mask only for bits with an index less than or equal 2299 /// to the size of \a A or \a B. 2300 /// \returns Returns 1 if the length of the string in \a B is less than the 2301 /// maximum, otherwise, returns 0. 2302 #define _mm_cmpestrz(A, LA, B, LB, M) \ 2303 ((int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \ 2304 (__v16qi)(__m128i)(B), (int)(LB), \ 2305 (int)(M))) 2306 2307 /* SSE4.2 Compare Packed Data -- Greater Than. */ 2308 /// Compares each of the corresponding 64-bit values of the 128-bit 2309 /// integer vectors to determine if the values in the first operand are 2310 /// greater than those in the second operand. 2311 /// 2312 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 2313 /// 2314 /// \headerfile <x86intrin.h> 2315 /// 2316 /// This intrinsic corresponds to the <c> VPCMPGTQ / PCMPGTQ </c> instruction. 2317 /// 2318 /// \param __V1 2319 /// A 128-bit integer vector. 2320 /// \param __V2 2321 /// A 128-bit integer vector. 2322 /// \returns A 128-bit integer vector containing the comparison results. 2323 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi64(__m128i __V1, 2324 __m128i __V2) { 2325 return (__m128i)((__v2di)__V1 > (__v2di)__V2); 2326 } 2327 2328 #undef __DEFAULT_FN_ATTRS 2329 2330 #include <popcntintrin.h> 2331 2332 #include <crc32intrin.h> 2333 2334 #endif /* __SMMINTRIN_H */ 2335