xref: /llvm-project/clang/lib/Headers/xmmintrin.h (revision 8306114ed2313a7febdb0d0d0c31df357ed53fdd)
1 /*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 #ifndef __XMMINTRIN_H
11 #define __XMMINTRIN_H
12 
13 #if !defined(__i386__) && !defined(__x86_64__)
14 #error "This header is only meant to be used on x86 and x64 architecture"
15 #endif
16 
17 #include <mmintrin.h>
18 
19 typedef int __v4si __attribute__((__vector_size__(16)));
20 typedef float __v4sf __attribute__((__vector_size__(16)));
21 typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
22 
23 typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1)));
24 
25 /* Unsigned types */
26 typedef unsigned int __v4su __attribute__((__vector_size__(16)));
27 
28 /* This header should only be included in a hosted environment as it depends on
29  * a standard library to provide allocation routines. */
30 #if __STDC_HOSTED__
31 #include <mm_malloc.h>
32 #endif
33 
34 /* Define the default attributes for the functions in this file. */
35 #if defined(__EVEX512__) && !defined(__AVX10_1_512__)
36 #define __DEFAULT_FN_ATTRS                                                     \
37   __attribute__((__always_inline__, __nodebug__, __target__("sse,no-evex512"), \
38                  __min_vector_width__(128)))
39 #define __DEFAULT_FN_ATTRS_SSE2                                                \
40   __attribute__((__always_inline__, __nodebug__,                               \
41                  __target__("sse2,no-evex512"), __min_vector_width__(128)))
42 #else
43 #define __DEFAULT_FN_ATTRS                                                     \
44   __attribute__((__always_inline__, __nodebug__, __target__("sse"),            \
45                  __min_vector_width__(128)))
46 #define __DEFAULT_FN_ATTRS_SSE2                                                \
47   __attribute__((__always_inline__, __nodebug__, __target__("sse2"),           \
48                  __min_vector_width__(128)))
49 #endif
50 
51 #if defined(__cplusplus) && (__cplusplus >= 201103L)
52 #define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
53 #define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2 constexpr
54 #else
55 #define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
56 #define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2
57 #endif
58 
59 #define __trunc64(x)                                                           \
60   (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
61 #define __zext128(x)                                                           \
62   (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
63                                     1, 2, 3)
64 #define __anyext128(x)                                                         \
65   (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
66                                     1, -1, -1)
67 #define __zeroupper64(x)                                                       \
68   (__m128i) __builtin_shufflevector((__v4si)(x), __extension__(__v4si){}, 0,   \
69                                     1, 4, 5)
70 
71 /// Adds the 32-bit float values in the low-order bits of the operands.
72 ///
73 /// \headerfile <x86intrin.h>
74 ///
75 /// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
76 ///
77 /// \param __a
78 ///    A 128-bit vector of [4 x float] containing one of the source operands.
79 ///    The lower 32 bits of this operand are used in the calculation.
80 /// \param __b
81 ///    A 128-bit vector of [4 x float] containing one of the source operands.
82 ///    The lower 32 bits of this operand are used in the calculation.
83 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
84 ///    of the lower 32 bits of both operands. The upper 96 bits are copied from
85 ///    the upper 96 bits of the first source operand.
86 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
87 _mm_add_ss(__m128 __a, __m128 __b) {
88   __a[0] += __b[0];
89   return __a;
90 }
91 
92 /// Adds two 128-bit vectors of [4 x float], and returns the results of
93 ///    the addition.
94 ///
95 /// \headerfile <x86intrin.h>
96 ///
97 /// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
98 ///
99 /// \param __a
100 ///    A 128-bit vector of [4 x float] containing one of the source operands.
101 /// \param __b
102 ///    A 128-bit vector of [4 x float] containing one of the source operands.
103 /// \returns A 128-bit vector of [4 x float] containing the sums of both
104 ///    operands.
105 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
106 _mm_add_ps(__m128 __a, __m128 __b) {
107   return (__m128)((__v4sf)__a + (__v4sf)__b);
108 }
109 
110 /// Subtracts the 32-bit float value in the low-order bits of the second
111 ///    operand from the corresponding value in the first operand.
112 ///
113 /// \headerfile <x86intrin.h>
114 ///
115 /// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
116 ///
117 /// \param __a
118 ///    A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
119 ///    of this operand are used in the calculation.
120 /// \param __b
121 ///    A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
122 ///    bits of this operand are used in the calculation.
123 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
124 ///    difference of the lower 32 bits of both operands. The upper 96 bits are
125 ///    copied from the upper 96 bits of the first source operand.
126 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
127 _mm_sub_ss(__m128 __a, __m128 __b) {
128   __a[0] -= __b[0];
129   return __a;
130 }
131 
132 /// Subtracts each of the values of the second operand from the first
133 ///    operand, both of which are 128-bit vectors of [4 x float] and returns
134 ///    the results of the subtraction.
135 ///
136 /// \headerfile <x86intrin.h>
137 ///
138 /// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
139 ///
140 /// \param __a
141 ///    A 128-bit vector of [4 x float] containing the minuend.
142 /// \param __b
143 ///    A 128-bit vector of [4 x float] containing the subtrahend.
144 /// \returns A 128-bit vector of [4 x float] containing the differences between
145 ///    both operands.
146 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
147 _mm_sub_ps(__m128 __a, __m128 __b) {
148   return (__m128)((__v4sf)__a - (__v4sf)__b);
149 }
150 
151 /// Multiplies two 32-bit float values in the low-order bits of the
152 ///    operands.
153 ///
154 /// \headerfile <x86intrin.h>
155 ///
156 /// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
157 ///
158 /// \param __a
159 ///    A 128-bit vector of [4 x float] containing one of the source operands.
160 ///    The lower 32 bits of this operand are used in the calculation.
161 /// \param __b
162 ///    A 128-bit vector of [4 x float] containing one of the source operands.
163 ///    The lower 32 bits of this operand are used in the calculation.
164 /// \returns A 128-bit vector of [4 x float] containing the product of the lower
165 ///    32 bits of both operands. The upper 96 bits are copied from the upper 96
166 ///    bits of the first source operand.
167 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
168 _mm_mul_ss(__m128 __a, __m128 __b) {
169   __a[0] *= __b[0];
170   return __a;
171 }
172 
173 /// Multiplies two 128-bit vectors of [4 x float] and returns the
174 ///    results of the multiplication.
175 ///
176 /// \headerfile <x86intrin.h>
177 ///
178 /// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
179 ///
180 /// \param __a
181 ///    A 128-bit vector of [4 x float] containing one of the source operands.
182 /// \param __b
183 ///    A 128-bit vector of [4 x float] containing one of the source operands.
184 /// \returns A 128-bit vector of [4 x float] containing the products of both
185 ///    operands.
186 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
187 _mm_mul_ps(__m128 __a, __m128 __b) {
188   return (__m128)((__v4sf)__a * (__v4sf)__b);
189 }
190 
191 /// Divides the value in the low-order 32 bits of the first operand by
192 ///    the corresponding value in the second operand.
193 ///
194 /// \headerfile <x86intrin.h>
195 ///
196 /// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
197 ///
198 /// \param __a
199 ///    A 128-bit vector of [4 x float] containing the dividend. The lower 32
200 ///    bits of this operand are used in the calculation.
201 /// \param __b
202 ///    A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
203 ///    of this operand are used in the calculation.
204 /// \returns A 128-bit vector of [4 x float] containing the quotients of the
205 ///    lower 32 bits of both operands. The upper 96 bits are copied from the
206 ///    upper 96 bits of the first source operand.
207 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
208 _mm_div_ss(__m128 __a, __m128 __b) {
209   __a[0] /= __b[0];
210   return __a;
211 }
212 
213 /// Divides two 128-bit vectors of [4 x float].
214 ///
215 /// \headerfile <x86intrin.h>
216 ///
217 /// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
218 ///
219 /// \param __a
220 ///    A 128-bit vector of [4 x float] containing the dividend.
221 /// \param __b
222 ///    A 128-bit vector of [4 x float] containing the divisor.
223 /// \returns A 128-bit vector of [4 x float] containing the quotients of both
224 ///    operands.
225 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
226 _mm_div_ps(__m128 __a, __m128 __b) {
227   return (__m128)((__v4sf)__a / (__v4sf)__b);
228 }
229 
230 /// Calculates the square root of the value stored in the low-order bits
231 ///    of a 128-bit vector of [4 x float].
232 ///
233 /// \headerfile <x86intrin.h>
234 ///
235 /// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions.
236 ///
237 /// \param __a
238 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
239 ///    used in the calculation.
240 /// \returns A 128-bit vector of [4 x float] containing the square root of the
241 ///    value in the low-order bits of the operand.
242 static __inline__ __m128 __DEFAULT_FN_ATTRS
243 _mm_sqrt_ss(__m128 __a)
244 {
245   return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
246 }
247 
248 /// Calculates the square roots of the values stored in a 128-bit vector
249 ///    of [4 x float].
250 ///
251 /// \headerfile <x86intrin.h>
252 ///
253 /// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions.
254 ///
255 /// \param __a
256 ///    A 128-bit vector of [4 x float].
257 /// \returns A 128-bit vector of [4 x float] containing the square roots of the
258 ///    values in the operand.
259 static __inline__ __m128 __DEFAULT_FN_ATTRS
260 _mm_sqrt_ps(__m128 __a)
261 {
262   return __builtin_ia32_sqrtps((__v4sf)__a);
263 }
264 
265 /// Calculates the approximate reciprocal of the value stored in the
266 ///    low-order bits of a 128-bit vector of [4 x float].
267 ///
268 /// \headerfile <x86intrin.h>
269 ///
270 /// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
271 ///
272 /// \param __a
273 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
274 ///    used in the calculation.
275 /// \returns A 128-bit vector of [4 x float] containing the approximate
276 ///    reciprocal of the value in the low-order bits of the operand.
277 static __inline__ __m128 __DEFAULT_FN_ATTRS
278 _mm_rcp_ss(__m128 __a)
279 {
280   return (__m128)__builtin_ia32_rcpss((__v4sf)__a);
281 }
282 
283 /// Calculates the approximate reciprocals of the values stored in a
284 ///    128-bit vector of [4 x float].
285 ///
286 /// \headerfile <x86intrin.h>
287 ///
288 /// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
289 ///
290 /// \param __a
291 ///    A 128-bit vector of [4 x float].
292 /// \returns A 128-bit vector of [4 x float] containing the approximate
293 ///    reciprocals of the values in the operand.
294 static __inline__ __m128 __DEFAULT_FN_ATTRS
295 _mm_rcp_ps(__m128 __a)
296 {
297   return (__m128)__builtin_ia32_rcpps((__v4sf)__a);
298 }
299 
300 /// Calculates the approximate reciprocal of the square root of the value
301 ///    stored in the low-order bits of a 128-bit vector of [4 x float].
302 ///
303 /// \headerfile <x86intrin.h>
304 ///
305 /// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions.
306 ///
307 /// \param __a
308 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
309 ///    used in the calculation.
310 /// \returns A 128-bit vector of [4 x float] containing the approximate
311 ///    reciprocal of the square root of the value in the low-order bits of the
312 ///    operand.
313 static __inline__ __m128 __DEFAULT_FN_ATTRS
314 _mm_rsqrt_ss(__m128 __a)
315 {
316   return __builtin_ia32_rsqrtss((__v4sf)__a);
317 }
318 
319 /// Calculates the approximate reciprocals of the square roots of the
320 ///    values stored in a 128-bit vector of [4 x float].
321 ///
322 /// \headerfile <x86intrin.h>
323 ///
324 /// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions.
325 ///
326 /// \param __a
327 ///    A 128-bit vector of [4 x float].
328 /// \returns A 128-bit vector of [4 x float] containing the approximate
329 ///    reciprocals of the square roots of the values in the operand.
330 static __inline__ __m128 __DEFAULT_FN_ATTRS
331 _mm_rsqrt_ps(__m128 __a)
332 {
333   return __builtin_ia32_rsqrtps((__v4sf)__a);
334 }
335 
336 /// Compares two 32-bit float values in the low-order bits of both
337 ///    operands and returns the lesser value in the low-order bits of the
338 ///    vector of [4 x float].
339 ///
340 ///    If either value in a comparison is NaN, returns the value from \a __b.
341 ///
342 /// \headerfile <x86intrin.h>
343 ///
344 /// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
345 ///
346 /// \param __a
347 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
348 ///    32 bits of this operand are used in the comparison.
349 /// \param __b
350 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
351 ///    32 bits of this operand are used in the comparison.
352 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
353 ///    minimum value between both operands. The upper 96 bits are copied from
354 ///    the upper 96 bits of the first source operand.
355 static __inline__ __m128 __DEFAULT_FN_ATTRS
356 _mm_min_ss(__m128 __a, __m128 __b)
357 {
358   return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
359 }
360 
361 /// Compares two 128-bit vectors of [4 x float] and returns the lesser
362 ///    of each pair of values.
363 ///
364 ///    If either value in a comparison is NaN, returns the value from \a __b.
365 ///
366 /// \headerfile <x86intrin.h>
367 ///
368 /// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
369 ///
370 /// \param __a
371 ///    A 128-bit vector of [4 x float] containing one of the operands.
372 /// \param __b
373 ///    A 128-bit vector of [4 x float] containing one of the operands.
374 /// \returns A 128-bit vector of [4 x float] containing the minimum values
375 ///    between both operands.
376 static __inline__ __m128 __DEFAULT_FN_ATTRS
377 _mm_min_ps(__m128 __a, __m128 __b)
378 {
379   return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
380 }
381 
382 /// Compares two 32-bit float values in the low-order bits of both
383 ///    operands and returns the greater value in the low-order bits of a 128-bit
384 ///    vector of [4 x float].
385 ///
386 ///    If either value in a comparison is NaN, returns the value from \a __b.
387 ///
388 /// \headerfile <x86intrin.h>
389 ///
390 /// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
391 ///
392 /// \param __a
393 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
394 ///    32 bits of this operand are used in the comparison.
395 /// \param __b
396 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
397 ///    32 bits of this operand are used in the comparison.
398 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
399 ///    maximum value between both operands. The upper 96 bits are copied from
400 ///    the upper 96 bits of the first source operand.
401 static __inline__ __m128 __DEFAULT_FN_ATTRS
402 _mm_max_ss(__m128 __a, __m128 __b)
403 {
404   return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
405 }
406 
407 /// Compares two 128-bit vectors of [4 x float] and returns the greater
408 ///    of each pair of values.
409 ///
410 ///    If either value in a comparison is NaN, returns the value from \a __b.
411 ///
412 /// \headerfile <x86intrin.h>
413 ///
414 /// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
415 ///
416 /// \param __a
417 ///    A 128-bit vector of [4 x float] containing one of the operands.
418 /// \param __b
419 ///    A 128-bit vector of [4 x float] containing one of the operands.
420 /// \returns A 128-bit vector of [4 x float] containing the maximum values
421 ///    between both operands.
422 static __inline__ __m128 __DEFAULT_FN_ATTRS
423 _mm_max_ps(__m128 __a, __m128 __b)
424 {
425   return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
426 }
427 
428 /// Performs a bitwise AND of two 128-bit vectors of [4 x float].
429 ///
430 /// \headerfile <x86intrin.h>
431 ///
432 /// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
433 ///
434 /// \param __a
435 ///    A 128-bit vector containing one of the source operands.
436 /// \param __b
437 ///    A 128-bit vector containing one of the source operands.
438 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
439 ///    values between both operands.
440 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
441 _mm_and_ps(__m128 __a, __m128 __b) {
442   return (__m128)((__v4su)__a & (__v4su)__b);
443 }
444 
445 /// Performs a bitwise AND of two 128-bit vectors of [4 x float], using
446 ///    the one's complement of the values contained in the first source
447 ///    operand.
448 ///
449 /// \headerfile <x86intrin.h>
450 ///
451 /// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions.
452 ///
453 /// \param __a
454 ///    A 128-bit vector of [4 x float] containing the first source operand. The
455 ///    one's complement of this value is used in the bitwise AND.
456 /// \param __b
457 ///    A 128-bit vector of [4 x float] containing the second source operand.
458 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
459 ///    one's complement of the first operand and the values in the second
460 ///    operand.
461 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
462 _mm_andnot_ps(__m128 __a, __m128 __b) {
463   return (__m128)(~(__v4su)__a & (__v4su)__b);
464 }
465 
466 /// Performs a bitwise OR of two 128-bit vectors of [4 x float].
467 ///
468 /// \headerfile <x86intrin.h>
469 ///
470 /// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
471 ///
472 /// \param __a
473 ///    A 128-bit vector of [4 x float] containing one of the source operands.
474 /// \param __b
475 ///    A 128-bit vector of [4 x float] containing one of the source operands.
476 /// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
477 ///    values between both operands.
478 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
479 _mm_or_ps(__m128 __a, __m128 __b) {
480   return (__m128)((__v4su)__a | (__v4su)__b);
481 }
482 
483 /// Performs a bitwise exclusive OR of two 128-bit vectors of
484 ///    [4 x float].
485 ///
486 /// \headerfile <x86intrin.h>
487 ///
488 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
489 ///
490 /// \param __a
491 ///    A 128-bit vector of [4 x float] containing one of the source operands.
492 /// \param __b
493 ///    A 128-bit vector of [4 x float] containing one of the source operands.
494 /// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
495 ///    of the values between both operands.
496 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
497 _mm_xor_ps(__m128 __a, __m128 __b) {
498   return (__m128)((__v4su)__a ^ (__v4su)__b);
499 }
500 
501 /// Compares two 32-bit float values in the low-order bits of both
502 ///    operands for equality.
503 ///
504 ///    The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
505 ///    low-order bits of a vector [4 x float].
506 ///    If either value in a comparison is NaN, returns false.
507 ///
508 /// \headerfile <x86intrin.h>
509 ///
510 /// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions.
511 ///
512 /// \param __a
513 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
514 ///    32 bits of this operand are used in the comparison.
515 /// \param __b
516 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
517 ///    32 bits of this operand are used in the comparison.
518 /// \returns A 128-bit vector of [4 x float] containing the comparison results
519 ///    in the low-order bits.
520 static __inline__ __m128 __DEFAULT_FN_ATTRS
521 _mm_cmpeq_ss(__m128 __a, __m128 __b)
522 {
523   return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
524 }
525 
526 /// Compares each of the corresponding 32-bit float values of the
527 ///    128-bit vectors of [4 x float] for equality.
528 ///
529 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
530 ///    If either value in a comparison is NaN, returns false.
531 ///
532 /// \headerfile <x86intrin.h>
533 ///
534 /// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
535 ///
536 /// \param __a
537 ///    A 128-bit vector of [4 x float].
538 /// \param __b
539 ///    A 128-bit vector of [4 x float].
540 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
541 static __inline__ __m128 __DEFAULT_FN_ATTRS
542 _mm_cmpeq_ps(__m128 __a, __m128 __b)
543 {
544   return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
545 }
546 
547 /// Compares two 32-bit float values in the low-order bits of both
548 ///    operands to determine if the value in the first operand is less than the
549 ///    corresponding value in the second operand.
550 ///
551 ///    The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
552 ///    low-order bits of a vector of [4 x float].
553 ///    If either value in a comparison is NaN, returns false.
554 ///
555 /// \headerfile <x86intrin.h>
556 ///
557 /// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
558 ///
559 /// \param __a
560 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
561 ///    32 bits of this operand are used in the comparison.
562 /// \param __b
563 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
564 ///    32 bits of this operand are used in the comparison.
565 /// \returns A 128-bit vector of [4 x float] containing the comparison results
566 ///    in the low-order bits.
567 static __inline__ __m128 __DEFAULT_FN_ATTRS
568 _mm_cmplt_ss(__m128 __a, __m128 __b)
569 {
570   return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
571 }
572 
573 /// Compares each of the corresponding 32-bit float values of the
574 ///    128-bit vectors of [4 x float] to determine if the values in the first
575 ///    operand are less than those in the second operand.
576 ///
577 ///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
578 ///    If either value in a comparison is NaN, returns false.
579 ///
580 /// \headerfile <x86intrin.h>
581 ///
582 /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
583 ///
584 /// \param __a
585 ///    A 128-bit vector of [4 x float].
586 /// \param __b
587 ///    A 128-bit vector of [4 x float].
588 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
589 static __inline__ __m128 __DEFAULT_FN_ATTRS
590 _mm_cmplt_ps(__m128 __a, __m128 __b)
591 {
592   return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
593 }
594 
595 /// Compares two 32-bit float values in the low-order bits of both
596 ///    operands to determine if the value in the first operand is less than or
597 ///    equal to the corresponding value in the second operand.
598 ///
599 ///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true, in
600 ///    the low-order bits of a vector of [4 x float].
601 ///    If either value in a comparison is NaN, returns false.
602 ///
603 /// \headerfile <x86intrin.h>
604 ///
605 /// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
606 ///
607 /// \param __a
608 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
609 ///    32 bits of this operand are used in the comparison.
610 /// \param __b
611 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
612 ///    32 bits of this operand are used in the comparison.
613 /// \returns A 128-bit vector of [4 x float] containing the comparison results
614 ///    in the low-order bits.
615 static __inline__ __m128 __DEFAULT_FN_ATTRS
616 _mm_cmple_ss(__m128 __a, __m128 __b)
617 {
618   return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
619 }
620 
621 /// Compares each of the corresponding 32-bit float values of the
622 ///    128-bit vectors of [4 x float] to determine if the values in the first
623 ///    operand are less than or equal to those in the second operand.
624 ///
625 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
626 ///    If either value in a comparison is NaN, returns false.
627 ///
628 /// \headerfile <x86intrin.h>
629 ///
630 /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
631 ///
632 /// \param __a
633 ///    A 128-bit vector of [4 x float].
634 /// \param __b
635 ///    A 128-bit vector of [4 x float].
636 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
637 static __inline__ __m128 __DEFAULT_FN_ATTRS
638 _mm_cmple_ps(__m128 __a, __m128 __b)
639 {
640   return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
641 }
642 
643 /// Compares two 32-bit float values in the low-order bits of both
644 ///    operands to determine if the value in the first operand is greater than
645 ///    the corresponding value in the second operand.
646 ///
647 ///    The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
648 ///    low-order bits of a vector of [4 x float].
649 ///    If either value in a comparison is NaN, returns false.
650 ///
651 /// \headerfile <x86intrin.h>
652 ///
653 /// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
654 ///
655 /// \param __a
656 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
657 ///    32 bits of this operand are used in the comparison.
658 /// \param __b
659 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
660 ///    32 bits of this operand are used in the comparison.
661 /// \returns A 128-bit vector of [4 x float] containing the comparison results
662 ///    in the low-order bits.
663 static __inline__ __m128 __DEFAULT_FN_ATTRS
664 _mm_cmpgt_ss(__m128 __a, __m128 __b)
665 {
666   return (__m128)__builtin_shufflevector((__v4sf)__a,
667                                          (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
668                                          4, 1, 2, 3);
669 }
670 
671 /// Compares each of the corresponding 32-bit float values of the
672 ///    128-bit vectors of [4 x float] to determine if the values in the first
673 ///    operand are greater than those in the second operand.
674 ///
675 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
676 ///    If either value in a comparison is NaN, returns false.
677 ///
678 /// \headerfile <x86intrin.h>
679 ///
680 /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
681 ///
682 /// \param __a
683 ///    A 128-bit vector of [4 x float].
684 /// \param __b
685 ///    A 128-bit vector of [4 x float].
686 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
687 static __inline__ __m128 __DEFAULT_FN_ATTRS
688 _mm_cmpgt_ps(__m128 __a, __m128 __b)
689 {
690   return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
691 }
692 
693 /// Compares two 32-bit float values in the low-order bits of both
694 ///    operands to determine if the value in the first operand is greater than
695 ///    or equal to the corresponding value in the second operand.
696 ///
697 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
698 ///    low-order bits of a vector of [4 x float].
699 ///    If either value in a comparison is NaN, returns false.
700 ///
701 /// \headerfile <x86intrin.h>
702 ///
703 /// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
704 ///
705 /// \param __a
706 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
707 ///    32 bits of this operand are used in the comparison.
708 /// \param __b
709 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
710 ///    32 bits of this operand are used in the comparison.
711 /// \returns A 128-bit vector of [4 x float] containing the comparison results
712 ///    in the low-order bits.
713 static __inline__ __m128 __DEFAULT_FN_ATTRS
714 _mm_cmpge_ss(__m128 __a, __m128 __b)
715 {
716   return (__m128)__builtin_shufflevector((__v4sf)__a,
717                                          (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
718                                          4, 1, 2, 3);
719 }
720 
721 /// Compares each of the corresponding 32-bit float values of the
722 ///    128-bit vectors of [4 x float] to determine if the values in the first
723 ///    operand are greater than or equal to those in the second operand.
724 ///
725 ///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
726 ///    If either value in a comparison is NaN, returns false.
727 ///
728 /// \headerfile <x86intrin.h>
729 ///
730 /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
731 ///
732 /// \param __a
733 ///    A 128-bit vector of [4 x float].
734 /// \param __b
735 ///    A 128-bit vector of [4 x float].
736 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
737 static __inline__ __m128 __DEFAULT_FN_ATTRS
738 _mm_cmpge_ps(__m128 __a, __m128 __b)
739 {
740   return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
741 }
742 
743 /// Compares two 32-bit float values in the low-order bits of both operands
744 ///    for inequality.
745 ///
746 ///    The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
747 ///    low-order bits of a vector of [4 x float].
748 ///    If either value in a comparison is NaN, returns true.
749 ///
750 /// \headerfile <x86intrin.h>
751 ///
752 /// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
753 ///   instructions.
754 ///
755 /// \param __a
756 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
757 ///    32 bits of this operand are used in the comparison.
758 /// \param __b
759 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
760 ///    32 bits of this operand are used in the comparison.
761 /// \returns A 128-bit vector of [4 x float] containing the comparison results
762 ///    in the low-order bits.
763 static __inline__ __m128 __DEFAULT_FN_ATTRS
764 _mm_cmpneq_ss(__m128 __a, __m128 __b)
765 {
766   return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
767 }
768 
769 /// Compares each of the corresponding 32-bit float values of the
770 ///    128-bit vectors of [4 x float] for inequality.
771 ///
772 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
773 ///    If either value in a comparison is NaN, returns true.
774 ///
775 /// \headerfile <x86intrin.h>
776 ///
777 /// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
778 ///   instructions.
779 ///
780 /// \param __a
781 ///    A 128-bit vector of [4 x float].
782 /// \param __b
783 ///    A 128-bit vector of [4 x float].
784 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
785 static __inline__ __m128 __DEFAULT_FN_ATTRS
786 _mm_cmpneq_ps(__m128 __a, __m128 __b)
787 {
788   return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
789 }
790 
791 /// Compares two 32-bit float values in the low-order bits of both
792 ///    operands to determine if the value in the first operand is not less than
793 ///    the corresponding value in the second operand.
794 ///
795 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
796 ///    low-order bits of a vector of [4 x float].
797 ///    If either value in a comparison is NaN, returns true.
798 ///
799 /// \headerfile <x86intrin.h>
800 ///
801 /// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
802 ///   instructions.
803 ///
804 /// \param __a
805 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
806 ///    32 bits of this operand are used in the comparison.
807 /// \param __b
808 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
809 ///    32 bits of this operand are used in the comparison.
810 /// \returns A 128-bit vector of [4 x float] containing the comparison results
811 ///    in the low-order bits.
812 static __inline__ __m128 __DEFAULT_FN_ATTRS
813 _mm_cmpnlt_ss(__m128 __a, __m128 __b)
814 {
815   return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
816 }
817 
818 /// Compares each of the corresponding 32-bit float values of the
819 ///    128-bit vectors of [4 x float] to determine if the values in the first
820 ///    operand are not less than those in the second operand.
821 ///
822 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
823 ///    If either value in a comparison is NaN, returns true.
824 ///
825 /// \headerfile <x86intrin.h>
826 ///
827 /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
828 ///   instructions.
829 ///
830 /// \param __a
831 ///    A 128-bit vector of [4 x float].
832 /// \param __b
833 ///    A 128-bit vector of [4 x float].
834 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
835 static __inline__ __m128 __DEFAULT_FN_ATTRS
836 _mm_cmpnlt_ps(__m128 __a, __m128 __b)
837 {
838   return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
839 }
840 
841 /// Compares two 32-bit float values in the low-order bits of both
842 ///    operands to determine if the value in the first operand is not less than
843 ///    or equal to the corresponding value in the second operand.
844 ///
845 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
846 ///    low-order bits of a vector of [4 x float].
847 ///    If either value in a comparison is NaN, returns true.
848 ///
849 /// \headerfile <x86intrin.h>
850 ///
851 /// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
852 ///   instructions.
853 ///
854 /// \param __a
855 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
856 ///    32 bits of this operand are used in the comparison.
857 /// \param __b
858 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
859 ///    32 bits of this operand are used in the comparison.
860 /// \returns A 128-bit vector of [4 x float] containing the comparison results
861 ///    in the low-order bits.
862 static __inline__ __m128 __DEFAULT_FN_ATTRS
863 _mm_cmpnle_ss(__m128 __a, __m128 __b)
864 {
865   return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
866 }
867 
868 /// Compares each of the corresponding 32-bit float values of the
869 ///    128-bit vectors of [4 x float] to determine if the values in the first
870 ///    operand are not less than or equal to those in the second operand.
871 ///
872 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
873 ///    If either value in a comparison is NaN, returns true.
874 ///
875 /// \headerfile <x86intrin.h>
876 ///
877 /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
878 ///   instructions.
879 ///
880 /// \param __a
881 ///    A 128-bit vector of [4 x float].
882 /// \param __b
883 ///    A 128-bit vector of [4 x float].
884 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
885 static __inline__ __m128 __DEFAULT_FN_ATTRS
886 _mm_cmpnle_ps(__m128 __a, __m128 __b)
887 {
888   return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
889 }
890 
891 /// Compares two 32-bit float values in the low-order bits of both
892 ///    operands to determine if the value in the first operand is not greater
893 ///    than the corresponding value in the second operand.
894 ///
895 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
896 ///    low-order bits of a vector of [4 x float].
897 ///    If either value in a comparison is NaN, returns true.
898 ///
899 /// \headerfile <x86intrin.h>
900 ///
901 /// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
902 ///   instructions.
903 ///
904 /// \param __a
905 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
906 ///    32 bits of this operand are used in the comparison.
907 /// \param __b
908 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
909 ///    32 bits of this operand are used in the comparison.
910 /// \returns A 128-bit vector of [4 x float] containing the comparison results
911 ///    in the low-order bits.
912 static __inline__ __m128 __DEFAULT_FN_ATTRS
913 _mm_cmpngt_ss(__m128 __a, __m128 __b)
914 {
915   return (__m128)__builtin_shufflevector((__v4sf)__a,
916                                          (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
917                                          4, 1, 2, 3);
918 }
919 
920 /// Compares each of the corresponding 32-bit float values of the
921 ///    128-bit vectors of [4 x float] to determine if the values in the first
922 ///    operand are not greater than those in the second operand.
923 ///
924 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
925 ///    If either value in a comparison is NaN, returns true.
926 ///
927 /// \headerfile <x86intrin.h>
928 ///
929 /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
930 ///   instructions.
931 ///
932 /// \param __a
933 ///    A 128-bit vector of [4 x float].
934 /// \param __b
935 ///    A 128-bit vector of [4 x float].
936 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
937 static __inline__ __m128 __DEFAULT_FN_ATTRS
938 _mm_cmpngt_ps(__m128 __a, __m128 __b)
939 {
940   return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
941 }
942 
943 /// Compares two 32-bit float values in the low-order bits of both
944 ///    operands to determine if the value in the first operand is not greater
945 ///    than or equal to the corresponding value in the second operand.
946 ///
947 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
948 ///    low-order bits of a vector of [4 x float].
949 ///    If either value in a comparison is NaN, returns true.
950 ///
951 /// \headerfile <x86intrin.h>
952 ///
953 /// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
954 ///   instructions.
955 ///
956 /// \param __a
957 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
958 ///    32 bits of this operand are used in the comparison.
959 /// \param __b
960 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
961 ///    32 bits of this operand are used in the comparison.
962 /// \returns A 128-bit vector of [4 x float] containing the comparison results
963 ///    in the low-order bits.
964 static __inline__ __m128 __DEFAULT_FN_ATTRS
965 _mm_cmpnge_ss(__m128 __a, __m128 __b)
966 {
967   return (__m128)__builtin_shufflevector((__v4sf)__a,
968                                          (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
969                                          4, 1, 2, 3);
970 }
971 
972 /// Compares each of the corresponding 32-bit float values of the
973 ///    128-bit vectors of [4 x float] to determine if the values in the first
974 ///    operand are not greater than or equal to those in the second operand.
975 ///
976 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
977 ///    If either value in a comparison is NaN, returns true.
978 ///
979 /// \headerfile <x86intrin.h>
980 ///
981 /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
982 ///   instructions.
983 ///
984 /// \param __a
985 ///    A 128-bit vector of [4 x float].
986 /// \param __b
987 ///    A 128-bit vector of [4 x float].
988 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
989 static __inline__ __m128 __DEFAULT_FN_ATTRS
990 _mm_cmpnge_ps(__m128 __a, __m128 __b)
991 {
992   return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
993 }
994 
995 /// Compares two 32-bit float values in the low-order bits of both
996 ///    operands to determine if the value in the first operand is ordered with
997 ///    respect to the corresponding value in the second operand.
998 ///
999 ///    A pair of floating-point values are ordered with respect to each
1000 ///    other if neither value is a NaN. Each comparison returns 0x0 for false,
1001 ///    0xFFFFFFFF for true.
1002 ///
1003 /// \headerfile <x86intrin.h>
1004 ///
1005 /// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
1006 ///   instructions.
1007 ///
1008 /// \param __a
1009 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
1010 ///    32 bits of this operand are used in the comparison.
1011 /// \param __b
1012 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
1013 ///    32 bits of this operand are used in the comparison.
1014 /// \returns A 128-bit vector of [4 x float] containing the comparison results
1015 ///    in the low-order bits.
1016 static __inline__ __m128 __DEFAULT_FN_ATTRS
1017 _mm_cmpord_ss(__m128 __a, __m128 __b)
1018 {
1019   return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
1020 }
1021 
1022 /// Compares each of the corresponding 32-bit float values of the
1023 ///    128-bit vectors of [4 x float] to determine if the values in the first
1024 ///    operand are ordered with respect to those in the second operand.
1025 ///
1026 ///    A pair of floating-point values are ordered with respect to each
1027 ///    other if neither value is a NaN. Each comparison returns 0x0 for false,
1028 ///    0xFFFFFFFF for true.
1029 ///
1030 /// \headerfile <x86intrin.h>
1031 ///
1032 /// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
1033 ///   instructions.
1034 ///
1035 /// \param __a
1036 ///    A 128-bit vector of [4 x float].
1037 /// \param __b
1038 ///    A 128-bit vector of [4 x float].
1039 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
1040 static __inline__ __m128 __DEFAULT_FN_ATTRS
1041 _mm_cmpord_ps(__m128 __a, __m128 __b)
1042 {
1043   return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
1044 }
1045 
1046 /// Compares two 32-bit float values in the low-order bits of both
1047 ///    operands to determine if the value in the first operand is unordered
1048 ///    with respect to the corresponding value in the second operand.
1049 ///
1050 ///    A pair of double-precision values are unordered with respect to each
1051 ///    other if one or both values are NaN. Each comparison returns 0x0 for
1052 ///    false, 0xFFFFFFFF for true.
1053 ///
1054 /// \headerfile <x86intrin.h>
1055 ///
1056 /// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
1057 ///   instructions.
1058 ///
1059 /// \param __a
1060 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
1061 ///    32 bits of this operand are used in the comparison.
1062 /// \param __b
1063 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
1064 ///    32 bits of this operand are used in the comparison.
1065 /// \returns A 128-bit vector of [4 x float] containing the comparison results
1066 ///    in the low-order bits.
1067 static __inline__ __m128 __DEFAULT_FN_ATTRS
1068 _mm_cmpunord_ss(__m128 __a, __m128 __b)
1069 {
1070   return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
1071 }
1072 
1073 /// Compares each of the corresponding 32-bit float values of the
1074 ///    128-bit vectors of [4 x float] to determine if the values in the first
1075 ///    operand are unordered with respect to those in the second operand.
1076 ///
1077 ///    A pair of double-precision values are unordered with respect to each
1078 ///    other if one or both values are NaN. Each comparison returns 0x0 for
1079 ///    false, 0xFFFFFFFFFFFFFFFF for true.
1080 ///
1081 /// \headerfile <x86intrin.h>
1082 ///
1083 /// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
1084 ///   instructions.
1085 ///
1086 /// \param __a
1087 ///    A 128-bit vector of [4 x float].
1088 /// \param __b
1089 ///    A 128-bit vector of [4 x float].
1090 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
1091 static __inline__ __m128 __DEFAULT_FN_ATTRS
1092 _mm_cmpunord_ps(__m128 __a, __m128 __b)
1093 {
1094   return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
1095 }
1096 
1097 /// Compares two 32-bit float values in the low-order bits of both
1098 ///    operands for equality.
1099 ///
1100 ///    The comparison returns 0 for false, 1 for true. If either value in a
1101 ///    comparison is NaN, returns 0.
1102 ///
1103 /// \headerfile <x86intrin.h>
1104 ///
1105 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1106 ///   instructions.
1107 ///
1108 /// \param __a
1109 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1110 ///    used in the comparison.
1111 /// \param __b
1112 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1113 ///    used in the comparison.
1114 /// \returns An integer containing the comparison results.
1115 static __inline__ int __DEFAULT_FN_ATTRS
1116 _mm_comieq_ss(__m128 __a, __m128 __b)
1117 {
1118   return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
1119 }
1120 
1121 /// Compares two 32-bit float values in the low-order bits of both
1122 ///    operands to determine if the first operand is less than the second
1123 ///    operand.
1124 ///
1125 ///    The comparison returns 0 for false, 1 for true. If either value in a
1126 ///    comparison is NaN, returns 0.
1127 ///
1128 /// \headerfile <x86intrin.h>
1129 ///
1130 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1131 ///   instructions.
1132 ///
1133 /// \param __a
1134 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1135 ///    used in the comparison.
1136 /// \param __b
1137 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1138 ///    used in the comparison.
1139 /// \returns An integer containing the comparison results.
1140 static __inline__ int __DEFAULT_FN_ATTRS
1141 _mm_comilt_ss(__m128 __a, __m128 __b)
1142 {
1143   return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
1144 }
1145 
1146 /// Compares two 32-bit float values in the low-order bits of both
1147 ///    operands to determine if the first operand is less than or equal to the
1148 ///    second operand.
1149 ///
1150 ///    The comparison returns 0 for false, 1 for true. If either value in a
1151 ///    comparison is NaN, returns 0.
1152 ///
1153 /// \headerfile <x86intrin.h>
1154 ///
1155 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1156 ///
1157 /// \param __a
1158 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1159 ///    used in the comparison.
1160 /// \param __b
1161 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1162 ///    used in the comparison.
1163 /// \returns An integer containing the comparison results.
1164 static __inline__ int __DEFAULT_FN_ATTRS
1165 _mm_comile_ss(__m128 __a, __m128 __b)
1166 {
1167   return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
1168 }
1169 
1170 /// Compares two 32-bit float values in the low-order bits of both
1171 ///    operands to determine if the first operand is greater than the second
1172 ///    operand.
1173 ///
1174 ///    The comparison returns 0 for false, 1 for true. If either value in a
1175 ///    comparison is NaN, returns 0.
1176 ///
1177 /// \headerfile <x86intrin.h>
1178 ///
1179 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1180 ///
1181 /// \param __a
1182 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1183 ///    used in the comparison.
1184 /// \param __b
1185 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1186 ///    used in the comparison.
1187 /// \returns An integer containing the comparison results.
1188 static __inline__ int __DEFAULT_FN_ATTRS
1189 _mm_comigt_ss(__m128 __a, __m128 __b)
1190 {
1191   return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
1192 }
1193 
1194 /// Compares two 32-bit float values in the low-order bits of both
1195 ///    operands to determine if the first operand is greater than or equal to
1196 ///    the second operand.
1197 ///
1198 ///    The comparison returns 0 for false, 1 for true. If either value in a
1199 ///    comparison is NaN, returns 0.
1200 ///
1201 /// \headerfile <x86intrin.h>
1202 ///
1203 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1204 ///
1205 /// \param __a
1206 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1207 ///    used in the comparison.
1208 /// \param __b
1209 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1210 ///    used in the comparison.
1211 /// \returns An integer containing the comparison results.
1212 static __inline__ int __DEFAULT_FN_ATTRS
1213 _mm_comige_ss(__m128 __a, __m128 __b)
1214 {
1215   return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
1216 }
1217 
1218 /// Compares two 32-bit float values in the low-order bits of both
1219 ///    operands to determine if the first operand is not equal to the second
1220 ///    operand.
1221 ///
1222 ///    The comparison returns 0 for false, 1 for true. If either value in a
1223 ///    comparison is NaN, returns 1.
1224 ///
1225 /// \headerfile <x86intrin.h>
1226 ///
1227 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1228 ///
1229 /// \param __a
1230 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1231 ///    used in the comparison.
1232 /// \param __b
1233 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1234 ///    used in the comparison.
1235 /// \returns An integer containing the comparison results.
1236 static __inline__ int __DEFAULT_FN_ATTRS
1237 _mm_comineq_ss(__m128 __a, __m128 __b)
1238 {
1239   return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
1240 }
1241 
1242 /// Performs an unordered comparison of two 32-bit float values using
1243 ///    the low-order bits of both operands to determine equality.
1244 ///
1245 ///    The comparison returns 0 for false, 1 for true. If either value in a
1246 ///    comparison is NaN, returns 0.
1247 ///
1248 /// \headerfile <x86intrin.h>
1249 ///
1250 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1251 ///
1252 /// \param __a
1253 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1254 ///    used in the comparison.
1255 /// \param __b
1256 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1257 ///    used in the comparison.
1258 /// \returns An integer containing the comparison results.
1259 static __inline__ int __DEFAULT_FN_ATTRS
1260 _mm_ucomieq_ss(__m128 __a, __m128 __b)
1261 {
1262   return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
1263 }
1264 
1265 /// Performs an unordered comparison of two 32-bit float values using
1266 ///    the low-order bits of both operands to determine if the first operand is
1267 ///    less than the second operand.
1268 ///
1269 ///    The comparison returns 0 for false, 1 for true. If either value in a
1270 ///    comparison is NaN, returns 0.
1271 ///
1272 /// \headerfile <x86intrin.h>
1273 ///
1274 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1275 ///
1276 /// \param __a
1277 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1278 ///    used in the comparison.
1279 /// \param __b
1280 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1281 ///    used in the comparison.
1282 /// \returns An integer containing the comparison results.
1283 static __inline__ int __DEFAULT_FN_ATTRS
1284 _mm_ucomilt_ss(__m128 __a, __m128 __b)
1285 {
1286   return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
1287 }
1288 
1289 /// Performs an unordered comparison of two 32-bit float values using
1290 ///    the low-order bits of both operands to determine if the first operand is
1291 ///    less than or equal to the second operand.
1292 ///
1293 ///    The comparison returns 0 for false, 1 for true. If either value in a
1294 ///    comparison is NaN, returns 0.
1295 ///
1296 /// \headerfile <x86intrin.h>
1297 ///
1298 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1299 ///
1300 /// \param __a
1301 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1302 ///    used in the comparison.
1303 /// \param __b
1304 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1305 ///    used in the comparison.
1306 /// \returns An integer containing the comparison results.
1307 static __inline__ int __DEFAULT_FN_ATTRS
1308 _mm_ucomile_ss(__m128 __a, __m128 __b)
1309 {
1310   return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
1311 }
1312 
1313 /// Performs an unordered comparison of two 32-bit float values using
1314 ///    the low-order bits of both operands to determine if the first operand is
1315 ///    greater than the second operand.
1316 ///
1317 ///    The comparison returns 0 for false, 1 for true. If either value in a
1318 ///    comparison is NaN, returns 0.
1319 ///
1320 /// \headerfile <x86intrin.h>
1321 ///
1322 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1323 ///
1324 /// \param __a
1325 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1326 ///    used in the comparison.
1327 /// \param __b
1328 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1329 ///    used in the comparison.
1330 /// \returns An integer containing the comparison results.
1331 static __inline__ int __DEFAULT_FN_ATTRS
1332 _mm_ucomigt_ss(__m128 __a, __m128 __b)
1333 {
1334   return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
1335 }
1336 
1337 /// Performs an unordered comparison of two 32-bit float values using
1338 ///    the low-order bits of both operands to determine if the first operand is
1339 ///    greater than or equal to the second operand.
1340 ///
1341 ///    The comparison returns 0 for false, 1 for true. If either value in a
1342 ///    comparison is NaN, returns 0.
1343 ///
1344 /// \headerfile <x86intrin.h>
1345 ///
1346 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1347 ///
1348 /// \param __a
1349 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1350 ///    used in the comparison.
1351 /// \param __b
1352 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1353 ///    used in the comparison.
1354 /// \returns An integer containing the comparison results.
1355 static __inline__ int __DEFAULT_FN_ATTRS
1356 _mm_ucomige_ss(__m128 __a, __m128 __b)
1357 {
1358   return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
1359 }
1360 
1361 /// Performs an unordered comparison of two 32-bit float values using
1362 ///    the low-order bits of both operands to determine inequality.
1363 ///
1364 ///    The comparison returns 0 for false, 1 for true. If either value in a
1365 ///    comparison is NaN, returns 0.
1366 ///
1367 /// \headerfile <x86intrin.h>
1368 ///
1369 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1370 ///
1371 /// \param __a
1372 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1373 ///    used in the comparison.
1374 /// \param __b
1375 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1376 ///    used in the comparison.
1377 /// \returns An integer containing the comparison results.
1378 static __inline__ int __DEFAULT_FN_ATTRS
1379 _mm_ucomineq_ss(__m128 __a, __m128 __b)
1380 {
1381   return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
1382 }
1383 
1384 /// Converts a float value contained in the lower 32 bits of a vector of
1385 ///    [4 x float] into a 32-bit integer.
1386 ///
1387 ///    If the converted value does not fit in a 32-bit integer, raises a
1388 ///    floating-point invalid exception. If the exception is masked, returns
1389 ///    the most negative integer.
1390 ///
1391 /// \headerfile <x86intrin.h>
1392 ///
1393 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1394 ///   instructions.
1395 ///
1396 /// \param __a
1397 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1398 ///    used in the conversion.
1399 /// \returns A 32-bit integer containing the converted value.
1400 static __inline__ int __DEFAULT_FN_ATTRS
1401 _mm_cvtss_si32(__m128 __a)
1402 {
1403   return __builtin_ia32_cvtss2si((__v4sf)__a);
1404 }
1405 
1406 /// Converts a float value contained in the lower 32 bits of a vector of
1407 ///    [4 x float] into a 32-bit integer.
1408 ///
1409 ///    If the converted value does not fit in a 32-bit integer, raises a
1410 ///    floating-point invalid exception. If the exception is masked, returns
1411 ///    the most negative integer.
1412 ///
1413 /// \headerfile <x86intrin.h>
1414 ///
1415 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1416 ///   instructions.
1417 ///
1418 /// \param __a
1419 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1420 ///    used in the conversion.
1421 /// \returns A 32-bit integer containing the converted value.
1422 static __inline__ int __DEFAULT_FN_ATTRS
1423 _mm_cvt_ss2si(__m128 __a)
1424 {
1425   return _mm_cvtss_si32(__a);
1426 }
1427 
1428 #ifdef __x86_64__
1429 
1430 /// Converts a float value contained in the lower 32 bits of a vector of
1431 ///    [4 x float] into a 64-bit integer.
1432 ///
1433 ///    If the converted value does not fit in a 32-bit integer, raises a
1434 ///    floating-point invalid exception. If the exception is masked, returns
1435 ///    the most negative integer.
1436 ///
1437 /// \headerfile <x86intrin.h>
1438 ///
1439 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1440 ///   instructions.
1441 ///
1442 /// \param __a
1443 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1444 ///    used in the conversion.
1445 /// \returns A 64-bit integer containing the converted value.
1446 static __inline__ long long __DEFAULT_FN_ATTRS
1447 _mm_cvtss_si64(__m128 __a)
1448 {
1449   return __builtin_ia32_cvtss2si64((__v4sf)__a);
1450 }
1451 
1452 #endif
1453 
1454 /// Converts two low-order float values in a 128-bit vector of
1455 ///    [4 x float] into a 64-bit vector of [2 x i32].
1456 ///
1457 ///    If a converted value does not fit in a 32-bit integer, raises a
1458 ///    floating-point invalid exception. If the exception is masked, returns
1459 ///    the most negative integer.
1460 ///
1461 /// \headerfile <x86intrin.h>
1462 ///
1463 /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1464 ///
1465 /// \param __a
1466 ///    A 128-bit vector of [4 x float].
1467 /// \returns A 64-bit integer vector containing the converted values.
1468 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1469 _mm_cvtps_pi32(__m128 __a)
1470 {
1471   return __trunc64(__builtin_ia32_cvtps2dq((__v4sf)__zeroupper64(__a)));
1472 }
1473 
1474 /// Converts two low-order float values in a 128-bit vector of
1475 ///    [4 x float] into a 64-bit vector of [2 x i32].
1476 ///
1477 ///    If a converted value does not fit in a 32-bit integer, raises a
1478 ///    floating-point invalid exception. If the exception is masked, returns
1479 ///    the most negative integer.
1480 ///
1481 /// \headerfile <x86intrin.h>
1482 ///
1483 /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1484 ///
1485 /// \param __a
1486 ///    A 128-bit vector of [4 x float].
1487 /// \returns A 64-bit integer vector containing the converted values.
1488 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1489 _mm_cvt_ps2pi(__m128 __a)
1490 {
1491   return _mm_cvtps_pi32(__a);
1492 }
1493 
1494 /// Converts the lower (first) element of a vector of [4 x float] into a signed
1495 ///    truncated (rounded toward zero) 32-bit integer.
1496 ///
1497 ///    If the converted value does not fit in a 32-bit integer, raises a
1498 ///    floating-point invalid exception. If the exception is masked, returns
1499 ///    the most negative integer.
1500 ///
1501 /// \headerfile <x86intrin.h>
1502 ///
1503 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1504 ///   instructions.
1505 ///
1506 /// \param __a
1507 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1508 ///    used in the conversion.
1509 /// \returns A 32-bit integer containing the converted value.
1510 static __inline__ int __DEFAULT_FN_ATTRS
1511 _mm_cvttss_si32(__m128 __a)
1512 {
1513   return __builtin_ia32_cvttss2si((__v4sf)__a);
1514 }
1515 
1516 /// Converts the lower (first) element of a vector of [4 x float] into a signed
1517 ///    truncated (rounded toward zero) 32-bit integer.
1518 ///
1519 ///    If the converted value does not fit in a 32-bit integer, raises a
1520 ///    floating-point invalid exception. If the exception is masked, returns
1521 ///    the most negative integer.
1522 ///
1523 /// \headerfile <x86intrin.h>
1524 ///
1525 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1526 ///   instructions.
1527 ///
1528 /// \param __a
1529 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1530 ///    used in the conversion.
1531 /// \returns A 32-bit integer containing the converted value.
1532 static __inline__ int __DEFAULT_FN_ATTRS
1533 _mm_cvtt_ss2si(__m128 __a)
1534 {
1535   return _mm_cvttss_si32(__a);
1536 }
1537 
1538 #ifdef __x86_64__
1539 /// Converts the lower (first) element of a vector of [4 x float] into a signed
1540 ///    truncated (rounded toward zero) 64-bit integer.
1541 ///
1542 ///    If the converted value does not fit in a 64-bit integer, raises a
1543 ///    floating-point invalid exception. If the exception is masked, returns
1544 ///    the most negative integer.
1545 ///
1546 /// \headerfile <x86intrin.h>
1547 ///
1548 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1549 ///   instructions.
1550 ///
1551 /// \param __a
1552 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1553 ///    used in the conversion.
1554 /// \returns A 64-bit integer containing the converted value.
1555 static __inline__ long long __DEFAULT_FN_ATTRS
1556 _mm_cvttss_si64(__m128 __a)
1557 {
1558   return __builtin_ia32_cvttss2si64((__v4sf)__a);
1559 }
1560 #endif
1561 
1562 /// Converts the lower (first) two elements of a 128-bit vector of [4 x float]
1563 ///    into two signed truncated (rounded toward zero) 32-bit integers,
1564 ///    returned in a 64-bit vector of [2 x i32].
1565 ///
1566 ///    If a converted value does not fit in a 32-bit integer, raises a
1567 ///    floating-point invalid exception. If the exception is masked, returns
1568 ///    the most negative integer.
1569 ///
1570 /// \headerfile <x86intrin.h>
1571 ///
1572 /// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
1573 ///   instructions.
1574 ///
1575 /// \param __a
1576 ///    A 128-bit vector of [4 x float].
1577 /// \returns A 64-bit integer vector containing the converted values.
1578 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1579 _mm_cvttps_pi32(__m128 __a)
1580 {
1581   return __trunc64(__builtin_ia32_cvttps2dq((__v4sf)__zeroupper64(__a)));
1582 }
1583 
1584 /// Converts the lower (first) two elements of a 128-bit vector of [4 x float]
1585 ///    into two signed truncated (rounded toward zero) 64-bit integers,
1586 ///    returned in a 64-bit vector of [2 x i32].
1587 ///
1588 ///    If a converted value does not fit in a 32-bit integer, raises a
1589 ///    floating-point invalid exception. If the exception is masked, returns
1590 ///    the most negative integer.
1591 ///
1592 /// \headerfile <x86intrin.h>
1593 ///
1594 /// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
1595 ///
1596 /// \param __a
1597 ///    A 128-bit vector of [4 x float].
1598 /// \returns A 64-bit integer vector containing the converted values.
1599 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1600 _mm_cvtt_ps2pi(__m128 __a)
1601 {
1602   return _mm_cvttps_pi32(__a);
1603 }
1604 
1605 /// Converts a 32-bit signed integer value into a floating point value
1606 ///    and writes it to the lower 32 bits of the destination. The remaining
1607 ///    higher order elements of the destination vector are copied from the
1608 ///    corresponding elements in the first operand.
1609 ///
1610 /// \headerfile <x86intrin.h>
1611 ///
1612 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1613 ///
1614 /// \param __a
1615 ///    A 128-bit vector of [4 x float].
1616 /// \param __b
1617 ///    A 32-bit signed integer operand containing the value to be converted.
1618 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1619 ///    converted value of the second operand. The upper 96 bits are copied from
1620 ///    the upper 96 bits of the first operand.
1621 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtsi32_ss(__m128 __a,
1622                                                                      int __b) {
1623   __a[0] = __b;
1624   return __a;
1625 }
1626 
1627 /// Converts a 32-bit signed integer value into a floating point value
1628 ///    and writes it to the lower 32 bits of the destination. The remaining
1629 ///    higher order elements of the destination are copied from the
1630 ///    corresponding elements in the first operand.
1631 ///
1632 /// \headerfile <x86intrin.h>
1633 ///
1634 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1635 ///
1636 /// \param __a
1637 ///    A 128-bit vector of [4 x float].
1638 /// \param __b
1639 ///    A 32-bit signed integer operand containing the value to be converted.
1640 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1641 ///    converted value of the second operand. The upper 96 bits are copied from
1642 ///    the upper 96 bits of the first operand.
1643 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvt_si2ss(__m128 __a,
1644                                                                     int __b) {
1645   return _mm_cvtsi32_ss(__a, __b);
1646 }
1647 
1648 #ifdef __x86_64__
1649 
1650 /// Converts a 64-bit signed integer value into a floating point value
1651 ///    and writes it to the lower 32 bits of the destination. The remaining
1652 ///    higher order elements of the destination are copied from the
1653 ///    corresponding elements in the first operand.
1654 ///
1655 /// \headerfile <x86intrin.h>
1656 ///
1657 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1658 ///
1659 /// \param __a
1660 ///    A 128-bit vector of [4 x float].
1661 /// \param __b
1662 ///    A 64-bit signed integer operand containing the value to be converted.
1663 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1664 ///    converted value of the second operand. The upper 96 bits are copied from
1665 ///    the upper 96 bits of the first operand.
1666 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
1667 _mm_cvtsi64_ss(__m128 __a, long long __b) {
1668   __a[0] = __b;
1669   return __a;
1670 }
1671 
1672 #endif
1673 
1674 /// Converts two elements of a 64-bit vector of [2 x i32] into two
1675 ///    floating point values and writes them to the lower 64-bits of the
1676 ///    destination. The remaining higher order elements of the destination are
1677 ///    copied from the corresponding elements in the first operand.
1678 ///
1679 /// \headerfile <x86intrin.h>
1680 ///
1681 /// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1682 ///
1683 /// \param __a
1684 ///    A 128-bit vector of [4 x float].
1685 /// \param __b
1686 ///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
1687 ///    and written to the corresponding low-order elements in the destination.
1688 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1689 ///    converted value of the second operand. The upper 64 bits are copied from
1690 ///    the upper 64 bits of the first operand.
1691 static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
1692 _mm_cvtpi32_ps(__m128 __a, __m64 __b)
1693 {
1694   return (__m128)__builtin_shufflevector(
1695       (__v4sf)__a,
1696       __builtin_convertvector((__v4si)__zext128(__b), __v4sf),
1697       4, 5, 2, 3);
1698 }
1699 
1700 /// Converts two elements of a 64-bit vector of [2 x i32] into two
1701 ///    floating point values and writes them to the lower 64-bits of the
1702 ///    destination. The remaining higher order elements of the destination are
1703 ///    copied from the corresponding elements in the first operand.
1704 ///
1705 /// \headerfile <x86intrin.h>
1706 ///
1707 /// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1708 ///
1709 /// \param __a
1710 ///    A 128-bit vector of [4 x float].
1711 /// \param __b
1712 ///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
1713 ///    and written to the corresponding low-order elements in the destination.
1714 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1715 ///    converted value from the second operand. The upper 64 bits are copied
1716 ///    from the upper 64 bits of the first operand.
1717 static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
1718 _mm_cvt_pi2ps(__m128 __a, __m64 __b)
1719 {
1720   return _mm_cvtpi32_ps(__a, __b);
1721 }
1722 
1723 /// Extracts a float value contained in the lower 32 bits of a vector of
1724 ///    [4 x float].
1725 ///
1726 /// \headerfile <x86intrin.h>
1727 ///
1728 /// This intrinsic has no corresponding instruction.
1729 ///
1730 /// \param __a
1731 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1732 ///    used in the extraction.
1733 /// \returns A 32-bit float containing the extracted value.
1734 static __inline__ float __DEFAULT_FN_ATTRS_CONSTEXPR
1735 _mm_cvtss_f32(__m128 __a) {
1736   return __a[0];
1737 }
1738 
1739 /// Loads two packed float values from the address \a __p into the
1740 ///     high-order bits of a 128-bit vector of [4 x float]. The low-order bits
1741 ///     are copied from the low-order bits of the first operand.
1742 ///
1743 /// \headerfile <x86intrin.h>
1744 ///
1745 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1746 ///
1747 /// \param __a
1748 ///    A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
1749 ///    of the destination.
1750 /// \param __p
1751 ///    A pointer to two packed float values. Bits [63:0] are written to bits
1752 ///    [127:64] of the destination.
1753 /// \returns A 128-bit vector of [4 x float] containing the moved values.
1754 static __inline__ __m128 __DEFAULT_FN_ATTRS
1755 _mm_loadh_pi(__m128 __a, const __m64 *__p)
1756 {
1757   typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
1758   struct __mm_loadh_pi_struct {
1759     __mm_loadh_pi_v2f32 __u;
1760   } __attribute__((__packed__, __may_alias__));
1761   __mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u;
1762   __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1763   return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
1764 }
1765 
1766 /// Loads two packed float values from the address \a __p into the
1767 ///    low-order bits of a 128-bit vector of [4 x float]. The high-order bits
1768 ///    are copied from the high-order bits of the first operand.
1769 ///
1770 /// \headerfile <x86intrin.h>
1771 ///
1772 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1773 ///
1774 /// \param __a
1775 ///    A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
1776 ///    [127:64] of the destination.
1777 /// \param __p
1778 ///    A pointer to two packed float values. Bits [63:0] are written to bits
1779 ///    [63:0] of the destination.
1780 /// \returns A 128-bit vector of [4 x float] containing the moved values.
1781 static __inline__ __m128 __DEFAULT_FN_ATTRS
1782 _mm_loadl_pi(__m128 __a, const __m64 *__p)
1783 {
1784   typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
1785   struct __mm_loadl_pi_struct {
1786     __mm_loadl_pi_v2f32 __u;
1787   } __attribute__((__packed__, __may_alias__));
1788   __mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u;
1789   __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1790   return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
1791 }
1792 
1793 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1794 ///    32 bits of the vector are initialized with the single-precision
1795 ///    floating-point value loaded from a specified memory location. The upper
1796 ///    96 bits are set to zero.
1797 ///
1798 /// \headerfile <x86intrin.h>
1799 ///
1800 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1801 ///
1802 /// \param __p
1803 ///    A pointer to a 32-bit memory location containing a single-precision
1804 ///    floating-point value.
1805 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1806 ///    lower 32 bits contain the value loaded from the memory location. The
1807 ///    upper 96 bits are set to zero.
1808 static __inline__ __m128 __DEFAULT_FN_ATTRS
1809 _mm_load_ss(const float *__p)
1810 {
1811   struct __mm_load_ss_struct {
1812     float __u;
1813   } __attribute__((__packed__, __may_alias__));
1814   float __u = ((const struct __mm_load_ss_struct*)__p)->__u;
1815   return __extension__ (__m128){ __u, 0, 0, 0 };
1816 }
1817 
1818 /// Loads a 32-bit float value and duplicates it to all four vector
1819 ///    elements of a 128-bit vector of [4 x float].
1820 ///
1821 /// \headerfile <x86intrin.h>
1822 ///
1823 /// This intrinsic corresponds to the <c> VBROADCASTSS / MOVSS + shuffling </c>
1824 ///    instruction.
1825 ///
1826 /// \param __p
1827 ///    A pointer to a float value to be loaded and duplicated.
1828 /// \returns A 128-bit vector of [4 x float] containing the loaded and
1829 ///    duplicated values.
1830 static __inline__ __m128 __DEFAULT_FN_ATTRS
1831 _mm_load1_ps(const float *__p)
1832 {
1833   struct __mm_load1_ps_struct {
1834     float __u;
1835   } __attribute__((__packed__, __may_alias__));
1836   float __u = ((const struct __mm_load1_ps_struct*)__p)->__u;
1837   return __extension__ (__m128){ __u, __u, __u, __u };
1838 }
1839 
1840 #define        _mm_load_ps1(p) _mm_load1_ps(p)
1841 
1842 /// Loads a 128-bit floating-point vector of [4 x float] from an aligned
1843 ///    memory location.
1844 ///
1845 /// \headerfile <x86intrin.h>
1846 ///
1847 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
1848 ///
1849 /// \param __p
1850 ///    A pointer to a 128-bit memory location. The address of the memory
1851 ///    location has to be 128-bit aligned.
1852 /// \returns A 128-bit vector of [4 x float] containing the loaded values.
1853 static __inline__ __m128 __DEFAULT_FN_ATTRS
1854 _mm_load_ps(const float *__p)
1855 {
1856   return *(const __m128*)__p;
1857 }
1858 
1859 /// Loads a 128-bit floating-point vector of [4 x float] from an
1860 ///    unaligned memory location.
1861 ///
1862 /// \headerfile <x86intrin.h>
1863 ///
1864 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
1865 ///
1866 /// \param __p
1867 ///    A pointer to a 128-bit memory location. The address of the memory
1868 ///    location does not have to be aligned.
1869 /// \returns A 128-bit vector of [4 x float] containing the loaded values.
1870 static __inline__ __m128 __DEFAULT_FN_ATTRS
1871 _mm_loadu_ps(const float *__p)
1872 {
1873   struct __loadu_ps {
1874     __m128_u __v;
1875   } __attribute__((__packed__, __may_alias__));
1876   return ((const struct __loadu_ps*)__p)->__v;
1877 }
1878 
1879 /// Loads four packed float values, in reverse order, from an aligned
1880 ///    memory location to 32-bit elements in a 128-bit vector of [4 x float].
1881 ///
1882 /// \headerfile <x86intrin.h>
1883 ///
1884 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
1885 ///    instruction.
1886 ///
1887 /// \param __p
1888 ///    A pointer to a 128-bit memory location. The address of the memory
1889 ///    location has to be 128-bit aligned.
1890 /// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
1891 ///    in reverse order.
1892 static __inline__ __m128 __DEFAULT_FN_ATTRS
1893 _mm_loadr_ps(const float *__p)
1894 {
1895   __m128 __a = _mm_load_ps(__p);
1896   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
1897 }
1898 
1899 /// Create a 128-bit vector of [4 x float] with undefined values.
1900 ///
1901 /// \headerfile <x86intrin.h>
1902 ///
1903 /// This intrinsic has no corresponding instruction.
1904 ///
1905 /// \returns A 128-bit vector of [4 x float] containing undefined values.
1906 static __inline__ __m128 __DEFAULT_FN_ATTRS
1907 _mm_undefined_ps(void)
1908 {
1909   return (__m128)__builtin_ia32_undef128();
1910 }
1911 
1912 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1913 ///    32 bits of the vector are initialized with the specified single-precision
1914 ///    floating-point value. The upper 96 bits are set to zero.
1915 ///
1916 /// \headerfile <x86intrin.h>
1917 ///
1918 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1919 ///
1920 /// \param __w
1921 ///    A single-precision floating-point value used to initialize the lower 32
1922 ///    bits of the result.
1923 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1924 ///    lower 32 bits contain the value provided in the source operand. The
1925 ///    upper 96 bits are set to zero.
1926 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
1927 _mm_set_ss(float __w) {
1928   return __extension__ (__m128){ __w, 0.0f, 0.0f, 0.0f };
1929 }
1930 
1931 /// Constructs a 128-bit floating-point vector of [4 x float], with each
1932 ///    of the four single-precision floating-point vector elements set to the
1933 ///    specified single-precision floating-point value.
1934 ///
1935 /// \headerfile <x86intrin.h>
1936 ///
1937 /// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1938 ///
1939 /// \param __w
1940 ///    A single-precision floating-point value used to initialize each vector
1941 ///    element of the result.
1942 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1943 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
1944 _mm_set1_ps(float __w) {
1945   return __extension__ (__m128){ __w, __w, __w, __w };
1946 }
1947 
1948 /* Microsoft specific. */
1949 /// Constructs a 128-bit floating-point vector of [4 x float], with each
1950 ///    of the four single-precision floating-point vector elements set to the
1951 ///    specified single-precision floating-point value.
1952 ///
1953 /// \headerfile <x86intrin.h>
1954 ///
1955 /// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1956 ///
1957 /// \param __w
1958 ///    A single-precision floating-point value used to initialize each vector
1959 ///    element of the result.
1960 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1961 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
1962 _mm_set_ps1(float __w) {
1963     return _mm_set1_ps(__w);
1964 }
1965 
1966 /// Constructs a 128-bit floating-point vector of [4 x float]
1967 ///    initialized with the specified single-precision floating-point values.
1968 ///
1969 /// \headerfile <x86intrin.h>
1970 ///
1971 /// This intrinsic is a utility function and does not correspond to a specific
1972 ///    instruction.
1973 ///
1974 /// \param __z
1975 ///    A single-precision floating-point value used to initialize bits [127:96]
1976 ///    of the result.
1977 /// \param __y
1978 ///    A single-precision floating-point value used to initialize bits [95:64]
1979 ///    of the result.
1980 /// \param __x
1981 ///    A single-precision floating-point value used to initialize bits [63:32]
1982 ///    of the result.
1983 /// \param __w
1984 ///    A single-precision floating-point value used to initialize bits [31:0]
1985 ///    of the result.
1986 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1987 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
1988 _mm_set_ps(float __z, float __y, float __x, float __w) {
1989   return __extension__ (__m128){ __w, __x, __y, __z };
1990 }
1991 
1992 /// Constructs a 128-bit floating-point vector of [4 x float],
1993 ///    initialized in reverse order with the specified 32-bit single-precision
1994 ///    float-point values.
1995 ///
1996 /// \headerfile <x86intrin.h>
1997 ///
1998 /// This intrinsic is a utility function and does not correspond to a specific
1999 ///    instruction.
2000 ///
2001 /// \param __z
2002 ///    A single-precision floating-point value used to initialize bits [31:0]
2003 ///    of the result.
2004 /// \param __y
2005 ///    A single-precision floating-point value used to initialize bits [63:32]
2006 ///    of the result.
2007 /// \param __x
2008 ///    A single-precision floating-point value used to initialize bits [95:64]
2009 ///    of the result.
2010 /// \param __w
2011 ///    A single-precision floating-point value used to initialize bits [127:96]
2012 ///    of the result.
2013 /// \returns An initialized 128-bit floating-point vector of [4 x float].
2014 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
2015 _mm_setr_ps(float __z, float __y, float __x, float __w) {
2016   return __extension__ (__m128){ __z, __y, __x, __w };
2017 }
2018 
2019 /// Constructs a 128-bit floating-point vector of [4 x float] initialized
2020 ///    to zero.
2021 ///
2022 /// \headerfile <x86intrin.h>
2023 ///
2024 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
2025 ///
2026 /// \returns An initialized 128-bit floating-point vector of [4 x float] with
2027 ///    all elements set to zero.
2028 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
2029 _mm_setzero_ps(void) {
2030   return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
2031 }
2032 
2033 /// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
2034 ///    memory location.
2035 ///
2036 /// \headerfile <x86intrin.h>
2037 ///
2038 /// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
2039 ///
2040 /// \param __p
2041 ///    A pointer to a 64-bit memory location.
2042 /// \param __a
2043 ///    A 128-bit vector of [4 x float] containing the values to be stored.
2044 static __inline__ void __DEFAULT_FN_ATTRS
2045 _mm_storeh_pi(__m64 *__p, __m128 __a)
2046 {
2047   typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
2048   struct __mm_storeh_pi_struct {
2049     __mm_storeh_pi_v2f32 __u;
2050   } __attribute__((__packed__, __may_alias__));
2051   ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3);
2052 }
2053 
2054 /// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
2055 ///     memory location.
2056 ///
2057 /// \headerfile <x86intrin.h>
2058 ///
2059 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
2060 ///
2061 /// \param __p
2062 ///    A pointer to a memory location that will receive the float values.
2063 /// \param __a
2064 ///    A 128-bit vector of [4 x float] containing the values to be stored.
2065 static __inline__ void __DEFAULT_FN_ATTRS
2066 _mm_storel_pi(__m64 *__p, __m128 __a)
2067 {
2068   typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
2069   struct __mm_storeh_pi_struct {
2070     __mm_storeh_pi_v2f32 __u;
2071   } __attribute__((__packed__, __may_alias__));
2072   ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1);
2073 }
2074 
2075 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
2076 ///     memory location.
2077 ///
2078 /// \headerfile <x86intrin.h>
2079 ///
2080 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
2081 ///
2082 /// \param __p
2083 ///    A pointer to a 32-bit memory location.
2084 /// \param __a
2085 ///    A 128-bit vector of [4 x float] containing the value to be stored.
2086 static __inline__ void __DEFAULT_FN_ATTRS
2087 _mm_store_ss(float *__p, __m128 __a)
2088 {
2089   struct __mm_store_ss_struct {
2090     float __u;
2091   } __attribute__((__packed__, __may_alias__));
2092   ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
2093 }
2094 
2095 /// Stores a 128-bit vector of [4 x float] to an unaligned memory
2096 ///    location.
2097 ///
2098 /// \headerfile <x86intrin.h>
2099 ///
2100 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
2101 ///
2102 /// \param __p
2103 ///    A pointer to a 128-bit memory location. The address of the memory
2104 ///    location does not have to be aligned.
2105 /// \param __a
2106 ///    A 128-bit vector of [4 x float] containing the values to be stored.
2107 static __inline__ void __DEFAULT_FN_ATTRS
2108 _mm_storeu_ps(float *__p, __m128 __a)
2109 {
2110   struct __storeu_ps {
2111     __m128_u __v;
2112   } __attribute__((__packed__, __may_alias__));
2113   ((struct __storeu_ps*)__p)->__v = __a;
2114 }
2115 
2116 /// Stores a 128-bit vector of [4 x float] into an aligned memory
2117 ///    location.
2118 ///
2119 /// \headerfile <x86intrin.h>
2120 ///
2121 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
2122 ///
2123 /// \param __p
2124 ///    A pointer to a 128-bit memory location. The address of the memory
2125 ///    location has to be 16-byte aligned.
2126 /// \param __a
2127 ///    A 128-bit vector of [4 x float] containing the values to be stored.
2128 static __inline__ void __DEFAULT_FN_ATTRS
2129 _mm_store_ps(float *__p, __m128 __a)
2130 {
2131   *(__m128*)__p = __a;
2132 }
2133 
2134 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2135 ///    four contiguous elements in an aligned memory location.
2136 ///
2137 /// \headerfile <x86intrin.h>
2138 ///
2139 /// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2140 ///    instruction.
2141 ///
2142 /// \param __p
2143 ///    A pointer to a 128-bit memory location.
2144 /// \param __a
2145 ///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2146 ///    of the four contiguous elements pointed by \a __p.
2147 static __inline__ void __DEFAULT_FN_ATTRS
2148 _mm_store1_ps(float *__p, __m128 __a)
2149 {
2150   __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
2151   _mm_store_ps(__p, __a);
2152 }
2153 
2154 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2155 ///    four contiguous elements in an aligned memory location.
2156 ///
2157 /// \headerfile <x86intrin.h>
2158 ///
2159 /// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2160 ///    instruction.
2161 ///
2162 /// \param __p
2163 ///    A pointer to a 128-bit memory location.
2164 /// \param __a
2165 ///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2166 ///    of the four contiguous elements pointed by \a __p.
2167 static __inline__ void __DEFAULT_FN_ATTRS
2168 _mm_store_ps1(float *__p, __m128 __a)
2169 {
2170   _mm_store1_ps(__p, __a);
2171 }
2172 
2173 /// Stores float values from a 128-bit vector of [4 x float] to an
2174 ///    aligned memory location in reverse order.
2175 ///
2176 /// \headerfile <x86intrin.h>
2177 ///
2178 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
2179 ///    instruction.
2180 ///
2181 /// \param __p
2182 ///    A pointer to a 128-bit memory location. The address of the memory
2183 ///    location has to be 128-bit aligned.
2184 /// \param __a
2185 ///    A 128-bit vector of [4 x float] containing the values to be stored.
2186 static __inline__ void __DEFAULT_FN_ATTRS
2187 _mm_storer_ps(float *__p, __m128 __a)
2188 {
2189   __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
2190   _mm_store_ps(__p, __a);
2191 }
2192 
2193 #define _MM_HINT_ET0 7
2194 #define _MM_HINT_ET1 6
2195 #define _MM_HINT_T0  3
2196 #define _MM_HINT_T1  2
2197 #define _MM_HINT_T2  1
2198 #define _MM_HINT_NTA 0
2199 
2200 #ifndef _MSC_VER
2201 /* FIXME: We have to #define this because "sel" must be a constant integer, and
2202    Sema doesn't do any form of constant propagation yet. */
2203 
2204 /// Loads one cache line of data from the specified address to a location
2205 ///    closer to the processor.
2206 ///
2207 /// \headerfile <x86intrin.h>
2208 ///
2209 /// \code
2210 /// void _mm_prefetch(const void *a, const int sel);
2211 /// \endcode
2212 ///
2213 /// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
2214 ///
2215 /// \param a
2216 ///    A pointer to a memory location containing a cache line of data.
2217 /// \param sel
2218 ///    A predefined integer constant specifying the type of prefetch
2219 ///    operation: \n
2220 ///    _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
2221 ///    PREFETCHNTA instruction will be generated. \n
2222 ///    _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
2223 ///    be generated. \n
2224 ///    _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
2225 ///    be generated. \n
2226 ///    _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
2227 ///    be generated.
2228 #define _mm_prefetch(a, sel) (__builtin_prefetch((const void *)(a), \
2229                                                  ((sel) >> 2) & 1, (sel) & 0x3))
2230 #endif
2231 
2232 /// Stores a 64-bit integer in the specified aligned memory location. To
2233 ///    minimize caching, the data is flagged as non-temporal (unlikely to be
2234 ///    used again soon).
2235 ///
2236 /// \headerfile <x86intrin.h>
2237 ///
2238 /// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
2239 ///
2240 /// \param __p
2241 ///    A pointer to an aligned memory location used to store the register value.
2242 /// \param __a
2243 ///    A 64-bit integer containing the value to be stored.
2244 static __inline__ void __DEFAULT_FN_ATTRS
2245 _mm_stream_pi(void *__p, __m64 __a)
2246 {
2247   __builtin_nontemporal_store(__a, (__m64 *)__p);
2248 }
2249 
2250 /// Moves packed float values from a 128-bit vector of [4 x float] to a
2251 ///    128-bit aligned memory location. To minimize caching, the data is flagged
2252 ///    as non-temporal (unlikely to be used again soon).
2253 ///
2254 /// \headerfile <x86intrin.h>
2255 ///
2256 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
2257 ///
2258 /// \param __p
2259 ///    A pointer to a 128-bit aligned memory location that will receive the
2260 ///    single-precision floating-point values.
2261 /// \param __a
2262 ///    A 128-bit vector of [4 x float] containing the values to be moved.
2263 static __inline__ void __DEFAULT_FN_ATTRS
2264 _mm_stream_ps(void *__p, __m128 __a)
2265 {
2266   __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
2267 }
2268 
2269 #if defined(__cplusplus)
2270 extern "C" {
2271 #endif
2272 
2273 /// Forces strong memory ordering (serialization) between store
2274 ///    instructions preceding this instruction and store instructions following
2275 ///    this instruction, ensuring the system completes all previous stores
2276 ///    before executing subsequent stores.
2277 ///
2278 /// \headerfile <x86intrin.h>
2279 ///
2280 /// This intrinsic corresponds to the <c> SFENCE </c> instruction.
2281 ///
2282 void _mm_sfence(void);
2283 
2284 #if defined(__cplusplus)
2285 } // extern "C"
2286 #endif
2287 
2288 /// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
2289 ///    returns it, as specified by the immediate integer operand.
2290 ///
2291 /// \headerfile <x86intrin.h>
2292 ///
2293 /// \code
2294 /// int _mm_extract_pi16(__m64 a, int n);
2295 /// \endcode
2296 ///
2297 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
2298 ///
2299 /// \param a
2300 ///    A 64-bit vector of [4 x i16].
2301 /// \param n
2302 ///    An immediate integer operand that determines which bits are extracted: \n
2303 ///    0: Bits [15:0] are copied to the destination. \n
2304 ///    1: Bits [31:16] are copied to the destination. \n
2305 ///    2: Bits [47:32] are copied to the destination. \n
2306 ///    3: Bits [63:48] are copied to the destination.
2307 /// \returns A 16-bit integer containing the extracted 16 bits of packed data.
2308 #define _mm_extract_pi16(a, n) \
2309   ((int)(unsigned short)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
2310 
2311 /// Copies data from the 64-bit vector of [4 x i16] to the destination,
2312 ///    and inserts the lower 16-bits of an integer operand at the 16-bit offset
2313 ///    specified by the immediate operand \a n.
2314 ///
2315 /// \headerfile <x86intrin.h>
2316 ///
2317 /// \code
2318 /// __m64 _mm_insert_pi16(__m64 a, int d, int n);
2319 /// \endcode
2320 ///
2321 /// This intrinsic corresponds to the <c> PINSRW </c> instruction.
2322 ///
2323 /// \param a
2324 ///    A 64-bit vector of [4 x i16].
2325 /// \param d
2326 ///    An integer. The lower 16-bit value from this operand is written to the
2327 ///    destination at the offset specified by operand \a n.
2328 /// \param n
2329 ///    An immediate integer operant that determines which the bits to be used
2330 ///    in the destination. \n
2331 ///    0: Bits [15:0] are copied to the destination. \n
2332 ///    1: Bits [31:16] are copied to the destination. \n
2333 ///    2: Bits [47:32] are copied to the destination. \n
2334 ///    3: Bits [63:48] are copied to the destination.  \n
2335 ///    The remaining bits in the destination are copied from the corresponding
2336 ///    bits in operand \a a.
2337 /// \returns A 64-bit integer vector containing the copied packed data from the
2338 ///    operands.
2339 #define _mm_insert_pi16(a, d, n) \
2340   ((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n))
2341 
2342 /// Compares each of the corresponding packed 16-bit integer values of
2343 ///    the 64-bit integer vectors, and writes the greater value to the
2344 ///    corresponding bits in the destination.
2345 ///
2346 /// \headerfile <x86intrin.h>
2347 ///
2348 /// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
2349 ///
2350 /// \param __a
2351 ///    A 64-bit integer vector containing one of the source operands.
2352 /// \param __b
2353 ///    A 64-bit integer vector containing one of the source operands.
2354 /// \returns A 64-bit integer vector containing the comparison results.
2355 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2356 _mm_max_pi16(__m64 __a, __m64 __b)
2357 {
2358   return (__m64)__builtin_elementwise_max((__v4hi)__a, (__v4hi)__b);
2359 }
2360 
2361 /// Compares each of the corresponding packed 8-bit unsigned integer
2362 ///    values of the 64-bit integer vectors, and writes the greater value to the
2363 ///    corresponding bits in the destination.
2364 ///
2365 /// \headerfile <x86intrin.h>
2366 ///
2367 /// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
2368 ///
2369 /// \param __a
2370 ///    A 64-bit integer vector containing one of the source operands.
2371 /// \param __b
2372 ///    A 64-bit integer vector containing one of the source operands.
2373 /// \returns A 64-bit integer vector containing the comparison results.
2374 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2375 _mm_max_pu8(__m64 __a, __m64 __b)
2376 {
2377   return (__m64)__builtin_elementwise_max((__v8qu)__a, (__v8qu)__b);
2378 }
2379 
2380 /// Compares each of the corresponding packed 16-bit integer values of
2381 ///    the 64-bit integer vectors, and writes the lesser value to the
2382 ///    corresponding bits in the destination.
2383 ///
2384 /// \headerfile <x86intrin.h>
2385 ///
2386 /// This intrinsic corresponds to the <c> PMINSW </c> instruction.
2387 ///
2388 /// \param __a
2389 ///    A 64-bit integer vector containing one of the source operands.
2390 /// \param __b
2391 ///    A 64-bit integer vector containing one of the source operands.
2392 /// \returns A 64-bit integer vector containing the comparison results.
2393 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2394 _mm_min_pi16(__m64 __a, __m64 __b)
2395 {
2396   return (__m64)__builtin_elementwise_min((__v4hi)__a, (__v4hi)__b);
2397 }
2398 
2399 /// Compares each of the corresponding packed 8-bit unsigned integer
2400 ///    values of the 64-bit integer vectors, and writes the lesser value to the
2401 ///    corresponding bits in the destination.
2402 ///
2403 /// \headerfile <x86intrin.h>
2404 ///
2405 /// This intrinsic corresponds to the <c> PMINUB </c> instruction.
2406 ///
2407 /// \param __a
2408 ///    A 64-bit integer vector containing one of the source operands.
2409 /// \param __b
2410 ///    A 64-bit integer vector containing one of the source operands.
2411 /// \returns A 64-bit integer vector containing the comparison results.
2412 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2413 _mm_min_pu8(__m64 __a, __m64 __b)
2414 {
2415   return (__m64)__builtin_elementwise_min((__v8qu)__a, (__v8qu)__b);
2416 }
2417 
2418 /// Takes the most significant bit from each 8-bit element in a 64-bit
2419 ///    integer vector to create an 8-bit mask value. Zero-extends the value to
2420 ///    32-bit integer and writes it to the destination.
2421 ///
2422 /// \headerfile <x86intrin.h>
2423 ///
2424 /// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
2425 ///
2426 /// \param __a
2427 ///    A 64-bit integer vector containing the values with bits to be extracted.
2428 /// \returns The most significant bit from each 8-bit element in \a __a,
2429 ///    written to bits [7:0].
2430 static __inline__ int __DEFAULT_FN_ATTRS_SSE2
2431 _mm_movemask_pi8(__m64 __a)
2432 {
2433   return __builtin_ia32_pmovmskb128((__v16qi)__zext128(__a));
2434 }
2435 
2436 /// Multiplies packed 16-bit unsigned integer values and writes the
2437 ///    high-order 16 bits of each 32-bit product to the corresponding bits in
2438 ///    the destination.
2439 ///
2440 /// \headerfile <x86intrin.h>
2441 ///
2442 /// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
2443 ///
2444 /// \param __a
2445 ///    A 64-bit integer vector containing one of the source operands.
2446 /// \param __b
2447 ///    A 64-bit integer vector containing one of the source operands.
2448 /// \returns A 64-bit integer vector containing the products of both operands.
2449 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2450 _mm_mulhi_pu16(__m64 __a, __m64 __b)
2451 {
2452   return __trunc64(__builtin_ia32_pmulhuw128((__v8hi)__anyext128(__a),
2453                                              (__v8hi)__anyext128(__b)));
2454 }
2455 
2456 /// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
2457 ///    destination, as specified by the immediate value operand.
2458 ///
2459 /// \headerfile <x86intrin.h>
2460 ///
2461 /// \code
2462 /// __m64 _mm_shuffle_pi16(__m64 a, const int n);
2463 /// \endcode
2464 ///
2465 /// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
2466 ///
2467 /// \param a
2468 ///    A 64-bit integer vector containing the values to be shuffled.
2469 /// \param n
2470 ///    An immediate value containing an 8-bit value specifying which elements to
2471 ///    copy from \a a. The destinations within the 64-bit destination are
2472 ///    assigned values as follows: \n
2473 ///    Bits [1:0] are used to assign values to bits [15:0] in the
2474 ///    destination. \n
2475 ///    Bits [3:2] are used to assign values to bits [31:16] in the
2476 ///    destination. \n
2477 ///    Bits [5:4] are used to assign values to bits [47:32] in the
2478 ///    destination. \n
2479 ///    Bits [7:6] are used to assign values to bits [63:48] in the
2480 ///    destination. \n
2481 ///    Bit value assignments: \n
2482 ///    00: assigned from bits [15:0] of \a a. \n
2483 ///    01: assigned from bits [31:16] of \a a. \n
2484 ///    10: assigned from bits [47:32] of \a a. \n
2485 ///    11: assigned from bits [63:48] of \a a. \n
2486 ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
2487 ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
2488 ///    <c>[b6, b4, b2, b0]</c>.
2489 /// \returns A 64-bit integer vector containing the shuffled values.
2490 #define _mm_shuffle_pi16(a, n)                                                 \
2491   ((__m64)__builtin_shufflevector((__v4hi)(__m64)(a), __extension__(__v4hi){}, \
2492                                   (n) & 0x3, ((n) >> 2) & 0x3,                 \
2493                                   ((n) >> 4) & 0x3, ((n) >> 6) & 0x3))
2494 
2495 /// Conditionally copies the values from each 8-bit element in the first
2496 ///    64-bit integer vector operand to the specified memory location, as
2497 ///    specified by the most significant bit in the corresponding element in the
2498 ///    second 64-bit integer vector operand.
2499 ///
2500 ///    To minimize caching, the data is flagged as non-temporal
2501 ///    (unlikely to be used again soon).
2502 ///
2503 /// \headerfile <x86intrin.h>
2504 ///
2505 /// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
2506 ///
2507 /// \param __d
2508 ///    A 64-bit integer vector containing the values with elements to be copied.
2509 /// \param __n
2510 ///    A 64-bit integer vector operand. The most significant bit from each 8-bit
2511 ///    element determines whether the corresponding element in operand \a __d
2512 ///    is copied. If the most significant bit of a given element is 1, the
2513 ///    corresponding element in operand \a __d is copied.
2514 /// \param __p
2515 ///    A pointer to a 64-bit memory location that will receive the conditionally
2516 ///    copied integer values. The address of the memory location does not have
2517 ///    to be aligned.
2518 static __inline__ void __DEFAULT_FN_ATTRS_SSE2
2519 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
2520 {
2521   // This is complex, because we need to support the case where __p is pointing
2522   // within the last 15 to 8 bytes of a page. In that case, using a 128-bit
2523   // write might cause a trap where a 64-bit maskmovq would not. (Memory
2524   // locations not selected by the mask bits might still cause traps.)
2525   __m128i __d128  = __anyext128(__d);
2526   __m128i __n128  = __zext128(__n);
2527   if (((__SIZE_TYPE__)__p & 0xfff) >= 4096-15 &&
2528       ((__SIZE_TYPE__)__p & 0xfff) <= 4096-8) {
2529     // If there's a risk of spurious trap due to a 128-bit write, back up the
2530     // pointer by 8 bytes and shift values in registers to match.
2531     __p -= 8;
2532     __d128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__d128, 8);
2533     __n128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__n128, 8);
2534   }
2535 
2536   __builtin_ia32_maskmovdqu((__v16qi)__d128, (__v16qi)__n128, __p);
2537 }
2538 
2539 /// Computes the rounded averages of the packed unsigned 8-bit integer
2540 ///    values and writes the averages to the corresponding bits in the
2541 ///    destination.
2542 ///
2543 /// \headerfile <x86intrin.h>
2544 ///
2545 /// This intrinsic corresponds to the <c> PAVGB </c> instruction.
2546 ///
2547 /// \param __a
2548 ///    A 64-bit integer vector containing one of the source operands.
2549 /// \param __b
2550 ///    A 64-bit integer vector containing one of the source operands.
2551 /// \returns A 64-bit integer vector containing the averages of both operands.
2552 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2553 _mm_avg_pu8(__m64 __a, __m64 __b)
2554 {
2555   return __trunc64(__builtin_ia32_pavgb128((__v16qi)__anyext128(__a),
2556                                            (__v16qi)__anyext128(__b)));
2557 }
2558 
2559 /// Computes the rounded averages of the packed unsigned 16-bit integer
2560 ///    values and writes the averages to the corresponding bits in the
2561 ///    destination.
2562 ///
2563 /// \headerfile <x86intrin.h>
2564 ///
2565 /// This intrinsic corresponds to the <c> PAVGW </c> instruction.
2566 ///
2567 /// \param __a
2568 ///    A 64-bit integer vector containing one of the source operands.
2569 /// \param __b
2570 ///    A 64-bit integer vector containing one of the source operands.
2571 /// \returns A 64-bit integer vector containing the averages of both operands.
2572 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2573 _mm_avg_pu16(__m64 __a, __m64 __b)
2574 {
2575   return __trunc64(__builtin_ia32_pavgw128((__v8hi)__anyext128(__a),
2576                                            (__v8hi)__anyext128(__b)));
2577 }
2578 
2579 /// Subtracts the corresponding 8-bit unsigned integer values of the two
2580 ///    64-bit vector operands and computes the absolute value for each of the
2581 ///    difference. Then sum of the 8 absolute differences is written to the
2582 ///    bits [15:0] of the destination; the remaining bits [63:16] are cleared.
2583 ///
2584 /// \headerfile <x86intrin.h>
2585 ///
2586 /// This intrinsic corresponds to the <c> PSADBW </c> instruction.
2587 ///
2588 /// \param __a
2589 ///    A 64-bit integer vector containing one of the source operands.
2590 /// \param __b
2591 ///    A 64-bit integer vector containing one of the source operands.
2592 /// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
2593 ///    sets of absolute differences between both operands. The upper bits are
2594 ///    cleared.
2595 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2596 _mm_sad_pu8(__m64 __a, __m64 __b)
2597 {
2598   return __trunc64(__builtin_ia32_psadbw128((__v16qi)__zext128(__a),
2599                                             (__v16qi)__zext128(__b)));
2600 }
2601 
2602 #if defined(__cplusplus)
2603 extern "C" {
2604 #endif
2605 
2606 /// Returns the contents of the MXCSR register as a 32-bit unsigned
2607 ///    integer value.
2608 ///
2609 ///    There are several groups of macros associated with this
2610 ///    intrinsic, including:
2611 ///    <ul>
2612 ///    <li>
2613 ///      For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2614 ///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2615 ///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
2616 ///      _MM_GET_EXCEPTION_STATE().
2617 ///    </li>
2618 ///    <li>
2619 ///      For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2620 ///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2621 ///      There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
2622 ///    </li>
2623 ///    <li>
2624 ///      For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2625 ///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2626 ///      _MM_GET_ROUNDING_MODE().
2627 ///    </li>
2628 ///    <li>
2629 ///      For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2630 ///      There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
2631 ///    </li>
2632 ///    <li>
2633 ///      For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2634 ///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2635 ///      _MM_GET_DENORMALS_ZERO_MODE().
2636 ///    </li>
2637 ///    </ul>
2638 ///
2639 ///    For example, the following expression checks if an overflow exception has
2640 ///    occurred:
2641 ///    \code
2642 ///      ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
2643 ///    \endcode
2644 ///
2645 ///    The following expression gets the current rounding mode:
2646 ///    \code
2647 ///      _MM_GET_ROUNDING_MODE()
2648 ///    \endcode
2649 ///
2650 /// \headerfile <x86intrin.h>
2651 ///
2652 /// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction.
2653 ///
2654 /// \returns A 32-bit unsigned integer containing the contents of the MXCSR
2655 ///    register.
2656 unsigned int _mm_getcsr(void);
2657 
2658 /// Sets the MXCSR register with the 32-bit unsigned integer value.
2659 ///
2660 ///    There are several groups of macros associated with this intrinsic,
2661 ///    including:
2662 ///    <ul>
2663 ///    <li>
2664 ///      For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2665 ///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2666 ///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
2667 ///      _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
2668 ///    </li>
2669 ///    <li>
2670 ///      For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2671 ///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2672 ///      There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
2673 ///      of these macros.
2674 ///    </li>
2675 ///    <li>
2676 ///      For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2677 ///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2678 ///      _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
2679 ///    </li>
2680 ///    <li>
2681 ///      For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2682 ///      There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
2683 ///      one of these macros.
2684 ///    </li>
2685 ///    <li>
2686 ///      For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2687 ///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2688 ///      _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
2689 ///    </li>
2690 ///    </ul>
2691 ///
2692 ///    For example, the following expression causes subsequent floating-point
2693 ///    operations to round up:
2694 ///      _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP)
2695 ///
2696 ///    The following example sets the DAZ and FTZ flags:
2697 ///    \code
2698 ///    void setFlags() {
2699 ///      _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
2700 ///      _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
2701 ///    }
2702 ///    \endcode
2703 ///
2704 /// \headerfile <x86intrin.h>
2705 ///
2706 /// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction.
2707 ///
2708 /// \param __i
2709 ///    A 32-bit unsigned integer value to be written to the MXCSR register.
2710 void _mm_setcsr(unsigned int __i);
2711 
2712 #if defined(__cplusplus)
2713 } // extern "C"
2714 #endif
2715 
2716 /// Selects 4 float values from the 128-bit operands of [4 x float], as
2717 ///    specified by the immediate value operand.
2718 ///
2719 /// \headerfile <x86intrin.h>
2720 ///
2721 /// \code
2722 /// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
2723 /// \endcode
2724 ///
2725 /// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction.
2726 ///
2727 /// \param a
2728 ///    A 128-bit vector of [4 x float].
2729 /// \param b
2730 ///    A 128-bit vector of [4 x float].
2731 /// \param mask
2732 ///    An immediate value containing an 8-bit value specifying which elements to
2733 ///    copy from \a a and \a b. \n
2734 ///    Bits [3:0] specify the values copied from operand \a a. \n
2735 ///    Bits [7:4] specify the values copied from operand \a b. \n
2736 ///    The destinations within the 128-bit destination are assigned values as
2737 ///    follows: \n
2738 ///    Bits [1:0] are used to assign values to bits [31:0] in the
2739 ///    destination. \n
2740 ///    Bits [3:2] are used to assign values to bits [63:32] in the
2741 ///    destination. \n
2742 ///    Bits [5:4] are used to assign values to bits [95:64] in the
2743 ///    destination. \n
2744 ///    Bits [7:6] are used to assign values to bits [127:96] in the
2745 ///    destination. \n
2746 ///    Bit value assignments: \n
2747 ///    00: Bits [31:0] copied from the specified operand. \n
2748 ///    01: Bits [63:32] copied from the specified operand. \n
2749 ///    10: Bits [95:64] copied from the specified operand. \n
2750 ///    11: Bits [127:96] copied from the specified operand. \n
2751 ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
2752 ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
2753 ///    <c>[b6, b4, b2, b0]</c>.
2754 /// \returns A 128-bit vector of [4 x float] containing the shuffled values.
2755 #define _mm_shuffle_ps(a, b, mask) \
2756   ((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
2757                                  (int)(mask)))
2758 
2759 /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
2760 ///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2761 ///
2762 /// \headerfile <x86intrin.h>
2763 ///
2764 /// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction.
2765 ///
2766 /// \param __a
2767 ///    A 128-bit vector of [4 x float]. \n
2768 ///    Bits [95:64] are written to bits [31:0] of the destination. \n
2769 ///    Bits [127:96] are written to bits [95:64] of the destination.
2770 /// \param __b
2771 ///    A 128-bit vector of [4 x float].
2772 ///    Bits [95:64] are written to bits [63:32] of the destination. \n
2773 ///    Bits [127:96] are written to bits [127:96] of the destination.
2774 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2775 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
2776 _mm_unpackhi_ps(__m128 __a, __m128 __b) {
2777   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
2778 }
2779 
2780 /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
2781 ///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2782 ///
2783 /// \headerfile <x86intrin.h>
2784 ///
2785 /// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction.
2786 ///
2787 /// \param __a
2788 ///    A 128-bit vector of [4 x float]. \n
2789 ///    Bits [31:0] are written to bits [31:0] of the destination.  \n
2790 ///    Bits [63:32] are written to bits [95:64] of the destination.
2791 /// \param __b
2792 ///    A 128-bit vector of [4 x float]. \n
2793 ///    Bits [31:0] are written to bits [63:32] of the destination. \n
2794 ///    Bits [63:32] are written to bits [127:96] of the destination.
2795 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2796 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
2797 _mm_unpacklo_ps(__m128 __a, __m128 __b) {
2798   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
2799 }
2800 
2801 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2802 ///    32 bits are set to the lower 32 bits of the second parameter. The upper
2803 ///    96 bits are set to the upper 96 bits of the first parameter.
2804 ///
2805 /// \headerfile <x86intrin.h>
2806 ///
2807 /// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS / MOVSS </c>
2808 ///    instruction.
2809 ///
2810 /// \param __a
2811 ///    A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
2812 ///    written to the upper 96 bits of the result.
2813 /// \param __b
2814 ///    A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
2815 ///    written to the lower 32 bits of the result.
2816 /// \returns A 128-bit floating-point vector of [4 x float].
2817 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
2818 _mm_move_ss(__m128 __a, __m128 __b) {
2819   __a[0] = __b[0];
2820   return __a;
2821 }
2822 
2823 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2824 ///    64 bits are set to the upper 64 bits of the second parameter. The upper
2825 ///    64 bits are set to the upper 64 bits of the first parameter.
2826 ///
2827 /// \headerfile <x86intrin.h>
2828 ///
2829 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
2830 ///
2831 /// \param __a
2832 ///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2833 ///    written to the upper 64 bits of the result.
2834 /// \param __b
2835 ///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2836 ///    written to the lower 64 bits of the result.
2837 /// \returns A 128-bit floating-point vector of [4 x float].
2838 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
2839 _mm_movehl_ps(__m128 __a, __m128 __b) {
2840   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
2841 }
2842 
2843 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2844 ///    64 bits are set to the lower 64 bits of the first parameter. The upper
2845 ///    64 bits are set to the lower 64 bits of the second parameter.
2846 ///
2847 /// \headerfile <x86intrin.h>
2848 ///
2849 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
2850 ///
2851 /// \param __a
2852 ///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2853 ///    written to the lower 64 bits of the result.
2854 /// \param __b
2855 ///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2856 ///    written to the upper 64 bits of the result.
2857 /// \returns A 128-bit floating-point vector of [4 x float].
2858 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
2859 _mm_movelh_ps(__m128 __a, __m128 __b) {
2860   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
2861 }
2862 
2863 /// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
2864 ///    float].
2865 ///
2866 /// \headerfile <x86intrin.h>
2867 ///
2868 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2869 ///
2870 /// \param __a
2871 ///    A 64-bit vector of [4 x i16]. The elements of the destination are copied
2872 ///    from the corresponding elements in this operand.
2873 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2874 ///    values from the operand.
2875 static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
2876 _mm_cvtpi16_ps(__m64 __a)
2877 {
2878   return __builtin_convertvector((__v4hi)__a, __v4sf);
2879 }
2880 
2881 /// Converts a 64-bit vector of 16-bit unsigned integer values into a
2882 ///    128-bit vector of [4 x float].
2883 ///
2884 /// \headerfile <x86intrin.h>
2885 ///
2886 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2887 ///
2888 /// \param __a
2889 ///    A 64-bit vector of 16-bit unsigned integer values. The elements of the
2890 ///    destination are copied from the corresponding elements in this operand.
2891 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2892 ///    values from the operand.
2893 static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
2894 _mm_cvtpu16_ps(__m64 __a)
2895 {
2896   return __builtin_convertvector((__v4hu)__a, __v4sf);
2897 }
2898 
2899 /// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
2900 ///    into a 128-bit vector of [4 x float].
2901 ///
2902 /// \headerfile <x86intrin.h>
2903 ///
2904 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2905 ///
2906 /// \param __a
2907 ///    A 64-bit vector of [8 x i8]. The elements of the destination are copied
2908 ///    from the corresponding lower 4 elements in this operand.
2909 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2910 ///    values from the operand.
2911 static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
2912 _mm_cvtpi8_ps(__m64 __a)
2913 {
2914   return __builtin_convertvector(
2915       __builtin_shufflevector((__v8qs)__a, __extension__ (__v8qs){},
2916                               0, 1, 2, 3), __v4sf);
2917 }
2918 
2919 /// Converts the lower four unsigned 8-bit integer values from a 64-bit
2920 ///    vector of [8 x u8] into a 128-bit vector of [4 x float].
2921 ///
2922 /// \headerfile <x86intrin.h>
2923 ///
2924 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2925 ///
2926 /// \param __a
2927 ///    A 64-bit vector of unsigned 8-bit integer values. The elements of the
2928 ///    destination are copied from the corresponding lower 4 elements in this
2929 ///    operand.
2930 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2931 ///    values from the source operand.
2932 static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
2933 _mm_cvtpu8_ps(__m64 __a)
2934 {
2935   return __builtin_convertvector(
2936       __builtin_shufflevector((__v8qu)__a, __extension__ (__v8qu){},
2937                               0, 1, 2, 3), __v4sf);
2938 }
2939 
2940 /// Converts the two 32-bit signed integer values from each 64-bit vector
2941 ///    operand of [2 x i32] into a 128-bit vector of [4 x float].
2942 ///
2943 /// \headerfile <x86intrin.h>
2944 ///
2945 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2946 ///
2947 /// \param __a
2948 ///    A 64-bit vector of [2 x i32]. The lower elements of the destination are
2949 ///    copied from the elements in this operand.
2950 /// \param __b
2951 ///    A 64-bit vector of [2 x i32]. The upper elements of the destination are
2952 ///    copied from the elements in this operand.
2953 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
2954 ///    copied and converted values from the first operand. The upper 64 bits
2955 ///    contain the copied and converted values from the second operand.
2956 static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
2957 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
2958 {
2959   return __builtin_convertvector(
2960       __builtin_shufflevector((__v2si)__a, (__v2si)__b,
2961                               0, 1, 2, 3), __v4sf);
2962 }
2963 
2964 /// Converts each single-precision floating-point element of a 128-bit
2965 ///    floating-point vector of [4 x float] into a 16-bit signed integer, and
2966 ///    packs the results into a 64-bit integer vector of [4 x i16].
2967 ///
2968 ///    If the floating-point element is NaN or infinity, or if the
2969 ///    floating-point element is greater than 0x7FFFFFFF or less than -0x8000,
2970 ///    it is converted to 0x8000. Otherwise if the floating-point element is
2971 ///    greater than 0x7FFF, it is converted to 0x7FFF.
2972 ///
2973 /// \headerfile <x86intrin.h>
2974 ///
2975 /// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2976 ///
2977 /// \param __a
2978 ///    A 128-bit floating-point vector of [4 x float].
2979 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
2980 ///    values.
2981 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2982 _mm_cvtps_pi16(__m128 __a)
2983 {
2984   return __trunc64(__builtin_ia32_packssdw128(
2985       (__v4si)__builtin_ia32_cvtps2dq((__v4sf)__a), (__v4si)_mm_setzero_ps()));
2986 }
2987 
2988 /// Converts each single-precision floating-point element of a 128-bit
2989 ///    floating-point vector of [4 x float] into an 8-bit signed integer, and
2990 ///    packs the results into the lower 32 bits of a 64-bit integer vector of
2991 ///    [8 x i8]. The upper 32 bits of the vector are set to 0.
2992 ///
2993 ///    If the floating-point element is NaN or infinity, or if the
2994 ///    floating-point element is greater than 0x7FFFFFFF or less than -0x80, it
2995 ///    is converted to 0x80. Otherwise if the floating-point element is greater
2996 ///    than 0x7F, it is converted to 0x7F.
2997 ///
2998 /// \headerfile <x86intrin.h>
2999 ///
3000 /// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
3001 ///
3002 /// \param __a
3003 ///    128-bit floating-point vector of [4 x float].
3004 /// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
3005 ///    converted values and the uppper 32 bits are set to zero.
3006 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
3007 _mm_cvtps_pi8(__m128 __a)
3008 {
3009   __m64 __b, __c;
3010 
3011   __b = _mm_cvtps_pi16(__a);
3012   __c = _mm_setzero_si64();
3013 
3014   return _mm_packs_pi16(__b, __c);
3015 }
3016 
3017 /// Extracts the sign bits from each single-precision floating-point
3018 ///    element of a 128-bit floating-point vector of [4 x float] and returns the
3019 ///    sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
3020 ///    to zero.
3021 ///
3022 /// \headerfile <x86intrin.h>
3023 ///
3024 /// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction.
3025 ///
3026 /// \param __a
3027 ///    A 128-bit floating-point vector of [4 x float].
3028 /// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
3029 ///    single-precision floating-point element of the parameter. Bits [31:4] are
3030 ///    set to zero.
3031 static __inline__ int __DEFAULT_FN_ATTRS
3032 _mm_movemask_ps(__m128 __a)
3033 {
3034   return __builtin_ia32_movmskps((__v4sf)__a);
3035 }
3036 
3037 /* Compare */
3038 #define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
3039 #define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
3040 #define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
3041 #define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
3042 #define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
3043 #define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
3044 #define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
3045 #define _CMP_ORD_Q    0x07 /* Ordered (non-signaling)   */
3046 
3047 /// Compares each of the corresponding values of two 128-bit vectors of
3048 ///    [4 x float], using the operation specified by the immediate integer
3049 ///    operand.
3050 ///
3051 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3052 ///    If either value in a comparison is NaN, comparisons that are ordered
3053 ///    return false, and comparisons that are unordered return true.
3054 ///
3055 /// \headerfile <x86intrin.h>
3056 ///
3057 /// \code
3058 /// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
3059 /// \endcode
3060 ///
3061 /// This intrinsic corresponds to the <c> (V)CMPPS </c> instruction.
3062 ///
3063 /// \param a
3064 ///    A 128-bit vector of [4 x float].
3065 /// \param b
3066 ///    A 128-bit vector of [4 x float].
3067 /// \param c
3068 ///    An immediate integer operand, with bits [4:0] specifying which comparison
3069 ///    operation to use: \n
3070 ///    0x00: Equal (ordered, non-signaling) \n
3071 ///    0x01: Less-than (ordered, signaling) \n
3072 ///    0x02: Less-than-or-equal (ordered, signaling) \n
3073 ///    0x03: Unordered (non-signaling) \n
3074 ///    0x04: Not-equal (unordered, non-signaling) \n
3075 ///    0x05: Not-less-than (unordered, signaling) \n
3076 ///    0x06: Not-less-than-or-equal (unordered, signaling) \n
3077 ///    0x07: Ordered (non-signaling) \n
3078 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
3079 #define _mm_cmp_ps(a, b, c)                                                    \
3080   ((__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), (c)))
3081 
3082 /// Compares each of the corresponding scalar values of two 128-bit
3083 ///    vectors of [4 x float], using the operation specified by the immediate
3084 ///    integer operand.
3085 ///
3086 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3087 ///    If either value in a comparison is NaN, comparisons that are ordered
3088 ///    return false, and comparisons that are unordered return true.
3089 ///
3090 /// \headerfile <x86intrin.h>
3091 ///
3092 /// \code
3093 /// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
3094 /// \endcode
3095 ///
3096 /// This intrinsic corresponds to the <c> (V)CMPSS </c> instruction.
3097 ///
3098 /// \param a
3099 ///    A 128-bit vector of [4 x float].
3100 /// \param b
3101 ///    A 128-bit vector of [4 x float].
3102 /// \param c
3103 ///    An immediate integer operand, with bits [4:0] specifying which comparison
3104 ///    operation to use: \n
3105 ///    0x00: Equal (ordered, non-signaling) \n
3106 ///    0x01: Less-than (ordered, signaling) \n
3107 ///    0x02: Less-than-or-equal (ordered, signaling) \n
3108 ///    0x03: Unordered (non-signaling) \n
3109 ///    0x04: Not-equal (unordered, non-signaling) \n
3110 ///    0x05: Not-less-than (unordered, signaling) \n
3111 ///    0x06: Not-less-than-or-equal (unordered, signaling) \n
3112 ///    0x07: Ordered (non-signaling) \n
3113 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
3114 #define _mm_cmp_ss(a, b, c)                                                    \
3115   ((__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), (c)))
3116 
3117 #define _MM_ALIGN16 __attribute__((aligned(16)))
3118 
3119 #define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
3120 
3121 #define _MM_EXCEPT_INVALID    (0x0001U)
3122 #define _MM_EXCEPT_DENORM     (0x0002U)
3123 #define _MM_EXCEPT_DIV_ZERO   (0x0004U)
3124 #define _MM_EXCEPT_OVERFLOW   (0x0008U)
3125 #define _MM_EXCEPT_UNDERFLOW  (0x0010U)
3126 #define _MM_EXCEPT_INEXACT    (0x0020U)
3127 #define _MM_EXCEPT_MASK       (0x003fU)
3128 
3129 #define _MM_MASK_INVALID      (0x0080U)
3130 #define _MM_MASK_DENORM       (0x0100U)
3131 #define _MM_MASK_DIV_ZERO     (0x0200U)
3132 #define _MM_MASK_OVERFLOW     (0x0400U)
3133 #define _MM_MASK_UNDERFLOW    (0x0800U)
3134 #define _MM_MASK_INEXACT      (0x1000U)
3135 #define _MM_MASK_MASK         (0x1f80U)
3136 
3137 #define _MM_ROUND_NEAREST     (0x0000U)
3138 #define _MM_ROUND_DOWN        (0x2000U)
3139 #define _MM_ROUND_UP          (0x4000U)
3140 #define _MM_ROUND_TOWARD_ZERO (0x6000U)
3141 #define _MM_ROUND_MASK        (0x6000U)
3142 
3143 #define _MM_FLUSH_ZERO_MASK   (0x8000U)
3144 #define _MM_FLUSH_ZERO_ON     (0x8000U)
3145 #define _MM_FLUSH_ZERO_OFF    (0x0000U)
3146 
3147 #define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
3148 #define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
3149 #define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
3150 #define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
3151 
3152 #define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
3153 #define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
3154 #define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
3155 #define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
3156 
3157 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
3158 do { \
3159   __m128 tmp3, tmp2, tmp1, tmp0; \
3160   tmp0 = _mm_unpacklo_ps((row0), (row1)); \
3161   tmp2 = _mm_unpacklo_ps((row2), (row3)); \
3162   tmp1 = _mm_unpackhi_ps((row0), (row1)); \
3163   tmp3 = _mm_unpackhi_ps((row2), (row3)); \
3164   (row0) = _mm_movelh_ps(tmp0, tmp2); \
3165   (row1) = _mm_movehl_ps(tmp2, tmp0); \
3166   (row2) = _mm_movelh_ps(tmp1, tmp3); \
3167   (row3) = _mm_movehl_ps(tmp3, tmp1); \
3168 } while (0)
3169 
3170 /* Aliases for compatibility. */
3171 #define _m_pextrw _mm_extract_pi16
3172 #define _m_pinsrw _mm_insert_pi16
3173 #define _m_pmaxsw _mm_max_pi16
3174 #define _m_pmaxub _mm_max_pu8
3175 #define _m_pminsw _mm_min_pi16
3176 #define _m_pminub _mm_min_pu8
3177 #define _m_pmovmskb _mm_movemask_pi8
3178 #define _m_pmulhuw _mm_mulhi_pu16
3179 #define _m_pshufw _mm_shuffle_pi16
3180 #define _m_maskmovq _mm_maskmove_si64
3181 #define _m_pavgb _mm_avg_pu8
3182 #define _m_pavgw _mm_avg_pu16
3183 #define _m_psadbw _mm_sad_pu8
3184 #define _m_ _mm_
3185 
3186 #undef __trunc64
3187 #undef __zext128
3188 #undef __anyext128
3189 #undef __zeroupper64
3190 #undef __DEFAULT_FN_ATTRS
3191 #undef __DEFAULT_FN_ATTRS_CONSTEXPR
3192 #undef __DEFAULT_FN_ATTRS_SSE2
3193 #undef __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
3194 
3195 /* Ugly hack for backwards-compatibility (compatible with gcc) */
3196 #if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
3197 #include <emmintrin.h>
3198 #endif
3199 
3200 #endif /* __XMMINTRIN_H */
3201