xref: /llvm-project/clang/lib/Headers/mmintrin.h (revision c9552283c0bf277eba490cde9fd913510f4111c0)
1 /*===---- mmintrin.h - MMX intrinsics --------------------------------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 #ifndef __MMINTRIN_H
11 #define __MMINTRIN_H
12 
13 #if !defined(__i386__) && !defined(__x86_64__)
14 #error "This header is only meant to be used on x86 and x64 architecture"
15 #endif
16 
17 typedef long long __m64 __attribute__((__vector_size__(8), __aligned__(8)));
18 
19 typedef long long __v1di __attribute__((__vector_size__(8)));
20 typedef int __v2si __attribute__((__vector_size__(8)));
21 typedef short __v4hi __attribute__((__vector_size__(8)));
22 typedef char __v8qi __attribute__((__vector_size__(8)));
23 
24 /* Unsigned types */
25 typedef unsigned long long __v1du __attribute__ ((__vector_size__ (8)));
26 typedef unsigned int __v2su __attribute__ ((__vector_size__ (8)));
27 typedef unsigned short __v4hu __attribute__((__vector_size__(8)));
28 typedef unsigned char __v8qu __attribute__((__vector_size__(8)));
29 
30 /* We need an explicitly signed variant for char. Note that this shouldn't
31  * appear in the interface though. */
32 typedef signed char __v8qs __attribute__((__vector_size__(8)));
33 
34 /* SSE/SSE2 types */
35 typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
36 typedef long long __v2di __attribute__ ((__vector_size__ (16)));
37 typedef int __v4si __attribute__((__vector_size__(16)));
38 typedef short __v8hi __attribute__((__vector_size__(16)));
39 typedef char __v16qi __attribute__((__vector_size__(16)));
40 
41 /* Define the default attributes for the functions in this file. */
42 #if defined(__EVEX512__) && !defined(__AVX10_1_512__)
43 #define __DEFAULT_FN_ATTRS_SSE2                                                \
44   __attribute__((__always_inline__, __nodebug__,                               \
45                  __target__("sse2,no-evex512"), __min_vector_width__(128)))
46 #else
47 #define __DEFAULT_FN_ATTRS_SSE2                                                \
48   __attribute__((__always_inline__, __nodebug__, __target__("sse2"),           \
49                  __min_vector_width__(128)))
50 #endif
51 
52 #if defined(__cplusplus) && (__cplusplus >= 201103L)
53 #define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2 constexpr
54 #else
55 #define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2
56 #endif
57 
58 #define __trunc64(x)                                                           \
59   (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
60 #define __anyext128(x)                                                         \
61   (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
62                                     1, -1, -1)
63 
64 /// Clears the MMX state by setting the state of the x87 stack registers
65 ///    to empty.
66 ///
67 /// \headerfile <x86intrin.h>
68 ///
69 /// This intrinsic corresponds to the <c> EMMS </c> instruction.
70 ///
71 static __inline__ void __attribute__((__always_inline__, __nodebug__,
72                                       __target__("mmx,no-evex512")))
73 _mm_empty(void) {
74   __builtin_ia32_emms();
75 }
76 
77 /// Constructs a 64-bit integer vector, setting the lower 32 bits to the
78 ///    value of the 32-bit integer parameter and setting the upper 32 bits to 0.
79 ///
80 /// \headerfile <x86intrin.h>
81 ///
82 /// This intrinsic corresponds to the <c> MOVD </c> instruction.
83 ///
84 /// \param __i
85 ///    A 32-bit integer value.
86 /// \returns A 64-bit integer vector. The lower 32 bits contain the value of the
87 ///    parameter. The upper 32 bits are set to 0.
88 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
89 _mm_cvtsi32_si64(int __i)
90 {
91     return __extension__ (__m64)(__v2si){__i, 0};
92 }
93 
94 /// Returns the lower 32 bits of a 64-bit integer vector as a 32-bit
95 ///    signed integer.
96 ///
97 /// \headerfile <x86intrin.h>
98 ///
99 /// This intrinsic corresponds to the <c> MOVD </c> instruction.
100 ///
101 /// \param __m
102 ///    A 64-bit integer vector.
103 /// \returns A 32-bit signed integer value containing the lower 32 bits of the
104 ///    parameter.
105 static __inline__ int __DEFAULT_FN_ATTRS_SSE2
106 _mm_cvtsi64_si32(__m64 __m)
107 {
108     return ((__v2si)__m)[0];
109 }
110 
111 /// Casts a 64-bit signed integer value into a 64-bit integer vector.
112 ///
113 /// \headerfile <x86intrin.h>
114 ///
115 /// This intrinsic corresponds to the <c> MOVQ </c> instruction.
116 ///
117 /// \param __i
118 ///    A 64-bit signed integer.
119 /// \returns A 64-bit integer vector containing the same bitwise pattern as the
120 ///    parameter.
121 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
122 _mm_cvtsi64_m64(long long __i)
123 {
124     return (__m64)__i;
125 }
126 
127 /// Casts a 64-bit integer vector into a 64-bit signed integer value.
128 ///
129 /// \headerfile <x86intrin.h>
130 ///
131 /// This intrinsic corresponds to the <c> MOVQ </c> instruction.
132 ///
133 /// \param __m
134 ///    A 64-bit integer vector.
135 /// \returns A 64-bit signed integer containing the same bitwise pattern as the
136 ///    parameter.
137 static __inline__ long long __DEFAULT_FN_ATTRS_SSE2
138 _mm_cvtm64_si64(__m64 __m)
139 {
140     return (long long)__m;
141 }
142 
143 /// Converts, with saturation, 16-bit signed integers from both 64-bit integer
144 ///    vector parameters of [4 x i16] into 8-bit signed integer values, and
145 ///    constructs a 64-bit integer vector of [8 x i8] as the result.
146 ///
147 ///    Positive values greater than 0x7F are saturated to 0x7F. Negative values
148 ///    less than 0x80 are saturated to 0x80.
149 ///
150 /// \headerfile <x86intrin.h>
151 ///
152 /// This intrinsic corresponds to the <c> PACKSSWB </c> instruction.
153 ///
154 /// \param __m1
155 ///    A 64-bit integer vector of [4 x i16]. The converted [4 x i8] values are
156 ///    written to the lower 32 bits of the result.
157 /// \param __m2
158 ///    A 64-bit integer vector of [4 x i16]. The converted [4 x i8] values are
159 ///    written to the upper 32 bits of the result.
160 /// \returns A 64-bit integer vector of [8 x i8] containing the converted
161 ///    values.
162 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
163 _mm_packs_pi16(__m64 __m1, __m64 __m2)
164 {
165     return __trunc64(__builtin_ia32_packsswb128(
166         (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){}));
167 }
168 
169 /// Converts, with saturation, 32-bit signed integers from both 64-bit integer
170 ///    vector parameters of [2 x i32] into 16-bit signed integer values, and
171 ///    constructs a 64-bit integer vector of [4 x i16] as the result.
172 ///
173 ///    Positive values greater than 0x7FFF are saturated to 0x7FFF. Negative
174 ///    values less than 0x8000 are saturated to 0x8000.
175 ///
176 /// \headerfile <x86intrin.h>
177 ///
178 /// This intrinsic corresponds to the <c> PACKSSDW </c> instruction.
179 ///
180 /// \param __m1
181 ///    A 64-bit integer vector of [2 x i32]. The converted [2 x i16] values are
182 ///    written to the lower 32 bits of the result.
183 /// \param __m2
184 ///    A 64-bit integer vector of [2 x i32]. The converted [2 x i16] values are
185 ///    written to the upper 32 bits of the result.
186 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
187 ///    values.
188 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
189 _mm_packs_pi32(__m64 __m1, __m64 __m2)
190 {
191     return __trunc64(__builtin_ia32_packssdw128(
192         (__v4si)__builtin_shufflevector(__m1, __m2, 0, 1), (__v4si){}));
193 }
194 
195 /// Converts, with saturation, 16-bit signed integers from both 64-bit integer
196 ///    vector parameters of [4 x i16] into 8-bit unsigned integer values, and
197 ///    constructs a 64-bit integer vector of [8 x i8] as the result.
198 ///
199 ///    Values greater than 0xFF are saturated to 0xFF. Values less than 0 are
200 ///    saturated to 0.
201 ///
202 /// \headerfile <x86intrin.h>
203 ///
204 /// This intrinsic corresponds to the <c> PACKUSWB </c> instruction.
205 ///
206 /// \param __m1
207 ///    A 64-bit integer vector of [4 x i16]. The converted [4 x i8] values are
208 ///    written to the lower 32 bits of the result.
209 /// \param __m2
210 ///    A 64-bit integer vector of [4 x i16]. The converted [4 x i8] values are
211 ///    written to the upper 32 bits of the result.
212 /// \returns A 64-bit integer vector of [8 x i8] containing the converted
213 ///    values.
214 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
215 _mm_packs_pu16(__m64 __m1, __m64 __m2)
216 {
217     return __trunc64(__builtin_ia32_packuswb128(
218         (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){}));
219 }
220 
221 /// Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8]
222 ///    and interleaves them into a 64-bit integer vector of [8 x i8].
223 ///
224 /// \headerfile <x86intrin.h>
225 ///
226 /// This intrinsic corresponds to the <c> PUNPCKHBW </c> instruction.
227 ///
228 /// \param __m1
229 ///    A 64-bit integer vector of [8 x i8]. \n
230 ///    Bits [39:32] are written to bits [7:0] of the result. \n
231 ///    Bits [47:40] are written to bits [23:16] of the result. \n
232 ///    Bits [55:48] are written to bits [39:32] of the result. \n
233 ///    Bits [63:56] are written to bits [55:48] of the result.
234 /// \param __m2
235 ///    A 64-bit integer vector of [8 x i8].
236 ///    Bits [39:32] are written to bits [15:8] of the result. \n
237 ///    Bits [47:40] are written to bits [31:24] of the result. \n
238 ///    Bits [55:48] are written to bits [47:40] of the result. \n
239 ///    Bits [63:56] are written to bits [63:56] of the result.
240 /// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
241 ///    values.
242 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
243 _mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
244 {
245     return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2,
246                                           4, 12, 5, 13, 6, 14, 7, 15);
247 }
248 
249 /// Unpacks the upper 32 bits from two 64-bit integer vectors of
250 ///    [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16].
251 ///
252 /// \headerfile <x86intrin.h>
253 ///
254 /// This intrinsic corresponds to the <c> PUNPCKHWD </c> instruction.
255 ///
256 /// \param __m1
257 ///    A 64-bit integer vector of [4 x i16].
258 ///    Bits [47:32] are written to bits [15:0] of the result. \n
259 ///    Bits [63:48] are written to bits [47:32] of the result.
260 /// \param __m2
261 ///    A 64-bit integer vector of [4 x i16].
262 ///    Bits [47:32] are written to bits [31:16] of the result. \n
263 ///    Bits [63:48] are written to bits [63:48] of the result.
264 /// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
265 ///    values.
266 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
267 _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
268 {
269     return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2,
270                                           2, 6, 3, 7);
271 }
272 
273 /// Unpacks the upper 32 bits from two 64-bit integer vectors of
274 ///    [2 x i32] and interleaves them into a 64-bit integer vector of [2 x i32].
275 ///
276 /// \headerfile <x86intrin.h>
277 ///
278 /// This intrinsic corresponds to the <c> PUNPCKHDQ </c> instruction.
279 ///
280 /// \param __m1
281 ///    A 64-bit integer vector of [2 x i32]. The upper 32 bits are written to
282 ///    the lower 32 bits of the result.
283 /// \param __m2
284 ///    A 64-bit integer vector of [2 x i32]. The upper 32 bits are written to
285 ///    the upper 32 bits of the result.
286 /// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
287 ///    values.
288 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
289 _mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
290 {
291     return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 1, 3);
292 }
293 
294 /// Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8]
295 ///    and interleaves them into a 64-bit integer vector of [8 x i8].
296 ///
297 /// \headerfile <x86intrin.h>
298 ///
299 /// This intrinsic corresponds to the <c> PUNPCKLBW </c> instruction.
300 ///
301 /// \param __m1
302 ///    A 64-bit integer vector of [8 x i8].
303 ///    Bits [7:0] are written to bits [7:0] of the result. \n
304 ///    Bits [15:8] are written to bits [23:16] of the result. \n
305 ///    Bits [23:16] are written to bits [39:32] of the result. \n
306 ///    Bits [31:24] are written to bits [55:48] of the result.
307 /// \param __m2
308 ///    A 64-bit integer vector of [8 x i8].
309 ///    Bits [7:0] are written to bits [15:8] of the result. \n
310 ///    Bits [15:8] are written to bits [31:24] of the result. \n
311 ///    Bits [23:16] are written to bits [47:40] of the result. \n
312 ///    Bits [31:24] are written to bits [63:56] of the result.
313 /// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
314 ///    values.
315 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
316 _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
317 {
318     return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2,
319                                           0, 8, 1, 9, 2, 10, 3, 11);
320 }
321 
322 /// Unpacks the lower 32 bits from two 64-bit integer vectors of
323 ///    [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16].
324 ///
325 /// \headerfile <x86intrin.h>
326 ///
327 /// This intrinsic corresponds to the <c> PUNPCKLWD </c> instruction.
328 ///
329 /// \param __m1
330 ///    A 64-bit integer vector of [4 x i16].
331 ///    Bits [15:0] are written to bits [15:0] of the result. \n
332 ///    Bits [31:16] are written to bits [47:32] of the result.
333 /// \param __m2
334 ///    A 64-bit integer vector of [4 x i16].
335 ///    Bits [15:0] are written to bits [31:16] of the result. \n
336 ///    Bits [31:16] are written to bits [63:48] of the result.
337 /// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
338 ///    values.
339 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
340 _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
341 {
342     return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2,
343                                           0, 4, 1, 5);
344 }
345 
346 /// Unpacks the lower 32 bits from two 64-bit integer vectors of
347 ///    [2 x i32] and interleaves them into a 64-bit integer vector of [2 x i32].
348 ///
349 /// \headerfile <x86intrin.h>
350 ///
351 /// This intrinsic corresponds to the <c> PUNPCKLDQ </c> instruction.
352 ///
353 /// \param __m1
354 ///    A 64-bit integer vector of [2 x i32]. The lower 32 bits are written to
355 ///    the lower 32 bits of the result.
356 /// \param __m2
357 ///    A 64-bit integer vector of [2 x i32]. The lower 32 bits are written to
358 ///    the upper 32 bits of the result.
359 /// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
360 ///    values.
361 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
362 _mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
363 {
364     return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 0, 2);
365 }
366 
367 /// Adds each 8-bit integer element of the first 64-bit integer vector
368 ///    of [8 x i8] to the corresponding 8-bit integer element of the second
369 ///    64-bit integer vector of [8 x i8]. The lower 8 bits of the results are
370 ///    packed into a 64-bit integer vector of [8 x i8].
371 ///
372 /// \headerfile <x86intrin.h>
373 ///
374 /// This intrinsic corresponds to the <c> PADDB </c> instruction.
375 ///
376 /// \param __m1
377 ///    A 64-bit integer vector of [8 x i8].
378 /// \param __m2
379 ///    A 64-bit integer vector of [8 x i8].
380 /// \returns A 64-bit integer vector of [8 x i8] containing the sums of both
381 ///    parameters.
382 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
383 _mm_add_pi8(__m64 __m1, __m64 __m2)
384 {
385     return (__m64)(((__v8qu)__m1) + ((__v8qu)__m2));
386 }
387 
388 /// Adds each 16-bit integer element of the first 64-bit integer vector
389 ///    of [4 x i16] to the corresponding 16-bit integer element of the second
390 ///    64-bit integer vector of [4 x i16]. The lower 16 bits of the results are
391 ///    packed into a 64-bit integer vector of [4 x i16].
392 ///
393 /// \headerfile <x86intrin.h>
394 ///
395 /// This intrinsic corresponds to the <c> PADDW </c> instruction.
396 ///
397 /// \param __m1
398 ///    A 64-bit integer vector of [4 x i16].
399 /// \param __m2
400 ///    A 64-bit integer vector of [4 x i16].
401 /// \returns A 64-bit integer vector of [4 x i16] containing the sums of both
402 ///    parameters.
403 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
404 _mm_add_pi16(__m64 __m1, __m64 __m2)
405 {
406     return (__m64)(((__v4hu)__m1) + ((__v4hu)__m2));
407 }
408 
409 /// Adds each 32-bit integer element of the first 64-bit integer vector
410 ///    of [2 x i32] to the corresponding 32-bit integer element of the second
411 ///    64-bit integer vector of [2 x i32]. The lower 32 bits of the results are
412 ///    packed into a 64-bit integer vector of [2 x i32].
413 ///
414 /// \headerfile <x86intrin.h>
415 ///
416 /// This intrinsic corresponds to the <c> PADDD </c> instruction.
417 ///
418 /// \param __m1
419 ///    A 64-bit integer vector of [2 x i32].
420 /// \param __m2
421 ///    A 64-bit integer vector of [2 x i32].
422 /// \returns A 64-bit integer vector of [2 x i32] containing the sums of both
423 ///    parameters.
424 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
425 _mm_add_pi32(__m64 __m1, __m64 __m2)
426 {
427     return (__m64)(((__v2su)__m1) + ((__v2su)__m2));
428 }
429 
430 /// Adds, with saturation, each 8-bit signed integer element of the first
431 ///    64-bit integer vector of [8 x i8] to the corresponding 8-bit signed
432 ///    integer element of the second 64-bit integer vector of [8 x i8].
433 ///
434 ///    Positive sums greater than 0x7F are saturated to 0x7F. Negative sums
435 ///    less than 0x80 are saturated to 0x80. The results are packed into a
436 ///    64-bit integer vector of [8 x i8].
437 ///
438 /// \headerfile <x86intrin.h>
439 ///
440 /// This intrinsic corresponds to the <c> PADDSB </c> instruction.
441 ///
442 /// \param __m1
443 ///    A 64-bit integer vector of [8 x i8].
444 /// \param __m2
445 ///    A 64-bit integer vector of [8 x i8].
446 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated sums
447 ///    of both parameters.
448 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
449 _mm_adds_pi8(__m64 __m1, __m64 __m2)
450 {
451     return (__m64)__builtin_elementwise_add_sat((__v8qs)__m1, (__v8qs)__m2);
452 }
453 
454 /// Adds, with saturation, each 16-bit signed integer element of the first
455 ///    64-bit integer vector of [4 x i16] to the corresponding 16-bit signed
456 ///    integer element of the second 64-bit integer vector of [4 x i16].
457 ///
458 ///    Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
459 ///    less than 0x8000 are saturated to 0x8000. The results are packed into a
460 ///    64-bit integer vector of [4 x i16].
461 ///
462 /// \headerfile <x86intrin.h>
463 ///
464 /// This intrinsic corresponds to the <c> PADDSW </c> instruction.
465 ///
466 /// \param __m1
467 ///    A 64-bit integer vector of [4 x i16].
468 /// \param __m2
469 ///    A 64-bit integer vector of [4 x i16].
470 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated sums
471 ///    of both parameters.
472 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
473 _mm_adds_pi16(__m64 __m1, __m64 __m2)
474 {
475     return (__m64)__builtin_elementwise_add_sat((__v4hi)__m1, (__v4hi)__m2);
476 }
477 
478 /// Adds, with saturation, each 8-bit unsigned integer element of the first
479 ///    64-bit integer vector of [8 x i8] to the corresponding 8-bit unsigned
480 ///    integer element of the second 64-bit integer vector of [8 x i8].
481 ///
482 ///    Sums greater than 0xFF are saturated to 0xFF. The results are packed
483 ///    into a 64-bit integer vector of [8 x i8].
484 ///
485 /// \headerfile <x86intrin.h>
486 ///
487 /// This intrinsic corresponds to the <c> PADDUSB </c> instruction.
488 ///
489 /// \param __m1
490 ///    A 64-bit integer vector of [8 x i8].
491 /// \param __m2
492 ///    A 64-bit integer vector of [8 x i8].
493 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated
494 ///    unsigned sums of both parameters.
495 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
496 _mm_adds_pu8(__m64 __m1, __m64 __m2)
497 {
498     return (__m64)__builtin_elementwise_add_sat((__v8qu)__m1, (__v8qu)__m2);
499 }
500 
501 /// Adds, with saturation, each 16-bit unsigned integer element of the first
502 ///    64-bit integer vector of [4 x i16] to the corresponding 16-bit unsigned
503 ///    integer element of the second 64-bit integer vector of [4 x i16].
504 ///
505 ///    Sums greater than 0xFFFF are saturated to 0xFFFF. The results are packed
506 ///    into a 64-bit integer vector of [4 x i16].
507 ///
508 /// \headerfile <x86intrin.h>
509 ///
510 /// This intrinsic corresponds to the <c> PADDUSW </c> instruction.
511 ///
512 /// \param __m1
513 ///    A 64-bit integer vector of [4 x i16].
514 /// \param __m2
515 ///    A 64-bit integer vector of [4 x i16].
516 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated
517 ///    unsigned sums of both parameters.
518 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
519 _mm_adds_pu16(__m64 __m1, __m64 __m2)
520 {
521     return (__m64)__builtin_elementwise_add_sat((__v4hu)__m1, (__v4hu)__m2);
522 }
523 
524 /// Subtracts each 8-bit integer element of the second 64-bit integer
525 ///    vector of [8 x i8] from the corresponding 8-bit integer element of the
526 ///    first 64-bit integer vector of [8 x i8]. The lower 8 bits of the results
527 ///    are packed into a 64-bit integer vector of [8 x i8].
528 ///
529 /// \headerfile <x86intrin.h>
530 ///
531 /// This intrinsic corresponds to the <c> PSUBB </c> instruction.
532 ///
533 /// \param __m1
534 ///    A 64-bit integer vector of [8 x i8] containing the minuends.
535 /// \param __m2
536 ///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
537 /// \returns A 64-bit integer vector of [8 x i8] containing the differences of
538 ///    both parameters.
539 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
540 _mm_sub_pi8(__m64 __m1, __m64 __m2)
541 {
542     return (__m64)(((__v8qu)__m1) - ((__v8qu)__m2));
543 }
544 
545 /// Subtracts each 16-bit integer element of the second 64-bit integer
546 ///    vector of [4 x i16] from the corresponding 16-bit integer element of the
547 ///    first 64-bit integer vector of [4 x i16]. The lower 16 bits of the
548 ///    results are packed into a 64-bit integer vector of [4 x i16].
549 ///
550 /// \headerfile <x86intrin.h>
551 ///
552 /// This intrinsic corresponds to the <c> PSUBW </c> instruction.
553 ///
554 /// \param __m1
555 ///    A 64-bit integer vector of [4 x i16] containing the minuends.
556 /// \param __m2
557 ///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
558 /// \returns A 64-bit integer vector of [4 x i16] containing the differences of
559 ///    both parameters.
560 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
561 _mm_sub_pi16(__m64 __m1, __m64 __m2)
562 {
563     return (__m64)(((__v4hu)__m1) - ((__v4hu)__m2));
564 }
565 
566 /// Subtracts each 32-bit integer element of the second 64-bit integer
567 ///    vector of [2 x i32] from the corresponding 32-bit integer element of the
568 ///    first 64-bit integer vector of [2 x i32]. The lower 32 bits of the
569 ///    results are packed into a 64-bit integer vector of [2 x i32].
570 ///
571 /// \headerfile <x86intrin.h>
572 ///
573 /// This intrinsic corresponds to the <c> PSUBD </c> instruction.
574 ///
575 /// \param __m1
576 ///    A 64-bit integer vector of [2 x i32] containing the minuends.
577 /// \param __m2
578 ///    A 64-bit integer vector of [2 x i32] containing the subtrahends.
579 /// \returns A 64-bit integer vector of [2 x i32] containing the differences of
580 ///    both parameters.
581 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
582 _mm_sub_pi32(__m64 __m1, __m64 __m2)
583 {
584     return (__m64)(((__v2su)__m1) - ((__v2su)__m2));
585 }
586 
587 /// Subtracts, with saturation, each 8-bit signed integer element of the second
588 ///    64-bit integer vector of [8 x i8] from the corresponding 8-bit signed
589 ///    integer element of the first 64-bit integer vector of [8 x i8].
590 ///
591 ///    Positive results greater than 0x7F are saturated to 0x7F. Negative
592 ///    results less than 0x80 are saturated to 0x80. The results are packed
593 ///    into a 64-bit integer vector of [8 x i8].
594 ///
595 /// \headerfile <x86intrin.h>
596 ///
597 /// This intrinsic corresponds to the <c> PSUBSB </c> instruction.
598 ///
599 /// \param __m1
600 ///    A 64-bit integer vector of [8 x i8] containing the minuends.
601 /// \param __m2
602 ///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
603 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated
604 ///    differences of both parameters.
605 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
606 _mm_subs_pi8(__m64 __m1, __m64 __m2)
607 {
608     return (__m64)__builtin_elementwise_sub_sat((__v8qs)__m1, (__v8qs)__m2);
609 }
610 
611 /// Subtracts, with saturation, each 16-bit signed integer element of the
612 ///    second 64-bit integer vector of [4 x i16] from the corresponding 16-bit
613 ///    signed integer element of the first 64-bit integer vector of [4 x i16].
614 ///
615 ///    Positive results greater than 0x7FFF are saturated to 0x7FFF. Negative
616 ///    results less than 0x8000 are saturated to 0x8000. The results are packed
617 ///    into a 64-bit integer vector of [4 x i16].
618 ///
619 /// \headerfile <x86intrin.h>
620 ///
621 /// This intrinsic corresponds to the <c> PSUBSW </c> instruction.
622 ///
623 /// \param __m1
624 ///    A 64-bit integer vector of [4 x i16] containing the minuends.
625 /// \param __m2
626 ///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
627 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated
628 ///    differences of both parameters.
629 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
630 _mm_subs_pi16(__m64 __m1, __m64 __m2)
631 {
632     return (__m64)__builtin_elementwise_sub_sat((__v4hi)__m1, (__v4hi)__m2);
633 }
634 
635 /// Subtracts each 8-bit unsigned integer element of the second 64-bit
636 ///    integer vector of [8 x i8] from the corresponding 8-bit unsigned integer
637 ///    element of the first 64-bit integer vector of [8 x i8].
638 ///
639 ///    If an element of the first vector is less than the corresponding element
640 ///    of the second vector, the result is saturated to 0. The results are
641 ///    packed into a 64-bit integer vector of [8 x i8].
642 ///
643 /// \headerfile <x86intrin.h>
644 ///
645 /// This intrinsic corresponds to the <c> PSUBUSB </c> instruction.
646 ///
647 /// \param __m1
648 ///    A 64-bit integer vector of [8 x i8] containing the minuends.
649 /// \param __m2
650 ///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
651 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated
652 ///    differences of both parameters.
653 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
654 _mm_subs_pu8(__m64 __m1, __m64 __m2)
655 {
656     return (__m64)__builtin_elementwise_sub_sat((__v8qu)__m1, (__v8qu)__m2);
657 }
658 
659 /// Subtracts each 16-bit unsigned integer element of the second 64-bit
660 ///    integer vector of [4 x i16] from the corresponding 16-bit unsigned
661 ///    integer element of the first 64-bit integer vector of [4 x i16].
662 ///
663 ///    If an element of the first vector is less than the corresponding element
664 ///    of the second vector, the result is saturated to 0. The results are
665 ///    packed into a 64-bit integer vector of [4 x i16].
666 ///
667 /// \headerfile <x86intrin.h>
668 ///
669 /// This intrinsic corresponds to the <c> PSUBUSW </c> instruction.
670 ///
671 /// \param __m1
672 ///    A 64-bit integer vector of [4 x i16] containing the minuends.
673 /// \param __m2
674 ///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
675 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated
676 ///    differences of both parameters.
677 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
678 _mm_subs_pu16(__m64 __m1, __m64 __m2)
679 {
680     return (__m64)__builtin_elementwise_sub_sat((__v4hu)__m1, (__v4hu)__m2);
681 }
682 
683 /// Multiplies each 16-bit signed integer element of the first 64-bit
684 ///    integer vector of [4 x i16] by the corresponding 16-bit signed integer
685 ///    element of the second 64-bit integer vector of [4 x i16] and get four
686 ///    32-bit products. Adds adjacent pairs of products to get two 32-bit sums.
687 ///    The lower 32 bits of these two sums are packed into a 64-bit integer
688 ///    vector of [2 x i32].
689 ///
690 ///    For example, bits [15:0] of both parameters are multiplied, bits [31:16]
691 ///    of both parameters are multiplied, and the sum of both results is written
692 ///    to bits [31:0] of the result.
693 ///
694 /// \headerfile <x86intrin.h>
695 ///
696 /// This intrinsic corresponds to the <c> PMADDWD </c> instruction.
697 ///
698 /// \param __m1
699 ///    A 64-bit integer vector of [4 x i16].
700 /// \param __m2
701 ///    A 64-bit integer vector of [4 x i16].
702 /// \returns A 64-bit integer vector of [2 x i32] containing the sums of
703 ///    products of both parameters.
704 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
705 _mm_madd_pi16(__m64 __m1, __m64 __m2)
706 {
707     return __trunc64(__builtin_ia32_pmaddwd128((__v8hi)__anyext128(__m1),
708                                                (__v8hi)__anyext128(__m2)));
709 }
710 
711 /// Multiplies each 16-bit signed integer element of the first 64-bit
712 ///    integer vector of [4 x i16] by the corresponding 16-bit signed integer
713 ///    element of the second 64-bit integer vector of [4 x i16]. Packs the upper
714 ///    16 bits of the 32-bit products into a 64-bit integer vector of [4 x i16].
715 ///
716 /// \headerfile <x86intrin.h>
717 ///
718 /// This intrinsic corresponds to the <c> PMULHW </c> instruction.
719 ///
720 /// \param __m1
721 ///    A 64-bit integer vector of [4 x i16].
722 /// \param __m2
723 ///    A 64-bit integer vector of [4 x i16].
724 /// \returns A 64-bit integer vector of [4 x i16] containing the upper 16 bits
725 ///    of the products of both parameters.
726 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
727 _mm_mulhi_pi16(__m64 __m1, __m64 __m2)
728 {
729     return __trunc64(__builtin_ia32_pmulhw128((__v8hi)__anyext128(__m1),
730                                               (__v8hi)__anyext128(__m2)));
731 }
732 
733 /// Multiplies each 16-bit signed integer element of the first 64-bit
734 ///    integer vector of [4 x i16] by the corresponding 16-bit signed integer
735 ///    element of the second 64-bit integer vector of [4 x i16]. Packs the lower
736 ///    16 bits of the 32-bit products into a 64-bit integer vector of [4 x i16].
737 ///
738 /// \headerfile <x86intrin.h>
739 ///
740 /// This intrinsic corresponds to the <c> PMULLW </c> instruction.
741 ///
742 /// \param __m1
743 ///    A 64-bit integer vector of [4 x i16].
744 /// \param __m2
745 ///    A 64-bit integer vector of [4 x i16].
746 /// \returns A 64-bit integer vector of [4 x i16] containing the lower 16 bits
747 ///    of the products of both parameters.
748 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
749 _mm_mullo_pi16(__m64 __m1, __m64 __m2)
750 {
751     return (__m64)(((__v4hu)__m1) * ((__v4hu)__m2));
752 }
753 
754 /// Left-shifts each 16-bit signed integer element of the first
755 ///    parameter, which is a 64-bit integer vector of [4 x i16], by the number
756 ///    of bits specified by the second parameter, which is a 64-bit integer. The
757 ///    lower 16 bits of the results are packed into a 64-bit integer vector of
758 ///    [4 x i16].
759 ///
760 /// \headerfile <x86intrin.h>
761 ///
762 /// This intrinsic corresponds to the <c> PSLLW </c> instruction.
763 ///
764 /// \param __m
765 ///    A 64-bit integer vector of [4 x i16].
766 /// \param __count
767 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
768 /// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
769 ///    values. If \a __count is greater or equal to 16, the result is set to all
770 ///    0.
771 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
772 _mm_sll_pi16(__m64 __m, __m64 __count)
773 {
774     return __trunc64(__builtin_ia32_psllw128((__v8hi)__anyext128(__m),
775                                              (__v8hi)__anyext128(__count)));
776 }
777 
778 /// Left-shifts each 16-bit signed integer element of a 64-bit integer
779 ///    vector of [4 x i16] by the number of bits specified by a 32-bit integer.
780 ///    The lower 16 bits of the results are packed into a 64-bit integer vector
781 ///    of [4 x i16].
782 ///
783 /// \headerfile <x86intrin.h>
784 ///
785 /// This intrinsic corresponds to the <c> PSLLW </c> instruction.
786 ///
787 /// \param __m
788 ///    A 64-bit integer vector of [4 x i16].
789 /// \param __count
790 ///    A 32-bit integer value.
791 /// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
792 ///    values. If \a __count is greater or equal to 16, the result is set to all
793 ///    0.
794 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
795 _mm_slli_pi16(__m64 __m, int __count)
796 {
797     return __trunc64(__builtin_ia32_psllwi128((__v8hi)__anyext128(__m),
798                                               __count));
799 }
800 
801 /// Left-shifts each 32-bit signed integer element of the first
802 ///    parameter, which is a 64-bit integer vector of [2 x i32], by the number
803 ///    of bits specified by the second parameter, which is a 64-bit integer. The
804 ///    lower 32 bits of the results are packed into a 64-bit integer vector of
805 ///    [2 x i32].
806 ///
807 /// \headerfile <x86intrin.h>
808 ///
809 /// This intrinsic corresponds to the <c> PSLLD </c> instruction.
810 ///
811 /// \param __m
812 ///    A 64-bit integer vector of [2 x i32].
813 /// \param __count
814 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
815 /// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
816 ///    values. If \a __count is greater or equal to 32, the result is set to all
817 ///    0.
818 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
819 _mm_sll_pi32(__m64 __m, __m64 __count)
820 {
821     return __trunc64(__builtin_ia32_pslld128((__v4si)__anyext128(__m),
822                                              (__v4si)__anyext128(__count)));
823 }
824 
825 /// Left-shifts each 32-bit signed integer element of a 64-bit integer
826 ///    vector of [2 x i32] by the number of bits specified by a 32-bit integer.
827 ///    The lower 32 bits of the results are packed into a 64-bit integer vector
828 ///    of [2 x i32].
829 ///
830 /// \headerfile <x86intrin.h>
831 ///
832 /// This intrinsic corresponds to the <c> PSLLD </c> instruction.
833 ///
834 /// \param __m
835 ///    A 64-bit integer vector of [2 x i32].
836 /// \param __count
837 ///    A 32-bit integer value.
838 /// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
839 ///    values. If \a __count is greater or equal to 32, the result is set to all
840 ///    0.
841 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
842 _mm_slli_pi32(__m64 __m, int __count)
843 {
844     return __trunc64(__builtin_ia32_pslldi128((__v4si)__anyext128(__m),
845                                               __count));
846 }
847 
848 /// Left-shifts the first 64-bit integer parameter by the number of bits
849 ///    specified by the second 64-bit integer parameter. The lower 64 bits of
850 ///    result are returned.
851 ///
852 /// \headerfile <x86intrin.h>
853 ///
854 /// This intrinsic corresponds to the <c> PSLLQ </c> instruction.
855 ///
856 /// \param __m
857 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
858 /// \param __count
859 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
860 /// \returns A 64-bit integer vector containing the left-shifted value. If
861 ///     \a __count is greater or equal to 64, the result is set to 0.
862 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
863 _mm_sll_si64(__m64 __m, __m64 __count)
864 {
865     return __trunc64(__builtin_ia32_psllq128((__v2di)__anyext128(__m),
866                                              (__v2di)__anyext128(__count)));
867 }
868 
869 /// Left-shifts the first parameter, which is a 64-bit integer, by the
870 ///    number of bits specified by the second parameter, which is a 32-bit
871 ///    integer. The lower 64 bits of result are returned.
872 ///
873 /// \headerfile <x86intrin.h>
874 ///
875 /// This intrinsic corresponds to the <c> PSLLQ </c> instruction.
876 ///
877 /// \param __m
878 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
879 /// \param __count
880 ///    A 32-bit integer value.
881 /// \returns A 64-bit integer vector containing the left-shifted value. If
882 ///     \a __count is greater or equal to 64, the result is set to 0.
883 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
884 _mm_slli_si64(__m64 __m, int __count)
885 {
886     return __trunc64(__builtin_ia32_psllqi128((__v2di)__anyext128(__m),
887                                               __count));
888 }
889 
890 /// Right-shifts each 16-bit integer element of the first parameter,
891 ///    which is a 64-bit integer vector of [4 x i16], by the number of bits
892 ///    specified by the second parameter, which is a 64-bit integer.
893 ///
894 ///    High-order bits are filled with the sign bit of the initial value of each
895 ///    16-bit element. The 16-bit results are packed into a 64-bit integer
896 ///    vector of [4 x i16].
897 ///
898 /// \headerfile <x86intrin.h>
899 ///
900 /// This intrinsic corresponds to the <c> PSRAW </c> instruction.
901 ///
902 /// \param __m
903 ///    A 64-bit integer vector of [4 x i16].
904 /// \param __count
905 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
906 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
907 ///    values.
908 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
909 _mm_sra_pi16(__m64 __m, __m64 __count)
910 {
911     return __trunc64(__builtin_ia32_psraw128((__v8hi)__anyext128(__m),
912                                              (__v8hi)__anyext128(__count)));
913 }
914 
915 /// Right-shifts each 16-bit integer element of a 64-bit integer vector
916 ///    of [4 x i16] by the number of bits specified by a 32-bit integer.
917 ///
918 ///    High-order bits are filled with the sign bit of the initial value of each
919 ///    16-bit element. The 16-bit results are packed into a 64-bit integer
920 ///    vector of [4 x i16].
921 ///
922 /// \headerfile <x86intrin.h>
923 ///
924 /// This intrinsic corresponds to the <c> PSRAW </c> instruction.
925 ///
926 /// \param __m
927 ///    A 64-bit integer vector of [4 x i16].
928 /// \param __count
929 ///    A 32-bit integer value.
930 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
931 ///    values.
932 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
933 _mm_srai_pi16(__m64 __m, int __count)
934 {
935     return __trunc64(__builtin_ia32_psrawi128((__v8hi)__anyext128(__m),
936                                               __count));
937 }
938 
939 /// Right-shifts each 32-bit integer element of the first parameter,
940 ///    which is a 64-bit integer vector of [2 x i32], by the number of bits
941 ///    specified by the second parameter, which is a 64-bit integer.
942 ///
943 ///    High-order bits are filled with the sign bit of the initial value of each
944 ///    32-bit element. The 32-bit results are packed into a 64-bit integer
945 ///    vector of [2 x i32].
946 ///
947 /// \headerfile <x86intrin.h>
948 ///
949 /// This intrinsic corresponds to the <c> PSRAD </c> instruction.
950 ///
951 /// \param __m
952 ///    A 64-bit integer vector of [2 x i32].
953 /// \param __count
954 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
955 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
956 ///    values.
957 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
958 _mm_sra_pi32(__m64 __m, __m64 __count)
959 {
960     return __trunc64(__builtin_ia32_psrad128((__v4si)__anyext128(__m),
961                                              (__v4si)__anyext128(__count)));
962 }
963 
964 /// Right-shifts each 32-bit integer element of a 64-bit integer vector
965 ///    of [2 x i32] by the number of bits specified by a 32-bit integer.
966 ///
967 ///    High-order bits are filled with the sign bit of the initial value of each
968 ///    32-bit element. The 32-bit results are packed into a 64-bit integer
969 ///    vector of [2 x i32].
970 ///
971 /// \headerfile <x86intrin.h>
972 ///
973 /// This intrinsic corresponds to the <c> PSRAD </c> instruction.
974 ///
975 /// \param __m
976 ///    A 64-bit integer vector of [2 x i32].
977 /// \param __count
978 ///    A 32-bit integer value.
979 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
980 ///    values.
981 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
982 _mm_srai_pi32(__m64 __m, int __count)
983 {
984     return __trunc64(__builtin_ia32_psradi128((__v4si)__anyext128(__m),
985                                               __count));
986 }
987 
988 /// Right-shifts each 16-bit integer element of the first parameter,
989 ///    which is a 64-bit integer vector of [4 x i16], by the number of bits
990 ///    specified by the second parameter, which is a 64-bit integer.
991 ///
992 ///    High-order bits are cleared. The 16-bit results are packed into a 64-bit
993 ///    integer vector of [4 x i16].
994 ///
995 /// \headerfile <x86intrin.h>
996 ///
997 /// This intrinsic corresponds to the <c> PSRLW </c> instruction.
998 ///
999 /// \param __m
1000 ///    A 64-bit integer vector of [4 x i16].
1001 /// \param __count
1002 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
1003 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
1004 ///    values.
1005 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1006 _mm_srl_pi16(__m64 __m, __m64 __count)
1007 {
1008     return __trunc64(__builtin_ia32_psrlw128((__v8hi)__anyext128(__m),
1009                                              (__v8hi)__anyext128(__count)));
1010 }
1011 
1012 /// Right-shifts each 16-bit integer element of a 64-bit integer vector
1013 ///    of [4 x i16] by the number of bits specified by a 32-bit integer.
1014 ///
1015 ///    High-order bits are cleared. The 16-bit results are packed into a 64-bit
1016 ///    integer vector of [4 x i16].
1017 ///
1018 /// \headerfile <x86intrin.h>
1019 ///
1020 /// This intrinsic corresponds to the <c> PSRLW </c> instruction.
1021 ///
1022 /// \param __m
1023 ///    A 64-bit integer vector of [4 x i16].
1024 /// \param __count
1025 ///    A 32-bit integer value.
1026 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
1027 ///    values.
1028 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1029 _mm_srli_pi16(__m64 __m, int __count)
1030 {
1031     return __trunc64(__builtin_ia32_psrlwi128((__v8hi)__anyext128(__m),
1032                                               __count));
1033 }
1034 
1035 /// Right-shifts each 32-bit integer element of the first parameter,
1036 ///    which is a 64-bit integer vector of [2 x i32], by the number of bits
1037 ///    specified by the second parameter, which is a 64-bit integer.
1038 ///
1039 ///    High-order bits are cleared. The 32-bit results are packed into a 64-bit
1040 ///    integer vector of [2 x i32].
1041 ///
1042 /// \headerfile <x86intrin.h>
1043 ///
1044 /// This intrinsic corresponds to the <c> PSRLD </c> instruction.
1045 ///
1046 /// \param __m
1047 ///    A 64-bit integer vector of [2 x i32].
1048 /// \param __count
1049 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
1050 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
1051 ///    values.
1052 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1053 _mm_srl_pi32(__m64 __m, __m64 __count)
1054 {
1055     return __trunc64(__builtin_ia32_psrld128((__v4si)__anyext128(__m),
1056                                              (__v4si)__anyext128(__count)));
1057 }
1058 
1059 /// Right-shifts each 32-bit integer element of a 64-bit integer vector
1060 ///    of [2 x i32] by the number of bits specified by a 32-bit integer.
1061 ///
1062 ///    High-order bits are cleared. The 32-bit results are packed into a 64-bit
1063 ///    integer vector of [2 x i32].
1064 ///
1065 /// \headerfile <x86intrin.h>
1066 ///
1067 /// This intrinsic corresponds to the <c> PSRLD </c> instruction.
1068 ///
1069 /// \param __m
1070 ///    A 64-bit integer vector of [2 x i32].
1071 /// \param __count
1072 ///    A 32-bit integer value.
1073 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
1074 ///    values.
1075 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1076 _mm_srli_pi32(__m64 __m, int __count)
1077 {
1078     return __trunc64(__builtin_ia32_psrldi128((__v4si)__anyext128(__m),
1079                                               __count));
1080 }
1081 
1082 /// Right-shifts the first 64-bit integer parameter by the number of bits
1083 ///    specified by the second 64-bit integer parameter.
1084 ///
1085 ///    High-order bits are cleared.
1086 ///
1087 /// \headerfile <x86intrin.h>
1088 ///
1089 /// This intrinsic corresponds to the <c> PSRLQ </c> instruction.
1090 ///
1091 /// \param __m
1092 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
1093 /// \param __count
1094 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
1095 /// \returns A 64-bit integer vector containing the right-shifted value.
1096 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1097 _mm_srl_si64(__m64 __m, __m64 __count)
1098 {
1099     return __trunc64(__builtin_ia32_psrlq128((__v2di)__anyext128(__m),
1100                                              (__v2di)__anyext128(__count)));
1101 }
1102 
1103 /// Right-shifts the first parameter, which is a 64-bit integer, by the
1104 ///    number of bits specified by the second parameter, which is a 32-bit
1105 ///    integer.
1106 ///
1107 ///    High-order bits are cleared.
1108 ///
1109 /// \headerfile <x86intrin.h>
1110 ///
1111 /// This intrinsic corresponds to the <c> PSRLQ </c> instruction.
1112 ///
1113 /// \param __m
1114 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
1115 /// \param __count
1116 ///    A 32-bit integer value.
1117 /// \returns A 64-bit integer vector containing the right-shifted value.
1118 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1119 _mm_srli_si64(__m64 __m, int __count)
1120 {
1121     return __trunc64(__builtin_ia32_psrlqi128((__v2di)__anyext128(__m),
1122                                               __count));
1123 }
1124 
1125 /// Performs a bitwise AND of two 64-bit integer vectors.
1126 ///
1127 /// \headerfile <x86intrin.h>
1128 ///
1129 /// This intrinsic corresponds to the <c> PAND </c> instruction.
1130 ///
1131 /// \param __m1
1132 ///    A 64-bit integer vector.
1133 /// \param __m2
1134 ///    A 64-bit integer vector.
1135 /// \returns A 64-bit integer vector containing the bitwise AND of both
1136 ///    parameters.
1137 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1138 _mm_and_si64(__m64 __m1, __m64 __m2)
1139 {
1140     return (__m64)(((__v1du)__m1) & ((__v1du)__m2));
1141 }
1142 
1143 /// Performs a bitwise NOT of the first 64-bit integer vector, and then
1144 ///    performs a bitwise AND of the intermediate result and the second 64-bit
1145 ///    integer vector.
1146 ///
1147 /// \headerfile <x86intrin.h>
1148 ///
1149 /// This intrinsic corresponds to the <c> PANDN </c> instruction.
1150 ///
1151 /// \param __m1
1152 ///    A 64-bit integer vector. The one's complement of this parameter is used
1153 ///    in the bitwise AND.
1154 /// \param __m2
1155 ///    A 64-bit integer vector.
1156 /// \returns A 64-bit integer vector containing the bitwise AND of the second
1157 ///    parameter and the one's complement of the first parameter.
1158 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1159 _mm_andnot_si64(__m64 __m1, __m64 __m2)
1160 {
1161     return (__m64)(~((__v1du)__m1) & ((__v1du)__m2));
1162 }
1163 
1164 /// Performs a bitwise OR of two 64-bit integer vectors.
1165 ///
1166 /// \headerfile <x86intrin.h>
1167 ///
1168 /// This intrinsic corresponds to the <c> POR </c> instruction.
1169 ///
1170 /// \param __m1
1171 ///    A 64-bit integer vector.
1172 /// \param __m2
1173 ///    A 64-bit integer vector.
1174 /// \returns A 64-bit integer vector containing the bitwise OR of both
1175 ///    parameters.
1176 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1177 _mm_or_si64(__m64 __m1, __m64 __m2)
1178 {
1179     return (__m64)(((__v1du)__m1) | ((__v1du)__m2));
1180 }
1181 
1182 /// Performs a bitwise exclusive OR of two 64-bit integer vectors.
1183 ///
1184 /// \headerfile <x86intrin.h>
1185 ///
1186 /// This intrinsic corresponds to the <c> PXOR </c> instruction.
1187 ///
1188 /// \param __m1
1189 ///    A 64-bit integer vector.
1190 /// \param __m2
1191 ///    A 64-bit integer vector.
1192 /// \returns A 64-bit integer vector containing the bitwise exclusive OR of both
1193 ///    parameters.
1194 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1195 _mm_xor_si64(__m64 __m1, __m64 __m2)
1196 {
1197     return (__m64)(((__v1du)__m1) ^ ((__v1du)__m2));
1198 }
1199 
1200 /// Compares the 8-bit integer elements of two 64-bit integer vectors of
1201 ///    [8 x i8] to determine if the element of the first vector is equal to the
1202 ///    corresponding element of the second vector.
1203 ///
1204 ///    Each comparison returns 0 for false, 0xFF for true.
1205 ///
1206 /// \headerfile <x86intrin.h>
1207 ///
1208 /// This intrinsic corresponds to the <c> PCMPEQB </c> instruction.
1209 ///
1210 /// \param __m1
1211 ///    A 64-bit integer vector of [8 x i8].
1212 /// \param __m2
1213 ///    A 64-bit integer vector of [8 x i8].
1214 /// \returns A 64-bit integer vector of [8 x i8] containing the comparison
1215 ///    results.
1216 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1217 _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
1218 {
1219     return (__m64)(((__v8qi)__m1) == ((__v8qi)__m2));
1220 }
1221 
1222 /// Compares the 16-bit integer elements of two 64-bit integer vectors of
1223 ///    [4 x i16] to determine if the element of the first vector is equal to the
1224 ///    corresponding element of the second vector.
1225 ///
1226 ///    Each comparison returns 0 for false, 0xFFFF for true.
1227 ///
1228 /// \headerfile <x86intrin.h>
1229 ///
1230 /// This intrinsic corresponds to the <c> PCMPEQW </c> instruction.
1231 ///
1232 /// \param __m1
1233 ///    A 64-bit integer vector of [4 x i16].
1234 /// \param __m2
1235 ///    A 64-bit integer vector of [4 x i16].
1236 /// \returns A 64-bit integer vector of [4 x i16] containing the comparison
1237 ///    results.
1238 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1239 _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
1240 {
1241     return (__m64)(((__v4hi)__m1) == ((__v4hi)__m2));
1242 }
1243 
1244 /// Compares the 32-bit integer elements of two 64-bit integer vectors of
1245 ///    [2 x i32] to determine if the element of the first vector is equal to the
1246 ///    corresponding element of the second vector.
1247 ///
1248 ///    Each comparison returns 0 for false, 0xFFFFFFFF for true.
1249 ///
1250 /// \headerfile <x86intrin.h>
1251 ///
1252 /// This intrinsic corresponds to the <c> PCMPEQD </c> instruction.
1253 ///
1254 /// \param __m1
1255 ///    A 64-bit integer vector of [2 x i32].
1256 /// \param __m2
1257 ///    A 64-bit integer vector of [2 x i32].
1258 /// \returns A 64-bit integer vector of [2 x i32] containing the comparison
1259 ///    results.
1260 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1261 _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
1262 {
1263     return (__m64)(((__v2si)__m1) == ((__v2si)__m2));
1264 }
1265 
1266 /// Compares the 8-bit integer elements of two 64-bit integer vectors of
1267 ///    [8 x i8] to determine if the element of the first vector is greater than
1268 ///    the corresponding element of the second vector.
1269 ///
1270 ///    Each comparison returns 0 for false, 0xFF for true.
1271 ///
1272 /// \headerfile <x86intrin.h>
1273 ///
1274 /// This intrinsic corresponds to the <c> PCMPGTB </c> instruction.
1275 ///
1276 /// \param __m1
1277 ///    A 64-bit integer vector of [8 x i8].
1278 /// \param __m2
1279 ///    A 64-bit integer vector of [8 x i8].
1280 /// \returns A 64-bit integer vector of [8 x i8] containing the comparison
1281 ///    results.
1282 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1283 _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
1284 {
1285   /* This function always performs a signed comparison, but __v8qi is a char
1286      which may be signed or unsigned, so use __v8qs. */
1287     return (__m64)((__v8qs)__m1 > (__v8qs)__m2);
1288 }
1289 
1290 /// Compares the 16-bit integer elements of two 64-bit integer vectors of
1291 ///    [4 x i16] to determine if the element of the first vector is greater than
1292 ///    the corresponding element of the second vector.
1293 ///
1294 ///    Each comparison returns 0 for false, 0xFFFF for true.
1295 ///
1296 /// \headerfile <x86intrin.h>
1297 ///
1298 /// This intrinsic corresponds to the <c> PCMPGTW </c> instruction.
1299 ///
1300 /// \param __m1
1301 ///    A 64-bit integer vector of [4 x i16].
1302 /// \param __m2
1303 ///    A 64-bit integer vector of [4 x i16].
1304 /// \returns A 64-bit integer vector of [4 x i16] containing the comparison
1305 ///    results.
1306 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1307 _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
1308 {
1309     return (__m64)((__v4hi)__m1 > (__v4hi)__m2);
1310 }
1311 
1312 /// Compares the 32-bit integer elements of two 64-bit integer vectors of
1313 ///    [2 x i32] to determine if the element of the first vector is greater than
1314 ///    the corresponding element of the second vector.
1315 ///
1316 ///    Each comparison returns 0 for false, 0xFFFFFFFF for true.
1317 ///
1318 /// \headerfile <x86intrin.h>
1319 ///
1320 /// This intrinsic corresponds to the <c> PCMPGTD </c> instruction.
1321 ///
1322 /// \param __m1
1323 ///    A 64-bit integer vector of [2 x i32].
1324 /// \param __m2
1325 ///    A 64-bit integer vector of [2 x i32].
1326 /// \returns A 64-bit integer vector of [2 x i32] containing the comparison
1327 ///    results.
1328 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1329 _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
1330 {
1331     return (__m64)((__v2si)__m1 > (__v2si)__m2);
1332 }
1333 
1334 /// Constructs a 64-bit integer vector initialized to zero.
1335 ///
1336 /// \headerfile <x86intrin.h>
1337 ///
1338 /// This intrinsic corresponds to the <c> PXOR </c> instruction.
1339 ///
1340 /// \returns An initialized 64-bit integer vector with all elements set to zero.
1341 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
1342 _mm_setzero_si64(void) {
1343   return __extension__(__m64){0LL};
1344 }
1345 
1346 /// Constructs a 64-bit integer vector initialized with the specified
1347 ///    32-bit integer values.
1348 ///
1349 /// \headerfile <x86intrin.h>
1350 ///
1351 /// This intrinsic is a utility function and does not correspond to a specific
1352 ///    instruction.
1353 ///
1354 /// \param __i1
1355 ///    A 32-bit integer value used to initialize the upper 32 bits of the
1356 ///    result.
1357 /// \param __i0
1358 ///    A 32-bit integer value used to initialize the lower 32 bits of the
1359 ///    result.
1360 /// \returns An initialized 64-bit integer vector.
1361 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
1362 _mm_set_pi32(int __i1, int __i0) {
1363   return __extension__(__m64)(__v2si){__i0, __i1};
1364 }
1365 
1366 /// Constructs a 64-bit integer vector initialized with the specified
1367 ///    16-bit integer values.
1368 ///
1369 /// \headerfile <x86intrin.h>
1370 ///
1371 /// This intrinsic is a utility function and does not correspond to a specific
1372 ///    instruction.
1373 ///
1374 /// \param __s3
1375 ///    A 16-bit integer value used to initialize bits [63:48] of the result.
1376 /// \param __s2
1377 ///    A 16-bit integer value used to initialize bits [47:32] of the result.
1378 /// \param __s1
1379 ///    A 16-bit integer value used to initialize bits [31:16] of the result.
1380 /// \param __s0
1381 ///    A 16-bit integer value used to initialize bits [15:0] of the result.
1382 /// \returns An initialized 64-bit integer vector.
1383 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
1384 _mm_set_pi16(short __s3, short __s2, short __s1, short __s0) {
1385   return __extension__(__m64)(__v4hi){__s0, __s1, __s2, __s3};
1386 }
1387 
1388 /// Constructs a 64-bit integer vector initialized with the specified
1389 ///    8-bit integer values.
1390 ///
1391 /// \headerfile <x86intrin.h>
1392 ///
1393 /// This intrinsic is a utility function and does not correspond to a specific
1394 ///    instruction.
1395 ///
1396 /// \param __b7
1397 ///    An 8-bit integer value used to initialize bits [63:56] of the result.
1398 /// \param __b6
1399 ///    An 8-bit integer value used to initialize bits [55:48] of the result.
1400 /// \param __b5
1401 ///    An 8-bit integer value used to initialize bits [47:40] of the result.
1402 /// \param __b4
1403 ///    An 8-bit integer value used to initialize bits [39:32] of the result.
1404 /// \param __b3
1405 ///    An 8-bit integer value used to initialize bits [31:24] of the result.
1406 /// \param __b2
1407 ///    An 8-bit integer value used to initialize bits [23:16] of the result.
1408 /// \param __b1
1409 ///    An 8-bit integer value used to initialize bits [15:8] of the result.
1410 /// \param __b0
1411 ///    An 8-bit integer value used to initialize bits [7:0] of the result.
1412 /// \returns An initialized 64-bit integer vector.
1413 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
1414 _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
1415             char __b1, char __b0) {
1416   return __extension__(__m64)(__v8qi){__b0, __b1, __b2, __b3,
1417                                       __b4, __b5, __b6, __b7};
1418 }
1419 
1420 /// Constructs a 64-bit integer vector of [2 x i32], with each of the
1421 ///    32-bit integer vector elements set to the specified 32-bit integer
1422 ///    value.
1423 ///
1424 /// \headerfile <x86intrin.h>
1425 ///
1426 /// This intrinsic is a utility function and does not correspond to a specific
1427 ///    instruction.
1428 ///
1429 /// \param __i
1430 ///    A 32-bit integer value used to initialize each vector element of the
1431 ///    result.
1432 /// \returns An initialized 64-bit integer vector of [2 x i32].
1433 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
1434 _mm_set1_pi32(int __i) {
1435   return _mm_set_pi32(__i, __i);
1436 }
1437 
1438 /// Constructs a 64-bit integer vector of [4 x i16], with each of the
1439 ///    16-bit integer vector elements set to the specified 16-bit integer
1440 ///    value.
1441 ///
1442 /// \headerfile <x86intrin.h>
1443 ///
1444 /// This intrinsic is a utility function and does not correspond to a specific
1445 ///    instruction.
1446 ///
1447 /// \param __w
1448 ///    A 16-bit integer value used to initialize each vector element of the
1449 ///    result.
1450 /// \returns An initialized 64-bit integer vector of [4 x i16].
1451 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
1452 _mm_set1_pi16(short __w) {
1453   return _mm_set_pi16(__w, __w, __w, __w);
1454 }
1455 
1456 /// Constructs a 64-bit integer vector of [8 x i8], with each of the
1457 ///    8-bit integer vector elements set to the specified 8-bit integer value.
1458 ///
1459 /// \headerfile <x86intrin.h>
1460 ///
1461 /// This intrinsic is a utility function and does not correspond to a specific
1462 ///    instruction.
1463 ///
1464 /// \param __b
1465 ///    An 8-bit integer value used to initialize each vector element of the
1466 ///    result.
1467 /// \returns An initialized 64-bit integer vector of [8 x i8].
1468 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
1469 _mm_set1_pi8(char __b) {
1470   return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b);
1471 }
1472 
1473 /// Constructs a 64-bit integer vector, initialized in reverse order with
1474 ///    the specified 32-bit integer values.
1475 ///
1476 /// \headerfile <x86intrin.h>
1477 ///
1478 /// This intrinsic is a utility function and does not correspond to a specific
1479 ///    instruction.
1480 ///
1481 /// \param __i0
1482 ///    A 32-bit integer value used to initialize the lower 32 bits of the
1483 ///    result.
1484 /// \param __i1
1485 ///    A 32-bit integer value used to initialize the upper 32 bits of the
1486 ///    result.
1487 /// \returns An initialized 64-bit integer vector.
1488 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
1489 _mm_setr_pi32(int __i0, int __i1) {
1490   return _mm_set_pi32(__i1, __i0);
1491 }
1492 
1493 /// Constructs a 64-bit integer vector, initialized in reverse order with
1494 ///    the specified 16-bit integer values.
1495 ///
1496 /// \headerfile <x86intrin.h>
1497 ///
1498 /// This intrinsic is a utility function and does not correspond to a specific
1499 ///    instruction.
1500 ///
1501 /// \param __w0
1502 ///    A 16-bit integer value used to initialize bits [15:0] of the result.
1503 /// \param __w1
1504 ///    A 16-bit integer value used to initialize bits [31:16] of the result.
1505 /// \param __w2
1506 ///    A 16-bit integer value used to initialize bits [47:32] of the result.
1507 /// \param __w3
1508 ///    A 16-bit integer value used to initialize bits [63:48] of the result.
1509 /// \returns An initialized 64-bit integer vector.
1510 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
1511 _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) {
1512   return _mm_set_pi16(__w3, __w2, __w1, __w0);
1513 }
1514 
1515 /// Constructs a 64-bit integer vector, initialized in reverse order with
1516 ///    the specified 8-bit integer values.
1517 ///
1518 /// \headerfile <x86intrin.h>
1519 ///
1520 /// This intrinsic is a utility function and does not correspond to a specific
1521 ///    instruction.
1522 ///
1523 /// \param __b0
1524 ///    An 8-bit integer value used to initialize bits [7:0] of the result.
1525 /// \param __b1
1526 ///    An 8-bit integer value used to initialize bits [15:8] of the result.
1527 /// \param __b2
1528 ///    An 8-bit integer value used to initialize bits [23:16] of the result.
1529 /// \param __b3
1530 ///    An 8-bit integer value used to initialize bits [31:24] of the result.
1531 /// \param __b4
1532 ///    An 8-bit integer value used to initialize bits [39:32] of the result.
1533 /// \param __b5
1534 ///    An 8-bit integer value used to initialize bits [47:40] of the result.
1535 /// \param __b6
1536 ///    An 8-bit integer value used to initialize bits [55:48] of the result.
1537 /// \param __b7
1538 ///    An 8-bit integer value used to initialize bits [63:56] of the result.
1539 /// \returns An initialized 64-bit integer vector.
1540 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
1541 _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
1542              char __b6, char __b7) {
1543   return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
1544 }
1545 
1546 #undef __anyext128
1547 #undef __trunc64
1548 #undef __DEFAULT_FN_ATTRS_SSE2
1549 
1550 /* Aliases for compatibility. */
1551 #define _m_empty _mm_empty
1552 #define _m_from_int _mm_cvtsi32_si64
1553 #define _m_from_int64 _mm_cvtsi64_m64
1554 #define _m_to_int _mm_cvtsi64_si32
1555 #define _m_to_int64 _mm_cvtm64_si64
1556 #define _m_packsswb _mm_packs_pi16
1557 #define _m_packssdw _mm_packs_pi32
1558 #define _m_packuswb _mm_packs_pu16
1559 #define _m_punpckhbw _mm_unpackhi_pi8
1560 #define _m_punpckhwd _mm_unpackhi_pi16
1561 #define _m_punpckhdq _mm_unpackhi_pi32
1562 #define _m_punpcklbw _mm_unpacklo_pi8
1563 #define _m_punpcklwd _mm_unpacklo_pi16
1564 #define _m_punpckldq _mm_unpacklo_pi32
1565 #define _m_paddb _mm_add_pi8
1566 #define _m_paddw _mm_add_pi16
1567 #define _m_paddd _mm_add_pi32
1568 #define _m_paddsb _mm_adds_pi8
1569 #define _m_paddsw _mm_adds_pi16
1570 #define _m_paddusb _mm_adds_pu8
1571 #define _m_paddusw _mm_adds_pu16
1572 #define _m_psubb _mm_sub_pi8
1573 #define _m_psubw _mm_sub_pi16
1574 #define _m_psubd _mm_sub_pi32
1575 #define _m_psubsb _mm_subs_pi8
1576 #define _m_psubsw _mm_subs_pi16
1577 #define _m_psubusb _mm_subs_pu8
1578 #define _m_psubusw _mm_subs_pu16
1579 #define _m_pmaddwd _mm_madd_pi16
1580 #define _m_pmulhw _mm_mulhi_pi16
1581 #define _m_pmullw _mm_mullo_pi16
1582 #define _m_psllw _mm_sll_pi16
1583 #define _m_psllwi _mm_slli_pi16
1584 #define _m_pslld _mm_sll_pi32
1585 #define _m_pslldi _mm_slli_pi32
1586 #define _m_psllq _mm_sll_si64
1587 #define _m_psllqi _mm_slli_si64
1588 #define _m_psraw _mm_sra_pi16
1589 #define _m_psrawi _mm_srai_pi16
1590 #define _m_psrad _mm_sra_pi32
1591 #define _m_psradi _mm_srai_pi32
1592 #define _m_psrlw _mm_srl_pi16
1593 #define _m_psrlwi _mm_srli_pi16
1594 #define _m_psrld _mm_srl_pi32
1595 #define _m_psrldi _mm_srli_pi32
1596 #define _m_psrlq _mm_srl_si64
1597 #define _m_psrlqi _mm_srli_si64
1598 #define _m_pand _mm_and_si64
1599 #define _m_pandn _mm_andnot_si64
1600 #define _m_por _mm_or_si64
1601 #define _m_pxor _mm_xor_si64
1602 #define _m_pcmpeqb _mm_cmpeq_pi8
1603 #define _m_pcmpeqw _mm_cmpeq_pi16
1604 #define _m_pcmpeqd _mm_cmpeq_pi32
1605 #define _m_pcmpgtb _mm_cmpgt_pi8
1606 #define _m_pcmpgtw _mm_cmpgt_pi16
1607 #define _m_pcmpgtd _mm_cmpgt_pi32
1608 
1609 #endif /* __MMINTRIN_H */
1610 
1611