xref: /llvm-project/clang/lib/Headers/tmmintrin.h (revision 3f25f23a2b8aaff300e751d4724a3ddba4d694eb)
1 /*===---- tmmintrin.h - SSSE3 intrinsics -----------------------------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 #ifndef __TMMINTRIN_H
11 #define __TMMINTRIN_H
12 
13 #if !defined(__i386__) && !defined(__x86_64__)
14 #error "This header is only meant to be used on x86 and x64 architecture"
15 #endif
16 
17 #include <pmmintrin.h>
18 
19 /* Define the default attributes for the functions in this file. */
20 #if defined(__EVEX512__) && !defined(__AVX10_1_512__)
21 #define __DEFAULT_FN_ATTRS                                                     \
22   __attribute__((__always_inline__, __nodebug__,                               \
23                  __target__("ssse3,no-evex512"), __min_vector_width__(128)))
24 #else
25 #define __DEFAULT_FN_ATTRS                                                     \
26   __attribute__((__always_inline__, __nodebug__, __target__("ssse3"),          \
27                  __min_vector_width__(128)))
28 #endif
29 
30 #define __trunc64(x)                                                           \
31   (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
32 #define __anyext128(x)                                                         \
33   (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
34                                     1, -1, -1)
35 
36 /// Computes the absolute value of each of the packed 8-bit signed
37 ///    integers in the source operand and stores the 8-bit unsigned integer
38 ///    results in the destination.
39 ///
40 /// \headerfile <x86intrin.h>
41 ///
42 /// This intrinsic corresponds to the \c PABSB instruction.
43 ///
44 /// \param __a
45 ///    A 64-bit vector of [8 x i8].
46 /// \returns A 64-bit integer vector containing the absolute values of the
47 ///    elements in the operand.
48 static __inline__ __m64 __DEFAULT_FN_ATTRS
49 _mm_abs_pi8(__m64 __a)
50 {
51   return (__m64)__builtin_elementwise_abs((__v8qs)__a);
52 }
53 
54 /// Computes the absolute value of each of the packed 8-bit signed
55 ///    integers in the source operand and stores the 8-bit unsigned integer
56 ///    results in the destination.
57 ///
58 /// \headerfile <x86intrin.h>
59 ///
60 /// This intrinsic corresponds to the \c VPABSB instruction.
61 ///
62 /// \param __a
63 ///    A 128-bit vector of [16 x i8].
64 /// \returns A 128-bit integer vector containing the absolute values of the
65 ///    elements in the operand.
66 static __inline__ __m128i __DEFAULT_FN_ATTRS
67 _mm_abs_epi8(__m128i __a)
68 {
69     return (__m128i)__builtin_elementwise_abs((__v16qs)__a);
70 }
71 
72 /// Computes the absolute value of each of the packed 16-bit signed
73 ///    integers in the source operand and stores the 16-bit unsigned integer
74 ///    results in the destination.
75 ///
76 /// \headerfile <x86intrin.h>
77 ///
78 /// This intrinsic corresponds to the \c PABSW instruction.
79 ///
80 /// \param __a
81 ///    A 64-bit vector of [4 x i16].
82 /// \returns A 64-bit integer vector containing the absolute values of the
83 ///    elements in the operand.
84 static __inline__ __m64 __DEFAULT_FN_ATTRS
85 _mm_abs_pi16(__m64 __a)
86 {
87     return (__m64)__builtin_elementwise_abs((__v4hi)__a);
88 }
89 
90 /// Computes the absolute value of each of the packed 16-bit signed
91 ///    integers in the source operand and stores the 16-bit unsigned integer
92 ///    results in the destination.
93 ///
94 /// \headerfile <x86intrin.h>
95 ///
96 /// This intrinsic corresponds to the \c VPABSW instruction.
97 ///
98 /// \param __a
99 ///    A 128-bit vector of [8 x i16].
100 /// \returns A 128-bit integer vector containing the absolute values of the
101 ///    elements in the operand.
102 static __inline__ __m128i __DEFAULT_FN_ATTRS
103 _mm_abs_epi16(__m128i __a)
104 {
105     return (__m128i)__builtin_elementwise_abs((__v8hi)__a);
106 }
107 
108 /// Computes the absolute value of each of the packed 32-bit signed
109 ///    integers in the source operand and stores the 32-bit unsigned integer
110 ///    results in the destination.
111 ///
112 /// \headerfile <x86intrin.h>
113 ///
114 /// This intrinsic corresponds to the \c PABSD instruction.
115 ///
116 /// \param __a
117 ///    A 64-bit vector of [2 x i32].
118 /// \returns A 64-bit integer vector containing the absolute values of the
119 ///    elements in the operand.
120 static __inline__ __m64 __DEFAULT_FN_ATTRS
121 _mm_abs_pi32(__m64 __a)
122 {
123     return (__m64)__builtin_elementwise_abs((__v2si)__a);
124 }
125 
126 /// Computes the absolute value of each of the packed 32-bit signed
127 ///    integers in the source operand and stores the 32-bit unsigned integer
128 ///    results in the destination.
129 ///
130 /// \headerfile <x86intrin.h>
131 ///
132 /// This intrinsic corresponds to the \c VPABSD instruction.
133 ///
134 /// \param __a
135 ///    A 128-bit vector of [4 x i32].
136 /// \returns A 128-bit integer vector containing the absolute values of the
137 ///    elements in the operand.
138 static __inline__ __m128i __DEFAULT_FN_ATTRS
139 _mm_abs_epi32(__m128i __a)
140 {
141     return (__m128i)__builtin_elementwise_abs((__v4si)__a);
142 }
143 
144 /// Concatenates the two 128-bit integer vector operands, and
145 ///    right-shifts the result by the number of bytes specified in the immediate
146 ///    operand.
147 ///
148 /// \headerfile <x86intrin.h>
149 ///
150 /// \code
151 /// __m128i _mm_alignr_epi8(__m128i a, __m128i b, const int n);
152 /// \endcode
153 ///
154 /// This intrinsic corresponds to the \c PALIGNR instruction.
155 ///
156 /// \param a
157 ///    A 128-bit vector of [16 x i8] containing one of the source operands.
158 /// \param b
159 ///    A 128-bit vector of [16 x i8] containing one of the source operands.
160 /// \param n
161 ///    An immediate operand specifying how many bytes to right-shift the result.
162 /// \returns A 128-bit integer vector containing the concatenated right-shifted
163 ///    value.
164 #define _mm_alignr_epi8(a, b, n) \
165   ((__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
166                                       (__v16qi)(__m128i)(b), (n)))
167 
168 /// Concatenates the two 64-bit integer vector operands, and right-shifts
169 ///    the result by the number of bytes specified in the immediate operand.
170 ///
171 /// \headerfile <x86intrin.h>
172 ///
173 /// \code
174 /// __m64 _mm_alignr_pi8(__m64 a, __m64 b, const int n);
175 /// \endcode
176 ///
177 /// This intrinsic corresponds to the \c PALIGNR instruction.
178 ///
179 /// \param a
180 ///    A 64-bit vector of [8 x i8] containing one of the source operands.
181 /// \param b
182 ///    A 64-bit vector of [8 x i8] containing one of the source operands.
183 /// \param n
184 ///    An immediate operand specifying how many bytes to right-shift the result.
185 /// \returns A 64-bit integer vector containing the concatenated right-shifted
186 ///    value.
187 #define _mm_alignr_pi8(a, b, n) \
188   ((__m64)__builtin_shufflevector(                                       \
189        __builtin_ia32_psrldqi128_byteshift(                              \
190            __builtin_shufflevector((__v1di)(a), (__v1di)(b), 1, 0),      \
191            (n)), __extension__ (__v2di){}, 0))
192 
193 /// Horizontally adds the adjacent pairs of values contained in 2 packed
194 ///    128-bit vectors of [8 x i16].
195 ///
196 /// \headerfile <x86intrin.h>
197 ///
198 /// This intrinsic corresponds to the \c VPHADDW instruction.
199 ///
200 /// \param __a
201 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
202 ///    horizontal sums of the values are stored in the lower bits of the
203 ///    destination.
204 /// \param __b
205 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
206 ///    horizontal sums of the values are stored in the upper bits of the
207 ///    destination.
208 /// \returns A 128-bit vector of [8 x i16] containing the horizontal sums of
209 ///    both operands.
210 static __inline__ __m128i __DEFAULT_FN_ATTRS
211 _mm_hadd_epi16(__m128i __a, __m128i __b)
212 {
213     return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
214 }
215 
216 /// Horizontally adds the adjacent pairs of values contained in 2 packed
217 ///    128-bit vectors of [4 x i32].
218 ///
219 /// \headerfile <x86intrin.h>
220 ///
221 /// This intrinsic corresponds to the \c VPHADDD instruction.
222 ///
223 /// \param __a
224 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
225 ///    horizontal sums of the values are stored in the lower bits of the
226 ///    destination.
227 /// \param __b
228 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
229 ///    horizontal sums of the values are stored in the upper bits of the
230 ///    destination.
231 /// \returns A 128-bit vector of [4 x i32] containing the horizontal sums of
232 ///    both operands.
233 static __inline__ __m128i __DEFAULT_FN_ATTRS
234 _mm_hadd_epi32(__m128i __a, __m128i __b)
235 {
236     return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
237 }
238 
239 /// Horizontally adds the adjacent pairs of values contained in 2 packed
240 ///    64-bit vectors of [4 x i16].
241 ///
242 /// \headerfile <x86intrin.h>
243 ///
244 /// This intrinsic corresponds to the \c PHADDW instruction.
245 ///
246 /// \param __a
247 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
248 ///    horizontal sums of the values are stored in the lower bits of the
249 ///    destination.
250 /// \param __b
251 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
252 ///    horizontal sums of the values are stored in the upper bits of the
253 ///    destination.
254 /// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
255 ///    operands.
256 static __inline__ __m64 __DEFAULT_FN_ATTRS
257 _mm_hadd_pi16(__m64 __a, __m64 __b)
258 {
259     return __trunc64(__builtin_ia32_phaddw128(
260         (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
261 }
262 
263 /// Horizontally adds the adjacent pairs of values contained in 2 packed
264 ///    64-bit vectors of [2 x i32].
265 ///
266 /// \headerfile <x86intrin.h>
267 ///
268 /// This intrinsic corresponds to the \c PHADDD instruction.
269 ///
270 /// \param __a
271 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
272 ///    horizontal sums of the values are stored in the lower bits of the
273 ///    destination.
274 /// \param __b
275 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
276 ///    horizontal sums of the values are stored in the upper bits of the
277 ///    destination.
278 /// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
279 ///    operands.
280 static __inline__ __m64 __DEFAULT_FN_ATTRS
281 _mm_hadd_pi32(__m64 __a, __m64 __b)
282 {
283     return __trunc64(__builtin_ia32_phaddd128(
284         (__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){}));
285 }
286 
287 /// Horizontally adds, with saturation, the adjacent pairs of values contained
288 ///    in two packed 128-bit vectors of [8 x i16].
289 ///
290 ///    Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
291 ///    less than 0x8000 are saturated to 0x8000.
292 ///
293 /// \headerfile <x86intrin.h>
294 ///
295 /// This intrinsic corresponds to the \c VPHADDSW instruction.
296 ///
297 /// \param __a
298 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
299 ///    horizontal sums of the values are stored in the lower bits of the
300 ///    destination.
301 /// \param __b
302 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
303 ///    horizontal sums of the values are stored in the upper bits of the
304 ///    destination.
305 /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
306 ///    sums of both operands.
307 static __inline__ __m128i __DEFAULT_FN_ATTRS
308 _mm_hadds_epi16(__m128i __a, __m128i __b)
309 {
310     return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
311 }
312 
313 /// Horizontally adds, with saturation, the adjacent pairs of values contained
314 ///    in two packed 64-bit vectors of [4 x i16].
315 ///
316 ///    Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
317 ///    less than 0x8000 are saturated to 0x8000.
318 ///
319 /// \headerfile <x86intrin.h>
320 ///
321 /// This intrinsic corresponds to the \c PHADDSW instruction.
322 ///
323 /// \param __a
324 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
325 ///    horizontal sums of the values are stored in the lower bits of the
326 ///    destination.
327 /// \param __b
328 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
329 ///    horizontal sums of the values are stored in the upper bits of the
330 ///    destination.
331 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
332 ///    sums of both operands.
333 static __inline__ __m64 __DEFAULT_FN_ATTRS
334 _mm_hadds_pi16(__m64 __a, __m64 __b)
335 {
336     return __trunc64(__builtin_ia32_phaddsw128(
337         (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
338 }
339 
340 /// Horizontally subtracts the adjacent pairs of values contained in 2
341 ///    packed 128-bit vectors of [8 x i16].
342 ///
343 /// \headerfile <x86intrin.h>
344 ///
345 /// This intrinsic corresponds to the \c VPHSUBW instruction.
346 ///
347 /// \param __a
348 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
349 ///    horizontal differences between the values are stored in the lower bits of
350 ///    the destination.
351 /// \param __b
352 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
353 ///    horizontal differences between the values are stored in the upper bits of
354 ///    the destination.
355 /// \returns A 128-bit vector of [8 x i16] containing the horizontal differences
356 ///    of both operands.
357 static __inline__ __m128i __DEFAULT_FN_ATTRS
358 _mm_hsub_epi16(__m128i __a, __m128i __b)
359 {
360     return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
361 }
362 
363 /// Horizontally subtracts the adjacent pairs of values contained in 2
364 ///    packed 128-bit vectors of [4 x i32].
365 ///
366 /// \headerfile <x86intrin.h>
367 ///
368 /// This intrinsic corresponds to the \c VPHSUBD instruction.
369 ///
370 /// \param __a
371 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
372 ///    horizontal differences between the values are stored in the lower bits of
373 ///    the destination.
374 /// \param __b
375 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
376 ///    horizontal differences between the values are stored in the upper bits of
377 ///    the destination.
378 /// \returns A 128-bit vector of [4 x i32] containing the horizontal differences
379 ///    of both operands.
380 static __inline__ __m128i __DEFAULT_FN_ATTRS
381 _mm_hsub_epi32(__m128i __a, __m128i __b)
382 {
383     return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
384 }
385 
386 /// Horizontally subtracts the adjacent pairs of values contained in 2
387 ///    packed 64-bit vectors of [4 x i16].
388 ///
389 /// \headerfile <x86intrin.h>
390 ///
391 /// This intrinsic corresponds to the \c PHSUBW instruction.
392 ///
393 /// \param __a
394 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
395 ///    horizontal differences between the values are stored in the lower bits of
396 ///    the destination.
397 /// \param __b
398 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
399 ///    horizontal differences between the values are stored in the upper bits of
400 ///    the destination.
401 /// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
402 ///    of both operands.
403 static __inline__ __m64 __DEFAULT_FN_ATTRS
404 _mm_hsub_pi16(__m64 __a, __m64 __b)
405 {
406     return __trunc64(__builtin_ia32_phsubw128(
407         (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
408 }
409 
410 /// Horizontally subtracts the adjacent pairs of values contained in 2
411 ///    packed 64-bit vectors of [2 x i32].
412 ///
413 /// \headerfile <x86intrin.h>
414 ///
415 /// This intrinsic corresponds to the \c PHSUBD instruction.
416 ///
417 /// \param __a
418 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
419 ///    horizontal differences between the values are stored in the lower bits of
420 ///    the destination.
421 /// \param __b
422 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
423 ///    horizontal differences between the values are stored in the upper bits of
424 ///    the destination.
425 /// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
426 ///    of both operands.
427 static __inline__ __m64 __DEFAULT_FN_ATTRS
428 _mm_hsub_pi32(__m64 __a, __m64 __b)
429 {
430     return __trunc64(__builtin_ia32_phsubd128(
431         (__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){}));
432 }
433 
434 /// Horizontally subtracts, with saturation, the adjacent pairs of values
435 ///    contained in two packed 128-bit vectors of [8 x i16].
436 ///
437 ///    Positive differences greater than 0x7FFF are saturated to 0x7FFF.
438 ///    Negative differences less than 0x8000 are saturated to 0x8000.
439 ///
440 /// \headerfile <x86intrin.h>
441 ///
442 /// This intrinsic corresponds to the \c VPHSUBSW instruction.
443 ///
444 /// \param __a
445 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
446 ///    horizontal differences between the values are stored in the lower bits of
447 ///    the destination.
448 /// \param __b
449 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
450 ///    horizontal differences between the values are stored in the upper bits of
451 ///    the destination.
452 /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
453 ///    differences of both operands.
454 static __inline__ __m128i __DEFAULT_FN_ATTRS
455 _mm_hsubs_epi16(__m128i __a, __m128i __b)
456 {
457     return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
458 }
459 
460 /// Horizontally subtracts, with saturation, the adjacent pairs of values
461 ///    contained in two packed 64-bit vectors of [4 x i16].
462 ///
463 ///    Positive differences greater than 0x7FFF are saturated to 0x7FFF.
464 ///    Negative differences less than 0x8000 are saturated to 0x8000.
465 ///
466 /// \headerfile <x86intrin.h>
467 ///
468 /// This intrinsic corresponds to the \c PHSUBSW instruction.
469 ///
470 /// \param __a
471 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
472 ///    horizontal differences between the values are stored in the lower bits of
473 ///    the destination.
474 /// \param __b
475 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
476 ///    horizontal differences between the values are stored in the upper bits of
477 ///    the destination.
478 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
479 ///    differences of both operands.
480 static __inline__ __m64 __DEFAULT_FN_ATTRS
481 _mm_hsubs_pi16(__m64 __a, __m64 __b)
482 {
483     return __trunc64(__builtin_ia32_phsubsw128(
484         (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
485 }
486 
487 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
488 ///    values contained in the first source operand and packed 8-bit signed
489 ///    integer values contained in the second source operand, adds pairs of
490 ///    contiguous products with signed saturation, and writes the 16-bit sums to
491 ///    the corresponding bits in the destination.
492 ///
493 ///    For example, bits [7:0] of both operands are multiplied, bits [15:8] of
494 ///    both operands are multiplied, and the sum of both results is written to
495 ///    bits [15:0] of the destination.
496 ///
497 /// \headerfile <x86intrin.h>
498 ///
499 /// This intrinsic corresponds to the \c VPMADDUBSW instruction.
500 ///
501 /// \param __a
502 ///    A 128-bit integer vector containing the first source operand.
503 /// \param __b
504 ///    A 128-bit integer vector containing the second source operand.
505 /// \returns A 128-bit integer vector containing the sums of products of both
506 ///    operands: \n
507 ///    \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
508 ///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
509 ///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
510 ///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) \n
511 ///    \a R4 := (\a __a8 * \a __b8) + (\a __a9 * \a __b9) \n
512 ///    \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n
513 ///    \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n
514 ///    \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15)
515 static __inline__ __m128i __DEFAULT_FN_ATTRS
516 _mm_maddubs_epi16(__m128i __a, __m128i __b)
517 {
518     return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
519 }
520 
521 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
522 ///    values contained in the first source operand and packed 8-bit signed
523 ///    integer values contained in the second source operand, adds pairs of
524 ///    contiguous products with signed saturation, and writes the 16-bit sums to
525 ///    the corresponding bits in the destination.
526 ///
527 ///    For example, bits [7:0] of both operands are multiplied, bits [15:8] of
528 ///    both operands are multiplied, and the sum of both results is written to
529 ///    bits [15:0] of the destination.
530 ///
531 /// \headerfile <x86intrin.h>
532 ///
533 /// This intrinsic corresponds to the \c PMADDUBSW instruction.
534 ///
535 /// \param __a
536 ///    A 64-bit integer vector containing the first source operand.
537 /// \param __b
538 ///    A 64-bit integer vector containing the second source operand.
539 /// \returns A 64-bit integer vector containing the sums of products of both
540 ///    operands: \n
541 ///    \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
542 ///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
543 ///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
544 ///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
545 static __inline__ __m64 __DEFAULT_FN_ATTRS
546 _mm_maddubs_pi16(__m64 __a, __m64 __b)
547 {
548     return __trunc64(__builtin_ia32_pmaddubsw128((__v16qi)__anyext128(__a),
549                                                  (__v16qi)__anyext128(__b)));
550 }
551 
552 /// Multiplies packed 16-bit signed integer values, truncates the 32-bit
553 ///    products to the 18 most significant bits by right-shifting, rounds the
554 ///    truncated value by adding 1, and writes bits [16:1] to the destination.
555 ///
556 /// \headerfile <x86intrin.h>
557 ///
558 /// This intrinsic corresponds to the \c VPMULHRSW instruction.
559 ///
560 /// \param __a
561 ///    A 128-bit vector of [8 x i16] containing one of the source operands.
562 /// \param __b
563 ///    A 128-bit vector of [8 x i16] containing one of the source operands.
564 /// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled
565 ///    products of both operands.
566 static __inline__ __m128i __DEFAULT_FN_ATTRS
567 _mm_mulhrs_epi16(__m128i __a, __m128i __b)
568 {
569     return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
570 }
571 
572 /// Multiplies packed 16-bit signed integer values, truncates the 32-bit
573 ///    products to the 18 most significant bits by right-shifting, rounds the
574 ///    truncated value by adding 1, and writes bits [16:1] to the destination.
575 ///
576 /// \headerfile <x86intrin.h>
577 ///
578 /// This intrinsic corresponds to the \c PMULHRSW instruction.
579 ///
580 /// \param __a
581 ///    A 64-bit vector of [4 x i16] containing one of the source operands.
582 /// \param __b
583 ///    A 64-bit vector of [4 x i16] containing one of the source operands.
584 /// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
585 ///    products of both operands.
586 static __inline__ __m64 __DEFAULT_FN_ATTRS
587 _mm_mulhrs_pi16(__m64 __a, __m64 __b)
588 {
589     return __trunc64(__builtin_ia32_pmulhrsw128((__v8hi)__anyext128(__a),
590                                                 (__v8hi)__anyext128(__b)));
591 }
592 
593 /// Copies the 8-bit integers from a 128-bit integer vector to the
594 ///    destination or clears 8-bit values in the destination, as specified by
595 ///    the second source operand.
596 ///
597 /// \headerfile <x86intrin.h>
598 ///
599 /// This intrinsic corresponds to the \c VPSHUFB instruction.
600 ///
601 /// \param __a
602 ///    A 128-bit integer vector containing the values to be copied.
603 /// \param __b
604 ///    A 128-bit integer vector containing control bytes corresponding to
605 ///    positions in the destination:
606 ///    Bit 7: \n
607 ///    1: Clear the corresponding byte in the destination. \n
608 ///    0: Copy the selected source byte to the corresponding byte in the
609 ///    destination. \n
610 ///    Bits [6:4] Reserved.  \n
611 ///    Bits [3:0] select the source byte to be copied.
612 /// \returns A 128-bit integer vector containing the copied or cleared values.
613 static __inline__ __m128i __DEFAULT_FN_ATTRS
614 _mm_shuffle_epi8(__m128i __a, __m128i __b)
615 {
616     return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
617 }
618 
619 /// Copies the 8-bit integers from a 64-bit integer vector to the
620 ///    destination or clears 8-bit values in the destination, as specified by
621 ///    the second source operand.
622 ///
623 /// \headerfile <x86intrin.h>
624 ///
625 /// This intrinsic corresponds to the \c PSHUFB instruction.
626 ///
627 /// \param __a
628 ///    A 64-bit integer vector containing the values to be copied.
629 /// \param __b
630 ///    A 64-bit integer vector containing control bytes corresponding to
631 ///    positions in the destination:
632 ///    Bit 7: \n
633 ///    1: Clear the corresponding byte in the destination. \n
634 ///    0: Copy the selected source byte to the corresponding byte in the
635 ///    destination. \n
636 ///    Bits [2:0] select the source byte to be copied.
637 /// \returns A 64-bit integer vector containing the copied or cleared values.
638 static __inline__ __m64 __DEFAULT_FN_ATTRS
639 _mm_shuffle_pi8(__m64 __a, __m64 __b)
640 {
641     return __trunc64(__builtin_ia32_pshufb128(
642         (__v16qi)__builtin_shufflevector(
643             (__v2si)(__a), __extension__ (__v2si){}, 0, 1, 0, 1),
644         (__v16qi)__anyext128(__b)));
645 }
646 
647 /// For each 8-bit integer in the first source operand, perform one of
648 ///    the following actions as specified by the second source operand.
649 ///
650 ///    If the byte in the second source is negative, calculate the two's
651 ///    complement of the corresponding byte in the first source, and write that
652 ///    value to the destination. If the byte in the second source is positive,
653 ///    copy the corresponding byte from the first source to the destination. If
654 ///    the byte in the second source is zero, clear the corresponding byte in
655 ///    the destination.
656 ///
657 /// \headerfile <x86intrin.h>
658 ///
659 /// This intrinsic corresponds to the \c VPSIGNB instruction.
660 ///
661 /// \param __a
662 ///    A 128-bit integer vector containing the values to be copied.
663 /// \param __b
664 ///    A 128-bit integer vector containing control bytes corresponding to
665 ///    positions in the destination.
666 /// \returns A 128-bit integer vector containing the resultant values.
667 static __inline__ __m128i __DEFAULT_FN_ATTRS
668 _mm_sign_epi8(__m128i __a, __m128i __b)
669 {
670     return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
671 }
672 
673 /// For each 16-bit integer in the first source operand, perform one of
674 ///    the following actions as specified by the second source operand.
675 ///
676 ///    If the word in the second source is negative, calculate the two's
677 ///    complement of the corresponding word in the first source, and write that
678 ///    value to the destination. If the word in the second source is positive,
679 ///    copy the corresponding word from the first source to the destination. If
680 ///    the word in the second source is zero, clear the corresponding word in
681 ///    the destination.
682 ///
683 /// \headerfile <x86intrin.h>
684 ///
685 /// This intrinsic corresponds to the \c VPSIGNW instruction.
686 ///
687 /// \param __a
688 ///    A 128-bit integer vector containing the values to be copied.
689 /// \param __b
690 ///    A 128-bit integer vector containing control words corresponding to
691 ///    positions in the destination.
692 /// \returns A 128-bit integer vector containing the resultant values.
693 static __inline__ __m128i __DEFAULT_FN_ATTRS
694 _mm_sign_epi16(__m128i __a, __m128i __b)
695 {
696     return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
697 }
698 
699 /// For each 32-bit integer in the first source operand, perform one of
700 ///    the following actions as specified by the second source operand.
701 ///
702 ///    If the doubleword in the second source is negative, calculate the two's
703 ///    complement of the corresponding word in the first source, and write that
704 ///    value to the destination. If the doubleword in the second source is
705 ///    positive, copy the corresponding word from the first source to the
706 ///    destination. If the doubleword in the second source is zero, clear the
707 ///    corresponding word in the destination.
708 ///
709 /// \headerfile <x86intrin.h>
710 ///
711 /// This intrinsic corresponds to the \c VPSIGND instruction.
712 ///
713 /// \param __a
714 ///    A 128-bit integer vector containing the values to be copied.
715 /// \param __b
716 ///    A 128-bit integer vector containing control doublewords corresponding to
717 ///    positions in the destination.
718 /// \returns A 128-bit integer vector containing the resultant values.
719 static __inline__ __m128i __DEFAULT_FN_ATTRS
720 _mm_sign_epi32(__m128i __a, __m128i __b)
721 {
722     return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
723 }
724 
725 /// For each 8-bit integer in the first source operand, perform one of
726 ///    the following actions as specified by the second source operand.
727 ///
728 ///    If the byte in the second source is negative, calculate the two's
729 ///    complement of the corresponding byte in the first source, and write that
730 ///    value to the destination. If the byte in the second source is positive,
731 ///    copy the corresponding byte from the first source to the destination. If
732 ///    the byte in the second source is zero, clear the corresponding byte in
733 ///    the destination.
734 ///
735 /// \headerfile <x86intrin.h>
736 ///
737 /// This intrinsic corresponds to the \c PSIGNB instruction.
738 ///
739 /// \param __a
740 ///    A 64-bit integer vector containing the values to be copied.
741 /// \param __b
742 ///    A 64-bit integer vector containing control bytes corresponding to
743 ///    positions in the destination.
744 /// \returns A 64-bit integer vector containing the resultant values.
745 static __inline__ __m64 __DEFAULT_FN_ATTRS
746 _mm_sign_pi8(__m64 __a, __m64 __b)
747 {
748     return __trunc64(__builtin_ia32_psignb128((__v16qi)__anyext128(__a),
749                                               (__v16qi)__anyext128(__b)));
750 }
751 
752 /// For each 16-bit integer in the first source operand, perform one of
753 ///    the following actions as specified by the second source operand.
754 ///
755 ///    If the word in the second source is negative, calculate the two's
756 ///    complement of the corresponding word in the first source, and write that
757 ///    value to the destination. If the word in the second source is positive,
758 ///    copy the corresponding word from the first source to the destination. If
759 ///    the word in the second source is zero, clear the corresponding word in
760 ///    the destination.
761 ///
762 /// \headerfile <x86intrin.h>
763 ///
764 /// This intrinsic corresponds to the \c PSIGNW instruction.
765 ///
766 /// \param __a
767 ///    A 64-bit integer vector containing the values to be copied.
768 /// \param __b
769 ///    A 64-bit integer vector containing control words corresponding to
770 ///    positions in the destination.
771 /// \returns A 64-bit integer vector containing the resultant values.
772 static __inline__ __m64 __DEFAULT_FN_ATTRS
773 _mm_sign_pi16(__m64 __a, __m64 __b)
774 {
775     return __trunc64(__builtin_ia32_psignw128((__v8hi)__anyext128(__a),
776                                               (__v8hi)__anyext128(__b)));
777 }
778 
779 /// For each 32-bit integer in the first source operand, perform one of
780 ///    the following actions as specified by the second source operand.
781 ///
782 ///    If the doubleword in the second source is negative, calculate the two's
783 ///    complement of the corresponding doubleword in the first source, and
784 ///    write that value to the destination. If the doubleword in the second
785 ///    source is positive, copy the corresponding doubleword from the first
786 ///    source to the destination. If the doubleword in the second source is
787 ///    zero, clear the corresponding doubleword in the destination.
788 ///
789 /// \headerfile <x86intrin.h>
790 ///
791 /// This intrinsic corresponds to the \c PSIGND instruction.
792 ///
793 /// \param __a
794 ///    A 64-bit integer vector containing the values to be copied.
795 /// \param __b
796 ///    A 64-bit integer vector containing two control doublewords corresponding
797 ///    to positions in the destination.
798 /// \returns A 64-bit integer vector containing the resultant values.
799 static __inline__ __m64 __DEFAULT_FN_ATTRS
800 _mm_sign_pi32(__m64 __a, __m64 __b)
801 {
802     return __trunc64(__builtin_ia32_psignd128((__v4si)__anyext128(__a),
803                                               (__v4si)__anyext128(__b)));
804 }
805 
806 #undef __anyext128
807 #undef __trunc64
808 #undef __DEFAULT_FN_ATTRS
809 
810 #endif /* __TMMINTRIN_H */
811