xref: /freebsd-src/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/smmintrin.h (revision 753f127f3ace09432b2baeffd71a308760641a62)
1 /*===---- smmintrin.h - Implementation of SSE4 intrinsics on PowerPC -------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 /* Implemented from the specification included in the Intel C++ Compiler
11    User Guide and Reference, version 9.0.
12 
13    NOTE: This is NOT a complete implementation of the SSE4 intrinsics!  */
14 
15 #ifndef NO_WARN_X86_INTRINSICS
16 /* This header is distributed to simplify porting x86_64 code that
17    makes explicit use of Intel intrinsics to powerp64/powerpc64le.
18 
19    It is the user's responsibility to determine if the results are
20    acceptable and make additional changes as necessary.
21 
22    Note that much code that uses Intel intrinsics can be rewritten in
23    standard C or GNU C extensions, which are more portable and better
24    optimized across multiple targets.  */
25 #error                                                                         \
26     "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
27 #endif
28 
29 #ifndef SMMINTRIN_H_
30 #define SMMINTRIN_H_
31 
32 #if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))
33 
34 #include <altivec.h>
35 #include <tmmintrin.h>
36 
37 /* Rounding mode macros. */
38 #define _MM_FROUND_TO_NEAREST_INT 0x00
39 #define _MM_FROUND_TO_ZERO 0x01
40 #define _MM_FROUND_TO_POS_INF 0x02
41 #define _MM_FROUND_TO_NEG_INF 0x03
42 #define _MM_FROUND_CUR_DIRECTION 0x04
43 
44 #define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
45 #define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
46 #define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
47 #define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
48 #define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
49 #define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
50 
51 #define _MM_FROUND_RAISE_EXC 0x00
52 #define _MM_FROUND_NO_EXC 0x08
53 
54 extern __inline __m128d
55     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
56     _mm_round_pd(__m128d __A, int __rounding) {
57   __v2df __r;
58   union {
59     double __fr;
60     long long __fpscr;
61   } __enables_save, __fpscr_save;
62 
63   if (__rounding & _MM_FROUND_NO_EXC) {
64     /* Save enabled exceptions, disable all exceptions,
65        and preserve the rounding mode.  */
66 #ifdef _ARCH_PWR9
67     __asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
68     __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
69 #else
70     __fpscr_save.__fr = __builtin_mffs();
71     __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
72     __fpscr_save.__fpscr &= ~0xf8;
73     __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
74 #endif
75     /* Insert an artificial "read/write" reference to the variable
76        read below, to ensure the compiler does not schedule
77        a read/use of the variable before the FPSCR is modified, above.
78        This can be removed if and when GCC PR102783 is fixed.
79      */
80     __asm__("" : "+wa"(__A));
81   }
82 
83   switch (__rounding) {
84   case _MM_FROUND_TO_NEAREST_INT:
85     __fpscr_save.__fr = __builtin_mffsl();
86     __attribute__((fallthrough));
87   case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
88     __builtin_set_fpscr_rn(0b00);
89     /* Insert an artificial "read/write" reference to the variable
90        read below, to ensure the compiler does not schedule
91        a read/use of the variable before the FPSCR is modified, above.
92        This can be removed if and when GCC PR102783 is fixed.
93      */
94     __asm__("" : "+wa"(__A));
95 
96     __r = vec_rint((__v2df)__A);
97 
98     /* Insert an artificial "read" reference to the variable written
99        above, to ensure the compiler does not schedule the computation
100        of the value after the manipulation of the FPSCR, below.
101        This can be removed if and when GCC PR102783 is fixed.
102      */
103     __asm__("" : : "wa"(__r));
104     __builtin_set_fpscr_rn(__fpscr_save.__fpscr);
105     break;
106   case _MM_FROUND_TO_NEG_INF:
107   case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
108     __r = vec_floor((__v2df)__A);
109     break;
110   case _MM_FROUND_TO_POS_INF:
111   case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC:
112     __r = vec_ceil((__v2df)__A);
113     break;
114   case _MM_FROUND_TO_ZERO:
115   case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC:
116     __r = vec_trunc((__v2df)__A);
117     break;
118   case _MM_FROUND_CUR_DIRECTION:
119     __r = vec_rint((__v2df)__A);
120     break;
121   }
122   if (__rounding & _MM_FROUND_NO_EXC) {
123     /* Insert an artificial "read" reference to the variable written
124        above, to ensure the compiler does not schedule the computation
125        of the value after the manipulation of the FPSCR, below.
126        This can be removed if and when GCC PR102783 is fixed.
127      */
128     __asm__("" : : "wa"(__r));
129     /* Restore enabled exceptions.  */
130     __fpscr_save.__fr = __builtin_mffsl();
131     __fpscr_save.__fpscr |= __enables_save.__fpscr;
132     __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
133   }
134   return (__m128d)__r;
135 }
136 
137 extern __inline __m128d
138     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
139     _mm_round_sd(__m128d __A, __m128d __B, int __rounding) {
140   __B = _mm_round_pd(__B, __rounding);
141   __v2df __r = {((__v2df)__B)[0], ((__v2df)__A)[1]};
142   return (__m128d)__r;
143 }
144 
145 extern __inline __m128
146     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
147     _mm_round_ps(__m128 __A, int __rounding) {
148   __v4sf __r;
149   union {
150     double __fr;
151     long long __fpscr;
152   } __enables_save, __fpscr_save;
153 
154   if (__rounding & _MM_FROUND_NO_EXC) {
155     /* Save enabled exceptions, disable all exceptions,
156        and preserve the rounding mode.  */
157 #ifdef _ARCH_PWR9
158     __asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
159     __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
160 #else
161     __fpscr_save.__fr = __builtin_mffs();
162     __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
163     __fpscr_save.__fpscr &= ~0xf8;
164     __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
165 #endif
166     /* Insert an artificial "read/write" reference to the variable
167        read below, to ensure the compiler does not schedule
168        a read/use of the variable before the FPSCR is modified, above.
169        This can be removed if and when GCC PR102783 is fixed.
170      */
171     __asm__("" : "+wa"(__A));
172   }
173 
174   switch (__rounding) {
175   case _MM_FROUND_TO_NEAREST_INT:
176     __fpscr_save.__fr = __builtin_mffsl();
177     __attribute__((fallthrough));
178   case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
179     __builtin_set_fpscr_rn(0b00);
180     /* Insert an artificial "read/write" reference to the variable
181        read below, to ensure the compiler does not schedule
182        a read/use of the variable before the FPSCR is modified, above.
183        This can be removed if and when GCC PR102783 is fixed.
184      */
185     __asm__("" : "+wa"(__A));
186 
187     __r = vec_rint((__v4sf)__A);
188 
189     /* Insert an artificial "read" reference to the variable written
190        above, to ensure the compiler does not schedule the computation
191        of the value after the manipulation of the FPSCR, below.
192        This can be removed if and when GCC PR102783 is fixed.
193      */
194     __asm__("" : : "wa"(__r));
195     __builtin_set_fpscr_rn(__fpscr_save.__fpscr);
196     break;
197   case _MM_FROUND_TO_NEG_INF:
198   case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
199     __r = vec_floor((__v4sf)__A);
200     break;
201   case _MM_FROUND_TO_POS_INF:
202   case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC:
203     __r = vec_ceil((__v4sf)__A);
204     break;
205   case _MM_FROUND_TO_ZERO:
206   case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC:
207     __r = vec_trunc((__v4sf)__A);
208     break;
209   case _MM_FROUND_CUR_DIRECTION:
210     __r = vec_rint((__v4sf)__A);
211     break;
212   }
213   if (__rounding & _MM_FROUND_NO_EXC) {
214     /* Insert an artificial "read" reference to the variable written
215        above, to ensure the compiler does not schedule the computation
216        of the value after the manipulation of the FPSCR, below.
217        This can be removed if and when GCC PR102783 is fixed.
218      */
219     __asm__("" : : "wa"(__r));
220     /* Restore enabled exceptions.  */
221     __fpscr_save.__fr = __builtin_mffsl();
222     __fpscr_save.__fpscr |= __enables_save.__fpscr;
223     __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
224   }
225   return (__m128)__r;
226 }
227 
228 extern __inline __m128
229     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
230     _mm_round_ss(__m128 __A, __m128 __B, int __rounding) {
231   __B = _mm_round_ps(__B, __rounding);
232   __v4sf __r = (__v4sf)__A;
233   __r[0] = ((__v4sf)__B)[0];
234   return (__m128)__r;
235 }
236 
237 #define _mm_ceil_pd(V) _mm_round_pd((V), _MM_FROUND_CEIL)
238 #define _mm_ceil_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_CEIL)
239 
240 #define _mm_floor_pd(V) _mm_round_pd((V), _MM_FROUND_FLOOR)
241 #define _mm_floor_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_FLOOR)
242 
243 #define _mm_ceil_ps(V) _mm_round_ps((V), _MM_FROUND_CEIL)
244 #define _mm_ceil_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_CEIL)
245 
246 #define _mm_floor_ps(V) _mm_round_ps((V), _MM_FROUND_FLOOR)
247 #define _mm_floor_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_FLOOR)
248 
249 extern __inline __m128i
250     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
251     _mm_insert_epi8(__m128i const __A, int const __D, int const __N) {
252   __v16qi __result = (__v16qi)__A;
253 
254   __result[__N & 0xf] = __D;
255 
256   return (__m128i)__result;
257 }
258 
259 extern __inline __m128i
260     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
261     _mm_insert_epi32(__m128i const __A, int const __D, int const __N) {
262   __v4si __result = (__v4si)__A;
263 
264   __result[__N & 3] = __D;
265 
266   return (__m128i)__result;
267 }
268 
269 extern __inline __m128i
270     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
271     _mm_insert_epi64(__m128i const __A, long long const __D, int const __N) {
272   __v2di __result = (__v2di)__A;
273 
274   __result[__N & 1] = __D;
275 
276   return (__m128i)__result;
277 }
278 
279 extern __inline int
280     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
281     _mm_extract_epi8(__m128i __X, const int __N) {
282   return (unsigned char)((__v16qi)__X)[__N & 15];
283 }
284 
285 extern __inline int
286     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
287     _mm_extract_epi32(__m128i __X, const int __N) {
288   return ((__v4si)__X)[__N & 3];
289 }
290 
291 extern __inline int
292     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
293     _mm_extract_epi64(__m128i __X, const int __N) {
294   return ((__v2di)__X)[__N & 1];
295 }
296 
297 extern __inline int
298     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
299     _mm_extract_ps(__m128 __X, const int __N) {
300   return ((__v4si)__X)[__N & 3];
301 }
302 
303 #ifdef _ARCH_PWR8
304 extern __inline __m128i
305     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
306     _mm_blend_epi16(__m128i __A, __m128i __B, const int __imm8) {
307   __v16qi __charmask = vec_splats((signed char)__imm8);
308   __charmask = vec_gb(__charmask);
309   __v8hu __shortmask = (__v8hu)vec_unpackh(__charmask);
310 #ifdef __BIG_ENDIAN__
311   __shortmask = vec_reve(__shortmask);
312 #endif
313   return (__m128i)vec_sel((__v8hu)__A, (__v8hu)__B, __shortmask);
314 }
315 #endif
316 
317 extern __inline __m128i
318     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
319     _mm_blendv_epi8(__m128i __A, __m128i __B, __m128i __mask) {
320 #ifdef _ARCH_PWR10
321   return (__m128i)vec_blendv((__v16qi)__A, (__v16qi)__B, (__v16qu)__mask);
322 #else
323   const __v16qu __seven = vec_splats((unsigned char)0x07);
324   __v16qu __lmask = vec_sra((__v16qu)__mask, __seven);
325   return (__m128i)vec_sel((__v16qi)__A, (__v16qi)__B, __lmask);
326 #endif
327 }
328 
329 extern __inline __m128
330     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
331     _mm_blend_ps(__m128 __A, __m128 __B, const int __imm8) {
332   __v16qu __pcv[] = {
333       {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
334       {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
335       {0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
336       {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
337       {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15},
338       {16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15},
339       {0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15},
340       {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15},
341       {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31},
342       {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31},
343       {0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31},
344       {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31},
345       {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
346       {16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
347       {0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
348       {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
349   };
350   __v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]);
351   return (__m128)__r;
352 }
353 
354 extern __inline __m128
355     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
356     _mm_blendv_ps(__m128 __A, __m128 __B, __m128 __mask) {
357 #ifdef _ARCH_PWR10
358   return (__m128)vec_blendv((__v4sf)__A, (__v4sf)__B, (__v4su)__mask);
359 #else
360   const __v4si __zero = {0};
361   const __vector __bool int __boolmask = vec_cmplt((__v4si)__mask, __zero);
362   return (__m128)vec_sel((__v4su)__A, (__v4su)__B, (__v4su)__boolmask);
363 #endif
364 }
365 
366 extern __inline __m128d
367     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
368     _mm_blend_pd(__m128d __A, __m128d __B, const int __imm8) {
369   __v16qu __pcv[] = {
370       {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
371       {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
372       {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
373       {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}};
374   __v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]);
375   return (__m128d)__r;
376 }
377 
378 #ifdef _ARCH_PWR8
379 extern __inline __m128d
380     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
381     _mm_blendv_pd(__m128d __A, __m128d __B, __m128d __mask) {
382 #ifdef _ARCH_PWR10
383   return (__m128d)vec_blendv((__v2df)__A, (__v2df)__B, (__v2du)__mask);
384 #else
385   const __v2di __zero = {0};
386   const __vector __bool long long __boolmask =
387       vec_cmplt((__v2di)__mask, __zero);
388   return (__m128d)vec_sel((__v2du)__A, (__v2du)__B, (__v2du)__boolmask);
389 #endif
390 }
391 #endif
392 
393 extern __inline int
394     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
395     _mm_testz_si128(__m128i __A, __m128i __B) {
396   /* Note: This implementation does NOT set "zero" or "carry" flags.  */
397   const __v16qu __zero = {0};
398   return vec_all_eq(vec_and((__v16qu)__A, (__v16qu)__B), __zero);
399 }
400 
401 extern __inline int
402     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
403     _mm_testc_si128(__m128i __A, __m128i __B) {
404   /* Note: This implementation does NOT set "zero" or "carry" flags.  */
405   const __v16qu __zero = {0};
406   const __v16qu __notA = vec_nor((__v16qu)__A, (__v16qu)__A);
407   return vec_all_eq(vec_and((__v16qu)__notA, (__v16qu)__B), __zero);
408 }
409 
410 extern __inline int
411     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
412     _mm_testnzc_si128(__m128i __A, __m128i __B) {
413   /* Note: This implementation does NOT set "zero" or "carry" flags.  */
414   return _mm_testz_si128(__A, __B) == 0 && _mm_testc_si128(__A, __B) == 0;
415 }
416 
417 #define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V))
418 
419 #define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
420 
421 #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
422 
423 #ifdef _ARCH_PWR8
424 extern __inline __m128i
425     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
426     _mm_cmpeq_epi64(__m128i __X, __m128i __Y) {
427   return (__m128i)vec_cmpeq((__v2di)__X, (__v2di)__Y);
428 }
429 #endif
430 
431 extern __inline __m128i
432     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
433     _mm_min_epi8(__m128i __X, __m128i __Y) {
434   return (__m128i)vec_min((__v16qi)__X, (__v16qi)__Y);
435 }
436 
437 extern __inline __m128i
438     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
439     _mm_min_epu16(__m128i __X, __m128i __Y) {
440   return (__m128i)vec_min((__v8hu)__X, (__v8hu)__Y);
441 }
442 
443 extern __inline __m128i
444     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
445     _mm_min_epi32(__m128i __X, __m128i __Y) {
446   return (__m128i)vec_min((__v4si)__X, (__v4si)__Y);
447 }
448 
449 extern __inline __m128i
450     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
451     _mm_min_epu32(__m128i __X, __m128i __Y) {
452   return (__m128i)vec_min((__v4su)__X, (__v4su)__Y);
453 }
454 
455 extern __inline __m128i
456     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
457     _mm_max_epi8(__m128i __X, __m128i __Y) {
458   return (__m128i)vec_max((__v16qi)__X, (__v16qi)__Y);
459 }
460 
461 extern __inline __m128i
462     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
463     _mm_max_epu16(__m128i __X, __m128i __Y) {
464   return (__m128i)vec_max((__v8hu)__X, (__v8hu)__Y);
465 }
466 
467 extern __inline __m128i
468     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
469     _mm_max_epi32(__m128i __X, __m128i __Y) {
470   return (__m128i)vec_max((__v4si)__X, (__v4si)__Y);
471 }
472 
473 extern __inline __m128i
474     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
475     _mm_max_epu32(__m128i __X, __m128i __Y) {
476   return (__m128i)vec_max((__v4su)__X, (__v4su)__Y);
477 }
478 
479 extern __inline __m128i
480     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
481     _mm_mullo_epi32(__m128i __X, __m128i __Y) {
482   return (__m128i)vec_mul((__v4su)__X, (__v4su)__Y);
483 }
484 
485 #ifdef _ARCH_PWR8
486 extern __inline __m128i
487     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
488     _mm_mul_epi32(__m128i __X, __m128i __Y) {
489   return (__m128i)vec_mule((__v4si)__X, (__v4si)__Y);
490 }
491 #endif
492 
493 extern __inline __m128i
494     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
495     _mm_cvtepi8_epi16(__m128i __A) {
496   return (__m128i)vec_unpackh((__v16qi)__A);
497 }
498 
499 extern __inline __m128i
500     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
501     _mm_cvtepi8_epi32(__m128i __A) {
502   __A = (__m128i)vec_unpackh((__v16qi)__A);
503   return (__m128i)vec_unpackh((__v8hi)__A);
504 }
505 
506 #ifdef _ARCH_PWR8
507 extern __inline __m128i
508     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
509     _mm_cvtepi8_epi64(__m128i __A) {
510   __A = (__m128i)vec_unpackh((__v16qi)__A);
511   __A = (__m128i)vec_unpackh((__v8hi)__A);
512   return (__m128i)vec_unpackh((__v4si)__A);
513 }
514 #endif
515 
516 extern __inline __m128i
517     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
518     _mm_cvtepi16_epi32(__m128i __A) {
519   return (__m128i)vec_unpackh((__v8hi)__A);
520 }
521 
522 #ifdef _ARCH_PWR8
523 extern __inline __m128i
524     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
525     _mm_cvtepi16_epi64(__m128i __A) {
526   __A = (__m128i)vec_unpackh((__v8hi)__A);
527   return (__m128i)vec_unpackh((__v4si)__A);
528 }
529 #endif
530 
531 #ifdef _ARCH_PWR8
532 extern __inline __m128i
533     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
534     _mm_cvtepi32_epi64(__m128i __A) {
535   return (__m128i)vec_unpackh((__v4si)__A);
536 }
537 #endif
538 
539 extern __inline __m128i
540     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
541     _mm_cvtepu8_epi16(__m128i __A) {
542   const __v16qu __zero = {0};
543 #ifdef __LITTLE_ENDIAN__
544   __A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
545 #else  /* __BIG_ENDIAN__.  */
546   __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
547 #endif /* __BIG_ENDIAN__.  */
548   return __A;
549 }
550 
551 extern __inline __m128i
552     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
553     _mm_cvtepu8_epi32(__m128i __A) {
554   const __v16qu __zero = {0};
555 #ifdef __LITTLE_ENDIAN__
556   __A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
557   __A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero);
558 #else  /* __BIG_ENDIAN__.  */
559   __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
560   __A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A);
561 #endif /* __BIG_ENDIAN__.  */
562   return __A;
563 }
564 
565 extern __inline __m128i
566     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
567     _mm_cvtepu8_epi64(__m128i __A) {
568   const __v16qu __zero = {0};
569 #ifdef __LITTLE_ENDIAN__
570   __A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
571   __A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero);
572   __A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero);
573 #else  /* __BIG_ENDIAN__.  */
574   __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
575   __A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A);
576   __A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A);
577 #endif /* __BIG_ENDIAN__.  */
578   return __A;
579 }
580 
581 extern __inline __m128i
582     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
583     _mm_cvtepu16_epi32(__m128i __A) {
584   const __v8hu __zero = {0};
585 #ifdef __LITTLE_ENDIAN__
586   __A = (__m128i)vec_mergeh((__v8hu)__A, __zero);
587 #else  /* __BIG_ENDIAN__.  */
588   __A = (__m128i)vec_mergeh(__zero, (__v8hu)__A);
589 #endif /* __BIG_ENDIAN__.  */
590   return __A;
591 }
592 
593 extern __inline __m128i
594     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
595     _mm_cvtepu16_epi64(__m128i __A) {
596   const __v8hu __zero = {0};
597 #ifdef __LITTLE_ENDIAN__
598   __A = (__m128i)vec_mergeh((__v8hu)__A, __zero);
599   __A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero);
600 #else  /* __BIG_ENDIAN__.  */
601   __A = (__m128i)vec_mergeh(__zero, (__v8hu)__A);
602   __A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A);
603 #endif /* __BIG_ENDIAN__.  */
604   return __A;
605 }
606 
607 extern __inline __m128i
608     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
609     _mm_cvtepu32_epi64(__m128i __A) {
610   const __v4su __zero = {0};
611 #ifdef __LITTLE_ENDIAN__
612   __A = (__m128i)vec_mergeh((__v4su)__A, __zero);
613 #else  /* __BIG_ENDIAN__.  */
614   __A = (__m128i)vec_mergeh(__zero, (__v4su)__A);
615 #endif /* __BIG_ENDIAN__.  */
616   return __A;
617 }
618 
619 /* Return horizontal packed word minimum and its index in bits [15:0]
620    and bits [18:16] respectively.  */
621 extern __inline __m128i
622     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
623     _mm_minpos_epu16(__m128i __A) {
624   union __u {
625     __m128i __m;
626     __v8hu __uh;
627   };
628   union __u __u = {.__m = __A}, __r = {.__m = {0}};
629   unsigned short __ridx = 0;
630   unsigned short __rmin = __u.__uh[__ridx];
631   unsigned long __i;
632   for (__i = 1; __i < 8; __i++) {
633     if (__u.__uh[__i] < __rmin) {
634       __rmin = __u.__uh[__i];
635       __ridx = __i;
636     }
637   }
638   __r.__uh[0] = __rmin;
639   __r.__uh[1] = __ridx;
640   return __r.__m;
641 }
642 
643 extern __inline __m128i
644     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
645     _mm_packus_epi32(__m128i __X, __m128i __Y) {
646   return (__m128i)vec_packsu((__v4si)__X, (__v4si)__Y);
647 }
648 
649 #ifdef _ARCH_PWR8
650 extern __inline __m128i
651     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
652     _mm_cmpgt_epi64(__m128i __X, __m128i __Y) {
653   return (__m128i)vec_cmpgt((__v2di)__X, (__v2di)__Y);
654 }
655 #endif
656 
657 #else
658 #include_next <smmintrin.h>
659 #endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))   \
660         */
661 
662 #endif /* SMMINTRIN_H_ */
663