xref: /netbsd-src/external/gpl3/gcc/dist/gcc/config/rs6000/tmmintrin.h (revision b1e838363e3c6fc78a55519254d99869742dd33c)
1 /* Copyright (C) 2003-2022 Free Software Foundation, Inc.
2 
3    This file is part of GCC.
4 
5    GCC is free software; you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation; either version 3, or (at your option)
8    any later version.
9 
10    GCC is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    Under Section 7 of GPL version 3, you are granted additional
16    permissions described in the GCC Runtime Library Exception, version
17    3.1, as published by the Free Software Foundation.
18 
19    You should have received a copy of the GNU General Public License and
20    a copy of the GCC Runtime Library Exception along with this program;
21    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
22    <http://www.gnu.org/licenses/>.  */
23 
24 /* Implemented from the specification included in the Intel C++ Compiler
25    User Guide and Reference, version 9.0.  */
26 
27 #ifndef NO_WARN_X86_INTRINSICS
28 /* This header is distributed to simplify porting x86_64 code that
29    makes explicit use of Intel intrinsics to powerpc64le.
30    It is the user's responsibility to determine if the results are
31    acceptable and make additional changes as necessary.
32    Note that much code that uses Intel intrinsics can be rewritten in
33    standard C or GNU C extensions, which are more portable and better
34    optimized across multiple targets.  */
35 #endif
36 
37 #ifndef TMMINTRIN_H_
38 #define TMMINTRIN_H_
39 
40 #include <altivec.h>
41 #include <assert.h>
42 
43 /* We need definitions from the SSE header files.  */
44 #include <pmmintrin.h>
45 
46 extern __inline __m128i
47 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_abs_epi16(__m128i __A)48 _mm_abs_epi16 (__m128i __A)
49 {
50   return (__m128i) vec_abs ((__v8hi) __A);
51 }
52 
53 extern __inline __m128i
54 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_abs_epi32(__m128i __A)55 _mm_abs_epi32 (__m128i __A)
56 {
57   return (__m128i) vec_abs ((__v4si) __A);
58 }
59 
60 extern __inline __m128i
61 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_abs_epi8(__m128i __A)62 _mm_abs_epi8 (__m128i __A)
63 {
64   return (__m128i) vec_abs ((__v16qi) __A);
65 }
66 
67 extern __inline __m64
68 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_abs_pi16(__m64 __A)69 _mm_abs_pi16 (__m64 __A)
70 {
71   __v8hi __B = (__v8hi) (__v2du) { __A, __A };
72   return (__m64) ((__v2du) vec_abs (__B))[0];
73 }
74 
75 extern __inline __m64
76 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_abs_pi32(__m64 __A)77 _mm_abs_pi32 (__m64 __A)
78 {
79   __v4si __B = (__v4si) (__v2du) { __A, __A };
80   return (__m64) ((__v2du) vec_abs (__B))[0];
81 }
82 
83 extern __inline __m64
84 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_abs_pi8(__m64 __A)85 _mm_abs_pi8 (__m64 __A)
86 {
87   __v16qi __B = (__v16qi) (__v2du) { __A, __A };
88   return (__m64) ((__v2du) vec_abs (__B))[0];
89 }
90 
91 extern __inline __m128i
92 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_alignr_epi8(__m128i __A,__m128i __B,const unsigned int __count)93 _mm_alignr_epi8 (__m128i __A, __m128i __B, const unsigned int __count)
94 {
95   if (__builtin_constant_p (__count) && __count < 16)
96     {
97 #ifdef __LITTLE_ENDIAN__
98       __A = (__m128i) vec_reve ((__v16qu) __A);
99       __B = (__m128i) vec_reve ((__v16qu) __B);
100 #endif
101       __A = (__m128i) vec_sld ((__v16qu) __B, (__v16qu) __A, __count);
102 #ifdef __LITTLE_ENDIAN__
103       __A = (__m128i) vec_reve ((__v16qu) __A);
104 #endif
105       return __A;
106     }
107 
108   if (__count == 0)
109     return __B;
110 
111   if (__count >= 16)
112     {
113       if (__count >= 32)
114 	{
115 	  const __v16qu __zero = { 0 };
116 	  return (__m128i) __zero;
117 	}
118       else
119 	{
120 	  const __v16qu __shift =
121 	    vec_splats ((unsigned char) ((__count - 16) * 8));
122 #ifdef __LITTLE_ENDIAN__
123 	  return (__m128i) vec_sro ((__v16qu) __A, __shift);
124 #else
125 	  return (__m128i) vec_slo ((__v16qu) __A, __shift);
126 #endif
127 	}
128     }
129   else
130     {
131       const __v16qu __shiftA =
132 	vec_splats ((unsigned char) ((16 - __count) * 8));
133       const __v16qu __shiftB = vec_splats ((unsigned char) (__count * 8));
134 #ifdef __LITTLE_ENDIAN__
135       __A = (__m128i) vec_slo ((__v16qu) __A, __shiftA);
136       __B = (__m128i) vec_sro ((__v16qu) __B, __shiftB);
137 #else
138       __A = (__m128i) vec_sro ((__v16qu) __A, __shiftA);
139       __B = (__m128i) vec_slo ((__v16qu) __B, __shiftB);
140 #endif
141       return (__m128i) vec_or ((__v16qu) __A, (__v16qu) __B);
142     }
143 }
144 
145 extern __inline __m64
146 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_alignr_pi8(__m64 __A,__m64 __B,unsigned int __count)147 _mm_alignr_pi8 (__m64 __A, __m64 __B, unsigned int __count)
148 {
149   if (__count < 16)
150     {
151       __v2du __C = { __B, __A };
152 #ifdef __LITTLE_ENDIAN__
153       const __v4su __shift = { __count << 3, 0, 0, 0 };
154       __C = (__v2du) vec_sro ((__v16qu) __C, (__v16qu) __shift);
155 #else
156       const __v4su __shift = { 0, 0, 0, __count << 3 };
157       __C = (__v2du) vec_slo ((__v16qu) __C, (__v16qu) __shift);
158 #endif
159       return (__m64) __C[0];
160     }
161   else
162     {
163       const __m64 __zero = { 0 };
164       return __zero;
165     }
166 }
167 
168 extern __inline __m128i
169 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadd_epi16(__m128i __A,__m128i __B)170 _mm_hadd_epi16 (__m128i __A, __m128i __B)
171 {
172   const __v16qu __P =
173     {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
174   const __v16qu __Q =
175     {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
176   __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
177   __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
178   return (__m128i) vec_add (__C, __D);
179 }
180 
181 extern __inline __m128i
182 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadd_epi32(__m128i __A,__m128i __B)183 _mm_hadd_epi32 (__m128i __A, __m128i __B)
184 {
185   const __v16qu __P =
186     {  0,  1,  2,  3,  8,  9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
187   const __v16qu __Q =
188     {  4,  5,  6,  7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
189   __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P);
190   __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q);
191   return (__m128i) vec_add (__C, __D);
192 }
193 
194 extern __inline __m64
195 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadd_pi16(__m64 __A,__m64 __B)196 _mm_hadd_pi16 (__m64 __A, __m64 __B)
197 {
198   __v8hi __C = (__v8hi) (__v2du) { __A, __B };
199   const __v16qu __P =
200     {  0,  1,  4,  5,  8,  9, 12, 13,  0,  1,  4,  5,  8,  9, 12, 13 };
201   const __v16qu __Q =
202     {  2,  3,  6,  7, 10, 11, 14, 15,  2,  3,  6,  7, 10, 11, 14, 15 };
203   __v8hi __D = vec_perm (__C, __C, __Q);
204   __C = vec_perm (__C, __C, __P);
205   __C = vec_add (__C, __D);
206   return (__m64) ((__v2du) __C)[1];
207 }
208 
209 extern __inline __m64
210 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadd_pi32(__m64 __A,__m64 __B)211 _mm_hadd_pi32 (__m64 __A, __m64 __B)
212 {
213   __v4si __C = (__v4si) (__v2du) { __A, __B };
214   const __v16qu __P =
215     {  0,  1,  2,  3,  8,  9, 10, 11,  0,  1,  2,  3,  8,  9, 10, 11 };
216   const __v16qu __Q =
217     {  4,  5,  6,  7, 12, 13, 14, 15,  4,  5,  6,  7, 12, 13, 14, 15 };
218   __v4si __D = vec_perm (__C, __C, __Q);
219   __C = vec_perm (__C, __C, __P);
220   __C = vec_add (__C, __D);
221   return (__m64) ((__v2du) __C)[1];
222 }
223 
224 extern __inline __m128i
225 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadds_epi16(__m128i __A,__m128i __B)226 _mm_hadds_epi16 (__m128i __A, __m128i __B)
227 {
228   __v4si __C = { 0 }, __D = { 0 };
229   __C = vec_sum4s ((__v8hi) __A, __C);
230   __D = vec_sum4s ((__v8hi) __B, __D);
231   __C = (__v4si) vec_packs (__C, __D);
232   return (__m128i) __C;
233 }
234 
235 extern __inline __m64
236 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadds_pi16(__m64 __A,__m64 __B)237 _mm_hadds_pi16 (__m64 __A, __m64 __B)
238 {
239   const __v4si __zero = { 0 };
240   __v8hi __C = (__v8hi) (__v2du) { __A, __B };
241   __v4si __D = vec_sum4s (__C, __zero);
242   __C = vec_packs (__D, __D);
243   return (__m64) ((__v2du) __C)[1];
244 }
245 
246 extern __inline __m128i
247 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsub_epi16(__m128i __A,__m128i __B)248 _mm_hsub_epi16 (__m128i __A, __m128i __B)
249 {
250   const __v16qu __P =
251     {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
252   const __v16qu __Q =
253     {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
254   __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
255   __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
256   return (__m128i) vec_sub (__C, __D);
257 }
258 
259 extern __inline __m128i
260 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsub_epi32(__m128i __A,__m128i __B)261 _mm_hsub_epi32 (__m128i __A, __m128i __B)
262 {
263   const __v16qu __P =
264     {  0,  1,  2,  3,  8,  9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
265   const __v16qu __Q =
266     {  4,  5,  6,  7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
267   __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P);
268   __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q);
269   return (__m128i) vec_sub (__C, __D);
270 }
271 
272 extern __inline __m64
273 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsub_pi16(__m64 __A,__m64 __B)274 _mm_hsub_pi16 (__m64 __A, __m64 __B)
275 {
276   const __v16qu __P =
277     {  0,  1,  4,  5,  8,  9, 12, 13,  0,  1,  4,  5,  8,  9, 12, 13 };
278   const __v16qu __Q =
279     {  2,  3,  6,  7, 10, 11, 14, 15,  2,  3,  6,  7, 10, 11, 14, 15 };
280   __v8hi __C = (__v8hi) (__v2du) { __A, __B };
281   __v8hi __D = vec_perm (__C, __C, __Q);
282   __C = vec_perm (__C, __C, __P);
283   __C = vec_sub (__C, __D);
284   return (__m64) ((__v2du) __C)[1];
285 }
286 
287 extern __inline __m64
288 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsub_pi32(__m64 __A,__m64 __B)289 _mm_hsub_pi32 (__m64 __A, __m64 __B)
290 {
291   const __v16qu __P =
292     {  0,  1,  2,  3,  8,  9, 10, 11,  0,  1,  2,  3,  8,  9, 10, 11 };
293   const __v16qu __Q =
294     {  4,  5,  6,  7, 12, 13, 14, 15,  4,  5,  6,  7, 12, 13, 14, 15 };
295   __v4si __C = (__v4si) (__v2du) { __A, __B };
296   __v4si __D = vec_perm (__C, __C, __Q);
297   __C = vec_perm (__C, __C, __P);
298   __C = vec_sub (__C, __D);
299   return (__m64) ((__v2du) __C)[1];
300 }
301 
302 extern __inline __m128i
303 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsubs_epi16(__m128i __A,__m128i __B)304 _mm_hsubs_epi16 (__m128i __A, __m128i __B)
305 {
306   const __v16qu __P =
307     {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
308   const __v16qu __Q =
309     {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
310   __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
311   __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
312   return (__m128i) vec_subs (__C, __D);
313 }
314 
315 extern __inline __m64
316 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsubs_pi16(__m64 __A,__m64 __B)317 _mm_hsubs_pi16 (__m64 __A, __m64 __B)
318 {
319   const __v16qu __P =
320     {  0,  1,  4,  5,  8,  9, 12, 13,  0,  1,  4,  5,  8,  9, 12, 13 };
321   const __v16qu __Q =
322     {  2,  3,  6,  7, 10, 11, 14, 15,  2,  3,  6,  7, 10, 11, 14, 15 };
323   __v8hi __C = (__v8hi) (__v2du) { __A, __B };
324   __v8hi __D = vec_perm (__C, __C, __P);
325   __v8hi __E = vec_perm (__C, __C, __Q);
326   __C = vec_subs (__D, __E);
327   return (__m64) ((__v2du) __C)[1];
328 }
329 
330 extern __inline __m128i
331 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_epi8(__m128i __A,__m128i __B)332 _mm_shuffle_epi8 (__m128i __A, __m128i __B)
333 {
334   const __v16qi __zero = { 0 };
335   __vector __bool char __select = vec_cmplt ((__v16qi) __B, __zero);
336   __v16qi __C = vec_perm ((__v16qi) __A, (__v16qi) __A, (__v16qu) __B);
337   return (__m128i) vec_sel (__C, __zero, __select);
338 }
339 
340 extern __inline __m64
341 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_pi8(__m64 __A,__m64 __B)342 _mm_shuffle_pi8 (__m64 __A, __m64 __B)
343 {
344   const __v16qi __zero = { 0 };
345   __v16qi __C = (__v16qi) (__v2du) { __A, __A };
346   __v16qi __D = (__v16qi) (__v2du) { __B, __B };
347   __vector __bool char __select = vec_cmplt ((__v16qi) __D, __zero);
348   __C = vec_perm ((__v16qi) __C, (__v16qi) __C, (__v16qu) __D);
349   __C = vec_sel (__C, __zero, __select);
350   return (__m64) ((__v2du) (__C))[0];
351 }
352 
353 #ifdef _ARCH_PWR8
354 extern __inline __m128i
355 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_epi8(__m128i __A,__m128i __B)356 _mm_sign_epi8 (__m128i __A, __m128i __B)
357 {
358   const __v16qi __zero = { 0 };
359   __v16qi __selectneg = (__v16qi) vec_cmplt ((__v16qi) __B, __zero);
360   __v16qi __selectpos =
361     (__v16qi) vec_neg ((__v16qi) vec_cmpgt ((__v16qi) __B, __zero));
362   __v16qi __conv = vec_add (__selectneg, __selectpos);
363   return (__m128i) vec_mul ((__v16qi) __A, (__v16qi) __conv);
364 }
365 #endif
366 
367 #ifdef _ARCH_PWR8
368 extern __inline __m128i
369 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_epi16(__m128i __A,__m128i __B)370 _mm_sign_epi16 (__m128i __A, __m128i __B)
371 {
372   const __v8hi __zero = { 0 };
373   __v8hi __selectneg = (__v8hi) vec_cmplt ((__v8hi) __B, __zero);
374   __v8hi __selectpos =
375     (__v8hi) vec_neg ((__v8hi) vec_cmpgt ((__v8hi) __B, __zero));
376   __v8hi __conv = vec_add (__selectneg, __selectpos);
377   return (__m128i) vec_mul ((__v8hi) __A, (__v8hi) __conv);
378 }
379 #endif
380 
381 #ifdef _ARCH_PWR8
382 extern __inline __m128i
383 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_epi32(__m128i __A,__m128i __B)384 _mm_sign_epi32 (__m128i __A, __m128i __B)
385 {
386   const __v4si __zero = { 0 };
387   __v4si __selectneg = (__v4si) vec_cmplt ((__v4si) __B, __zero);
388   __v4si __selectpos =
389     (__v4si) vec_neg ((__v4si) vec_cmpgt ((__v4si) __B, __zero));
390   __v4si __conv = vec_add (__selectneg, __selectpos);
391   return (__m128i) vec_mul ((__v4si) __A, (__v4si) __conv);
392 }
393 #endif
394 
395 #ifdef _ARCH_PWR8
396 extern __inline __m64
397 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_pi8(__m64 __A,__m64 __B)398 _mm_sign_pi8 (__m64 __A, __m64 __B)
399 {
400   const __v16qi __zero = { 0 };
401   __v16qi __C = (__v16qi) (__v2du) { __A, __A };
402   __v16qi __D = (__v16qi) (__v2du) { __B, __B };
403   __C = (__v16qi) _mm_sign_epi8 ((__m128i) __C, (__m128i) __D);
404   return (__m64) ((__v2du) (__C))[0];
405 }
406 #endif
407 
408 #ifdef _ARCH_PWR8
409 extern __inline __m64
410 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_pi16(__m64 __A,__m64 __B)411 _mm_sign_pi16 (__m64 __A, __m64 __B)
412 {
413   const __v8hi __zero = { 0 };
414   __v8hi __C = (__v8hi) (__v2du) { __A, __A };
415   __v8hi __D = (__v8hi) (__v2du) { __B, __B };
416   __C = (__v8hi) _mm_sign_epi16 ((__m128i) __C, (__m128i) __D);
417   return (__m64) ((__v2du) (__C))[0];
418 }
419 #endif
420 
421 #ifdef _ARCH_PWR8
422 extern __inline __m64
423 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_pi32(__m64 __A,__m64 __B)424 _mm_sign_pi32 (__m64 __A, __m64 __B)
425 {
426   const __v4si __zero = { 0 };
427   __v4si __C = (__v4si) (__v2du) { __A, __A };
428   __v4si __D = (__v4si) (__v2du) { __B, __B };
429   __C = (__v4si) _mm_sign_epi32 ((__m128i) __C, (__m128i) __D);
430   return (__m64) ((__v2du) (__C))[0];
431 }
432 #endif
433 
434 extern __inline __m128i
435 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maddubs_epi16(__m128i __A,__m128i __B)436 _mm_maddubs_epi16 (__m128i __A, __m128i __B)
437 {
438   __v8hi __unsigned = vec_splats ((signed short) 0x00ff);
439   __v8hi __C = vec_and (vec_unpackh ((__v16qi) __A), __unsigned);
440   __v8hi __D = vec_and (vec_unpackl ((__v16qi) __A), __unsigned);
441   __v8hi __E = vec_unpackh ((__v16qi) __B);
442   __v8hi __F = vec_unpackl ((__v16qi) __B);
443   __C = vec_mul (__C, __E);
444   __D = vec_mul (__D, __F);
445   const __v16qu __odds  =
446     {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
447   const __v16qu __evens =
448     {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
449   __E = vec_perm (__C, __D, __odds);
450   __F = vec_perm (__C, __D, __evens);
451   return (__m128i) vec_adds (__E, __F);
452 }
453 
454 extern __inline __m64
455 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maddubs_pi16(__m64 __A,__m64 __B)456 _mm_maddubs_pi16 (__m64 __A, __m64 __B)
457 {
458   __v8hi __C = (__v8hi) (__v2du) { __A, __A };
459   __C = vec_unpackl ((__v16qi) __C);
460   const __v8hi __unsigned = vec_splats ((signed short) 0x00ff);
461   __C = vec_and (__C, __unsigned);
462   __v8hi __D = (__v8hi) (__v2du) { __B, __B };
463   __D = vec_unpackl ((__v16qi) __D);
464   __D = vec_mul (__C, __D);
465   const __v16qu __odds  =
466     {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
467   const __v16qu __evens =
468     {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
469   __C = vec_perm (__D, __D, __odds);
470   __D = vec_perm (__D, __D, __evens);
471   __C = vec_adds (__C, __D);
472   return (__m64) ((__v2du) (__C))[0];
473 }
474 
475 extern __inline __m128i
476 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhrs_epi16(__m128i __A,__m128i __B)477 _mm_mulhrs_epi16 (__m128i __A, __m128i __B)
478 {
479   __v4si __C = vec_unpackh ((__v8hi) __A);
480   __v4si __D = vec_unpackh ((__v8hi) __B);
481   __C = vec_mul (__C, __D);
482   __D = vec_unpackl ((__v8hi) __A);
483   __v4si __E = vec_unpackl ((__v8hi) __B);
484   __D = vec_mul (__D, __E);
485   const __v4su __shift = vec_splats ((unsigned int) 14);
486   __C = vec_sr (__C, __shift);
487   __D = vec_sr (__D, __shift);
488   const __v4si __ones = vec_splats ((signed int) 1);
489   __C = vec_add (__C, __ones);
490   __C = vec_sr (__C, (__v4su) __ones);
491   __D = vec_add (__D, __ones);
492   __D = vec_sr (__D, (__v4su) __ones);
493   return (__m128i) vec_pack (__C, __D);
494 }
495 
496 extern __inline __m64
497 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhrs_pi16(__m64 __A,__m64 __B)498 _mm_mulhrs_pi16 (__m64 __A, __m64 __B)
499 {
500   __v4si __C = (__v4si) (__v2du) { __A, __A };
501   __C = vec_unpackh ((__v8hi) __C);
502   __v4si __D = (__v4si) (__v2du) { __B, __B };
503   __D = vec_unpackh ((__v8hi) __D);
504   __C = vec_mul (__C, __D);
505   const __v4su __shift = vec_splats ((unsigned int) 14);
506   __C = vec_sr (__C, __shift);
507   const __v4si __ones = vec_splats ((signed int) 1);
508   __C = vec_add (__C, __ones);
509   __C = vec_sr (__C, (__v4su) __ones);
510   __v8hi __E = vec_pack (__C, __D);
511   return (__m64) ((__v2du) (__E))[0];
512 }
513 
514 #endif
515